@inproceedings{3335a37824a5404e97dd3b3aaaa29230,
title = "Crowd-based deduplication: An adaptive approach",
abstract = "Data deduplication stands as a building block for data integration and data cleaning. The state-of-the-art techniques focus on how to exploit crowdsourcing to improve the accuracy of deduplication. However, they either incur significant overheads on the crowd or offer inferior accuracy. This paper presents ACD, a new crowd-based algorithm for data deduplication. The basic idea of ACD is to adopt correlation clustering (which is a classic machine-based algorithm for data deduplication) under a crowd-based setting. We propose non-trivial techniques to reduce the time required in performing correlation clustering with the crowd, and devise methods to postprocess the results of correlation clustering for better accuracy of deduplication. With extensive experiments on the Amazon Mechanical Turk, we demonstrate that ACD outperforms the states of the art by offering a high precision of deduplication while incurring moderate crowdsourcing overheads.",
keywords = "Correlating clustering, Crowdsourcing, Data deduplication",
author = "Sibo Wang and Xiaokui Xiao and Lee, {Chun Hee}",
year = "2015",
month = may,
day = "27",
doi = "10.1145/2723372.2723739",
language = "English",
series = "Proceedings of the ACM SIGMOD International Conference on Management of Data",
publisher = "Association for Computing Machinery",
pages = "1263--1277",
booktitle = "SIGMOD 2015 - Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data",
note = "ACM SIGMOD International Conference on Management of Data, SIGMOD 2015 ; Conference date: 31-05-2015 Through 04-06-2015",
}