From f7b4b38a05860d8dcfd25d24522c506d9ffa468c Mon Sep 17 00:00:00 2001 From: Dmitry Ustalov Date: Sat, 6 Apr 2024 11:20:42 +0200 Subject: [PATCH] JOSS paper is out! --- CITATION.cff | 10 +- README.md | 18 +- joss/paper.bib | 476 ------------------------------------------------- joss/paper.md | 193 -------------------- 4 files changed, 22 insertions(+), 675 deletions(-) delete mode 100644 joss/paper.bib delete mode 100644 joss/paper.md diff --git a/CITATION.cff b/CITATION.cff index 737330d..a48c02c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -27,7 +27,7 @@ keywords: - data quality license: Apache-2.0 preferred-citation: - type: generic + type: article authors: - family-names: Ustalov given-names: Dmitry @@ -39,6 +39,14 @@ preferred-citation: family-names: Tseitlin orcid: "https://orcid.org/0000-0001-8553-4260" title: "Learning from Crowds with Crowd-Kit" + year: 2024 + journal: Journal of Open Source Software + volume: 9 + issue: 96 + start: 6227 + end: 6227 + doi: "10.21105/joss.06227" + issn: 2475-9066 identifiers: - type: other value: "arXiv:2109.08584" diff --git a/README.md b/README.md index ff77bd2..594a5da 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![GitHub Tests][github_tests_badge]][github_tests_link] [![Codecov][codecov_badge]][codecov_link] [![Documentation][docs_badge]][docs_link] +[![Paper][paper_badge]][paper_link] [pypi_badge]: https://badge.fury.io/py/crowd-kit.svg [pypi_link]: https://pypi.python.org/pypi/crowd-kit @@ -15,6 +16,8 @@ [codecov_link]: https://codecov.io/gh/Toloka/crowd-kit [docs_badge]: https://readthedocs.org/projects/crowd-kit/badge/ [docs_link]: https://crowd-kit.readthedocs.io/ +[paper_badge]: https://joss.theoj.org/papers/10.21105/joss.06227/status.svg +[paper_link]: https://doi.org/10.21105/joss.06227 **Crowd-Kit** is a powerful Python library that implements commonly-used aggregation methods for crowdsourced annotation and offers the relevant metrics and datasets. We strive to implement functionality that simplifies working with crowdsourced data. @@ -117,18 +120,23 @@ Below is the list of currently implemented methods, including the already availa ## Citation -* Ustalov D., Pavlichenko N., Tseitlin B. [Learning from Crowds with Crowd-Kit](https://arxiv.org/abs/2109.08584). 2023. arXiv: [2109.08584 [cs.HC]](https://arxiv.org/abs/2109.08584). +* Ustalov D., Pavlichenko N., Tseitlin B. (2024). [Learning from Crowds with Crowd-Kit](https://doi.org/10.21105/joss.06227). Journal of Open Source Software, 9(96), 6227 ```bibtex -@misc{CrowdKit, +@article{CrowdKit, author = {Ustalov, Dmitry and Pavlichenko, Nikita and Tseitlin, Boris}, title = {{Learning from Crowds with Crowd-Kit}}, - year = {2023}, - publisher = {arXiv}, + year = {2024}, + journal = {Journal of Open Source Software}, + volume = {9}, + number = {96}, + pages = {6227}, + publisher = {The Open Journal}, + doi = {10.21105/joss.06227}, + issn = {2475-9066}, eprint = {2109.08584}, eprinttype = {arxiv}, eprintclass = {cs.HC}, - url = {https://arxiv.org/abs/2109.08584}, language = {english}, } ``` diff --git a/joss/paper.bib b/joss/paper.bib deleted file mode 100644 index aa1bb06..0000000 --- a/joss/paper.bib +++ /dev/null @@ -1,476 +0,0 @@ -@inproceedings{Bernstein:10, - author = {Bernstein, Michael S. and Little, Greg and Miller, Robert C. and Hartmann, Bj\"{o}rn and Ackerman, Mark S. and Karger, David R. and Crowell, David and Panovich, Katrina}, - title = {{Soylent: A Word Processor with a Crowd Inside}}, - year = {2010}, - booktitle = {Proceedings of the 23Nd Annual ACM Symposium on User Interface Software and Technology}, - series = {UIST '10}, - pages = {313--322}, - address = {New York, NY, USA}, - publisher = {ACM}, - doi = {10.1145/1866029.1866078}, - isbn = {978-1-4503-0271-5}, - language = {english}, -} - -@article{Bradley:52, - author = {Bradley, Ralph Allan and Terry, Milton E.}, - title = {{Rank Analysis of Incomplete Block Designs: I. The Method of Paired Comparisons}}, - year = {1952}, - journal = {Biometrika}, - volume = {39}, - number = {3/4}, - pages = {324--345}, - publisher = {Oxford University Press, Biometrika Trust}, - doi = {10.2307/2334029}, - issn = {0006-3444}, - language = {english}, -} - -@inproceedings{Buckley:10, - author = {Buckley, Chris and Lease, Matthew and Smucker, Mark D.}, - title = {{Overview of the TREC 2010 Relevance Feedback Track (Notebook)}}, - year = {2010}, - booktitle = {The Nineteenth TREC Notebook}, - url = {https://www.ischool.utexas.edu/~ml/papers/trec-notebook-2010.pdf}, - language = {english}, -} - -@inproceedings{Bugakova:19, - author = {Bugakova, Nadezhda and Fedorova, Valentina and Gusev, Gleb and Drutsa, Alexey}, - title = {{Aggregation of pairwise comparisons with reduction of biases}}, - year = {2019}, - booktitle = {2019 ICML Workshop on Human in the Loop Learning}, - series = {HILL~2019}, - url = {https://arxiv.org/abs/1906.03711}, - numpages = {8}, - eprint = {1906.03711}, - eprinttype = {arxiv}, - eprintclass = {cs.HC}, - language = {english}, -} - -@inproceedings{Chen:13, - author = {Chen, Xi and Bennett, Paul N. and Collins-Thompson, Kevyn and Horvitz, Eric}, - title = {{Pairwise Ranking Aggregation in a Crowdsourced Setting}}, - year = {2013}, - booktitle = {Proceedings of the Sixth ACM International Conference on Web Search and Data Mining}, - series = {WSDM '13}, - pages = {193--202}, - address = {Rome, Italy}, - publisher = {Association for Computing Machinery}, - doi = {10.1145/2433396.2433420}, - isbn = {9781450318693}, - language = {english}, -} - -@article{Chu:21, - author = {Chu, Zhendong and Ma, Jing and Wang, Hongning}, - title = {{Learning from Crowds by Modeling Common Confusions}}, - year = {2021}, - journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, - volume = {35}, - number = {7}, - pages = {5832--5840}, - doi = {10.1609/aaai.v35i7.16730}, - language = {english}, -} - -@article{Dawid:79, - author = {Dawid, Alexander Philip and Skene, Allan M.}, - title = {{Maximum Likelihood Estimation of Observer Error-Rates Using the EM Algorithm}}, - year = {1979}, - journal = {Journal of the Royal Statistical Society, Series~C (Applied Statistics)}, - volume = {28}, - number = {1}, - pages = {20--28}, - publisher = {Wiley}, - doi = {10.2307/2346806}, - issn = {0035-9254}, - language = {english}, -} - -@misc{Drutsa:21, - author = {Drutsa, Alexey and Ustalov, Dmitry and Popov, Nikita and Baidakova, Daria}, - title = {{Improving Web Ranking with Human-in-the-Loop: Methodology, Scalability, Evaluation}}, - year = {2021}, - url = {https://research.yandex.com/tutorials/crowd/www-2021}, - note = {Tutorial at TheWebConf 2021 (WWW '21)}, - language = {english}, -} - -@inproceedings{Fiscus:97, - author = {Fiscus, Jonathan G.}, - title = {{A post-processing system to yield reduced word error rates: Recognizer Output Voting Error Reduction (ROVER)}}, - year = {1997}, - booktitle = {1997 IEEE Workshop on Automatic Speech Recognition and Understanding Proceedings}, - pages = {347--354}, - address = {Santa Barbara, CA, USA}, - publisher = {IEEE}, - doi = {10.1109/ASRU.1997.659110}, - language = {english}, -} - -@article{Hochreiter:97, - author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen}, - title = {{Long Short-Term Memory}}, - year = {1997}, - journal = {Neural Computation}, - volume = {9}, - number = {8}, - pages = {1735--1780}, - doi = {10.1162/neco.1997.9.8.1735}, - language = {english}, -} - -@inproceedings{Hovy:13, - author = {Hovy, Dirk and Berg-Kirkpatrick, Taylor and Vaswani, Ashish and Hovy, Eduard}, - title = {{Learning Whom to Trust with MACE}}, - year = {2013}, - booktitle = {Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, - series = {NAACL-HLT~2013}, - pages = {1120--1130}, - address = {Atlanta, GA, USA}, - publisher = {Association for Computational Linguistics}, - url = {https://aclanthology.org/N13-1132}, - language = {english}, -} - -@techreport{JungLinLee:18, - author = {Jung-Lin Lee, Doris and Das Sarma, Akash and Parameswaran, Aditya}, - title = {{Quality Evaluation Methods for Crowdsourced Image Segmentation}}, - year = {2018}, - publisher = {Stanford InfoLab}, - url = {http://ilpubs.stanford.edu:8090/1161/}, - type = {Technical Report}, - institution = {Stanford University}, - language = {english}, -} - -@article{Karger:14, - author = {Karger, David R. and Oh, Sewoong and Shah, Devavrat}, - title = {{Budget-Optimal Task Allocation for Reliable Crowdsourcing Systems}}, - year = {2014}, - journal = {Operations Research}, - volume = {62}, - number = {1}, - pages = {1--24}, - publisher = {INFORMS}, - doi = {10.1287/opre.2013.1235}, - issn = {0030-364X}, - language = {english}, -} - -@book{Krippendorff:18, - author = {Krippendorff, Klaus}, - title = {{Content Analysis: An Introduction to Its Methodology}}, - year = {2018}, - address = {Thousand Oaks, CA, USA}, - publisher = {SAGE Publications, Inc}, - isbn = {978-1-5063-9566-1}, - edition = {Fourth Edition}, - numpages = {472}, - language = {english}, -} - -@techreport{Krizhevsky:09, - author = {Krizhevsky, Alex}, - title = {{Learning Multiple Layers of Features from Tiny Images}}, - year = {2009}, - address = {Toronto, ON, Canada}, - url = {https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf}, - institution = {University of Toronto}, - language = {english}, -} - -@inproceedings{Li:19, - author = {Li, Jiyi and Fukumoto, Fumiyo}, - title = {{A Dataset of Crowdsourced Word Sequences: Collections and Answer Aggregation for Ground Truth Creation}}, - year = {2019}, - booktitle = {Proceedings of the First Workshop on Aggregating and Analysing Crowdsourced Annotations for NLP}, - series = {AnnoNLP '19}, - pages = {24--28}, - address = {Hong Kong}, - publisher = {Association for Computational Linguistics}, - doi = {10.18653/v1/D19-5904}, - language = {english}, -} - -@inproceedings{Li:20, - author = {Li, Jiyi}, - title = {{Crowdsourced Text Sequence Aggregation Based on Hybrid Reliability and Representation}}, - year = {2020}, - booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval}, - series = {SIGIR '20}, - pages = {1761--1764}, - address = {Virtual Event, China}, - publisher = {Association for Computing Machinery}, - doi = {10.1145/3397271.3401239}, - isbn = {9781450380164}, - language = {english}, -} - -@inproceedings{Lin:14, - author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C. Lawrence}, - title = {{Microsoft COCO: Common Objects in Context}}, - year = {2014}, - booktitle = {Computer Vision -- ECCV 2014}, - pages = {740--755}, - address = {Cham, Switzerland}, - publisher = {Springer International Publishing}, - doi = {10.1007/978-3-319-10602-1_48}, - isbn = {978-3-319-10602-1}, - language = {english}, -} - -@misc{Liu:19, - author = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin}, - title = {{RoBERTa: A Robustly Optimized BERT Pretraining Approach}}, - year = {2019}, - url = {https://arxiv.org/abs/1907.11692}, - eprint = {1907.11692}, - eprinttype = {arxiv}, - eprintclass = {cs.CL}, - language = {english}, -} - -@inproceedings{Ma:20, - author = {Ma, Qianqian and Olshevsky, Alex}, - title = {{Adversarial Crowdsourcing Through Robust Rank-One Matrix Completion}}, - year = {2020}, - booktitle = {Advances in Neural Information Processing Systems~33}, - pages = {21841--21852}, - publisher = {Curran Associates, Inc.}, - url = {https://proceedings.neurips.cc/paper/2020/file/f86890095c957e9b949d11d15f0d0cd5-Paper.pdf}, - language = {english}, -} - -@inproceedings{Maas:11, - author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, - title = {{Learning Word Vectors for Sentiment Analysis}}, - year = {2011}, - booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, - pages = {142--150}, - address = {Portland, Oregon, USA}, - publisher = {Association for Computational Linguistics}, - url = {https://aclanthology.org/P11-1015}, - language = {english}, -} - -@phdthesis{Malinin:19, - author = {Malinin, Andrey}, - title = {{Uncertainty Estimation in Deep Learning with application to Spoken Language Assessment}}, - year = {2019}, - address = {Cambridge, England, UK}, - doi = {10.17863/CAM.45912}, - school = {University of Cambridge}, - language = {english}, -} - -@inproceedings{Marge:10, - author = {Marge, Matthew and Banerjee, Satanjeev and Rudnicky, Alexander I.}, - title = {{Using the Amazon Mechanical Turk for transcription of spoken language}}, - year = {2010}, - booktitle = {2010 IEEE International Conference on Acoustics, Speech and Signal Processing}, - series = {ICASSP~2010}, - pages = {5270--5273}, - address = {Dallas, TX, USA}, - publisher = {IEEE}, - doi = {10.1109/ICASSP.2010.5494979}, - language = {english}, -} - -@inproceedings{McKinney:10, - author = {McKinney, Wes}, - title = {{Data Structures for Statistical Computing in Python}}, - year = {2010}, - booktitle = {Proceedings of the 9th Python in Science Conference}, - pages = {56--61}, - doi = {10.25080/Majora-92bf1922-00a}, - language = {english}, -} - -@inproceedings{Pavlichenko:21:crowdspeech, - author = {Pavlichenko, Nikita and Stelmakh, Ivan and Ustalov, Dmitry}, - title = {{CrowdSpeech and Vox~DIY: Benchmark Dataset for Crowdsourced Audio Transcription}}, - year = {2021}, - booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, - url = {https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/65ded5353c5ee48d0b7d48c591b8f430-Abstract-round1.html}, - numpages = {14}, - eprint = {2107.01091}, - eprinttype = {arxiv}, - eprintclass = {cs.SD}, - language = {english}, -} - -@inproceedings{Pavlichenko:21:sbs, - author = {Pavlichenko, Nikita and Ustalov, Dmitry}, - title = {{IMDB-WIKI-SbS: An Evaluation Dataset for Crowdsourced Pairwise Comparisons}}, - year = {2021}, - booktitle = {NeurIPS Data-Centric AI Workshop}, - series = {DCAI~2021}, - url = {https://datacentricai.org/neurips21/papers/115_CameraReady_NeurIPS_2021_Data_Centric_AI_IMDB_WIKI_SbS-2.pdf}, - numpages = {5}, - eprint = {2110.14990}, - eprinttype = {arxiv}, - eprintclass = {cs.HC}, - language = {english}, -} - -@inproceedings{Paszke:19, - author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith}, - title = {{PyTorch: An Imperative Style, High-Performance Deep Learning Library}}, - year = {2019}, - booktitle = {Advances in Neural Information Processing Systems}, - volume = {32}, - pages = {}, - publisher = {Curran Associates, Inc.}, - url = {https://proceedings.neurips.cc/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf}, -} - -@article{Pedregosa:11, - author = {Pedregosa, Fabian and Varoquaux, Ga\"{e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, \'{E}douard}, - title = {{Scikit-learn: Machine Learning in Python}}, - year = {2011}, - journal = {Journal of Machine Learning Research}, - volume = {12}, - number = {85}, - pages = {2825--2830}, - issn = {1532-4435}, - url = {https://jmlr.org/papers/v12/pedregosa11a.html}, - language = {english}, -} - -@article{Rodrigo:19, - author = {Rodrigo, Enrique G. and Aledo, Juan A. and G\'{a}mez, Jos\'{e} A.}, - title = {{spark-crowd: A Spark Package for Learning from Crowdsourced Big Data}}, - year = {2019}, - journal = {Journal of Machine Learning Research}, - volume = {20}, - pages = {1--5}, - issn = {1532-4435}, - url = {https://jmlr.org/papers/v20/17-743.html}, - language = {english}, -} - -@article{Rodrigues:18, - author = {Rodrigues, Filipe and Pereira, Francisco C.}, - title = {{Deep Learning from Crowds}}, - year = {2018}, - journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, - volume = {32}, - number = {1}, - pages = {1611--1618}, - address = {New Orleans, LA, USA}, - publisher = {AAAI Press}, - doi = {10.1609/aaai.v32i1.11506}, - isbn = {978-1-57735-800-8}, - language = {english}, -} - -@article{Rothe:18, - author = {Rothe, Rasmus and Timofte, Radu and Van Gool, Luc}, - title = {{Deep Expectation of Real and Apparent Age from a Single Image Without Facial Landmarks}}, - year = {2018}, - journal = {International Journal of Computer Vision}, - volume = {126}, - number = {2}, - pages = {144--157}, - doi = {10.1007/s11263-016-0940-3}, - issn = {1573-1405}, - language = {english}, -} - -@article{Sheshadri:13, - author = {Sheshadri, Aashish and Lease, Matthew}, - title = {{SQUARE: A Benchmark for Research on Computing Crowd Consensus}}, - year = {2013}, - journal = {Proceedings of the AAAI Conference on Human Computation and Crowdsourcing}, - volume = {1}, - number = {1}, - pages = {156--164}, - doi = {10.1609/hcomp.v1i1.13088}, - language = {english}, -} - -@misc{Simonyan:15, - author = {Simonyan, Karen and Zisserman, Andrew}, - title = {{Very Deep Convolutional Networks for Large-Scale Image Recognition}}, - year = {2015}, - booktitle = {3rd International Conference on Learning Representations, ICLR~2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings}, - url = {https://arxiv.org/abs/1409.1556}, - eprint = {1409.1556}, - eprinttype = {arxiv}, - eprintclass = {cs.CV}, - language = {english}, -} - -@misc{TlkAgg, - author = {{Toloka}}, - title = {{Toloka Public Datasets for Machine Learning and Data Science}}, - year = {2021}, - url = {https://toloka.ai/datasets}, - language = {english}, -} - -@misc{Wawa, - author = {{Appen Limited}}, - title = {{Calculating Worker Agreement with Aggregate (Wawa)}}, - year = {2021}, - url = {https://success.appen.com/hc/en-us/articles/202703205-Calculating-Worker-Agreement-with-Aggregate-Wawa-}, - language = {english}, -} - -@incollection{Whitehill:09, - author = {Whitehill, Jacob and Wu, Ting-fan and Bergsma, Jacob and Movellan, Javier R. and Ruvolo, Paul L.}, - title = {{Whose Vote Should Count More: Optimal Integration of Labels from Labelers of Unknown Expertise}}, - year = {2009}, - booktitle = {Advances in Neural Information Processing Systems 22}, - series = {NIPS~2009}, - pages = {2035--2043}, - address = {Vancouver, BC, Canada}, - publisher = {Curran Associates, Inc.}, - isbn = {978-1-61567-911-9}, - url = {https://papers.nips.cc/paper/3644-whose-vote-should-count-more-optimal-integration-of-labels-from-labelers-of-unknown-expertise.pdf}, - language = {english}, -} - -@article{Zhang:15, - author = {Zhang, Jing and Sheng, Victor S. and Nicholson, Bryce A. and Wu, Xindong}, - title = {{CEKA: A Tool for Mining the Wisdom of Crowds}}, - year = {2015}, - journal = {Journal of Machine Learning Research}, - volume = {16}, - number = {88}, - pages = {2853--2858}, - issn = {1532-4435}, - url = {https://jmlr.org/papers/v16/zhang15a.html}, - language = {english}, -} - -@article{Zhdanovskaya:23, - author = {Zhdanovskaya, Anastasia and Baidakova, Daria and Ustalov, Dmitry}, - title = {{Data Labeling for Machine Learning Engineers: Project-Based Curriculum and Data-Centric Competitions}}, - year = {2023}, - journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, - volume = {37}, - number = {13}, - pages = {15886--15893}, - publisher = {AAAI Press}, - issn = {2374-3468}, - doi = {10.1609/aaai.v37i13.26886}, - language = {english}, -} - -@article{Zheng:17, - author = {Zheng, Yudian and Li, Guoliang and Li, Yuanbing and Shan, Caihua and Cheng, Reynold}, - title = {{Truth Inference in Crowdsourcing: Is the Problem Solved?}}, - year = {2017}, - journal = {Proceedings of the VLDB Endowment}, - volume = {10}, - number = {5}, - pages = {541--552}, - publisher = {VLDB Endowment}, - doi = {10.14778/3055540.3055547}, - issn = {2150-8097}, - language = {english}, -} diff --git a/joss/paper.md b/joss/paper.md deleted file mode 100644 index faa4f1c..0000000 --- a/joss/paper.md +++ /dev/null @@ -1,193 +0,0 @@ ---- -title: 'Learning from Crowds with Crowd-Kit' -tags: - - Python - - crowdsourcing - - data labeling - - answer aggregation - - truth inference - - learning from crowds - - machine learning - - quality control - - data quality -authors: - - name: Dmitry Ustalov - orcid: 0000-0002-9979-2188 - affiliation: 1 - corresponding: true - - name: Nikita Pavlichenko - orcid: 0000-0002-7330-393X - affiliation: 2 - - name: Boris Tseitlin - orcid: 0000-0001-8553-4260 - affiliation: 3 -affiliations: - - name: JetBrains, Serbia - index: 1 - - name: JetBrains, Germany - index: 2 - - name: Planet Farms, Portugal - index: 3 -date: 24 September 2023 -bibliography: paper.bib ---- - -# Summary - -This paper presents Crowd-Kit, a general-purpose computational quality control toolkit for crowdsourcing. Crowd-Kit provides efficient and convenient implementations of popular quality control algorithms in Python, including methods for truth inference, deep learning from crowds, and data quality estimation. Our toolkit supports multiple modalities of answers and provides dataset loaders and example notebooks for faster prototyping. We extensively evaluated our toolkit on several datasets of different natures, enabling benchmarking computational quality control methods in a uniform, systematic, and reproducible way using the same codebase. We release our code and data under the Apache License 2.0 at . - -# Statement of need - -A traditional approach to quality control in crowdsourcing builds upon various organizational means, such as careful task design, decomposition, and preparing golden tasks [@Zhdanovskaya:23]. These techniques yield the best results when accompanied by computational methods that leverage worker-task-label relationships and their statistical properties. - -Many studies in crowdsourcing simplify complex tasks via multi-classification or post-acceptance steps, as discussed in a pivotal paper by @Bernstein:10. Meanwhile, researchers in natural language processing and computer vision develop specialized techniques. However, existing toolkits like SQUARE [@Sheshadri:13], CEKA [@Zhang:15], Truth Inference [@Zheng:17], spark-crowd [@Rodrigo:19] require additional effort for integration into applications, popular data science libraries and frameworks. - -We propose addressing this challenge with **Crowd-Kit**, an open-source Python toolkit for computational quality control in crowdsourcing. Crowd-Kit implements popular quality control methods, providing a standardized platform for reliable experimentation and application. We extensively evaluate the Crowd-Kit library to establish a basis for comparisons. *In all the experiments in this paper, we used our implementations of the corresponding methods.* - -# Design - -Our fundamental goal of Crowd-Kit development was to bridge the gap between crowdsourcing research and vivid data science ecosystem of NumPy, SciPy, pandas [@McKinney:10], and scikit-learn [@Pedregosa:11]. We implemented Crowd-Kit in Python and employed the highly optimized data structures and algorithms available in these libraries, maintaining compatibility with the application programming interface (API) of scikit-learn and data frames/series of pandas. Even for a user not familiar with crowdsourcing but familiar with scientific computing and data analysis in Python, the basic API usage would be straightforward: - -```python -# df is a DataFrame with labeled data in form of (task, label, worker) -# gt is a Series with ground truth per task -df, gt = load_dataset('relevance-2') # binary relevance sample dataset - -# run the Dawid-Skene categorical aggregation method -agg_ds = DawidSkene(n_iter=10).fit_predict(df) # same format as gt -``` - -We implemented all the methods in Crowd-Kit from scratch in Python. Although unlike spark-crowd [@Rodrigo:19], our library did not provide a means for running on a distributed computational cluster, it leveraged efficient implementations of numerical algorithms in underlying libraries widely used in the research community. In addition to categorical aggregation methods, Crowd-Kit offers non-categorical aggregation methods, dataset loaders, and annotation quality estimators. - -# Maintenance and governance - -Crowd-Kit is not bound to any specific crowdsourcing platform, allowing analyzing data from any crowdsourcing marketplace (as soon as one can download the labeled data from that platform). Crowd-Kit is an open-source library working under most operating systems and available under the Apache license 2.0 both on GitHub and Python Package Index (PyPI).[^1] All code of Crowd-Kit has strict type annotations for additional safety and clarity. By the time of submission, our library had a test coverage of 93%. - -[^1]: & - -We built Crowd-Kit on top of the established open-source frameworks and best practices. We widely use the continuous integration facilities via GitHub Actions for two purposes. First, every patch (*commit* in git terminology) invokes unit testing and coverage, type checking, linting, documentation and packaging dry run. Second, every release is automatically submitted to PyPI directly from GitHub Actions via the trusted publishing mechanism to avoid potential side effects on the individual developer machines. Besides commit checks, every code change (*pull request* on GitHub) goes through a code review by the Crowd-Kit developers. We accept bug reports via GitHub Issues. - -# Functionality - -Crowd-Kit implements a selection of popular methods for answer aggregation and learning from crowds, dataset loaders, and annotation quality characteristics. - -## Aggregating and learning with Crowd-Kit - -Crowd-Kit features aggregation methods suitable for most kinds of crowdsourced responses, including categorical, pairwise, sequential, and image segmentation answers (see the summary in \autoref{tab:methods}). - -Methods for *categorical aggregation*, which are the most widespread in practice, assume that there is only one correct objective label per task and aim at recovering a latent true label from the observed noisy data. Some of these methods, such as Dawid-Skene and GLAD, also estimate latent parameters --- aka skills --- of the workers. Where the task design does not meet the latent label assumption, Crowd-Kit offers methods for aggregation *pairwise comparisons*, which are essential for subjective opinion gathering. Also, Crowd-Kit provides specialized methods for aggregating *sequences* (such as texts) and *image segmentation*. All these aggregation methods are implemented purely using NumPy, SciPy, pandas, and scikit-learn without any deep learning framework. Last but not least, Crowd-Kit offers methods for *deep learning from crowds* that learn an end-to-end machine learning model from raw responses submitted by the workers without the use of aggregation, which are available as ready-to-use modules for PyTorch [@Paszke:19]. - -One can easily add a new aggregation method to Crowd-Kit. For example, without the loss of generality, to create a new categorical aggregator, one should extend the base class `BaseClassificationAggregator` and implement two methods, `fit()` and `fit_predict()`, filling the instance variable `labels_` with the aggregated labels.[^2] Also, to add a new method for learning from crowds, one has to create a subclass from `torch.nn.Module` and implement the `forward()` method.[^3] - -: Summary of the implemented methods in Crowd-Kit.\label{tab:methods} - -| **Aggregation** | **Methods** | -|-----------------|-----------------------------------------------------------| -| Categorical | Majority Vote, Wawa [@Wawa], @Dawid:79, | -| | GLAD [@Whitehill:09], MACE [@Hovy:13], | -| | @Karger:14, M-MSR [@Ma:20] | -| Pairwise | @Bradley:52, noisyBT [@Bugakova:19] | -| Sequence | ROVER [@Fiscus:97], RASA and HRRASA [@Li:20], | -| | Language Model [@Pavlichenko:21:crowdspeech] | -| Segmentation | Majority Vote, Expectation-Maximization [@JungLinLee:18], | -| | RASA and HRRASA [@Li:20] | -| Learning | CrowdLayer [@Rodrigues:18], CoNAL [@Chu:21] | - -## Dataset loaders - -Crowd-Kit offers convenient dataset loaders for some popular or demonstrative datasets (see \autoref{tab:datasets}), allowing downloading them from the Internet in a ready-to-use form with a single line of code. It is possible to add new datasets in a declarative way and, if necessary, add the corresponding code to load the data as pandas data frames and series. - -: Summary of the datasets provided by Crowd-Kit.\label{tab:datasets} - -| **Task** | **Datasets** | -|--------------|---------------------------------------------------------------| -| Categorical | Toloka Relevance 2 and 5, TREC Relevance [@Buckley:10] | -| Pairwise | IMDB-WIKI-SbS [@Pavlichenko:21:sbs] | -| Sequence | CrowdWSA [-@Li:19], CrowdSpeech [@Pavlichenko:21:crowdspeech] | -| Image | Common Objects in Context [@Lin:14] | - -[^2]: See the implementation of Majority Vote at as an example of an aggregation method. - -[^3]: See the implementation of CrowdLayer at as an example of a method for deep learning from crowds. - -## Annotation quality estimators - -Crowd-Kit allows one to apply commonly-used techniques to evaluate data and annotation quality, providing a unified pandas-compatible API to compute $\alpha$ [@Krippendorff:18], annotation uncertainty [@Malinin:19], agreement with aggregate [@Wawa], Dawid-Skene posterior probability, etc. - -# Evaluation - -We extensively evaluate Crowd-Kit methods for answer aggregation and learning from crowds. When possible, we compare with other authors; either way, we show how the currently implemented methods perform on well-known datasets with noisy crowdsourced data, indicating the correctness of our implementations. - -## Evaluation of aggregation methods - -**Categorical.** To ensure the correctness of our implementations, we compared the observed aggregation quality with the already available implementations by @Zheng:17 and @Rodrigo:19. \autoref{tab:categorical} shows evaluation results, indicating a similar level of quality as them: *D_Product*, *D_PosSent*, *S_Rel*, and *S_Adult* are real-world datasets from @Zheng:17, and *binary1* and *binary2* are synthetic datasets from @Rodrigo:19. Our implementation of M-MSR could not process the D_Product dataset in a reasonable time, KOS can be applied to binary datasets only, and none of our implementations handled *binary3* and *binary4* synthetic datasets, which require a distributed computing cluster. - -: Comparison of the implemented categorical aggregation methods (accuracy is used).\label{tab:categorical} - -| **Method** | **D_Product** | **D_PosSent** | **S_Rel** | **S_Adult** | **binary1** | **binary2** | -| ------------|--------------:|--------------:|----------:|------------:|------------:|------------:| -| MV | $0.897$ | $0.932$ | $0.536$ | $0.763$ | $0.931$ | $0.936$ | -| Wawa | $0.897$ | $0.951$ | $0.557$ | $0.766$ | $0.981$ | $0.983$ | -| DS | $0.940$ | $0.960$ | $0.615$ | $0.748$ | $0.994$ | $0.994$ | -| GLAD | $0.928$ | $0.948$ | $0.511$ | $0.760$ | $0.994$ | $0.994$ | -| KOS | $0.895$ | $0.933$ | --- | --- | $0.993$ | $0.994$ | -| MACE | $0.929$ | $0.950$ | $0.501$ | $0.763$ | $0.995$ | $0.995$ | -| M-MSR | --- | $0.937$ | $0.425$ | $0.751$ | $0.994$ | $0.994$ | - -**Pairwise.** \autoref{tab:pairwise} shows the comparison of the *Bradley-Terry* and *noisyBT* methods implemented in Crowd-Kit to the random baseline on the graded readability dataset by @Chen:13 and a larger people age dataset by @Pavlichenko:21:sbs. - -: Comparison of implemented pairwise aggregation methods (Spearman's $\rho$ is used).\label{tab:pairwise} - -| **Method** | **@Chen:13** | **IMDB-WIKI-SBS** | -|----------------|-------------:|------------------:| -| Bradley-Terry | $0.246$ | $0.737$ | -| noisyBT | $0.238$ | $0.744$ | -| Random | $-0.013$ | $-0.001$ | - -**Sequence.** We used two datasets, CrowdWSA [@Li:19] and CrowdSpeech [@Pavlichenko:21:crowdspeech]. As the typical application for sequence aggregation in crowdsourcing is audio transcription, we used the word error rate as the quality criterion [@Fiscus:97] in \autoref{tab:sequence}. - -: Comparison of implemented sequence aggregation methods (average word error rate is used).\label{tab:sequence} - -| **Dataset** | **Version** | **ROVER** | **RASA** | **HRRASA** | -|--------------|:-----------:|----------:|---------:|-----------:| -| CrowdWSA | J1 | $0.612$ | $0.659$ | $0.676$ | -| | T1 | $0.514$ | $0.483$ | $0.500$ | -| | T2 | $0.524$ | $0.498$ | $0.520$ | -| CrowdSpeech | dev-clean | $0.676$ | $0.750$ | $0.745$ | -| | dev-other | $0.132$ | $0.142$ | $0.142$ | -| | test-clean | $0.729$ | $0.860$ | $0.859$ | -| | test-other | $0.134$ | $0.157$ | $0.157$ | - -**Segmentation.** We annotated on the Toloka crowdsourcing platform a sample of 2,000 images from the MS COCO [@Lin:14] dataset consisting of four object labels. For each image, nine workers submitted segmentations. The dataset is available in Crowd-Kit as `mscoco_small`. In total, we received 18,000 responses. \autoref{tab:segmentation} shows the comparison of the methods on the above-described dataset using the *intersection over union* (IoU) criterion. - -: Comparison of implemented image aggregation algorithms (IoU is used).\label{tab:segmentation} - -| **Dataset** | **MV** | **EM** | **RASA** | -|--------------|--------:|--------:|---------:| -| MS COCO | $0.839$ | $0.861$ | $0.849$ | - -## Evaluation of methods for learning from crowds - -To demonstrate the impact of learning on raw annotator labels compared to answer aggregation in crowdsourcing, we compared the implemented methods for learning from crowds with the two classical aggregation algorithms, Majority Vote (MV) and Dawid-Skene (DS). We picked the two most common machine learning tasks for which ground truth datasets are available: text classification and image classification. For text classification, we used the IMDB Movie Reviews dataset [@Maas:11], and for image classification, we chose CIFAR-10 [@Krizhevsky:09]. In each dataset, each object was annotated by three different annotators; 100 objects were used as golden tasks. - -We compared how different methods for learning from crowds impact test accuracy. We picked two different backbone networks for text classification, LSTM [@Hochreiter:97] and RoBERTa [@Liu:19], and one backbone network for image classification, VGG-16 [@Simonyan:15]. Then, we trained each backbone in three scenarios: use the fully connected layer after the backbone without taking into account any specifics of crowdsourcing (Base), CrowdLayer method by @Rodrigues:18, and CoNAL method by @Chu:21. \autoref{tab:learning} shows the evaluation results. - -: Comparison of different methods for deep learning from crowds with traditional answer aggregation methods (test set accuracy is used).\label{tab:learning} - -| **Dataset** | **Backbone** | **CoNAL** | **CrowdLayer** | **Base** | **DS** | **MV** | -|--------------|:------------:|----------:|---------------:|---------:|--------:|--------:| -| IMDb | LSTM | $0.844$ | $0.825$ | $0.835$ | $0.841$ | $0.819$ | -| IMDb | RoBERTa | $0.932$ | $0.928$ | $0.927$ | $0.932$ | $0.927$ | -| CIFAR-10 | VGG-16 | $0.825$ | $0.863$ | $0.882$ | $0.877$ | $0.865$ | - -Our experiment shows the feasibility of training a deep learning model directly from the raw annotated data, skipping trivial aggregation methods like MV. However, specialized methods like CoNAL and CrowdLayer or non-trivial aggregation methods like DS can significantly enhance prediction accuracy. It is crucial to make a well-informed model selection to achieve optimal results. We believe that Crowd-Kit can seamlessly integrate these methods into machine learning pipelines that utilize crowdsourced data with reliability and ease. - -# Conclusion - -Our experience running Crowd-Kit in production for processing crowdsourced data at Toloka shows that it successfully handles industry-scale datasets without needing a large compute cluster. We believe that the availability of computational quality control techniques in a standardized way would open new venues for reliable improvement of the crowdsourcing quality beyond the traditional well-known methods and pipelines. - -# Acknowledgements - -The work was done while the authors were with Yandex. We are grateful to Enrique G. Rodrigo for sharing the spark-crowd evaluation dataset. We want to thank Daniil Fedulov, Iulian Giliazev, Artem Grigorev, Daniil Likhobaba, Vladimir Losev, Stepan Nosov, Alisa Smirnova, Aleksey Sukhorosov, and Evgeny Tulin for their contributions to the library. Last but not least, we appreciate the improvements to our library made by open-source [contributors](https://github.com/Toloka/crowd-kit/graphs/contributors) and the reviewers of this paper. We received no external funding. - -# References