@inproceedings{Benko2019, author = {Vladim{\´i}r Benko}, title = {Deduplication in large web corpora}, series = {Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-7) 2019. Cardiff, 22nd July 2019}, editor = {Piotr Bański and Adrien Barbaresi and Hanno Biber and Evelyn Breiteneder and Simon Clematide and Marc Kupietz and Harald L{\"u}ngen and Caroline Iliadi}, publisher = {Leibniz-Institut f{\"u}r Deutsche Sprache}, address = {Mannheim}, doi = {10.14618/ids-pub-9022}, url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-90221}, pages = {17 -- 21}, year = {2019}, abstract = {Our paper tries to find answers to some questions related to deduplication process in large-scale web-crawled corpora. An experiment based on eight corpora from the Aranea family is introduced, and first results are presented.}, language = {en} }