@book{OPUS4-9811, title = {Proceedings of the LREC 2020 Workshop, Language Resources and Evaluation Conference, 11–16 May 2020, 8th Workshop on Challenges in the Management of Large Corpora (CMLC-8)}, editor = {Piotr Bański and Adrien Barbaresi and Simon Clematide and Marc Kupietz and Harald L{\"u}ngen and Ines Pisetta}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, isbn = {979-10-95546-61-0}, url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-98112}, pages = {63}, year = {2020}, abstract = {In order to satisfy the information needs of a wide range of researchers across a number of disciplines, large textual datasets require careful design, collection, cleaning, encoding, annotation, storage, retrieval, and curation. This daunting set of tasks has coalesced into a number of key themes and questions that are of interest to the contributing research communities: (a) what sampling techniques can we apply? (b) what quality issues should we be aware of? (c) what infrastructures and frameworks are being developed for the efficient storage, annotation, analysis and retrieval of large datasets? (d) what affordances do visualisation techniques offer for the exploratory analysis approaches of corpora? (e) what legal paths can be followed in dealing with IPR and data protection issues governing both the data sources and the query results? (f) how to guarantee that corpus data remain available and usable in a sustainable way?}, language = {en} }