@inproceedings{FaaßEckart2023, author = {Faaß, Gertrud and Eckart, Kerstin}, title = {SdeWaC - A corpus of parsable sentences from the web}, booktitle = {Language Processing and Knowledge in the Web: 25th International Conference, GSCL 2013, Darmstadt}, editor = {Gurevych, Iryna and Biemann, Chris and Zesch, Torsten}, isbn = {978-3-642-40722-2}, issn = {1611-3349}, doi = {10.1007/978-3-642-40722-2_6}, series = {Lecture Notes in Artificial Intelligence}, number = {8105}, pages = {61 -- 68}, year = {2023}, abstract = {For a number of languages, web crawling allows researchers to collect huge text samples to build corpora. However, only part of the material found on the internet is useful for Natural Language Processing, as e.g. parsers typically cannot handle lists and tables, or very short or very long sentences. There are methods (cf. e.g. [3]) for cleaning the downloaded data before adding it to a corpus collection - but even when these are applied, not all remaining textual material might be suitable for certain research requirements. This paper describes methods utilized to prepare deWaC, a freely available German web corpus of the WaCky project, for automatic processing up to the parsing level. It then discusses ways in which this corpus, called SdeWaC, has been used since its release.}, subject = {Korpus }, language = {en} }