@inproceedings{Diewald2022, author = {Nils Diewald}, title = {Matrix and double-array representations for efficient finite state tokenization}, series = {Proceedings of the LREC 2022 Workshop on Challenges in the Management of Large Corpora (CMLC-10 2022). Marseille, 20 June 2022}, editor = {Piotr BaƄski and Adrien Barbaresi and Simon Clematide and Marc Kupietz and Harald L{\"u}ngen}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, isbn = {979-10-95546-83-2}, url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-111091}, pages = {20 -- 26}, year = {2022}, abstract = {This paper presents an algorithm and an implementation for efficient tokenization of texts of space-delimited languages based on a deterministic finite state automaton. Two representations of the underlying data structure are presented and a model implementation for German is compared with state-of-the-art approaches. The presented solution is faster than other tools while maintaining comparable quality.}, language = {en} }