@inproceedings{McClureAlgeeHewittDourisetal.2017,
  author    = {David McClure and Mark Algee-Hewitt and Steele Douris and Erik Fredner and Hannah Walser},
  title     = {Organizing corpora at the Stanford Literary Lab. Balancing simplicity and flexibility in metadata management},
  series = {Proceedings of the Workshop on Challenges in the Management of Large Corpora and Big Data and Natural Language Processing (CMLC-5+BigNLP) 2017 including the papers from the Web-as-Corpus (WAC-XI) guest section. Birmingham, 24 July 2017},
  editor    = {Piotr Bański and Marc Kupietz and Harald L{\"u}ngen and Paul Rayson and Hanno Biber and Evelyn Breiteneder and Simon Clematide and John Mariani and Mark Stevenson and Theresa Sick},
  publisher = {Institut f{\"u}r Deutsche Sprache},
  address   = {Mannheim},
  url       = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-62617},
  pages     = {25 -- 29},
  year      = {2017},
  abstract  = {This article describes a series of ongoing efforts at the Stanford Literary Lab to manage a large collection of literary corpora (~40 billion words). This work is marked by a tension between two competing requirements – the corpora need to be merged together into higher-order collections that can be analyzed as units; but, at the same time, it’s also necessary to preserve granular access to the original metadata and relational organization of each individual corpus. We describe a set of data management practices that try to accommodate both of these requirements – Apache Spark is used to index data as Parquet tables on an HPC cluster at Stanford. Crucially, the approach distinguishes between what we call “canonical” and “combined” corpora, a variation on the well-established notion of a “virtual corpus” (Kupietz et al., 2014; Jakub{\´i}ek et al., 2014; van Uytvanck, 2010).},
  language  = {en}
}