From ec46138bc3aeaf5c34d0472c0241860ba783e170 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 2 Oct 2024 17:50:09 +0200
Subject: [PATCH] add ngram field to all corpora with date field and
 main_content_mapping

---
 .../dutchnewspapers/dutchnewspapers_public.py | 353 ++++++++--------
 backend/corpora/goodreads/goodreads.py        | 237 +++++------
 .../guardianobserver/guardianobserver.py      | 123 +++---
 backend/corpora/periodicals/periodicals.py    | 203 ++++-----
 backend/corpora/rechtspraak/rechtspraak.py    | 199 +++++----
 backend/corpora/times/times.py                | 396 ++++++++----------
 6 files changed, 724 insertions(+), 787 deletions(-)

diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
index 9d295f82b..5fc408eeb 100644
--- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
+++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
@@ -96,7 +96,7 @@ def sources(self, start=min_date, end=max_date):
                     if extension != '.xml':
                         logger.debug(self.non_xml_msg.format(full_path))
                         continue
-                    #def_match = self.definition_pattern.match(name)
+                    # def_match = self.definition_pattern.match(name)
                     article_match = self.article_pattern.match(name)
                     if article_match:
                         parts = name.split("_")
@@ -130,189 +130,188 @@ def sources(self, start=min_date, end=max_date):
         'issue'
     )
 
-
     @property
     def fields(self):
-        return [FieldDefinition(
-            name="url",
-            display_name="Delpher URL",
-            description="Link to record on Delpher",
-            display_type='url',
-            es_mapping=keyword_mapping(),
-            extractor=XML(
-                lambda metadata: Tag('recordIdentifier', string=metadata['id']),
-                SiblingTag('identifier'),
-                external_file=True
-            )
-        ),
-        FieldDefinition(
-            name='date',
-            display_name='Date',
-            description='Publication date.',
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
-            results_overview=True,
-            csv_core=True,
-            visualizations=['resultscount', 'termfrequency'],
-            search_filter=filters.DateFilter(
-                self.min_date,
-                self.max_date,
-                description=(
-                    'Accept only articles with publication date in this range.'
-                )
+        return [
+            FieldDefinition(
+                name="url",
+                display_name="Delpher URL",
+                description="Link to record on Delpher",
+                display_type="url",
+                es_mapping=keyword_mapping(),
+                extractor=XML(
+                    lambda metadata: Tag("recordIdentifier", string=metadata["id"]),
+                    SiblingTag("identifier"),
+                    external_file=True,
+                ),
             ),
-            extractor=Metadata('date')
-        ),
-        FieldDefinition(
-            name='ocr',
-            display_name='OCR confidence',
-            description='OCR confidence level.',
-            es_mapping={'type': 'float'},
-            search_filter=filters.RangeFilter(0, 100,
-                                              description=(
-                                                  'Accept only articles for which the Opitical Character Recognition confidence '
-                                                  'indicator is in this range.'
-                                              )
-                                              ),
-            extractor=XML(
-                Tag('OCRConfidencelevel'),
-                external_file=True,
-                transform=lambda x: float(x)*100
+            FieldDefinition(
+                name="date",
+                display_name="Date",
+                description="Publication date.",
+                es_mapping={"type": "date", "format": "yyyy-MM-dd"},
+                results_overview=True,
+                csv_core=True,
+                visualizations=["resultscount", "termfrequency"],
+                search_filter=filters.DateFilter(
+                    self.min_date,
+                    self.max_date,
+                    description=(
+                        "Accept only articles with publication date in this range."
+                    ),
+                ),
+                extractor=Metadata("date"),
             ),
-            sortable=True
-        ),
-        FieldDefinition(
-            name='newspaper_title',
-            display_name='Newspaper title',
-            description='Title of the newspaper',
-            results_overview=True,
-            search_field_core=True,
-            es_mapping={'type': 'keyword'},
-            visualizations=['resultscount', 'termfrequency'],
-            search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles in these newspapers.',
-                option_count=len(self.papers)
+            FieldDefinition(
+                name="ocr",
+                display_name="OCR confidence",
+                description="OCR confidence level.",
+                es_mapping={"type": "float"},
+                search_filter=filters.RangeFilter(
+                    0,
+                    100,
+                    description=(
+                        "Accept only articles for which the Opitical Character Recognition confidence "
+                        "indicator is in this range."
+                    ),
+                ),
+                extractor=XML(
+                    Tag("OCRConfidencelevel"),
+                    external_file=True,
+                    transform=lambda x: float(x) * 100,
+                ),
+                sortable=True,
             ),
-            extractor=Metadata('title')
-        ),
-        FieldDefinition(
-            name='version_of',
-            display_name='Version of',
-            description='The newspaper is a version of this newspaper.',
-            es_mapping={'type': 'keyword'},
-            extractor=Metadata('isVersionOf')
-        ),
-        FieldDefinition(
-            name='issue_number',
-            display_name='Issue number',
-            description='Issue number of the newspaper',
-            csv_core=True,
-            es_mapping={'type': 'integer'},
-            extractor=Metadata('issuenumber')
-        ),
-        FieldDefinition(
-            name='category',
-            display_name='Category',
-            description='Whether the item is an article, advertisment, etc.',
-            csv_core=True,
-            es_mapping={'type': 'keyword'},
-            extractor=XML(
-                lambda metadata: Tag('recordIdentifier', string=metadata['id']),
-                SiblingTag('subject'),
-                external_file=True
+            FieldDefinition(
+                name="newspaper_title",
+                display_name="Newspaper title",
+                description="Title of the newspaper",
+                results_overview=True,
+                search_field_core=True,
+                es_mapping={"type": "keyword"},
+                visualizations=["resultscount", "termfrequency"],
+                search_filter=filters.MultipleChoiceFilter(
+                    description="Accept only articles in these newspapers.",
+                    option_count=len(self.papers),
+                ),
+                extractor=Metadata("title"),
             ),
-            search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles in these categories.',
-                option_count=2,
+            FieldDefinition(
+                name="version_of",
+                display_name="Version of",
+                description="The newspaper is a version of this newspaper.",
+                es_mapping={"type": "keyword"},
+                extractor=Metadata("isVersionOf"),
             ),
-        ),
-        FieldDefinition(
-            name='circulation',
-            display_name='Circulation',
-            description='The area in which the newspaper was distributed.',
-            es_mapping={'type': 'keyword'},
-            csv_core=True,
-            extractor=Metadata('spatial'),
-            search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles appearing in specific areas.',
-                option_count=7
+            FieldDefinition(
+                name="issue_number",
+                display_name="Issue number",
+                description="Issue number of the newspaper",
+                csv_core=True,
+                es_mapping={"type": "integer"},
+                extractor=Metadata("issuenumber"),
             ),
-        ),
-        FieldDefinition(
-            name='publisher',
-            display_name='Publisher',
-            description='Publisher',
-            es_mapping=keyword_mapping(),
-            search_field_core=True,
-            extractor=Metadata('publisher')
-        ),
-        FieldDefinition(
-            name='language',
-            display_name='Language',
-            description='language',
-            es_mapping={'type': 'keyword'},
-            extractor=Metadata('language')
-        ),
-        FieldDefinition(
-            name='article_title',
-            display_name='Article title',
-            description='Article title',
-            results_overview=True,
-            search_field_core=True,
-            extractor=XML(Tag('title'), flatten=True, toplevel=True)
-        ),
-        FieldDefinition(
-            name='id',
-            display_name='ID',
-            description='Unique identifier of the entry.',
-            extractor=Metadata('id')
-        ),
-        FieldDefinition(
-            name='source',
-            display_name='Source',
-            description='Library or archive which keeps the hard copy of this newspaper.',
-            es_mapping={'type': 'keyword'},
-            extractor=Metadata('source')
-        ),
-        FieldDefinition(
-            name='pub_place',
-            display_name='Publication Place',
-            description='Where the newspaper was published',
-            es_mapping={'type': 'keyword'},
-            extractor=Metadata('pub_place')
-        ),
-        FieldDefinition(
-            name='temporal',
-            display_name='Edition',
-            description='Newspaper edition for the given date',
-            results_overview=True,
-            csv_core=True,
-            es_mapping={'type': 'keyword'},
-            visualizations=['resultscount', 'termfrequency'],
-            search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles in newspapers which appeared as a given edition.',
-                option_count=3,
+            FieldDefinition(
+                name="category",
+                display_name="Category",
+                description="Whether the item is an article, advertisment, etc.",
+                csv_core=True,
+                es_mapping={"type": "keyword"},
+                extractor=XML(
+                    lambda metadata: Tag("recordIdentifier", string=metadata["id"]),
+                    SiblingTag("subject"),
+                    external_file=True,
+                ),
+                search_filter=filters.MultipleChoiceFilter(
+                    description="Accept only articles in these categories.",
+                    option_count=2,
+                ),
             ),
-            extractor=Metadata('temporal')
-        ),
-        FieldDefinition(
-            name='content',
-            display_name='Content',
-            display_type='text_content',
-            description='Text content.',
-            es_mapping=main_content_mapping(True, True, True, 'nl'),
-            results_overview=True,
-            search_field_core=True,
-            extractor=XML(
-                Tag('p'),
-                multiple=True,
-                flatten=True,
-                toplevel=True,
-                transform='\n'.join,
+            FieldDefinition(
+                name="circulation",
+                display_name="Circulation",
+                description="The area in which the newspaper was distributed.",
+                es_mapping={"type": "keyword"},
+                csv_core=True,
+                extractor=Metadata("spatial"),
+                search_filter=filters.MultipleChoiceFilter(
+                    description="Accept only articles appearing in specific areas.",
+                    option_count=7,
+                ),
             ),
-            visualizations=["wordcloud"],
-            language='nl',
-        ),
-    ]
-
-
-
+            FieldDefinition(
+                name="publisher",
+                display_name="Publisher",
+                description="Publisher",
+                es_mapping=keyword_mapping(),
+                search_field_core=True,
+                extractor=Metadata("publisher"),
+            ),
+            FieldDefinition(
+                name="language",
+                display_name="Language",
+                description="language",
+                es_mapping={"type": "keyword"},
+                extractor=Metadata("language"),
+            ),
+            FieldDefinition(
+                name="article_title",
+                display_name="Article title",
+                description="Article title",
+                results_overview=True,
+                search_field_core=True,
+                extractor=XML(Tag("title"), flatten=True, toplevel=True),
+            ),
+            FieldDefinition(
+                name="id",
+                display_name="ID",
+                description="Unique identifier of the entry.",
+                extractor=Metadata("id"),
+            ),
+            FieldDefinition(
+                name="source",
+                display_name="Source",
+                description="Library or archive which keeps the hard copy of this newspaper.",
+                es_mapping={"type": "keyword"},
+                extractor=Metadata("source"),
+            ),
+            FieldDefinition(
+                name="pub_place",
+                display_name="Publication Place",
+                description="Where the newspaper was published",
+                es_mapping={"type": "keyword"},
+                extractor=Metadata("pub_place"),
+            ),
+            FieldDefinition(
+                name="temporal",
+                display_name="Edition",
+                description="Newspaper edition for the given date",
+                results_overview=True,
+                csv_core=True,
+                es_mapping={"type": "keyword"},
+                visualizations=["resultscount", "termfrequency"],
+                search_filter=filters.MultipleChoiceFilter(
+                    description="Accept only articles in newspapers which appeared as a given edition.",
+                    option_count=3,
+                ),
+                extractor=Metadata("temporal"),
+            ),
+            FieldDefinition(
+                name="content",
+                display_name="Content",
+                display_type="text_content",
+                description="Text content.",
+                es_mapping=main_content_mapping(True, True, True, "nl"),
+                results_overview=True,
+                search_field_core=True,
+                extractor=XML(
+                    Tag("p"),
+                    multiple=True,
+                    flatten=True,
+                    toplevel=True,
+                    transform="\n".join,
+                ),
+                visualizations=["wordcloud", "ngram"],
+                language="nl",
+            ),
+        ]
diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py
index eefb1bbb8..fcaef48d2 100644
--- a/backend/corpora/goodreads/goodreads.py
+++ b/backend/corpora/goodreads/goodreads.py
@@ -64,200 +64,192 @@ def sources(self, start, end):
 
     fields = [
         FieldDefinition(
-            name='year',
-            display_name='Year',
-            description='Year the review was written.',
+            name="year",
+            display_name="Year",
+            description="Year the review was written.",
             extractor=CSV(
-                'date',
-                transform=lambda x: datetime.strptime(
-                    x, '%b %d, %Y').strftime('%Y')
+                "date",
+                transform=lambda x: datetime.strptime(x, "%b %d, %Y").strftime("%Y"),
             ),
-            es_mapping={'type': 'integer'},
+            es_mapping={"type": "integer"},
             search_filter=RangeFilter(
                 min_date.year,
                 max_date.year,
-                description=(
-                    'Accept only book reviews written in this range.'
-                )
+                description=("Accept only book reviews written in this range."),
             ),
-            hidden=True
+            hidden=True,
         ),
         FieldDefinition(
-            name='id',
-            display_name='ID',
-            description='ID of the review.',
-            extractor=CSV('id'),
-            es_mapping={'type': 'keyword'},
+            name="id",
+            display_name="ID",
+            description="ID of the review.",
+            extractor=CSV("id"),
+            es_mapping={"type": "keyword"},
             csv_core=True,
         ),
         FieldDefinition(
-            name='book_title',
-            display_name='Book title',
-            description='The title of the book reviews were made for. Encompasses all editions.',
-            extractor=Metadata('book_title'),
-            es_mapping={'type': 'keyword'},
+            name="book_title",
+            display_name="Book title",
+            description="The title of the book reviews were made for. Encompasses all editions.",
+            extractor=Metadata("book_title"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only reviews made for these titles.',
-                option_count=154
+                description="Accept only reviews made for these titles.",
+                option_count=154,
             ),
-            csv_core=True
+            csv_core=True,
         ),
         FieldDefinition(
-            name='original_language',
-            display_name='Original language',
-            description='The original language the book reviews were made for was written in.',
-            extractor=Metadata('original_language'),
-            es_mapping={'type': 'keyword'},
+            name="original_language",
+            display_name="Original language",
+            description="The original language the book reviews were made for was written in.",
+            extractor=Metadata("original_language"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only reviews made for titles originally in this language(s).',
-                option_count=8
+                description="Accept only reviews made for titles originally in this language(s).",
+                option_count=8,
             ),
             csv_core=True,
         ),
         FieldDefinition(
-            name='edition_id',
-            display_name='Edition ID',
-            description='ID of the edition the review was made for.',
-            extractor=CSV('edition_id'),
-            es_mapping={'type': 'keyword'},
+            name="edition_id",
+            display_name="Edition ID",
+            description="ID of the edition the review was made for.",
+            extractor=CSV("edition_id"),
+            es_mapping={"type": "keyword"},
         ),
         FieldDefinition(
-            name='edition_language',
-            display_name='Edition language',
-            description='The language that the edition that the review is for was written in',
-            extractor=CSV('edition_language'),
-            es_mapping={'type': 'keyword'},
+            name="edition_language",
+            display_name="Edition language",
+            description="The language that the edition that the review is for was written in",
+            extractor=CSV("edition_language"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only editions written in these languages.',
-                option_count=8
+                description="Accept only editions written in these languages.",
+                option_count=8,
             ),
             results_overview=True,
             csv_core=True,
-            visualizations=['resultscount', 'termfrequency'],
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='book_genre',
-            display_name='Genre',
-            description='The genre of the reviewed book',
-            extractor=Metadata('book_genre'),
-            es_mapping={'type': 'keyword'},
+            name="book_genre",
+            display_name="Genre",
+            description="The genre of the reviewed book",
+            extractor=Metadata("book_genre"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only reviews of books in this genre',
-                option_count=8
+                description="Accept only reviews of books in this genre", option_count=8
             ),
-            visualizations=['resultscount', 'termfrequency']
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='age_category',
-            display_name='Age category',
-            description='The age category of the target audience of the reviewed book',
-            extractor=Metadata('age_category'),
-            es_mapping={'type': 'keyword'},
+            name="age_category",
+            display_name="Age category",
+            description="The age category of the target audience of the reviewed book",
+            extractor=Metadata("age_category"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only reviews of books written for this age category',
-                option_count=3
+                description="Accept only reviews of books written for this age category",
+                option_count=3,
             ),
-            visualizations=['resultscount', 'termfrequency']
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='url',
-            display_name='Source URL',
-            display_type='url',
-            description='Link to the the review on Goodreads',
-            extractor=CSV('url'),
-            es_mapping={'type': 'keyword'},
+            name="url",
+            display_name="Source URL",
+            display_type="url",
+            description="Link to the the review on Goodreads",
+            extractor=CSV("url"),
+            es_mapping={"type": "keyword"},
         ),
         FieldDefinition(
-            name='text',
-            display_name='Text',
-            description='Fulltext of the review.',
-            extractor=CSV('text'),
+            name="text",
+            display_name="Text",
+            description="Fulltext of the review.",
+            extractor=CSV("text"),
             es_mapping=main_content_mapping(),
-            display_type='text_content',
+            display_type="text_content",
             csv_core=True,
             results_overview=True,
             searchable=True,
-            visualizations=['wordcloud'],
+            visualizations=["wordcloud"],
         ),
         FieldDefinition(
-            name='language',
-            display_name='Review language',
-            description='The language of the review.',
-            extractor=CSV('language'),
-            es_mapping={'type': 'keyword'},
+            name="language",
+            display_name="Review language",
+            description="The language of the review.",
+            extractor=CSV("language"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only reviews written in these languages.',
-                option_count=50
+                description="Accept only reviews written in these languages.",
+                option_count=50,
             ),
             results_overview=True,
             csv_core=True,
-            visualizations=['resultscount', 'termfrequency'],
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='date',
-            display_name='Date',
-            description='Date the review was written.',
+            name="date",
+            display_name="Date",
+            description="Date the review was written.",
             extractor=CSV(
-                'date',
-                transform=lambda x: datetime.strptime(
-                    x, '%b %d, %Y').strftime('%Y-%m-%d')
+                "date",
+                transform=lambda x: datetime.strptime(x, "%b %d, %Y").strftime(
+                    "%Y-%m-%d"
+                ),
             ),
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            es_mapping={"type": "date", "format": "yyyy-MM-dd"},
         ),
         FieldDefinition(
-            name='rating_text',
-            display_name='Goodreads rating',
-            description='Rating in the Goodreads style, e.g. \'really liked it\'.',
-            extractor=CSV('rating'),
-            es_mapping={'type': 'keyword'},
+            name="rating_text",
+            display_name="Goodreads rating",
+            description="Rating in the Goodreads style, e.g. 'really liked it'.",
+            extractor=CSV("rating"),
+            es_mapping={"type": "keyword"},
         ),
         FieldDefinition(
-            name='rating_no',
-            display_name='Rating',
-            description='Rating as a number.',
-            extractor=CSV('rating_no'),
-            es_mapping={'type': 'keyword'},
+            name="rating_no",
+            display_name="Rating",
+            description="Rating as a number.",
+            extractor=CSV("rating_no"),
+            es_mapping={"type": "keyword"},
             search_filter=MultipleChoiceFilter(
-                description='Accept only reviews with these ratings.',
-                option_count=6
+                description="Accept only reviews with these ratings.", option_count=6
             ),
             results_overview=True,
-            visualizations=['resultscount', 'termfrequency'],
-            visualization_sort='key'
+            visualizations=["resultscount", "termfrequency"],
+            visualization_sort="key",
         ),
         FieldDefinition(
-            name='word_count',
-            display_name='Word count',
-            description='Number of words (whitespace-delimited) in the review.',
-            extractor=CSV(
-                'text',
-                transform=lambda x: len(x.split(' '))
-            ),
-            es_mapping={'type': 'integer'},
+            name="word_count",
+            display_name="Word count",
+            description="Number of words (whitespace-delimited) in the review.",
+            extractor=CSV("text", transform=lambda x: len(x.split(" "))),
+            es_mapping={"type": "integer"},
             search_filter=RangeFilter(
                 1,
                 4225,
-                description=(
-                    'Accept only book reviews with word count in this range.'
-            ))
+                description=("Accept only book reviews with word count in this range."),
+            ),
         ),
         FieldDefinition(
-            name='edition_publisher',
-            display_name='Edition publisher',
-            description='Publisher of the edition the review was written for',
+            name="edition_publisher",
+            display_name="Edition publisher",
+            description="Publisher of the edition the review was written for",
             extractor=CSV(
-                'edition_publisher',
+                "edition_publisher",
             ),
-            es_mapping={'type': 'keyword'},
+            es_mapping={"type": "keyword"},
         ),
         FieldDefinition(
-            name='edition_publishing_year',
-            display_name='Edition publishing year',
-            description='Year the edition the review was written for was published.',
+            name="edition_publishing_year",
+            display_name="Edition publishing year",
+            description="Year the edition the review was written for was published.",
             extractor=CSV(
-                'edition_publishing_year',
+                "edition_publishing_year",
             ),
-            es_mapping={'type': 'keyword'},
+            es_mapping={"type": "keyword"},
         ),
     ]
 
@@ -296,4 +288,3 @@ def update_script(self):
                 }
             }
             yield update_body
-
diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py
index aa5bab986..2658cca27 100644
--- a/backend/corpora/guardianobserver/guardianobserver.py
+++ b/backend/corpora/guardianobserver/guardianobserver.py
@@ -72,113 +72,112 @@ def sources(self, start=datetime.min, end=datetime.max):
 
     fields = [
         FieldDefinition(
-            name='date',
-            display_name='Publication Date',
-            description='Publication date, parsed to yyyy-MM-dd format',
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            name="date",
+            display_name="Publication Date",
+            description="Publication date, parsed to yyyy-MM-dd format",
+            es_mapping={"type": "date", "format": "yyyy-MM-dd"},
             hidden=True,
-            visualizations=['resultscount', 'termfrequency'],
+            visualizations=["resultscount", "termfrequency"],
             search_filter=filters.DateFilter(
                 min_date,
                 max_date,
                 description=(
-                    'Accept only articles with publication date in this range.'
-                )
+                    "Accept only articles with publication date in this range."
+                ),
             ),
             extractor=extract.XML(
-                Tag('NumericPubDate'),
-                transform=lambda x: '{y}-{m}-{d}'.format(y=x[:4],m=x[4:6],d=x[6:])
+                Tag("NumericPubDate"),
+                transform=lambda x: "{y}-{m}-{d}".format(y=x[:4], m=x[4:6], d=x[6:]),
             ),
             sortable=True,
         ),
         FieldDefinition(
-            name='date-pub',
+            name="date-pub",
             es_mapping=keyword_mapping(),
-            display_name='Publication Date',
+            display_name="Publication Date",
             csv_core=True,
             results_overview=True,
-            description='Publication date as full string, as found in source file',
-            extractor=extract.XML(Tag('AlphaPubDate'))
+            description="Publication date as full string, as found in source file",
+            extractor=extract.XML(Tag("AlphaPubDate")),
         ),
         FieldDefinition(
-            name='id',
+            name="id",
             es_mapping=keyword_mapping(),
-            display_name='ID',
-            description='Article identifier.',
-            extractor=extract.XML(Tag('RecordID')),
+            display_name="ID",
+            description="Article identifier.",
+            extractor=extract.XML(Tag("RecordID")),
         ),
         FieldDefinition(
-            name='pub_id',
+            name="pub_id",
             es_mapping=keyword_mapping(),
-            display_name='Publication ID',
-            description='Publication identifier',
-            extractor=extract.XML(Tag('PublicationID'))
+            display_name="Publication ID",
+            description="Publication identifier",
+            extractor=extract.XML(Tag("PublicationID")),
         ),
         FieldDefinition(
-            name='page',
+            name="page",
             es_mapping=keyword_mapping(),
-            display_name='Page',
-            description='Start page label, from source (1, 2, 17A, ...).',
-            extractor=extract.XML(Tag('StartPage'))
+            display_name="Page",
+            description="Start page label, from source (1, 2, 17A, ...).",
+            extractor=extract.XML(Tag("StartPage")),
         ),
         FieldDefinition(
-            name='title',
-            display_name='Title',
+            name="title",
+            display_name="Title",
             search_field_core=True,
-            visualizations=['wordcloud'],
-            description='Article title.',
-            extractor=extract.XML(Tag('RecordTitle'))
+            visualizations=["wordcloud"],
+            description="Article title.",
+            extractor=extract.XML(Tag("RecordTitle")),
         ),
         FieldDefinition(
-            name='source-paper',
+            name="source-paper",
             es_mapping=keyword_mapping(True),
-            display_name='Source paper',
-            description='Credited as source.',
-            extractor=extract.XML(Tag('Title')),
+            display_name="Source paper",
+            description="Credited as source.",
+            extractor=extract.XML(Tag("Title")),
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles from these source papers.',
-                option_count=5
+                description="Accept only articles from these source papers.",
+                option_count=5,
             ),
         ),
         FieldDefinition(
-            name='place',
+            name="place",
             mapping=keyword_mapping(True),
-            display_name='Place',
-            description='Place in which the article was published',
-            extractor=extract.XML(Tag('Qualifier'))
+            display_name="Place",
+            description="Place in which the article was published",
+            extractor=extract.XML(Tag("Qualifier")),
         ),
         FieldDefinition(
-            name='author',
+            name="author",
             mapping=keyword_mapping(True),
-            display_name='Author',
-            description='Article author',
-            extractor=extract.XML(Tag('PersonName'))
+            display_name="Author",
+            description="Article author",
+            extractor=extract.XML(Tag("PersonName")),
         ),
         FieldDefinition(
-            name='category',
-            visualizations=['resultscount', 'termfrequency'],
-            display_name='Category',
-            description='Article subject categories.',
-            es_mapping={'type': 'keyword'},
+            name="category",
+            visualizations=["resultscount", "termfrequency"],
+            display_name="Category",
+            description="Article subject categories.",
+            es_mapping={"type": "keyword"},
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles in these categories.',
-                option_count=19
+                description="Accept only articles in these categories.", option_count=19
             ),
-            extractor=extract.XML(Tag('ObjectType')),
-            csv_core=True
+            extractor=extract.XML(Tag("ObjectType")),
+            csv_core=True,
         ),
         FieldDefinition(
-            name='content',
-            es_mapping=main_content_mapping(True, True, True, 'en'),
-            display_name='Content',
-            display_type='text_content',
-            visualizations=['wordcloud'],
-            description='Raw OCR\'ed text (content).',
+            name="content",
+            es_mapping=main_content_mapping(True, True, True, "en"),
+            display_name="Content",
+            display_type="text_content",
+            visualizations=["wordcloud", "ngram"],
+            description="Raw OCR'ed text (content).",
             results_overview=True,
             search_field_core=True,
-            extractor=extract.XML(Tag('FullText'), flatten=True),
-            language='en',
-        )
+            extractor=extract.XML(Tag("FullText"), flatten=True),
+            language="en",
+        ),
     ]
 
     document_context = {
diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py
index 24111c8a5..e6ab86e95 100644
--- a/backend/corpora/periodicals/periodicals.py
+++ b/backend/corpora/periodicals/periodicals.py
@@ -5,7 +5,7 @@
 
 import logging
 logger = logging.getLogger(__name__)
-from os.path import join, isfile, splitext
+from os.path import join, isfile
 from datetime import datetime
 import re
 import openpyxl
@@ -59,7 +59,8 @@ def sources(self, start=min_date, end=max_date):
             metadict['title'] = row[0]
             if row[1].startswith('['):
                 date = row[1][1:-1]
-            else: date = row[1]
+            else:
+                date = row[1]
             metadict['date_full'] = date
             if date=='Date Unknown':
                 metadict['date'] = None
@@ -80,172 +81,172 @@ def sources(self, start=min_date, end=max_date):
 
     fields = [
         FieldDefinition(
-            name='date',
-            display_name='Formatted Date',
-            description='Publication date, formatted from the full date',
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            name="date",
+            display_name="Formatted Date",
+            description="Publication date, formatted from the full date",
+            es_mapping={"type": "date", "format": "yyyy-MM-dd"},
             histogram=True,
             search_filter=filters.DateFilter(
                 min_date,
                 max_date,
                 description=(
-                    'Accept only articles with publication date in this range.'
-                )
+                    "Accept only articles with publication date in this range."
+                ),
             ),
-            extractor=extract.Metadata('date'),
+            extractor=extract.Metadata("date"),
             csv_core=True,
-            visualizations=['resultscount', 'termfrequency']
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='date_pub',
-            display_name='Publication Date',
-            description='Publication date as full string, as found in source file',
+            name="date_pub",
+            display_name="Publication Date",
+            description="Publication date as full string, as found in source file",
             es_mapping=keyword_mapping(),
             results_overview=True,
-            extractor=extract.Metadata('date_full')
+            extractor=extract.Metadata("date_full"),
         ),
         FieldDefinition(
-            name='id',
-            display_name='ID',
-            description='Unique identifier of the entry.',
+            name="id",
+            display_name="ID",
+            description="Unique identifier of the entry.",
             es_mapping=keyword_mapping(),
-            extractor=extract.XML(attribute='id'),
+            extractor=extract.XML(attribute="id"),
         ),
         FieldDefinition(
-            name='issue',
-            display_name='Issue number',
-            description='Source issue number.',
+            name="issue",
+            display_name="Issue number",
+            description="Source issue number.",
             es_mapping=keyword_mapping(),
             results_overview=False,
-            extractor=extract.Metadata('issue_id'),
+            extractor=extract.Metadata("issue_id"),
             csv_core=False,
         ),
         FieldDefinition(
-            name='periodical',
-            display_name='Periodical name',
+            name="periodical",
+            display_name="Periodical name",
             histogram=True,
             results_overview=True,
-            es_mapping={'type': 'keyword'},
-            description='Periodical name.',
+            es_mapping={"type": "keyword"},
+            description="Periodical name.",
             search_filter=filters.MultipleChoiceFilter(
-                description='Search only within these periodicals.',
-                option_count=90
+                description="Search only within these periodicals.", option_count=90
             ),
-            extractor=extract.Metadata('title'),
+            extractor=extract.Metadata("title"),
             csv_core=True,
-            visualizations=['resultscount', 'termfrequency']
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='content',
-            display_name='Content',
-            display_type='text_content',
-            description='Text content.',
-            es_mapping=main_content_mapping(True, True, True, 'en'),
+            name="content",
+            display_name="Content",
+            display_type="text_content",
+            description="Text content.",
+            es_mapping=main_content_mapping(True, True, True, "en"),
             results_overview=True,
-            extractor=extract.XML(Tag('ocrText'), flatten=True),
+            extractor=extract.XML(Tag("ocrText"), flatten=True),
             search_field_core=True,
-            visualizations=["wordcloud"],
-            language='en',
+            visualizations=["wordcloud", "ngram"],
+            language="en",
         ),
         FieldDefinition(
-            name='ocr',
-            display_name='OCR confidence',
-            description='OCR confidence level.',
-            es_mapping={'type': 'float'},
-            search_filter=filters.RangeFilter(0, 100,
-                                              description=(
-                                                  'Accept only articles for which the Opitical Character Recognition confidence '
-                                                  'indicator is in this range.'
-                                              )
-                                              ),
+            name="ocr",
+            display_name="OCR confidence",
+            description="OCR confidence level.",
+            es_mapping={"type": "float"},
+            search_filter=filters.RangeFilter(
+                0,
+                100,
+                description=(
+                    "Accept only articles for which the Opitical Character Recognition confidence "
+                    "indicator is in this range."
+                ),
+            ),
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
-                SiblingTag('ocr'),
+                lambda metadata: Tag("id", string=metadata["id"]),
+                SiblingTag("ocr"),
             ),
-            sortable=True
+            sortable=True,
         ),
         FieldDefinition(
-            name='title',
-            display_name='Article title',
-            description='Title of the article.',
+            name="title",
+            display_name="Article title",
+            description="Title of the article.",
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
-                SiblingTag('ti'),
+                lambda metadata: Tag("id", string=metadata["id"]),
+                SiblingTag("ti"),
                 external_file=True,
             ),
-            visualizations=['wordcloud']
+            visualizations=["wordcloud"],
         ),
         FieldDefinition(
-            name='start_column',
-            es_mapping={'type': 'keyword'},
-            display_name='Starting column',
-            description='Which column the article starts in.',
+            name="start_column",
+            es_mapping={"type": "keyword"},
+            display_name="Starting column",
+            description="Which column the article starts in.",
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
-                SiblingTag('sc'),
+                lambda metadata: Tag("id", string=metadata["id"]),
+                SiblingTag("sc"),
                 external_file=True,
-            )
+            ),
         ),
         FieldDefinition(
-            name='page_count',
-            display_name='Page count',
-            description='How many pages the article covers.',
-            es_mapping={'type': 'integer'},
+            name="page_count",
+            display_name="Page count",
+            description="How many pages the article covers.",
+            es_mapping={"type": "integer"},
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
-                SiblingTag('pc'),
+                lambda metadata: Tag("id", string=metadata["id"]),
+                SiblingTag("pc"),
                 external_file=True,
-            )
+            ),
         ),
         FieldDefinition(
-            name='word_count',
-            display_name='Word count',
-            description='Number of words in the article.',
-            es_mapping={'type': 'integer'},
+            name="word_count",
+            display_name="Word count",
+            description="Number of words in the article.",
+            es_mapping={"type": "integer"},
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
-                SiblingTag('wordCount'),
+                lambda metadata: Tag("id", string=metadata["id"]),
+                SiblingTag("wordCount"),
                 external_file=True,
-            )
+            ),
         ),
         FieldDefinition(
-            name='category',
+            name="category",
             csv_core=True,
-            display_name='Category',
-            description='Article category.',
-            es_mapping={'type': 'keyword'},
+            display_name="Category",
+            description="Article category.",
+            es_mapping={"type": "keyword"},
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
-                SiblingTag('ct'),
+                lambda metadata: Tag("id", string=metadata["id"]),
+                SiblingTag("ct"),
                 external_file=True,
             ),
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles in these categories.',
-                option_count=26
+                description="Accept only articles in these categories.", option_count=26
             ),
-            visualizations=['resultscount', 'termfrequency']
+            visualizations=["resultscount", "termfrequency"],
         ),
         FieldDefinition(
-            name='page_no',
-            display_name='Page number',
-            description='At which page the article starts.',
-            es_mapping={'type': 'integer'},
+            name="page_no",
+            display_name="Page number",
+            description="At which page the article starts.",
+            es_mapping={"type": "integer"},
             extractor=extract.XML(
-                lambda metadata: Tag('id', string=metadata['id']),
+                lambda metadata: Tag("id", string=metadata["id"]),
                 ParentTag(2),
-                Tag('pa'),
+                Tag("pa"),
                 external_file=True,
-                transform=lambda x: re.sub('[\[\]]', '', x)
-            )
+                transform=lambda x: re.sub("[\[\]]", "", x),
+            ),
         ),
         FieldDefinition(
-            name='image_path',
-            display_name='Image path',
-            es_mapping={'type': 'keyword'},
-            description='Path of scan.',
-            extractor=extract.Metadata('image_path'),
+            name="image_path",
+            display_name="Image path",
+            es_mapping={"type": "keyword"},
+            description="Path of scan.",
+            extractor=extract.Metadata("image_path"),
             hidden=True,
-            downloadable=False
+            downloadable=False,
         ),
     ]
 
diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py
index fc46c2d39..683ae184f 100644
--- a/backend/corpora/rechtspraak/rechtspraak.py
+++ b/backend/corpora/rechtspraak/rechtspraak.py
@@ -36,7 +36,6 @@ def _rdf_description_extractor(tag: Tag, section='xml', **kwargs) -> extract.XML
     )
 
 
-
 class Rechtspraak(XMLCorpusDefinition):
     title = "Judicial system Netherlands"
     description = "Open data of (anonymised) court rulings of the Dutch judicial system"
@@ -146,179 +145,173 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None
 
     fields = [
         FieldDefinition(
-            name='id',
-            display_name='ID',
-            description='',
+            name="id",
+            display_name="ID",
+            description="",
             es_mapping=keyword_mapping(),
-            extractor=_rdf_description_extractor(Tag('dcterms:identifier')),
+            extractor=_rdf_description_extractor(Tag("dcterms:identifier")),
             csv_core=True,
         ),
         FieldDefinition(
-            name='has_content',
-            display_name='Has text content',
-            description='Document has available text content.',
-            es_mapping={'type': 'boolean'},
+            name="has_content",
+            display_name="Has text content",
+            description="Document has available text content.",
+            es_mapping={"type": "boolean"},
             extractor=extract.Backup(
-                extract.XML(Tag('uitspraak'), flatten=True),
-                extract.XML(Tag('conclusie'), flatten=True),
+                extract.XML(Tag("uitspraak"), flatten=True),
+                extract.XML(Tag("conclusie"), flatten=True),
                 extract.Constant(False),
-                transform=bool
+                transform=bool,
             ),
             search_filter=filters.BooleanFilter(
-                true='has content',
-                false='does not have content',
-                description=(
-                    'Accept only articles that have available text content.'
-                )
+                true="has content",
+                false="does not have content",
+                description=("Accept only articles that have available text content."),
             ),
         ),
         FieldDefinition(
-            name='year',
-            display_name='Year',
-            es_mapping={'type': 'integer'},
-            extractor=extract.Metadata('year'),
-            search_filter=filters.RangeFilter(min_date.year, max_date.year)
+            name="year",
+            display_name="Year",
+            es_mapping={"type": "integer"},
+            extractor=extract.Metadata("year"),
+            search_filter=filters.RangeFilter(min_date.year, max_date.year),
         ),
         FieldDefinition(
-            name='date',
-            display_name='Date',
-            extractor=_rdf_description_extractor(Tag('dcterms:date')),
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            name="date",
+            display_name="Date",
+            extractor=_rdf_description_extractor(Tag("dcterms:date")),
+            es_mapping={"type": "date", "format": "yyyy-MM-dd"},
             results_overview=True,
             csv_core=True,
             search_filter=filters.DateFilter(
                 min_date,
                 max_date,
-                description=(
-                    'Accept only rulings with date in this range.'
-                )
+                description=("Accept only rulings with date in this range."),
             ),
-
         ),
         FieldDefinition(
-            name='issued',
-            display_name='Publication Date',
-            extractor=_rdf_description_extractor(Tag('dcterms:issued')),
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            name="issued",
+            display_name="Publication Date",
+            extractor=_rdf_description_extractor(Tag("dcterms:issued")),
+            es_mapping={"type": "date", "format": "yyyy-MM-dd"},
             search_filter=filters.DateFilter(
                 min_date,
                 max_date,
                 description=(
-                    'Accept only rulings with publication date in this range.'
-                )
+                    "Accept only rulings with publication date in this range."
+                ),
             ),
         ),
         FieldDefinition(
-            name='publisher',
-            display_name='Publisher',
-            extractor=_rdf_description_extractor(Tag('dcterms:publisher')),
-            es_mapping={'type': 'keyword'},
-            language='nl',
+            name="publisher",
+            display_name="Publisher",
+            extractor=_rdf_description_extractor(Tag("dcterms:publisher")),
+            es_mapping={"type": "keyword"},
+            language="nl",
         ),
         FieldDefinition(
-            name='creator',
-            display_name='Court',
-            extractor=_rdf_description_extractor(Tag('dcterms:creator')),
-            es_mapping={'type': 'keyword'},
+            name="creator",
+            display_name="Court",
+            extractor=_rdf_description_extractor(Tag("dcterms:creator")),
+            es_mapping={"type": "keyword"},
             csv_core=True,
             results_overview=True,
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only rulings of selected courts.',
-                option_count=9999
+                description="Accept only rulings of selected courts.", option_count=9999
             ),
-            visualizations=['resultscount', 'termfrequency'],
-            language='nl',
+            visualizations=["resultscount", "termfrequency"],
+            language="nl",
         ),
         FieldDefinition(
-            name='zaaknr',
-            display_name='Case Number',
+            name="zaaknr",
+            display_name="Case Number",
             es_mapping=keyword_mapping(),
-            extractor=_rdf_description_extractor(Tag('psi:zaaknummer')),
+            extractor=_rdf_description_extractor(Tag("psi:zaaknummer")),
         ),
         FieldDefinition(
-            name='type',
-            display_name='Type',
-            extractor=_rdf_description_extractor(Tag('dcterms:type')),
-            es_mapping={'type': 'keyword'},
+            name="type",
+            display_name="Type",
+            extractor=_rdf_description_extractor(Tag("dcterms:type")),
+            es_mapping={"type": "keyword"},
             csv_core=True,
             results_overview=True,
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only rulings of selected type.',
-                option_count=2
+                description="Accept only rulings of selected type.", option_count=2
             ),
-            visualizations=['resultscount', 'termfrequency'],
-            language='nl',
+            visualizations=["resultscount", "termfrequency"],
+            language="nl",
         ),
         FieldDefinition(
-            name='procedure',
-            display_name='(type of) Procedure',
-            extractor=_rdf_description_extractor(Tag('psi:procedure')),
+            name="procedure",
+            display_name="(type of) Procedure",
+            extractor=_rdf_description_extractor(Tag("psi:procedure")),
             csv_core=True,
-            es_mapping={'type': 'keyword'},
+            es_mapping={"type": "keyword"},
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only rulings of selected procedure type.',
-                option_count=44
+                description="Accept only rulings of selected procedure type.",
+                option_count=44,
             ),
-            visualizations=['resultscount', 'termfrequency'],
-            language='nl',
+            visualizations=["resultscount", "termfrequency"],
+            language="nl",
         ),
         FieldDefinition(
-            name='spatial',
-            display_name='Location',
+            name="spatial",
+            display_name="Location",
             es_mapping=keyword_mapping(),
-            extractor=_rdf_description_extractor(Tag('dcterms:spatial')),
-            language='nl',
+            extractor=_rdf_description_extractor(Tag("dcterms:spatial")),
+            language="nl",
         ),
         FieldDefinition(
-            name='subject',
-            display_name='Area of law',
-            extractor=_rdf_description_extractor(Tag('dcterms:subject')),
+            name="subject",
+            display_name="Area of law",
+            extractor=_rdf_description_extractor(Tag("dcterms:subject")),
             csv_core=True,
-            es_mapping={'type': 'keyword'},
+            es_mapping={"type": "keyword"},
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only rulings within this area of law.',
-                option_count=32
+                description="Accept only rulings within this area of law.",
+                option_count=32,
             ),
-            visualizations=['resultscount', 'termfrequency'],
-            language='nl',
+            visualizations=["resultscount", "termfrequency"],
+            language="nl",
         ),
         FieldDefinition(
-            name='title',
-            display_name='Title',
-            extractor=_rdf_description_extractor(
-                Tag('dcterms:title'), section='html'),
+            name="title",
+            display_name="Title",
+            extractor=_rdf_description_extractor(Tag("dcterms:title"), section="html"),
             results_overview=True,
             search_field_core=True,
-            language='nl',
+            language="nl",
         ),
         FieldDefinition(
-            name='abstract',
-            display_name='Abstract',
-            extractor=extract.XML(Tag('inhoudsindicatie'), flatten=True),
+            name="abstract",
+            display_name="Abstract",
+            extractor=extract.XML(Tag("inhoudsindicatie"), flatten=True),
             results_overview=True,
-            language='nl',
+            language="nl",
         ),
         FieldDefinition(
-            name='content',
-            display_name='Content',
-            display_type='text_content',
-            es_mapping=main_content_mapping(True, True, True, 'nl'),
+            name="content",
+            display_name="Content",
+            display_type="text_content",
+            es_mapping=main_content_mapping(True, True, True, "nl"),
             extractor=extract.Backup(
-                extract.XML(Tag('uitspraak'), flatten=True),
-                extract.XML(Tag('conclusie'), flatten=True),
-                extract.Constant('Content not available')
+                extract.XML(Tag("uitspraak"), flatten=True),
+                extract.XML(Tag("conclusie"), flatten=True),
+                extract.Constant("Content not available"),
             ),
             csv_core=True,
             search_field_core=True,
-            language='nl',
+            language="nl",
+            visualizations=["ngram"],
         ),
         FieldDefinition(
-            name='url',
-            display_name='Source URL',
-            display_type='url',
-            description='URL of the case on rechtspraak.nl',
+            name="url",
+            display_name="Source URL",
+            display_type="url",
+            description="URL of the case on rechtspraak.nl",
             es_mapping=keyword_mapping(),
             extractor=_rdf_description_extractor(
-                Tag('dcterms:identifier'), section='html')
-        )
+                Tag("dcterms:identifier"), section="html"
+            ),
+        ),
     ]
diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py
index bab8a5ea7..65fbcbf09 100644
--- a/backend/corpora/times/times.py
+++ b/backend/corpora/times/times.py
@@ -96,172 +96,151 @@ def sources(self, start=datetime.min, end=datetime.max):
 
     fields = [
         FieldDefinition(
-            name='date',
-            display_name='Publication Date',
-            description='Publication date, parsed to yyyy-MM-dd format',
-            es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
+            name="date",
+            display_name="Publication Date",
+            description="Publication date, parsed to yyyy-MM-dd format",
+            es_mapping={"type": "date", "format": "yyyy-MM-dd"},
             hidden=True,
-            visualizations=['resultscount', 'termfrequency'],
+            visualizations=["resultscount", "termfrequency"],
             search_filter=filters.DateFilter(
                 min_date,
                 max_date,
                 description=(
-                    'Accept only articles with publication date in this range.'
-                )
+                    "Accept only articles with publication date in this range."
+                ),
+            ),
+            extractor=extract.Metadata(
+                "date", transform=lambda x: x.strftime("%Y-%m-%d")
             ),
-            extractor=extract.Metadata('date',
-                                       transform=lambda x: x.strftime(
-                                           '%Y-%m-%d')
-                                       )
         ),
         FieldDefinition(
-            name='source',
-            display_name='Source',
-            description='Library where the microfilm is sourced',
+            name="source",
+            display_name="Source",
+            description="Library where the microfilm is sourced",
             es_mapping=keyword_mapping(),
             extractor=extract.XML(
-                Tag('metadatainfo'),
-                Tag('sourceLibrary'),
+                Tag("metadatainfo"),
+                Tag("sourceLibrary"),
                 toplevel=True,
-                applicable=after(1985)
-            )
+                applicable=after(1985),
+            ),
         ),
         FieldDefinition(
-            name='edition',
-            display_name='Edition',
+            name="edition",
+            display_name="Edition",
             es_mapping=keyword_mapping(),
             extractor=extract.Choice(
+                extract.XML(Tag("ed"), toplevel=True, applicable=until(1985)),
                 extract.XML(
-                    Tag('ed'),
-                    toplevel=True,
-                    applicable=until(1985)
+                    Tag("ed"), toplevel=True, multiple=True, applicable=after(1985)
                 ),
-                extract.XML(
-                    Tag('ed'),
-                    toplevel=True, multiple=True,
-                    applicable=after(1985)
-                )
             ),
-            csv_core=True
+            csv_core=True,
         ),
         FieldDefinition(
-            name='issue',
-            display_name='Issue number',
-            es_mapping={'type': 'integer'},
-            description='Source issue number.',
+            name="issue",
+            display_name="Issue number",
+            es_mapping={"type": "integer"},
+            description="Source issue number.",
             extractor=extract.XML(
-                Tag('is'),
+                Tag("is"),
                 toplevel=True,
                 # Hardcoded to ignore one particular issue with source data
-                transform=lambda x: (62226 if x == "6222662226" else int(x))
+                transform=lambda x: (62226 if x == "6222662226" else int(x)),
             ),
             sortable=True,
-            csv_core=True
+            csv_core=True,
         ),
         FieldDefinition(
-            name='volume',
-            display_name='Volume',
-            description='Volume number.',
+            name="volume",
+            display_name="Volume",
+            description="Volume number.",
             es_mapping=keyword_mapping(),
-            extractor=extract.XML(
-                Tag('volNum'),
-                toplevel=True,
-                applicable=after(1985)
-            ),
-            csv_core=True
+            extractor=extract.XML(Tag("volNum"), toplevel=True, applicable=after(1985)),
+            csv_core=True,
         ),
         FieldDefinition(
-            name='date-pub',
-            display_name='Publication Date',
+            name="date-pub",
+            display_name="Publication Date",
             es_mapping=keyword_mapping(),
             csv_core=True,
             results_overview=True,
             sortable=True,
-            description='Publication date as full string, as found in source file',
-            extractor=extract.XML(
-                Tag('da'),
-                toplevel=True
-            )
+            description="Publication date as full string, as found in source file",
+            extractor=extract.XML(Tag("da"), toplevel=True),
         ),
         FieldDefinition(
-            name='ocr',
-            display_name='OCR confidence',
-            description='OCR confidence level.',
-            es_mapping={'type': 'float'},
-            search_filter=filters.RangeFilter(0, 100,
-                                              description=(
-                                                  'Accept only articles for which the Opitical Character Recognition confidence '
-                                                  'indicator is in this range.'
-                                              )
-                                              ),
-            extractor=extract.XML(Tag('ocr'), transform=float),
-            sortable=True
+            name="ocr",
+            display_name="OCR confidence",
+            description="OCR confidence level.",
+            es_mapping={"type": "float"},
+            search_filter=filters.RangeFilter(
+                0,
+                100,
+                description=(
+                    "Accept only articles for which the Opitical Character Recognition confidence "
+                    "indicator is in this range."
+                ),
+            ),
+            extractor=extract.XML(Tag("ocr"), transform=float),
+            sortable=True,
         ),
         FieldDefinition(
-            name='date-end',
-            display_name='Ending date',
+            name="date-end",
+            display_name="Ending date",
             es_mapping=keyword_mapping(),
             description=(
-                'Ending date of publication. '
-                'For issues that span more than 1 day.'
+                "Ending date of publication. " "For issues that span more than 1 day."
             ),
-            extractor=extract.XML(
-                Tag('tdate'), toplevel=True,
-                applicable=after(1985)
-            )
+            extractor=extract.XML(Tag("tdate"), toplevel=True, applicable=after(1985)),
         ),
         FieldDefinition(
-            name='page-count',
-            display_name='Image count',
-            description='Page count: number of images present in the issue.',
-            es_mapping={'type': 'integer'},
-            extractor=extract.XML(
-                Tag('ip'), toplevel=True, transform=int
-            ),
-            sortable=True
+            name="page-count",
+            display_name="Image count",
+            description="Page count: number of images present in the issue.",
+            es_mapping={"type": "integer"},
+            extractor=extract.XML(Tag("ip"), toplevel=True, transform=int),
+            sortable=True,
         ),
         FieldDefinition(
-            name='page-type',
-            display_name='Page type',
-            description='Supplement in which article occurs.',
-            es_mapping={'type': 'keyword'},
+            name="page-type",
+            display_name="Page type",
+            description="Supplement in which article occurs.",
+            es_mapping={"type": "keyword"},
             search_filter=filters.MultipleChoiceFilter(
                 description=(
-                    'Accept only articles that occur in the relevant '
-                    'supplement. Only after 1985.'
+                    "Accept only articles that occur in the relevant "
+                    "supplement. Only after 1985."
                 ),
-                option_count=2
+                option_count=2,
             ),
             extractor=extract.XML(
-                ParentTag(),
-                Tag('pageid'),
-                attribute='isPartOf',
-                applicable=after(1985)
-            )
+                ParentTag(), Tag("pageid"), attribute="isPartOf", applicable=after(1985)
+            ),
         ),
         FieldDefinition(
-            name='supplement-title',
-            display_name='Supplement title',
-            description='Supplement title.',
+            name="supplement-title",
+            display_name="Supplement title",
+            description="Supplement title.",
             extractor=extract.XML(
                 ParentTag(),
-                Tag('pageid'),
-                Tag('supptitle'),
+                Tag("pageid"),
+                Tag("supptitle"),
                 multiple=True,
-                applicable=after(1985)
+                applicable=after(1985),
             ),
         ),
         FieldDefinition(
-            name='supplement-subtitle',
-            display_name='Supplement subtitle',
-            description='Supplement subtitle.',
+            name="supplement-subtitle",
+            display_name="Supplement subtitle",
+            description="Supplement subtitle.",
             extractor=extract.XML(
                 ParentTag(),
-                Tag('pageid'),
-                Tag('suppsubtitle'),
+                Tag("pageid"),
+                Tag("suppsubtitle"),
                 multiple=True,
-                applicable=after(1985)
-            )
+                applicable=after(1985),
+            ),
         ),
         # There are no datapoints where this is True, hence the outcomment
         # FieldDefinition(
@@ -284,183 +263,158 @@ def sources(self, start=datetime.min, end=datetime.max):
         #     )
         # ),
         FieldDefinition(
-            name='id',
-            display_name='ID',
-            description='Article identifier.',
+            name="id",
+            display_name="ID",
+            description="Article identifier.",
             es_mapping=keyword_mapping(),
-            extractor=extract.XML(Tag('id'))
+            extractor=extract.XML(Tag("id")),
         ),
         FieldDefinition(
-            name='ocr-relevant',
-            display_name='OCR relevant',
-            description='Whether OCR confidence level is relevant.',
-            es_mapping={'type': 'boolean'},
+            name="ocr-relevant",
+            display_name="OCR relevant",
+            description="Whether OCR confidence level is relevant.",
+            es_mapping={"type": "boolean"},
             extractor=extract.XML(
-                Tag('ocr'), attribute='relevant',
+                Tag("ocr"),
+                attribute="relevant",
                 transform=string_contains("yes"),
-            )
+            ),
         ),
         FieldDefinition(
-            name='column',
-            display_name='Column',
+            name="column",
+            display_name="Column",
             description=(
-                'Starting column: a string to label the column'
-                'where article starts.'
+                "Starting column: a string to label the column" "where article starts."
             ),
             es_mapping=keyword_mapping(),
-            extractor=extract.XML(Tag('sc'))
+            extractor=extract.XML(Tag("sc")),
         ),
         FieldDefinition(
-            name='page',
-            display_name='Page',
-            description='Start page label, from source (1, 2, 17A, ...).',
+            name="page",
+            display_name="Page",
+            description="Start page label, from source (1, 2, 17A, ...).",
             es_mapping=keyword_mapping(),
             extractor=extract.Choice(
-                extract.XML(Tag('pa'), applicable=until(1985)),
-                extract.XML(ParentTag(), Tag('pa'), applicable=after(1985))
-            )
+                extract.XML(Tag("pa"), applicable=until(1985)),
+                extract.XML(ParentTag(), Tag("pa"), applicable=after(1985)),
+            ),
         ),
         FieldDefinition(
-            name='pages',
-            display_name='Page count',
-            es_mapping={'type': 'integer'},
+            name="pages",
+            display_name="Page count",
+            es_mapping={"type": "integer"},
             description=(
-                'Page count: total number of pages containing sections '
-                'of the article.'
+                "Page count: total number of pages containing sections "
+                "of the article."
             ),
-            extractor=extract.XML(
-                Tag('pc'), transform=int
-            ),
-            sortable=True
+            extractor=extract.XML(Tag("pc"), transform=int),
+            sortable=True,
         ),
         FieldDefinition(
-            name='title',
-            display_name='Title',
+            name="title",
+            display_name="Title",
             results_overview=True,
             search_field_core=True,
-            visualizations=['wordcloud'],
-            description='Article title.',
-            extractor=extract.XML(Tag('ti'))
+            visualizations=["wordcloud"],
+            description="Article title.",
+            extractor=extract.XML(Tag("ti")),
         ),
         FieldDefinition(
-            name='subtitle',
-            display_name='Subtitle',
-            description='Article subtitle.',
-            extractor=extract.XML(Tag('ta'), multiple=True),
-            search_field_core=True
+            name="subtitle",
+            display_name="Subtitle",
+            description="Article subtitle.",
+            extractor=extract.XML(Tag("ta"), multiple=True),
+            search_field_core=True,
         ),
         FieldDefinition(
-            name='subheader',
-            display_name='Subheader',
-            description='Article subheader (product dependent field).',
+            name="subheader",
+            display_name="Subheader",
+            description="Article subheader (product dependent field).",
             extractor=extract.XML(
-                Tag('subheader'), multiple=True,
-                applicable=after(1985)
-            )
+                Tag("subheader"), multiple=True, applicable=after(1985)
+            ),
         ),
         FieldDefinition(
-            name='author',
-            display_name='Author',
-            description='Article author.',
+            name="author",
+            display_name="Author",
+            description="Article author.",
             es_mapping=keyword_mapping(True),
             extractor=extract.Choice(
-                extract.XML(
-                    Tag('au'), multiple=True,
-                    applicable=until(1985)
-                ),
-                extract.XML(
-                    Tag('au_composed'), multiple=True,
-                    applicable=after(1985)
-                )
+                extract.XML(Tag("au"), multiple=True, applicable=until(1985)),
+                extract.XML(Tag("au_composed"), multiple=True, applicable=after(1985)),
             ),
             search_field_core=True,
-            csv_core=True
+            csv_core=True,
         ),
         FieldDefinition(
-            name='source-paper',
-            display_name='Source paper',
-            description='Credited as source.',
+            name="source-paper",
+            display_name="Source paper",
+            description="Credited as source.",
             es_mapping=keyword_mapping(True),
-            extractor=extract.XML(
-                Tag('altSource'), multiple=True
-            )
+            extractor=extract.XML(Tag("altSource"), multiple=True),
         ),
         FieldDefinition(
-            name='category',
-            visualizations=['resultscount', 'termfrequency'],
-            display_name='Category',
-            description='Article subject categories.',
-            es_mapping={'type': 'keyword'},
+            name="category",
+            visualizations=["resultscount", "termfrequency"],
+            display_name="Category",
+            description="Article subject categories.",
+            es_mapping={"type": "keyword"},
             search_filter=filters.MultipleChoiceFilter(
-                description='Accept only articles in these categories.',
-                option_count=25
+                description="Accept only articles in these categories.", option_count=25
             ),
-            extractor=extract.XML(Tag('ct'), multiple=True),
-            csv_core=True
+            extractor=extract.XML(Tag("ct"), multiple=True),
+            csv_core=True,
         ),
         FieldDefinition(
-            name='illustration',
-            display_name='Illustration',
-            description=(
-                'Tables and other illustrations associated with the article.'
-            ),
-            es_mapping={'type': 'keyword'},
-            visualizations=['resultscount', 'termfrequency'],
+            name="illustration",
+            display_name="Illustration",
+            description=("Tables and other illustrations associated with the article."),
+            es_mapping={"type": "keyword"},
+            visualizations=["resultscount", "termfrequency"],
             search_filter=filters.MultipleChoiceFilter(
                 description=(
-                    'Accept only articles associated with these types '
-                    'of illustrations.'),
-                option_count=7
+                    "Accept only articles associated with these types "
+                    "of illustrations."
+                ),
+                option_count=7,
             ),
             extractor=extract.Choice(
+                extract.XML(Tag("il"), multiple=True, applicable=until(1985)),
                 extract.XML(
-                    Tag('il'), multiple=True,
-                    applicable=until(1985)
+                    Tag("il"), attribute="type", multiple=True, applicable=after(1985)
                 ),
-                extract.XML(
-                    Tag('il'), attribute='type', multiple=True,
-                    applicable=after(1985)
-                )
             ),
-            csv_core=True
+            csv_core=True,
         ),
         FieldDefinition(
-            name='content-preamble',
-            display_name='Content preamble',
-            description='Raw OCR\'ed text (preamble).',
-            extractor=extract.XML(
-                Tag('text'),
-                Tag('text.preamble'),
-                flatten=True
-            )
+            name="content-preamble",
+            display_name="Content preamble",
+            description="Raw OCR'ed text (preamble).",
+            extractor=extract.XML(Tag("text"), Tag("text.preamble"), flatten=True),
         ),
         FieldDefinition(
-            name='content-heading',
-            display_name='Content heading',
-            description='Raw OCR\'ed text (header).',
-            extractor=extract.XML(
-                Tag('text'),
-                Tag('text.title'),
-                flatten=True
-            )
+            name="content-heading",
+            display_name="Content heading",
+            description="Raw OCR'ed text (header).",
+            extractor=extract.XML(Tag("text"), Tag("text.title"), flatten=True),
         ),
         FieldDefinition(
-            name='content',
-            display_name='Content',
-            display_type='text_content',
-            es_mapping=main_content_mapping(True, True, True, 'en'),
-            visualizations=['wordcloud'],
-            description='Raw OCR\'ed text (content).',
+            name="content",
+            display_name="Content",
+            display_type="text_content",
+            es_mapping=main_content_mapping(True, True, True, "en"),
+            visualizations=["wordcloud", "ngram"],
+            description="Raw OCR'ed text (content).",
             results_overview=True,
             search_field_core=True,
             extractor=extract.XML(
-                Tag('text'),
-                Tag('text.cr'),
+                Tag("text"),
+                Tag("text.cr"),
                 multiple=True,
                 flatten=True,
-                transform='\n'.join,
+                transform="\n".join,
             ),
-            language='en',
+            language="en",
         ),
     ]