From ec46138bc3aeaf5c34d0472c0241860ba783e170 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 2 Oct 2024 17:50:09 +0200 Subject: [PATCH] add ngram field to all corpora with date field and main_content_mapping --- .../dutchnewspapers/dutchnewspapers_public.py | 353 ++++++++-------- backend/corpora/goodreads/goodreads.py | 237 +++++------ .../guardianobserver/guardianobserver.py | 123 +++--- backend/corpora/periodicals/periodicals.py | 203 ++++----- backend/corpora/rechtspraak/rechtspraak.py | 199 +++++---- backend/corpora/times/times.py | 396 ++++++++---------- 6 files changed, 724 insertions(+), 787 deletions(-) diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index 9d295f82b..5fc408eeb 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -96,7 +96,7 @@ def sources(self, start=min_date, end=max_date): if extension != '.xml': logger.debug(self.non_xml_msg.format(full_path)) continue - #def_match = self.definition_pattern.match(name) + # def_match = self.definition_pattern.match(name) article_match = self.article_pattern.match(name) if article_match: parts = name.split("_") @@ -130,189 +130,188 @@ def sources(self, start=min_date, end=max_date): 'issue' ) - @property def fields(self): - return [FieldDefinition( - name="url", - display_name="Delpher URL", - description="Link to record on Delpher", - display_type='url', - es_mapping=keyword_mapping(), - extractor=XML( - lambda metadata: Tag('recordIdentifier', string=metadata['id']), - SiblingTag('identifier'), - external_file=True - ) - ), - FieldDefinition( - name='date', - display_name='Date', - description='Publication date.', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, - results_overview=True, - csv_core=True, - visualizations=['resultscount', 'termfrequency'], - search_filter=filters.DateFilter( - self.min_date, - self.max_date, - description=( - 'Accept only articles with publication date in this range.' - ) + return [ + FieldDefinition( + name="url", + display_name="Delpher URL", + description="Link to record on Delpher", + display_type="url", + es_mapping=keyword_mapping(), + extractor=XML( + lambda metadata: Tag("recordIdentifier", string=metadata["id"]), + SiblingTag("identifier"), + external_file=True, + ), ), - extractor=Metadata('date') - ), - FieldDefinition( - name='ocr', - display_name='OCR confidence', - description='OCR confidence level.', - es_mapping={'type': 'float'}, - search_filter=filters.RangeFilter(0, 100, - description=( - 'Accept only articles for which the Opitical Character Recognition confidence ' - 'indicator is in this range.' - ) - ), - extractor=XML( - Tag('OCRConfidencelevel'), - external_file=True, - transform=lambda x: float(x)*100 + FieldDefinition( + name="date", + display_name="Date", + description="Publication date.", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, + results_overview=True, + csv_core=True, + visualizations=["resultscount", "termfrequency"], + search_filter=filters.DateFilter( + self.min_date, + self.max_date, + description=( + "Accept only articles with publication date in this range." + ), + ), + extractor=Metadata("date"), ), - sortable=True - ), - FieldDefinition( - name='newspaper_title', - display_name='Newspaper title', - description='Title of the newspaper', - results_overview=True, - search_field_core=True, - es_mapping={'type': 'keyword'}, - visualizations=['resultscount', 'termfrequency'], - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these newspapers.', - option_count=len(self.papers) + FieldDefinition( + name="ocr", + display_name="OCR confidence", + description="OCR confidence level.", + es_mapping={"type": "float"}, + search_filter=filters.RangeFilter( + 0, + 100, + description=( + "Accept only articles for which the Opitical Character Recognition confidence " + "indicator is in this range." + ), + ), + extractor=XML( + Tag("OCRConfidencelevel"), + external_file=True, + transform=lambda x: float(x) * 100, + ), + sortable=True, ), - extractor=Metadata('title') - ), - FieldDefinition( - name='version_of', - display_name='Version of', - description='The newspaper is a version of this newspaper.', - es_mapping={'type': 'keyword'}, - extractor=Metadata('isVersionOf') - ), - FieldDefinition( - name='issue_number', - display_name='Issue number', - description='Issue number of the newspaper', - csv_core=True, - es_mapping={'type': 'integer'}, - extractor=Metadata('issuenumber') - ), - FieldDefinition( - name='category', - display_name='Category', - description='Whether the item is an article, advertisment, etc.', - csv_core=True, - es_mapping={'type': 'keyword'}, - extractor=XML( - lambda metadata: Tag('recordIdentifier', string=metadata['id']), - SiblingTag('subject'), - external_file=True + FieldDefinition( + name="newspaper_title", + display_name="Newspaper title", + description="Title of the newspaper", + results_overview=True, + search_field_core=True, + es_mapping={"type": "keyword"}, + visualizations=["resultscount", "termfrequency"], + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles in these newspapers.", + option_count=len(self.papers), + ), + extractor=Metadata("title"), ), - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=2, + FieldDefinition( + name="version_of", + display_name="Version of", + description="The newspaper is a version of this newspaper.", + es_mapping={"type": "keyword"}, + extractor=Metadata("isVersionOf"), ), - ), - FieldDefinition( - name='circulation', - display_name='Circulation', - description='The area in which the newspaper was distributed.', - es_mapping={'type': 'keyword'}, - csv_core=True, - extractor=Metadata('spatial'), - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles appearing in specific areas.', - option_count=7 + FieldDefinition( + name="issue_number", + display_name="Issue number", + description="Issue number of the newspaper", + csv_core=True, + es_mapping={"type": "integer"}, + extractor=Metadata("issuenumber"), ), - ), - FieldDefinition( - name='publisher', - display_name='Publisher', - description='Publisher', - es_mapping=keyword_mapping(), - search_field_core=True, - extractor=Metadata('publisher') - ), - FieldDefinition( - name='language', - display_name='Language', - description='language', - es_mapping={'type': 'keyword'}, - extractor=Metadata('language') - ), - FieldDefinition( - name='article_title', - display_name='Article title', - description='Article title', - results_overview=True, - search_field_core=True, - extractor=XML(Tag('title'), flatten=True, toplevel=True) - ), - FieldDefinition( - name='id', - display_name='ID', - description='Unique identifier of the entry.', - extractor=Metadata('id') - ), - FieldDefinition( - name='source', - display_name='Source', - description='Library or archive which keeps the hard copy of this newspaper.', - es_mapping={'type': 'keyword'}, - extractor=Metadata('source') - ), - FieldDefinition( - name='pub_place', - display_name='Publication Place', - description='Where the newspaper was published', - es_mapping={'type': 'keyword'}, - extractor=Metadata('pub_place') - ), - FieldDefinition( - name='temporal', - display_name='Edition', - description='Newspaper edition for the given date', - results_overview=True, - csv_core=True, - es_mapping={'type': 'keyword'}, - visualizations=['resultscount', 'termfrequency'], - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in newspapers which appeared as a given edition.', - option_count=3, + FieldDefinition( + name="category", + display_name="Category", + description="Whether the item is an article, advertisment, etc.", + csv_core=True, + es_mapping={"type": "keyword"}, + extractor=XML( + lambda metadata: Tag("recordIdentifier", string=metadata["id"]), + SiblingTag("subject"), + external_file=True, + ), + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles in these categories.", + option_count=2, + ), ), - extractor=Metadata('temporal') - ), - FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - description='Text content.', - es_mapping=main_content_mapping(True, True, True, 'nl'), - results_overview=True, - search_field_core=True, - extractor=XML( - Tag('p'), - multiple=True, - flatten=True, - toplevel=True, - transform='\n'.join, + FieldDefinition( + name="circulation", + display_name="Circulation", + description="The area in which the newspaper was distributed.", + es_mapping={"type": "keyword"}, + csv_core=True, + extractor=Metadata("spatial"), + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles appearing in specific areas.", + option_count=7, + ), ), - visualizations=["wordcloud"], - language='nl', - ), - ] - - - + FieldDefinition( + name="publisher", + display_name="Publisher", + description="Publisher", + es_mapping=keyword_mapping(), + search_field_core=True, + extractor=Metadata("publisher"), + ), + FieldDefinition( + name="language", + display_name="Language", + description="language", + es_mapping={"type": "keyword"}, + extractor=Metadata("language"), + ), + FieldDefinition( + name="article_title", + display_name="Article title", + description="Article title", + results_overview=True, + search_field_core=True, + extractor=XML(Tag("title"), flatten=True, toplevel=True), + ), + FieldDefinition( + name="id", + display_name="ID", + description="Unique identifier of the entry.", + extractor=Metadata("id"), + ), + FieldDefinition( + name="source", + display_name="Source", + description="Library or archive which keeps the hard copy of this newspaper.", + es_mapping={"type": "keyword"}, + extractor=Metadata("source"), + ), + FieldDefinition( + name="pub_place", + display_name="Publication Place", + description="Where the newspaper was published", + es_mapping={"type": "keyword"}, + extractor=Metadata("pub_place"), + ), + FieldDefinition( + name="temporal", + display_name="Edition", + description="Newspaper edition for the given date", + results_overview=True, + csv_core=True, + es_mapping={"type": "keyword"}, + visualizations=["resultscount", "termfrequency"], + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles in newspapers which appeared as a given edition.", + option_count=3, + ), + extractor=Metadata("temporal"), + ), + FieldDefinition( + name="content", + display_name="Content", + display_type="text_content", + description="Text content.", + es_mapping=main_content_mapping(True, True, True, "nl"), + results_overview=True, + search_field_core=True, + extractor=XML( + Tag("p"), + multiple=True, + flatten=True, + toplevel=True, + transform="\n".join, + ), + visualizations=["wordcloud", "ngram"], + language="nl", + ), + ] diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py index eefb1bbb8..fcaef48d2 100644 --- a/backend/corpora/goodreads/goodreads.py +++ b/backend/corpora/goodreads/goodreads.py @@ -64,200 +64,192 @@ def sources(self, start, end): fields = [ FieldDefinition( - name='year', - display_name='Year', - description='Year the review was written.', + name="year", + display_name="Year", + description="Year the review was written.", extractor=CSV( - 'date', - transform=lambda x: datetime.strptime( - x, '%b %d, %Y').strftime('%Y') + "date", + transform=lambda x: datetime.strptime(x, "%b %d, %Y").strftime("%Y"), ), - es_mapping={'type': 'integer'}, + es_mapping={"type": "integer"}, search_filter=RangeFilter( min_date.year, max_date.year, - description=( - 'Accept only book reviews written in this range.' - ) + description=("Accept only book reviews written in this range."), ), - hidden=True + hidden=True, ), FieldDefinition( - name='id', - display_name='ID', - description='ID of the review.', - extractor=CSV('id'), - es_mapping={'type': 'keyword'}, + name="id", + display_name="ID", + description="ID of the review.", + extractor=CSV("id"), + es_mapping={"type": "keyword"}, csv_core=True, ), FieldDefinition( - name='book_title', - display_name='Book title', - description='The title of the book reviews were made for. Encompasses all editions.', - extractor=Metadata('book_title'), - es_mapping={'type': 'keyword'}, + name="book_title", + display_name="Book title", + description="The title of the book reviews were made for. Encompasses all editions.", + extractor=Metadata("book_title"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews made for these titles.', - option_count=154 + description="Accept only reviews made for these titles.", + option_count=154, ), - csv_core=True + csv_core=True, ), FieldDefinition( - name='original_language', - display_name='Original language', - description='The original language the book reviews were made for was written in.', - extractor=Metadata('original_language'), - es_mapping={'type': 'keyword'}, + name="original_language", + display_name="Original language", + description="The original language the book reviews were made for was written in.", + extractor=Metadata("original_language"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews made for titles originally in this language(s).', - option_count=8 + description="Accept only reviews made for titles originally in this language(s).", + option_count=8, ), csv_core=True, ), FieldDefinition( - name='edition_id', - display_name='Edition ID', - description='ID of the edition the review was made for.', - extractor=CSV('edition_id'), - es_mapping={'type': 'keyword'}, + name="edition_id", + display_name="Edition ID", + description="ID of the edition the review was made for.", + extractor=CSV("edition_id"), + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='edition_language', - display_name='Edition language', - description='The language that the edition that the review is for was written in', - extractor=CSV('edition_language'), - es_mapping={'type': 'keyword'}, + name="edition_language", + display_name="Edition language", + description="The language that the edition that the review is for was written in", + extractor=CSV("edition_language"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only editions written in these languages.', - option_count=8 + description="Accept only editions written in these languages.", + option_count=8, ), results_overview=True, csv_core=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='book_genre', - display_name='Genre', - description='The genre of the reviewed book', - extractor=Metadata('book_genre'), - es_mapping={'type': 'keyword'}, + name="book_genre", + display_name="Genre", + description="The genre of the reviewed book", + extractor=Metadata("book_genre"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews of books in this genre', - option_count=8 + description="Accept only reviews of books in this genre", option_count=8 ), - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='age_category', - display_name='Age category', - description='The age category of the target audience of the reviewed book', - extractor=Metadata('age_category'), - es_mapping={'type': 'keyword'}, + name="age_category", + display_name="Age category", + description="The age category of the target audience of the reviewed book", + extractor=Metadata("age_category"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews of books written for this age category', - option_count=3 + description="Accept only reviews of books written for this age category", + option_count=3, ), - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='url', - display_name='Source URL', - display_type='url', - description='Link to the the review on Goodreads', - extractor=CSV('url'), - es_mapping={'type': 'keyword'}, + name="url", + display_name="Source URL", + display_type="url", + description="Link to the the review on Goodreads", + extractor=CSV("url"), + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='text', - display_name='Text', - description='Fulltext of the review.', - extractor=CSV('text'), + name="text", + display_name="Text", + description="Fulltext of the review.", + extractor=CSV("text"), es_mapping=main_content_mapping(), - display_type='text_content', + display_type="text_content", csv_core=True, results_overview=True, searchable=True, - visualizations=['wordcloud'], + visualizations=["wordcloud"], ), FieldDefinition( - name='language', - display_name='Review language', - description='The language of the review.', - extractor=CSV('language'), - es_mapping={'type': 'keyword'}, + name="language", + display_name="Review language", + description="The language of the review.", + extractor=CSV("language"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews written in these languages.', - option_count=50 + description="Accept only reviews written in these languages.", + option_count=50, ), results_overview=True, csv_core=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='date', - display_name='Date', - description='Date the review was written.', + name="date", + display_name="Date", + description="Date the review was written.", extractor=CSV( - 'date', - transform=lambda x: datetime.strptime( - x, '%b %d, %Y').strftime('%Y-%m-%d') + "date", + transform=lambda x: datetime.strptime(x, "%b %d, %Y").strftime( + "%Y-%m-%d" + ), ), - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, ), FieldDefinition( - name='rating_text', - display_name='Goodreads rating', - description='Rating in the Goodreads style, e.g. \'really liked it\'.', - extractor=CSV('rating'), - es_mapping={'type': 'keyword'}, + name="rating_text", + display_name="Goodreads rating", + description="Rating in the Goodreads style, e.g. 'really liked it'.", + extractor=CSV("rating"), + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='rating_no', - display_name='Rating', - description='Rating as a number.', - extractor=CSV('rating_no'), - es_mapping={'type': 'keyword'}, + name="rating_no", + display_name="Rating", + description="Rating as a number.", + extractor=CSV("rating_no"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews with these ratings.', - option_count=6 + description="Accept only reviews with these ratings.", option_count=6 ), results_overview=True, - visualizations=['resultscount', 'termfrequency'], - visualization_sort='key' + visualizations=["resultscount", "termfrequency"], + visualization_sort="key", ), FieldDefinition( - name='word_count', - display_name='Word count', - description='Number of words (whitespace-delimited) in the review.', - extractor=CSV( - 'text', - transform=lambda x: len(x.split(' ')) - ), - es_mapping={'type': 'integer'}, + name="word_count", + display_name="Word count", + description="Number of words (whitespace-delimited) in the review.", + extractor=CSV("text", transform=lambda x: len(x.split(" "))), + es_mapping={"type": "integer"}, search_filter=RangeFilter( 1, 4225, - description=( - 'Accept only book reviews with word count in this range.' - )) + description=("Accept only book reviews with word count in this range."), + ), ), FieldDefinition( - name='edition_publisher', - display_name='Edition publisher', - description='Publisher of the edition the review was written for', + name="edition_publisher", + display_name="Edition publisher", + description="Publisher of the edition the review was written for", extractor=CSV( - 'edition_publisher', + "edition_publisher", ), - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='edition_publishing_year', - display_name='Edition publishing year', - description='Year the edition the review was written for was published.', + name="edition_publishing_year", + display_name="Edition publishing year", + description="Year the edition the review was written for was published.", extractor=CSV( - 'edition_publishing_year', + "edition_publishing_year", ), - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, ), ] @@ -296,4 +288,3 @@ def update_script(self): } } yield update_body - diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py index aa5bab986..2658cca27 100644 --- a/backend/corpora/guardianobserver/guardianobserver.py +++ b/backend/corpora/guardianobserver/guardianobserver.py @@ -72,113 +72,112 @@ def sources(self, start=datetime.min, end=datetime.max): fields = [ FieldDefinition( - name='date', - display_name='Publication Date', - description='Publication date, parsed to yyyy-MM-dd format', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Publication Date", + description="Publication date, parsed to yyyy-MM-dd format", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, hidden=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only articles with publication date in this range.' - ) + "Accept only articles with publication date in this range." + ), ), extractor=extract.XML( - Tag('NumericPubDate'), - transform=lambda x: '{y}-{m}-{d}'.format(y=x[:4],m=x[4:6],d=x[6:]) + Tag("NumericPubDate"), + transform=lambda x: "{y}-{m}-{d}".format(y=x[:4], m=x[4:6], d=x[6:]), ), sortable=True, ), FieldDefinition( - name='date-pub', + name="date-pub", es_mapping=keyword_mapping(), - display_name='Publication Date', + display_name="Publication Date", csv_core=True, results_overview=True, - description='Publication date as full string, as found in source file', - extractor=extract.XML(Tag('AlphaPubDate')) + description="Publication date as full string, as found in source file", + extractor=extract.XML(Tag("AlphaPubDate")), ), FieldDefinition( - name='id', + name="id", es_mapping=keyword_mapping(), - display_name='ID', - description='Article identifier.', - extractor=extract.XML(Tag('RecordID')), + display_name="ID", + description="Article identifier.", + extractor=extract.XML(Tag("RecordID")), ), FieldDefinition( - name='pub_id', + name="pub_id", es_mapping=keyword_mapping(), - display_name='Publication ID', - description='Publication identifier', - extractor=extract.XML(Tag('PublicationID')) + display_name="Publication ID", + description="Publication identifier", + extractor=extract.XML(Tag("PublicationID")), ), FieldDefinition( - name='page', + name="page", es_mapping=keyword_mapping(), - display_name='Page', - description='Start page label, from source (1, 2, 17A, ...).', - extractor=extract.XML(Tag('StartPage')) + display_name="Page", + description="Start page label, from source (1, 2, 17A, ...).", + extractor=extract.XML(Tag("StartPage")), ), FieldDefinition( - name='title', - display_name='Title', + name="title", + display_name="Title", search_field_core=True, - visualizations=['wordcloud'], - description='Article title.', - extractor=extract.XML(Tag('RecordTitle')) + visualizations=["wordcloud"], + description="Article title.", + extractor=extract.XML(Tag("RecordTitle")), ), FieldDefinition( - name='source-paper', + name="source-paper", es_mapping=keyword_mapping(True), - display_name='Source paper', - description='Credited as source.', - extractor=extract.XML(Tag('Title')), + display_name="Source paper", + description="Credited as source.", + extractor=extract.XML(Tag("Title")), search_filter=filters.MultipleChoiceFilter( - description='Accept only articles from these source papers.', - option_count=5 + description="Accept only articles from these source papers.", + option_count=5, ), ), FieldDefinition( - name='place', + name="place", mapping=keyword_mapping(True), - display_name='Place', - description='Place in which the article was published', - extractor=extract.XML(Tag('Qualifier')) + display_name="Place", + description="Place in which the article was published", + extractor=extract.XML(Tag("Qualifier")), ), FieldDefinition( - name='author', + name="author", mapping=keyword_mapping(True), - display_name='Author', - description='Article author', - extractor=extract.XML(Tag('PersonName')) + display_name="Author", + description="Article author", + extractor=extract.XML(Tag("PersonName")), ), FieldDefinition( - name='category', - visualizations=['resultscount', 'termfrequency'], - display_name='Category', - description='Article subject categories.', - es_mapping={'type': 'keyword'}, + name="category", + visualizations=["resultscount", "termfrequency"], + display_name="Category", + description="Article subject categories.", + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=19 + description="Accept only articles in these categories.", option_count=19 ), - extractor=extract.XML(Tag('ObjectType')), - csv_core=True + extractor=extract.XML(Tag("ObjectType")), + csv_core=True, ), FieldDefinition( - name='content', - es_mapping=main_content_mapping(True, True, True, 'en'), - display_name='Content', - display_type='text_content', - visualizations=['wordcloud'], - description='Raw OCR\'ed text (content).', + name="content", + es_mapping=main_content_mapping(True, True, True, "en"), + display_name="Content", + display_type="text_content", + visualizations=["wordcloud", "ngram"], + description="Raw OCR'ed text (content).", results_overview=True, search_field_core=True, - extractor=extract.XML(Tag('FullText'), flatten=True), - language='en', - ) + extractor=extract.XML(Tag("FullText"), flatten=True), + language="en", + ), ] document_context = { diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py index 24111c8a5..e6ab86e95 100644 --- a/backend/corpora/periodicals/periodicals.py +++ b/backend/corpora/periodicals/periodicals.py @@ -5,7 +5,7 @@ import logging logger = logging.getLogger(__name__) -from os.path import join, isfile, splitext +from os.path import join, isfile from datetime import datetime import re import openpyxl @@ -59,7 +59,8 @@ def sources(self, start=min_date, end=max_date): metadict['title'] = row[0] if row[1].startswith('['): date = row[1][1:-1] - else: date = row[1] + else: + date = row[1] metadict['date_full'] = date if date=='Date Unknown': metadict['date'] = None @@ -80,172 +81,172 @@ def sources(self, start=min_date, end=max_date): fields = [ FieldDefinition( - name='date', - display_name='Formatted Date', - description='Publication date, formatted from the full date', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Formatted Date", + description="Publication date, formatted from the full date", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, histogram=True, search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only articles with publication date in this range.' - ) + "Accept only articles with publication date in this range." + ), ), - extractor=extract.Metadata('date'), + extractor=extract.Metadata("date"), csv_core=True, - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='date_pub', - display_name='Publication Date', - description='Publication date as full string, as found in source file', + name="date_pub", + display_name="Publication Date", + description="Publication date as full string, as found in source file", es_mapping=keyword_mapping(), results_overview=True, - extractor=extract.Metadata('date_full') + extractor=extract.Metadata("date_full"), ), FieldDefinition( - name='id', - display_name='ID', - description='Unique identifier of the entry.', + name="id", + display_name="ID", + description="Unique identifier of the entry.", es_mapping=keyword_mapping(), - extractor=extract.XML(attribute='id'), + extractor=extract.XML(attribute="id"), ), FieldDefinition( - name='issue', - display_name='Issue number', - description='Source issue number.', + name="issue", + display_name="Issue number", + description="Source issue number.", es_mapping=keyword_mapping(), results_overview=False, - extractor=extract.Metadata('issue_id'), + extractor=extract.Metadata("issue_id"), csv_core=False, ), FieldDefinition( - name='periodical', - display_name='Periodical name', + name="periodical", + display_name="Periodical name", histogram=True, results_overview=True, - es_mapping={'type': 'keyword'}, - description='Periodical name.', + es_mapping={"type": "keyword"}, + description="Periodical name.", search_filter=filters.MultipleChoiceFilter( - description='Search only within these periodicals.', - option_count=90 + description="Search only within these periodicals.", option_count=90 ), - extractor=extract.Metadata('title'), + extractor=extract.Metadata("title"), csv_core=True, - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - description='Text content.', - es_mapping=main_content_mapping(True, True, True, 'en'), + name="content", + display_name="Content", + display_type="text_content", + description="Text content.", + es_mapping=main_content_mapping(True, True, True, "en"), results_overview=True, - extractor=extract.XML(Tag('ocrText'), flatten=True), + extractor=extract.XML(Tag("ocrText"), flatten=True), search_field_core=True, - visualizations=["wordcloud"], - language='en', + visualizations=["wordcloud", "ngram"], + language="en", ), FieldDefinition( - name='ocr', - display_name='OCR confidence', - description='OCR confidence level.', - es_mapping={'type': 'float'}, - search_filter=filters.RangeFilter(0, 100, - description=( - 'Accept only articles for which the Opitical Character Recognition confidence ' - 'indicator is in this range.' - ) - ), + name="ocr", + display_name="OCR confidence", + description="OCR confidence level.", + es_mapping={"type": "float"}, + search_filter=filters.RangeFilter( + 0, + 100, + description=( + "Accept only articles for which the Opitical Character Recognition confidence " + "indicator is in this range." + ), + ), extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('ocr'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("ocr"), ), - sortable=True + sortable=True, ), FieldDefinition( - name='title', - display_name='Article title', - description='Title of the article.', + name="title", + display_name="Article title", + description="Title of the article.", extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('ti'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("ti"), external_file=True, ), - visualizations=['wordcloud'] + visualizations=["wordcloud"], ), FieldDefinition( - name='start_column', - es_mapping={'type': 'keyword'}, - display_name='Starting column', - description='Which column the article starts in.', + name="start_column", + es_mapping={"type": "keyword"}, + display_name="Starting column", + description="Which column the article starts in.", extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('sc'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("sc"), external_file=True, - ) + ), ), FieldDefinition( - name='page_count', - display_name='Page count', - description='How many pages the article covers.', - es_mapping={'type': 'integer'}, + name="page_count", + display_name="Page count", + description="How many pages the article covers.", + es_mapping={"type": "integer"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('pc'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("pc"), external_file=True, - ) + ), ), FieldDefinition( - name='word_count', - display_name='Word count', - description='Number of words in the article.', - es_mapping={'type': 'integer'}, + name="word_count", + display_name="Word count", + description="Number of words in the article.", + es_mapping={"type": "integer"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('wordCount'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("wordCount"), external_file=True, - ) + ), ), FieldDefinition( - name='category', + name="category", csv_core=True, - display_name='Category', - description='Article category.', - es_mapping={'type': 'keyword'}, + display_name="Category", + description="Article category.", + es_mapping={"type": "keyword"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('ct'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("ct"), external_file=True, ), search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=26 + description="Accept only articles in these categories.", option_count=26 ), - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='page_no', - display_name='Page number', - description='At which page the article starts.', - es_mapping={'type': 'integer'}, + name="page_no", + display_name="Page number", + description="At which page the article starts.", + es_mapping={"type": "integer"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), + lambda metadata: Tag("id", string=metadata["id"]), ParentTag(2), - Tag('pa'), + Tag("pa"), external_file=True, - transform=lambda x: re.sub('[\[\]]', '', x) - ) + transform=lambda x: re.sub("[\[\]]", "", x), + ), ), FieldDefinition( - name='image_path', - display_name='Image path', - es_mapping={'type': 'keyword'}, - description='Path of scan.', - extractor=extract.Metadata('image_path'), + name="image_path", + display_name="Image path", + es_mapping={"type": "keyword"}, + description="Path of scan.", + extractor=extract.Metadata("image_path"), hidden=True, - downloadable=False + downloadable=False, ), ] diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py index fc46c2d39..683ae184f 100644 --- a/backend/corpora/rechtspraak/rechtspraak.py +++ b/backend/corpora/rechtspraak/rechtspraak.py @@ -36,7 +36,6 @@ def _rdf_description_extractor(tag: Tag, section='xml', **kwargs) -> extract.XML ) - class Rechtspraak(XMLCorpusDefinition): title = "Judicial system Netherlands" description = "Open data of (anonymised) court rulings of the Dutch judicial system" @@ -146,179 +145,173 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None fields = [ FieldDefinition( - name='id', - display_name='ID', - description='', + name="id", + display_name="ID", + description="", es_mapping=keyword_mapping(), - extractor=_rdf_description_extractor(Tag('dcterms:identifier')), + extractor=_rdf_description_extractor(Tag("dcterms:identifier")), csv_core=True, ), FieldDefinition( - name='has_content', - display_name='Has text content', - description='Document has available text content.', - es_mapping={'type': 'boolean'}, + name="has_content", + display_name="Has text content", + description="Document has available text content.", + es_mapping={"type": "boolean"}, extractor=extract.Backup( - extract.XML(Tag('uitspraak'), flatten=True), - extract.XML(Tag('conclusie'), flatten=True), + extract.XML(Tag("uitspraak"), flatten=True), + extract.XML(Tag("conclusie"), flatten=True), extract.Constant(False), - transform=bool + transform=bool, ), search_filter=filters.BooleanFilter( - true='has content', - false='does not have content', - description=( - 'Accept only articles that have available text content.' - ) + true="has content", + false="does not have content", + description=("Accept only articles that have available text content."), ), ), FieldDefinition( - name='year', - display_name='Year', - es_mapping={'type': 'integer'}, - extractor=extract.Metadata('year'), - search_filter=filters.RangeFilter(min_date.year, max_date.year) + name="year", + display_name="Year", + es_mapping={"type": "integer"}, + extractor=extract.Metadata("year"), + search_filter=filters.RangeFilter(min_date.year, max_date.year), ), FieldDefinition( - name='date', - display_name='Date', - extractor=_rdf_description_extractor(Tag('dcterms:date')), - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Date", + extractor=_rdf_description_extractor(Tag("dcterms:date")), + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, results_overview=True, csv_core=True, search_filter=filters.DateFilter( min_date, max_date, - description=( - 'Accept only rulings with date in this range.' - ) + description=("Accept only rulings with date in this range."), ), - ), FieldDefinition( - name='issued', - display_name='Publication Date', - extractor=_rdf_description_extractor(Tag('dcterms:issued')), - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="issued", + display_name="Publication Date", + extractor=_rdf_description_extractor(Tag("dcterms:issued")), + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only rulings with publication date in this range.' - ) + "Accept only rulings with publication date in this range." + ), ), ), FieldDefinition( - name='publisher', - display_name='Publisher', - extractor=_rdf_description_extractor(Tag('dcterms:publisher')), - es_mapping={'type': 'keyword'}, - language='nl', + name="publisher", + display_name="Publisher", + extractor=_rdf_description_extractor(Tag("dcterms:publisher")), + es_mapping={"type": "keyword"}, + language="nl", ), FieldDefinition( - name='creator', - display_name='Court', - extractor=_rdf_description_extractor(Tag('dcterms:creator')), - es_mapping={'type': 'keyword'}, + name="creator", + display_name="Court", + extractor=_rdf_description_extractor(Tag("dcterms:creator")), + es_mapping={"type": "keyword"}, csv_core=True, results_overview=True, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings of selected courts.', - option_count=9999 + description="Accept only rulings of selected courts.", option_count=9999 ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='zaaknr', - display_name='Case Number', + name="zaaknr", + display_name="Case Number", es_mapping=keyword_mapping(), - extractor=_rdf_description_extractor(Tag('psi:zaaknummer')), + extractor=_rdf_description_extractor(Tag("psi:zaaknummer")), ), FieldDefinition( - name='type', - display_name='Type', - extractor=_rdf_description_extractor(Tag('dcterms:type')), - es_mapping={'type': 'keyword'}, + name="type", + display_name="Type", + extractor=_rdf_description_extractor(Tag("dcterms:type")), + es_mapping={"type": "keyword"}, csv_core=True, results_overview=True, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings of selected type.', - option_count=2 + description="Accept only rulings of selected type.", option_count=2 ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='procedure', - display_name='(type of) Procedure', - extractor=_rdf_description_extractor(Tag('psi:procedure')), + name="procedure", + display_name="(type of) Procedure", + extractor=_rdf_description_extractor(Tag("psi:procedure")), csv_core=True, - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings of selected procedure type.', - option_count=44 + description="Accept only rulings of selected procedure type.", + option_count=44, ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='spatial', - display_name='Location', + name="spatial", + display_name="Location", es_mapping=keyword_mapping(), - extractor=_rdf_description_extractor(Tag('dcterms:spatial')), - language='nl', + extractor=_rdf_description_extractor(Tag("dcterms:spatial")), + language="nl", ), FieldDefinition( - name='subject', - display_name='Area of law', - extractor=_rdf_description_extractor(Tag('dcterms:subject')), + name="subject", + display_name="Area of law", + extractor=_rdf_description_extractor(Tag("dcterms:subject")), csv_core=True, - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings within this area of law.', - option_count=32 + description="Accept only rulings within this area of law.", + option_count=32, ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='title', - display_name='Title', - extractor=_rdf_description_extractor( - Tag('dcterms:title'), section='html'), + name="title", + display_name="Title", + extractor=_rdf_description_extractor(Tag("dcterms:title"), section="html"), results_overview=True, search_field_core=True, - language='nl', + language="nl", ), FieldDefinition( - name='abstract', - display_name='Abstract', - extractor=extract.XML(Tag('inhoudsindicatie'), flatten=True), + name="abstract", + display_name="Abstract", + extractor=extract.XML(Tag("inhoudsindicatie"), flatten=True), results_overview=True, - language='nl', + language="nl", ), FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - es_mapping=main_content_mapping(True, True, True, 'nl'), + name="content", + display_name="Content", + display_type="text_content", + es_mapping=main_content_mapping(True, True, True, "nl"), extractor=extract.Backup( - extract.XML(Tag('uitspraak'), flatten=True), - extract.XML(Tag('conclusie'), flatten=True), - extract.Constant('Content not available') + extract.XML(Tag("uitspraak"), flatten=True), + extract.XML(Tag("conclusie"), flatten=True), + extract.Constant("Content not available"), ), csv_core=True, search_field_core=True, - language='nl', + language="nl", + visualizations=["ngram"], ), FieldDefinition( - name='url', - display_name='Source URL', - display_type='url', - description='URL of the case on rechtspraak.nl', + name="url", + display_name="Source URL", + display_type="url", + description="URL of the case on rechtspraak.nl", es_mapping=keyword_mapping(), extractor=_rdf_description_extractor( - Tag('dcterms:identifier'), section='html') - ) + Tag("dcterms:identifier"), section="html" + ), + ), ] diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py index bab8a5ea7..65fbcbf09 100644 --- a/backend/corpora/times/times.py +++ b/backend/corpora/times/times.py @@ -96,172 +96,151 @@ def sources(self, start=datetime.min, end=datetime.max): fields = [ FieldDefinition( - name='date', - display_name='Publication Date', - description='Publication date, parsed to yyyy-MM-dd format', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Publication Date", + description="Publication date, parsed to yyyy-MM-dd format", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, hidden=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only articles with publication date in this range.' - ) + "Accept only articles with publication date in this range." + ), + ), + extractor=extract.Metadata( + "date", transform=lambda x: x.strftime("%Y-%m-%d") ), - extractor=extract.Metadata('date', - transform=lambda x: x.strftime( - '%Y-%m-%d') - ) ), FieldDefinition( - name='source', - display_name='Source', - description='Library where the microfilm is sourced', + name="source", + display_name="Source", + description="Library where the microfilm is sourced", es_mapping=keyword_mapping(), extractor=extract.XML( - Tag('metadatainfo'), - Tag('sourceLibrary'), + Tag("metadatainfo"), + Tag("sourceLibrary"), toplevel=True, - applicable=after(1985) - ) + applicable=after(1985), + ), ), FieldDefinition( - name='edition', - display_name='Edition', + name="edition", + display_name="Edition", es_mapping=keyword_mapping(), extractor=extract.Choice( + extract.XML(Tag("ed"), toplevel=True, applicable=until(1985)), extract.XML( - Tag('ed'), - toplevel=True, - applicable=until(1985) + Tag("ed"), toplevel=True, multiple=True, applicable=after(1985) ), - extract.XML( - Tag('ed'), - toplevel=True, multiple=True, - applicable=after(1985) - ) ), - csv_core=True + csv_core=True, ), FieldDefinition( - name='issue', - display_name='Issue number', - es_mapping={'type': 'integer'}, - description='Source issue number.', + name="issue", + display_name="Issue number", + es_mapping={"type": "integer"}, + description="Source issue number.", extractor=extract.XML( - Tag('is'), + Tag("is"), toplevel=True, # Hardcoded to ignore one particular issue with source data - transform=lambda x: (62226 if x == "6222662226" else int(x)) + transform=lambda x: (62226 if x == "6222662226" else int(x)), ), sortable=True, - csv_core=True + csv_core=True, ), FieldDefinition( - name='volume', - display_name='Volume', - description='Volume number.', + name="volume", + display_name="Volume", + description="Volume number.", es_mapping=keyword_mapping(), - extractor=extract.XML( - Tag('volNum'), - toplevel=True, - applicable=after(1985) - ), - csv_core=True + extractor=extract.XML(Tag("volNum"), toplevel=True, applicable=after(1985)), + csv_core=True, ), FieldDefinition( - name='date-pub', - display_name='Publication Date', + name="date-pub", + display_name="Publication Date", es_mapping=keyword_mapping(), csv_core=True, results_overview=True, sortable=True, - description='Publication date as full string, as found in source file', - extractor=extract.XML( - Tag('da'), - toplevel=True - ) + description="Publication date as full string, as found in source file", + extractor=extract.XML(Tag("da"), toplevel=True), ), FieldDefinition( - name='ocr', - display_name='OCR confidence', - description='OCR confidence level.', - es_mapping={'type': 'float'}, - search_filter=filters.RangeFilter(0, 100, - description=( - 'Accept only articles for which the Opitical Character Recognition confidence ' - 'indicator is in this range.' - ) - ), - extractor=extract.XML(Tag('ocr'), transform=float), - sortable=True + name="ocr", + display_name="OCR confidence", + description="OCR confidence level.", + es_mapping={"type": "float"}, + search_filter=filters.RangeFilter( + 0, + 100, + description=( + "Accept only articles for which the Opitical Character Recognition confidence " + "indicator is in this range." + ), + ), + extractor=extract.XML(Tag("ocr"), transform=float), + sortable=True, ), FieldDefinition( - name='date-end', - display_name='Ending date', + name="date-end", + display_name="Ending date", es_mapping=keyword_mapping(), description=( - 'Ending date of publication. ' - 'For issues that span more than 1 day.' + "Ending date of publication. " "For issues that span more than 1 day." ), - extractor=extract.XML( - Tag('tdate'), toplevel=True, - applicable=after(1985) - ) + extractor=extract.XML(Tag("tdate"), toplevel=True, applicable=after(1985)), ), FieldDefinition( - name='page-count', - display_name='Image count', - description='Page count: number of images present in the issue.', - es_mapping={'type': 'integer'}, - extractor=extract.XML( - Tag('ip'), toplevel=True, transform=int - ), - sortable=True + name="page-count", + display_name="Image count", + description="Page count: number of images present in the issue.", + es_mapping={"type": "integer"}, + extractor=extract.XML(Tag("ip"), toplevel=True, transform=int), + sortable=True, ), FieldDefinition( - name='page-type', - display_name='Page type', - description='Supplement in which article occurs.', - es_mapping={'type': 'keyword'}, + name="page-type", + display_name="Page type", + description="Supplement in which article occurs.", + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( description=( - 'Accept only articles that occur in the relevant ' - 'supplement. Only after 1985.' + "Accept only articles that occur in the relevant " + "supplement. Only after 1985." ), - option_count=2 + option_count=2, ), extractor=extract.XML( - ParentTag(), - Tag('pageid'), - attribute='isPartOf', - applicable=after(1985) - ) + ParentTag(), Tag("pageid"), attribute="isPartOf", applicable=after(1985) + ), ), FieldDefinition( - name='supplement-title', - display_name='Supplement title', - description='Supplement title.', + name="supplement-title", + display_name="Supplement title", + description="Supplement title.", extractor=extract.XML( ParentTag(), - Tag('pageid'), - Tag('supptitle'), + Tag("pageid"), + Tag("supptitle"), multiple=True, - applicable=after(1985) + applicable=after(1985), ), ), FieldDefinition( - name='supplement-subtitle', - display_name='Supplement subtitle', - description='Supplement subtitle.', + name="supplement-subtitle", + display_name="Supplement subtitle", + description="Supplement subtitle.", extractor=extract.XML( ParentTag(), - Tag('pageid'), - Tag('suppsubtitle'), + Tag("pageid"), + Tag("suppsubtitle"), multiple=True, - applicable=after(1985) - ) + applicable=after(1985), + ), ), # There are no datapoints where this is True, hence the outcomment # FieldDefinition( @@ -284,183 +263,158 @@ def sources(self, start=datetime.min, end=datetime.max): # ) # ), FieldDefinition( - name='id', - display_name='ID', - description='Article identifier.', + name="id", + display_name="ID", + description="Article identifier.", es_mapping=keyword_mapping(), - extractor=extract.XML(Tag('id')) + extractor=extract.XML(Tag("id")), ), FieldDefinition( - name='ocr-relevant', - display_name='OCR relevant', - description='Whether OCR confidence level is relevant.', - es_mapping={'type': 'boolean'}, + name="ocr-relevant", + display_name="OCR relevant", + description="Whether OCR confidence level is relevant.", + es_mapping={"type": "boolean"}, extractor=extract.XML( - Tag('ocr'), attribute='relevant', + Tag("ocr"), + attribute="relevant", transform=string_contains("yes"), - ) + ), ), FieldDefinition( - name='column', - display_name='Column', + name="column", + display_name="Column", description=( - 'Starting column: a string to label the column' - 'where article starts.' + "Starting column: a string to label the column" "where article starts." ), es_mapping=keyword_mapping(), - extractor=extract.XML(Tag('sc')) + extractor=extract.XML(Tag("sc")), ), FieldDefinition( - name='page', - display_name='Page', - description='Start page label, from source (1, 2, 17A, ...).', + name="page", + display_name="Page", + description="Start page label, from source (1, 2, 17A, ...).", es_mapping=keyword_mapping(), extractor=extract.Choice( - extract.XML(Tag('pa'), applicable=until(1985)), - extract.XML(ParentTag(), Tag('pa'), applicable=after(1985)) - ) + extract.XML(Tag("pa"), applicable=until(1985)), + extract.XML(ParentTag(), Tag("pa"), applicable=after(1985)), + ), ), FieldDefinition( - name='pages', - display_name='Page count', - es_mapping={'type': 'integer'}, + name="pages", + display_name="Page count", + es_mapping={"type": "integer"}, description=( - 'Page count: total number of pages containing sections ' - 'of the article.' + "Page count: total number of pages containing sections " + "of the article." ), - extractor=extract.XML( - Tag('pc'), transform=int - ), - sortable=True + extractor=extract.XML(Tag("pc"), transform=int), + sortable=True, ), FieldDefinition( - name='title', - display_name='Title', + name="title", + display_name="Title", results_overview=True, search_field_core=True, - visualizations=['wordcloud'], - description='Article title.', - extractor=extract.XML(Tag('ti')) + visualizations=["wordcloud"], + description="Article title.", + extractor=extract.XML(Tag("ti")), ), FieldDefinition( - name='subtitle', - display_name='Subtitle', - description='Article subtitle.', - extractor=extract.XML(Tag('ta'), multiple=True), - search_field_core=True + name="subtitle", + display_name="Subtitle", + description="Article subtitle.", + extractor=extract.XML(Tag("ta"), multiple=True), + search_field_core=True, ), FieldDefinition( - name='subheader', - display_name='Subheader', - description='Article subheader (product dependent field).', + name="subheader", + display_name="Subheader", + description="Article subheader (product dependent field).", extractor=extract.XML( - Tag('subheader'), multiple=True, - applicable=after(1985) - ) + Tag("subheader"), multiple=True, applicable=after(1985) + ), ), FieldDefinition( - name='author', - display_name='Author', - description='Article author.', + name="author", + display_name="Author", + description="Article author.", es_mapping=keyword_mapping(True), extractor=extract.Choice( - extract.XML( - Tag('au'), multiple=True, - applicable=until(1985) - ), - extract.XML( - Tag('au_composed'), multiple=True, - applicable=after(1985) - ) + extract.XML(Tag("au"), multiple=True, applicable=until(1985)), + extract.XML(Tag("au_composed"), multiple=True, applicable=after(1985)), ), search_field_core=True, - csv_core=True + csv_core=True, ), FieldDefinition( - name='source-paper', - display_name='Source paper', - description='Credited as source.', + name="source-paper", + display_name="Source paper", + description="Credited as source.", es_mapping=keyword_mapping(True), - extractor=extract.XML( - Tag('altSource'), multiple=True - ) + extractor=extract.XML(Tag("altSource"), multiple=True), ), FieldDefinition( - name='category', - visualizations=['resultscount', 'termfrequency'], - display_name='Category', - description='Article subject categories.', - es_mapping={'type': 'keyword'}, + name="category", + visualizations=["resultscount", "termfrequency"], + display_name="Category", + description="Article subject categories.", + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=25 + description="Accept only articles in these categories.", option_count=25 ), - extractor=extract.XML(Tag('ct'), multiple=True), - csv_core=True + extractor=extract.XML(Tag("ct"), multiple=True), + csv_core=True, ), FieldDefinition( - name='illustration', - display_name='Illustration', - description=( - 'Tables and other illustrations associated with the article.' - ), - es_mapping={'type': 'keyword'}, - visualizations=['resultscount', 'termfrequency'], + name="illustration", + display_name="Illustration", + description=("Tables and other illustrations associated with the article."), + es_mapping={"type": "keyword"}, + visualizations=["resultscount", "termfrequency"], search_filter=filters.MultipleChoiceFilter( description=( - 'Accept only articles associated with these types ' - 'of illustrations.'), - option_count=7 + "Accept only articles associated with these types " + "of illustrations." + ), + option_count=7, ), extractor=extract.Choice( + extract.XML(Tag("il"), multiple=True, applicable=until(1985)), extract.XML( - Tag('il'), multiple=True, - applicable=until(1985) + Tag("il"), attribute="type", multiple=True, applicable=after(1985) ), - extract.XML( - Tag('il'), attribute='type', multiple=True, - applicable=after(1985) - ) ), - csv_core=True + csv_core=True, ), FieldDefinition( - name='content-preamble', - display_name='Content preamble', - description='Raw OCR\'ed text (preamble).', - extractor=extract.XML( - Tag('text'), - Tag('text.preamble'), - flatten=True - ) + name="content-preamble", + display_name="Content preamble", + description="Raw OCR'ed text (preamble).", + extractor=extract.XML(Tag("text"), Tag("text.preamble"), flatten=True), ), FieldDefinition( - name='content-heading', - display_name='Content heading', - description='Raw OCR\'ed text (header).', - extractor=extract.XML( - Tag('text'), - Tag('text.title'), - flatten=True - ) + name="content-heading", + display_name="Content heading", + description="Raw OCR'ed text (header).", + extractor=extract.XML(Tag("text"), Tag("text.title"), flatten=True), ), FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - es_mapping=main_content_mapping(True, True, True, 'en'), - visualizations=['wordcloud'], - description='Raw OCR\'ed text (content).', + name="content", + display_name="Content", + display_type="text_content", + es_mapping=main_content_mapping(True, True, True, "en"), + visualizations=["wordcloud", "ngram"], + description="Raw OCR'ed text (content).", results_overview=True, search_field_core=True, extractor=extract.XML( - Tag('text'), - Tag('text.cr'), + Tag("text"), + Tag("text.cr"), multiple=True, flatten=True, - transform='\n'.join, + transform="\n".join, ), - language='en', + language="en", ), ]