diff --git a/backend/.gitignore b/backend/.gitignore index 67e11925c..3396fb1a3 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -43,10 +43,6 @@ ianalyzer/config.py # csv downloads download/csv_files/ -# word models -corpora/*/wm/* -!corpora/*/wm/documentation.md - # file storage -test_data/ -data/ +/test_data/ +/data/ diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py index 98e297537..671e68ac2 100644 --- a/backend/addcorpus/models.py +++ b/backend/addcorpus/models.py @@ -1,5 +1,12 @@ import warnings +from django.contrib import admin +from django.contrib.auth.models import Group +from django.contrib.postgres.fields import ArrayField +from django.core.exceptions import ValidationError +from django.db import models +from django.db.models.constraints import UniqueConstraint + from addcorpus.constants import CATEGORIES, MappingType, VisualizationType from addcorpus.validation.creation import ( validate_es_mapping, validate_field_language, validate_implication, validate_language_code, @@ -12,15 +19,10 @@ ) from addcorpus.validation.indexing import (validate_essential_fields, validate_has_configuration, validate_language_field, validate_has_data_directory) -from addcorpus.validation.publishing import (validate_default_sort, - validate_ngram_has_date_field) -from django.contrib import admin -from django.contrib.auth.models import Group -from django.contrib.postgres.fields import ArrayField -from django.core.exceptions import ValidationError -from django.db import models -from django.db.models.constraints import UniqueConstraint - +from addcorpus.validation.publishing import ( + validate_default_sort, + validate_ngram_has_date_field, +) from ianalyzer.elasticsearch import elasticsearch MAX_LENGTH_NAME = 126 @@ -264,14 +266,15 @@ def clean(self): @property def has_named_entities(self): - client = elasticsearch(self.es_index) + from es.search import total_hits + + client = elasticsearch(self.corpus.name) try: - mapping = client.indices.get_mapping( - index=self.es_index) - # in production, the index name can be different from the object's es_index value - index_name = list(mapping.keys())[0] - fields = mapping[index_name].get('mappings', {}).get('properties', {}).keys() - if any(field.endswith(':ner') for field in fields): + # we check if any fields exist for filtering named entities + ner_exists = client.search( + index=self.es_index, query={"exists": {"field": "ner:*"}}, size=0 + ) + if total_hits(ner_exists): return True except: return False diff --git a/backend/addcorpus/python_corpora/corpus.py b/backend/addcorpus/python_corpora/corpus.py index 590edab83..3b0e2594a 100644 --- a/backend/addcorpus/python_corpora/corpus.py +++ b/backend/addcorpus/python_corpora/corpus.py @@ -14,6 +14,7 @@ from ianalyzer_readers.readers.xml import XMLReader from ianalyzer_readers.readers.csv import CSVReader from ianalyzer_readers.readers.html import HTMLReader +from ianalyzer_readers.readers.rdf import RDFReader from ianalyzer_readers.readers.xlsx import XLSXReader from addcorpus.python_corpora.filters import Filter @@ -340,6 +341,11 @@ def source2dicts(self, source, *nargs, **kwargs): yield field_dict +class RDFCorpusDefinition(CorpusDefinition, RDFReader): + ''' + A RDFCorpus is any corpus that extracts its data from Linked Data files. + ''' + # Fields ###################################################################### diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index 9d295f82b..5fc408eeb 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -96,7 +96,7 @@ def sources(self, start=min_date, end=max_date): if extension != '.xml': logger.debug(self.non_xml_msg.format(full_path)) continue - #def_match = self.definition_pattern.match(name) + # def_match = self.definition_pattern.match(name) article_match = self.article_pattern.match(name) if article_match: parts = name.split("_") @@ -130,189 +130,188 @@ def sources(self, start=min_date, end=max_date): 'issue' ) - @property def fields(self): - return [FieldDefinition( - name="url", - display_name="Delpher URL", - description="Link to record on Delpher", - display_type='url', - es_mapping=keyword_mapping(), - extractor=XML( - lambda metadata: Tag('recordIdentifier', string=metadata['id']), - SiblingTag('identifier'), - external_file=True - ) - ), - FieldDefinition( - name='date', - display_name='Date', - description='Publication date.', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, - results_overview=True, - csv_core=True, - visualizations=['resultscount', 'termfrequency'], - search_filter=filters.DateFilter( - self.min_date, - self.max_date, - description=( - 'Accept only articles with publication date in this range.' - ) + return [ + FieldDefinition( + name="url", + display_name="Delpher URL", + description="Link to record on Delpher", + display_type="url", + es_mapping=keyword_mapping(), + extractor=XML( + lambda metadata: Tag("recordIdentifier", string=metadata["id"]), + SiblingTag("identifier"), + external_file=True, + ), ), - extractor=Metadata('date') - ), - FieldDefinition( - name='ocr', - display_name='OCR confidence', - description='OCR confidence level.', - es_mapping={'type': 'float'}, - search_filter=filters.RangeFilter(0, 100, - description=( - 'Accept only articles for which the Opitical Character Recognition confidence ' - 'indicator is in this range.' - ) - ), - extractor=XML( - Tag('OCRConfidencelevel'), - external_file=True, - transform=lambda x: float(x)*100 + FieldDefinition( + name="date", + display_name="Date", + description="Publication date.", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, + results_overview=True, + csv_core=True, + visualizations=["resultscount", "termfrequency"], + search_filter=filters.DateFilter( + self.min_date, + self.max_date, + description=( + "Accept only articles with publication date in this range." + ), + ), + extractor=Metadata("date"), ), - sortable=True - ), - FieldDefinition( - name='newspaper_title', - display_name='Newspaper title', - description='Title of the newspaper', - results_overview=True, - search_field_core=True, - es_mapping={'type': 'keyword'}, - visualizations=['resultscount', 'termfrequency'], - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these newspapers.', - option_count=len(self.papers) + FieldDefinition( + name="ocr", + display_name="OCR confidence", + description="OCR confidence level.", + es_mapping={"type": "float"}, + search_filter=filters.RangeFilter( + 0, + 100, + description=( + "Accept only articles for which the Opitical Character Recognition confidence " + "indicator is in this range." + ), + ), + extractor=XML( + Tag("OCRConfidencelevel"), + external_file=True, + transform=lambda x: float(x) * 100, + ), + sortable=True, ), - extractor=Metadata('title') - ), - FieldDefinition( - name='version_of', - display_name='Version of', - description='The newspaper is a version of this newspaper.', - es_mapping={'type': 'keyword'}, - extractor=Metadata('isVersionOf') - ), - FieldDefinition( - name='issue_number', - display_name='Issue number', - description='Issue number of the newspaper', - csv_core=True, - es_mapping={'type': 'integer'}, - extractor=Metadata('issuenumber') - ), - FieldDefinition( - name='category', - display_name='Category', - description='Whether the item is an article, advertisment, etc.', - csv_core=True, - es_mapping={'type': 'keyword'}, - extractor=XML( - lambda metadata: Tag('recordIdentifier', string=metadata['id']), - SiblingTag('subject'), - external_file=True + FieldDefinition( + name="newspaper_title", + display_name="Newspaper title", + description="Title of the newspaper", + results_overview=True, + search_field_core=True, + es_mapping={"type": "keyword"}, + visualizations=["resultscount", "termfrequency"], + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles in these newspapers.", + option_count=len(self.papers), + ), + extractor=Metadata("title"), ), - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=2, + FieldDefinition( + name="version_of", + display_name="Version of", + description="The newspaper is a version of this newspaper.", + es_mapping={"type": "keyword"}, + extractor=Metadata("isVersionOf"), ), - ), - FieldDefinition( - name='circulation', - display_name='Circulation', - description='The area in which the newspaper was distributed.', - es_mapping={'type': 'keyword'}, - csv_core=True, - extractor=Metadata('spatial'), - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles appearing in specific areas.', - option_count=7 + FieldDefinition( + name="issue_number", + display_name="Issue number", + description="Issue number of the newspaper", + csv_core=True, + es_mapping={"type": "integer"}, + extractor=Metadata("issuenumber"), ), - ), - FieldDefinition( - name='publisher', - display_name='Publisher', - description='Publisher', - es_mapping=keyword_mapping(), - search_field_core=True, - extractor=Metadata('publisher') - ), - FieldDefinition( - name='language', - display_name='Language', - description='language', - es_mapping={'type': 'keyword'}, - extractor=Metadata('language') - ), - FieldDefinition( - name='article_title', - display_name='Article title', - description='Article title', - results_overview=True, - search_field_core=True, - extractor=XML(Tag('title'), flatten=True, toplevel=True) - ), - FieldDefinition( - name='id', - display_name='ID', - description='Unique identifier of the entry.', - extractor=Metadata('id') - ), - FieldDefinition( - name='source', - display_name='Source', - description='Library or archive which keeps the hard copy of this newspaper.', - es_mapping={'type': 'keyword'}, - extractor=Metadata('source') - ), - FieldDefinition( - name='pub_place', - display_name='Publication Place', - description='Where the newspaper was published', - es_mapping={'type': 'keyword'}, - extractor=Metadata('pub_place') - ), - FieldDefinition( - name='temporal', - display_name='Edition', - description='Newspaper edition for the given date', - results_overview=True, - csv_core=True, - es_mapping={'type': 'keyword'}, - visualizations=['resultscount', 'termfrequency'], - search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in newspapers which appeared as a given edition.', - option_count=3, + FieldDefinition( + name="category", + display_name="Category", + description="Whether the item is an article, advertisment, etc.", + csv_core=True, + es_mapping={"type": "keyword"}, + extractor=XML( + lambda metadata: Tag("recordIdentifier", string=metadata["id"]), + SiblingTag("subject"), + external_file=True, + ), + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles in these categories.", + option_count=2, + ), ), - extractor=Metadata('temporal') - ), - FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - description='Text content.', - es_mapping=main_content_mapping(True, True, True, 'nl'), - results_overview=True, - search_field_core=True, - extractor=XML( - Tag('p'), - multiple=True, - flatten=True, - toplevel=True, - transform='\n'.join, + FieldDefinition( + name="circulation", + display_name="Circulation", + description="The area in which the newspaper was distributed.", + es_mapping={"type": "keyword"}, + csv_core=True, + extractor=Metadata("spatial"), + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles appearing in specific areas.", + option_count=7, + ), ), - visualizations=["wordcloud"], - language='nl', - ), - ] - - - + FieldDefinition( + name="publisher", + display_name="Publisher", + description="Publisher", + es_mapping=keyword_mapping(), + search_field_core=True, + extractor=Metadata("publisher"), + ), + FieldDefinition( + name="language", + display_name="Language", + description="language", + es_mapping={"type": "keyword"}, + extractor=Metadata("language"), + ), + FieldDefinition( + name="article_title", + display_name="Article title", + description="Article title", + results_overview=True, + search_field_core=True, + extractor=XML(Tag("title"), flatten=True, toplevel=True), + ), + FieldDefinition( + name="id", + display_name="ID", + description="Unique identifier of the entry.", + extractor=Metadata("id"), + ), + FieldDefinition( + name="source", + display_name="Source", + description="Library or archive which keeps the hard copy of this newspaper.", + es_mapping={"type": "keyword"}, + extractor=Metadata("source"), + ), + FieldDefinition( + name="pub_place", + display_name="Publication Place", + description="Where the newspaper was published", + es_mapping={"type": "keyword"}, + extractor=Metadata("pub_place"), + ), + FieldDefinition( + name="temporal", + display_name="Edition", + description="Newspaper edition for the given date", + results_overview=True, + csv_core=True, + es_mapping={"type": "keyword"}, + visualizations=["resultscount", "termfrequency"], + search_filter=filters.MultipleChoiceFilter( + description="Accept only articles in newspapers which appeared as a given edition.", + option_count=3, + ), + extractor=Metadata("temporal"), + ), + FieldDefinition( + name="content", + display_name="Content", + display_type="text_content", + description="Text content.", + es_mapping=main_content_mapping(True, True, True, "nl"), + results_overview=True, + search_field_core=True, + extractor=XML( + Tag("p"), + multiple=True, + flatten=True, + toplevel=True, + transform="\n".join, + ), + visualizations=["wordcloud", "ngram"], + language="nl", + ), + ] diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py index eefb1bbb8..fcaef48d2 100644 --- a/backend/corpora/goodreads/goodreads.py +++ b/backend/corpora/goodreads/goodreads.py @@ -64,200 +64,192 @@ def sources(self, start, end): fields = [ FieldDefinition( - name='year', - display_name='Year', - description='Year the review was written.', + name="year", + display_name="Year", + description="Year the review was written.", extractor=CSV( - 'date', - transform=lambda x: datetime.strptime( - x, '%b %d, %Y').strftime('%Y') + "date", + transform=lambda x: datetime.strptime(x, "%b %d, %Y").strftime("%Y"), ), - es_mapping={'type': 'integer'}, + es_mapping={"type": "integer"}, search_filter=RangeFilter( min_date.year, max_date.year, - description=( - 'Accept only book reviews written in this range.' - ) + description=("Accept only book reviews written in this range."), ), - hidden=True + hidden=True, ), FieldDefinition( - name='id', - display_name='ID', - description='ID of the review.', - extractor=CSV('id'), - es_mapping={'type': 'keyword'}, + name="id", + display_name="ID", + description="ID of the review.", + extractor=CSV("id"), + es_mapping={"type": "keyword"}, csv_core=True, ), FieldDefinition( - name='book_title', - display_name='Book title', - description='The title of the book reviews were made for. Encompasses all editions.', - extractor=Metadata('book_title'), - es_mapping={'type': 'keyword'}, + name="book_title", + display_name="Book title", + description="The title of the book reviews were made for. Encompasses all editions.", + extractor=Metadata("book_title"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews made for these titles.', - option_count=154 + description="Accept only reviews made for these titles.", + option_count=154, ), - csv_core=True + csv_core=True, ), FieldDefinition( - name='original_language', - display_name='Original language', - description='The original language the book reviews were made for was written in.', - extractor=Metadata('original_language'), - es_mapping={'type': 'keyword'}, + name="original_language", + display_name="Original language", + description="The original language the book reviews were made for was written in.", + extractor=Metadata("original_language"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews made for titles originally in this language(s).', - option_count=8 + description="Accept only reviews made for titles originally in this language(s).", + option_count=8, ), csv_core=True, ), FieldDefinition( - name='edition_id', - display_name='Edition ID', - description='ID of the edition the review was made for.', - extractor=CSV('edition_id'), - es_mapping={'type': 'keyword'}, + name="edition_id", + display_name="Edition ID", + description="ID of the edition the review was made for.", + extractor=CSV("edition_id"), + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='edition_language', - display_name='Edition language', - description='The language that the edition that the review is for was written in', - extractor=CSV('edition_language'), - es_mapping={'type': 'keyword'}, + name="edition_language", + display_name="Edition language", + description="The language that the edition that the review is for was written in", + extractor=CSV("edition_language"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only editions written in these languages.', - option_count=8 + description="Accept only editions written in these languages.", + option_count=8, ), results_overview=True, csv_core=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='book_genre', - display_name='Genre', - description='The genre of the reviewed book', - extractor=Metadata('book_genre'), - es_mapping={'type': 'keyword'}, + name="book_genre", + display_name="Genre", + description="The genre of the reviewed book", + extractor=Metadata("book_genre"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews of books in this genre', - option_count=8 + description="Accept only reviews of books in this genre", option_count=8 ), - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='age_category', - display_name='Age category', - description='The age category of the target audience of the reviewed book', - extractor=Metadata('age_category'), - es_mapping={'type': 'keyword'}, + name="age_category", + display_name="Age category", + description="The age category of the target audience of the reviewed book", + extractor=Metadata("age_category"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews of books written for this age category', - option_count=3 + description="Accept only reviews of books written for this age category", + option_count=3, ), - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='url', - display_name='Source URL', - display_type='url', - description='Link to the the review on Goodreads', - extractor=CSV('url'), - es_mapping={'type': 'keyword'}, + name="url", + display_name="Source URL", + display_type="url", + description="Link to the the review on Goodreads", + extractor=CSV("url"), + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='text', - display_name='Text', - description='Fulltext of the review.', - extractor=CSV('text'), + name="text", + display_name="Text", + description="Fulltext of the review.", + extractor=CSV("text"), es_mapping=main_content_mapping(), - display_type='text_content', + display_type="text_content", csv_core=True, results_overview=True, searchable=True, - visualizations=['wordcloud'], + visualizations=["wordcloud"], ), FieldDefinition( - name='language', - display_name='Review language', - description='The language of the review.', - extractor=CSV('language'), - es_mapping={'type': 'keyword'}, + name="language", + display_name="Review language", + description="The language of the review.", + extractor=CSV("language"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews written in these languages.', - option_count=50 + description="Accept only reviews written in these languages.", + option_count=50, ), results_overview=True, csv_core=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='date', - display_name='Date', - description='Date the review was written.', + name="date", + display_name="Date", + description="Date the review was written.", extractor=CSV( - 'date', - transform=lambda x: datetime.strptime( - x, '%b %d, %Y').strftime('%Y-%m-%d') + "date", + transform=lambda x: datetime.strptime(x, "%b %d, %Y").strftime( + "%Y-%m-%d" + ), ), - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, ), FieldDefinition( - name='rating_text', - display_name='Goodreads rating', - description='Rating in the Goodreads style, e.g. \'really liked it\'.', - extractor=CSV('rating'), - es_mapping={'type': 'keyword'}, + name="rating_text", + display_name="Goodreads rating", + description="Rating in the Goodreads style, e.g. 'really liked it'.", + extractor=CSV("rating"), + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='rating_no', - display_name='Rating', - description='Rating as a number.', - extractor=CSV('rating_no'), - es_mapping={'type': 'keyword'}, + name="rating_no", + display_name="Rating", + description="Rating as a number.", + extractor=CSV("rating_no"), + es_mapping={"type": "keyword"}, search_filter=MultipleChoiceFilter( - description='Accept only reviews with these ratings.', - option_count=6 + description="Accept only reviews with these ratings.", option_count=6 ), results_overview=True, - visualizations=['resultscount', 'termfrequency'], - visualization_sort='key' + visualizations=["resultscount", "termfrequency"], + visualization_sort="key", ), FieldDefinition( - name='word_count', - display_name='Word count', - description='Number of words (whitespace-delimited) in the review.', - extractor=CSV( - 'text', - transform=lambda x: len(x.split(' ')) - ), - es_mapping={'type': 'integer'}, + name="word_count", + display_name="Word count", + description="Number of words (whitespace-delimited) in the review.", + extractor=CSV("text", transform=lambda x: len(x.split(" "))), + es_mapping={"type": "integer"}, search_filter=RangeFilter( 1, 4225, - description=( - 'Accept only book reviews with word count in this range.' - )) + description=("Accept only book reviews with word count in this range."), + ), ), FieldDefinition( - name='edition_publisher', - display_name='Edition publisher', - description='Publisher of the edition the review was written for', + name="edition_publisher", + display_name="Edition publisher", + description="Publisher of the edition the review was written for", extractor=CSV( - 'edition_publisher', + "edition_publisher", ), - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, ), FieldDefinition( - name='edition_publishing_year', - display_name='Edition publishing year', - description='Year the edition the review was written for was published.', + name="edition_publishing_year", + display_name="Edition publishing year", + description="Year the edition the review was written for was published.", extractor=CSV( - 'edition_publishing_year', + "edition_publishing_year", ), - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, ), ] @@ -296,4 +288,3 @@ def update_script(self): } } yield update_body - diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py index be6f0f658..2658cca27 100644 --- a/backend/corpora/guardianobserver/guardianobserver.py +++ b/backend/corpora/guardianobserver/guardianobserver.py @@ -42,6 +42,7 @@ class GuardianObserver(XMLCorpusDefinition): scan_image_type = getattr(settings, 'GO_SCAN_IMAGE_TYPE', 'application/pdf') languages = ['en'] category = 'periodical' + word_model_path = getattr(settings, "GO_WM", None) @property def es_settings(self): @@ -71,113 +72,112 @@ def sources(self, start=datetime.min, end=datetime.max): fields = [ FieldDefinition( - name='date', - display_name='Publication Date', - description='Publication date, parsed to yyyy-MM-dd format', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Publication Date", + description="Publication date, parsed to yyyy-MM-dd format", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, hidden=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only articles with publication date in this range.' - ) + "Accept only articles with publication date in this range." + ), ), extractor=extract.XML( - Tag('NumericPubDate'), - transform=lambda x: '{y}-{m}-{d}'.format(y=x[:4],m=x[4:6],d=x[6:]) + Tag("NumericPubDate"), + transform=lambda x: "{y}-{m}-{d}".format(y=x[:4], m=x[4:6], d=x[6:]), ), sortable=True, ), FieldDefinition( - name='date-pub', + name="date-pub", es_mapping=keyword_mapping(), - display_name='Publication Date', + display_name="Publication Date", csv_core=True, results_overview=True, - description='Publication date as full string, as found in source file', - extractor=extract.XML(Tag('AlphaPubDate')) + description="Publication date as full string, as found in source file", + extractor=extract.XML(Tag("AlphaPubDate")), ), FieldDefinition( - name='id', + name="id", es_mapping=keyword_mapping(), - display_name='ID', - description='Article identifier.', - extractor=extract.XML(Tag('RecordID')), + display_name="ID", + description="Article identifier.", + extractor=extract.XML(Tag("RecordID")), ), FieldDefinition( - name='pub_id', + name="pub_id", es_mapping=keyword_mapping(), - display_name='Publication ID', - description='Publication identifier', - extractor=extract.XML(Tag('PublicationID')) + display_name="Publication ID", + description="Publication identifier", + extractor=extract.XML(Tag("PublicationID")), ), FieldDefinition( - name='page', + name="page", es_mapping=keyword_mapping(), - display_name='Page', - description='Start page label, from source (1, 2, 17A, ...).', - extractor=extract.XML(Tag('StartPage')) + display_name="Page", + description="Start page label, from source (1, 2, 17A, ...).", + extractor=extract.XML(Tag("StartPage")), ), FieldDefinition( - name='title', - display_name='Title', + name="title", + display_name="Title", search_field_core=True, - visualizations=['wordcloud'], - description='Article title.', - extractor=extract.XML(Tag('RecordTitle')) + visualizations=["wordcloud"], + description="Article title.", + extractor=extract.XML(Tag("RecordTitle")), ), FieldDefinition( - name='source-paper', + name="source-paper", es_mapping=keyword_mapping(True), - display_name='Source paper', - description='Credited as source.', - extractor=extract.XML(Tag('Title')), + display_name="Source paper", + description="Credited as source.", + extractor=extract.XML(Tag("Title")), search_filter=filters.MultipleChoiceFilter( - description='Accept only articles from these source papers.', - option_count=5 + description="Accept only articles from these source papers.", + option_count=5, ), ), FieldDefinition( - name='place', + name="place", mapping=keyword_mapping(True), - display_name='Place', - description='Place in which the article was published', - extractor=extract.XML(Tag('Qualifier')) + display_name="Place", + description="Place in which the article was published", + extractor=extract.XML(Tag("Qualifier")), ), FieldDefinition( - name='author', + name="author", mapping=keyword_mapping(True), - display_name='Author', - description='Article author', - extractor=extract.XML(Tag('PersonName')) + display_name="Author", + description="Article author", + extractor=extract.XML(Tag("PersonName")), ), FieldDefinition( - name='category', - visualizations=['resultscount', 'termfrequency'], - display_name='Category', - description='Article subject categories.', - es_mapping={'type': 'keyword'}, + name="category", + visualizations=["resultscount", "termfrequency"], + display_name="Category", + description="Article subject categories.", + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=19 + description="Accept only articles in these categories.", option_count=19 ), - extractor=extract.XML(Tag('ObjectType')), - csv_core=True + extractor=extract.XML(Tag("ObjectType")), + csv_core=True, ), FieldDefinition( - name='content', - es_mapping=main_content_mapping(True, True, True, 'en'), - display_name='Content', - display_type='text_content', - visualizations=['wordcloud'], - description='Raw OCR\'ed text (content).', + name="content", + es_mapping=main_content_mapping(True, True, True, "en"), + display_name="Content", + display_type="text_content", + visualizations=["wordcloud", "ngram"], + description="Raw OCR'ed text (content).", results_overview=True, search_field_core=True, - extractor=extract.XML(Tag('FullText'), flatten=True), - language='en', - ) + extractor=extract.XML(Tag("FullText"), flatten=True), + language="en", + ), ] document_context = { @@ -244,7 +244,6 @@ def request_media(self, document, corpus_name): } return {'media': image_urls, 'info': pdf_info} - def get_media(self, request_args): ''' Given the image path of the zipfile, diff --git a/backend/corpora/jewishmigration/jewishmigration.py b/backend/corpora/jewishmigration/jewishmigration.py index 02fd3c8a3..40273b53e 100644 --- a/backend/corpora/jewishmigration/jewishmigration.py +++ b/backend/corpora/jewishmigration/jewishmigration.py @@ -51,7 +51,9 @@ class JewishMigration(PeacePortal, JSONCorpusDefinition): data_directory = settings.JMIG_DATA_DIR data_filepath = getattr(settings, 'JMIG_DATA', None) data_url = getattr(settings, 'JMIG_DATA_URL', None) + data_api_key = getattr(settings, 'JMIG_DATA_API_KEY', None) + es_alias = getattr(settings, 'JMIG_ALIAS', None) es_index = getattr(settings, 'JMIG_INDEX', 'jewishmigration') image = 'jewish_inscriptions.jpg' languages = ['en'] @@ -60,7 +62,11 @@ class JewishMigration(PeacePortal, JSONCorpusDefinition): def sources(self, start, end): if self.data_url: - response = requests.get(self.data_url) + if self.data_api_key: + headers = {"Authorization": f"Token {self.data_api_key}"} + response = requests.get(self.data_url, headers=headers) + else: + response = requests.get(self.data_url) list_of_sources = response.json() elif self.data_filepath: with open(self.data_filepath, 'r') as f: @@ -71,7 +77,6 @@ def sources(self, start, end): for source in list_of_sources: yield source - def __init__(self): super().__init__() self._id.extractor = extract.JSON(key='source') @@ -93,61 +98,63 @@ def __init__(self): key='transcription') extra_fields = [ FieldDefinition( - name='script', - display_name='Script', - description='Which alphabet the source was written in', + name="script", + display_name="Script", + description="Which alphabet the source was written in", es_mapping=keyword_mapping(), - extractor=extract.JSON(key='scripts'), + extractor=extract.JSON(key="scripts"), + visualizations=["resultscount"], ), FieldDefinition( - name='site_type', - display_name='Site Type', - description='Type of site where evidence for settlement was found', + name="site_type", + display_name="Site Type", + description="Type of site where evidence for settlement was found", es_mapping=keyword_mapping(), - extractor=extract.JSON(key='site_type') + extractor=extract.JSON(key="site_type"), ), FieldDefinition( - name='inscription_type', - display_name='Inscription type', - description='Type of inscription', + name="inscription_type", + display_name="Inscription type", + description="Type of inscription", es_mapping=keyword_mapping(), - extractor=extract.JSON(key='inscription_type') + extractor=extract.JSON(key="inscription_type"), ), FieldDefinition( - name='period', - display_name='Period', - description='Period in which the inscription was made', + name="period", + display_name="Period", + description="Period in which the inscription was made", es_mapping=keyword_mapping(), - extractor=extract.JSON(key='period') + extractor=extract.JSON(key="period"), ), FieldDefinition( - name='estimated_centuries', - display_name='Estimated Centuries', - description='Estimate of centuries in which the inscription was made', + name="estimated_centuries", + display_name="Estimated Centuries", + description="Estimate of centuries in which the inscription was made", es_mapping=int_mapping(), extractor=extract.JSON( - key='estimated_centuries', transform=transform_centuries) + key="estimated_centuries", transform=transform_centuries + ), ), FieldDefinition( - name='inscription_count', - display_name='Inscription count', - description='Number of inscriptions', + name="inscription_count", + display_name="Inscription count", + description="Number of inscriptions", es_mapping=int_mapping(), - extractor=extract.JSON(key='inscriptions_count') + extractor=extract.JSON(key="inscriptions_count"), ), FieldDefinition( - name='religious_profession', - display_name='Religious profession', - description='Religious profession of deceased', + name="religious_profession", + display_name="Religious profession", + description="Religious profession of deceased", es_mapping=keyword_mapping(), - extractor=extract.JSON(key='religious_profession') + extractor=extract.JSON(key="religious_profession"), ), FieldDefinition( - name='sex_dedicator', - display_name='Gender dedicator', - description='Gender of the dedicator', + name="sex_dedicator", + display_name="Gender dedicator", + description="Gender of the dedicator", es_mapping=keyword_mapping(), - extractor=extract.JSON(key='sex_dedicator') - ) + extractor=extract.JSON(key="sex_dedicator"), + ), ] self.fields = [*exclude_fields_without_extractor(self.fields), *extra_fields] diff --git a/backend/corpora/jewishmigration/test_jewishmigration.py b/backend/corpora/jewishmigration/test_jewishmigration.py index ef6d56fb4..7251a766f 100644 --- a/backend/corpora/jewishmigration/test_jewishmigration.py +++ b/backend/corpora/jewishmigration/test_jewishmigration.py @@ -21,7 +21,8 @@ def __init__(self, mock_content): def json(self): return self.mock_content -def mock_get(_dummy_path): + +def mock_get(_dummy_path, headers=None): return MockResponse(mock_content=[ { "source": "Le Bohec 1981 n. 71", diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py index 06d048c30..8989c39bc 100644 --- a/backend/corpora/parliament/conftest.py +++ b/backend/corpora/parliament/conftest.py @@ -1,6 +1,8 @@ -import pytest +from datetime import datetime import os +import pytest + here = os.path.abspath(os.path.dirname(__file__)) @pytest.fixture() @@ -20,24 +22,681 @@ def parliament_corpora_settings(settings): 'parliament-finland-old': os.path.join(here, 'finland-old.py'), 'parliament-norway': os.path.join(here, 'norway.py'), 'parliament-norway-new': os.path.join(here, 'norway-new.py'), - 'parliament-ireland': os.path.join(here, 'ireland.py') + 'parliament-ireland': os.path.join(here, 'ireland.py'), + 'parliament-europe': os.path.join(here, 'euparl.py'), } - settings.PP_UK_DATA = os.path.join(here, 'tests', 'data', 'uk') - - settings.PP_NL_DATA = os.path.join(here, 'tests', 'data', 'netherlands') - settings.PP_NL_RECENT_DATA = os.path.join(here, 'tests', 'data', 'netherlands-recent') - settings.PP_CANADA_DATA = os.path.join(here, 'tests', 'data', 'canada') - settings.PP_GERMANY_NEW_DATA = os.path.join(here, 'tests', 'data', 'germany-new') - settings.PP_GERMANY_OLD_DATA = os.path.join(here, 'tests', 'data', 'germany-old') - settings.PP_FR_DATA = os.path.join(here, 'tests', 'data', 'france') - settings.PP_SWEDEN_DATA = os.path.join(here, 'tests', 'data', 'sweden') - settings.PP_SWEDEN_OLD_DATA = os.path.join(here, 'tests', 'data', 'sweden-old') + settings.PP_DENMARK_DATA = os.path.join(here, 'tests', 'data', 'denmark') + settings.PP_DENMARK_NEW_DATA = os.path.join(here, 'tests', 'data', 'denmark-new') + settings.PP_EUPARL_DATA = os.path.join(here, 'tests', 'data', 'euparl') settings.PP_FINLAND_DATA = os.path.join(here, 'tests', 'data', 'finland') settings.PP_FINLAND_OLD_DATA = os.path.join(here, 'tests', 'data', 'finland-old') + settings.PP_FR_DATA = os.path.join(here, 'tests', 'data', 'france') + settings.PP_GERMANY_NEW_DATA = os.path.join(here, 'tests', 'data', 'germany-new') + settings.PP_GERMANY_OLD_DATA = os.path.join(here, 'tests', 'data', 'germany-old') + settings.PP_IRELAND_DATA = os.path.join(here, 'tests', 'data', 'ireland') + settings.PP_NL_DATA = os.path.join(here, 'tests', 'data', 'netherlands') + settings.PP_NL_RECENT_DATA = os.path.join(here, 'tests', 'data', 'netherlands-recent') settings.PP_NORWAY_DATA = os.path.join(here, 'tests', 'data', 'norway') settings.PP_NORWAY_NEW_DATA = os.path.join(here, 'tests', 'data', 'norway-new') - settings.PP_DENMARK_DATA = os.path.join(here, 'tests', 'data', 'denmark') - settings.PP_DENMARK_NEW_DATA = os.path.join(here, 'tests', 'data', 'denmark-new') - settings.PP_IRELAND_DATA = os.path.join(here, 'tests', 'data', 'ireland') + settings.PP_SWEDEN_DATA = os.path.join(here, 'tests', 'data', 'sweden') + settings.PP_SWEDEN_OLD_DATA = os.path.join(here, 'tests', 'data', 'sweden-old') + settings.PP_UK_DATA = os.path.join(here, 'tests', 'data', 'uk') + + +CORPUS_TEST_DATA = [ + { + "name": "parliament-canada", + "docs": [ + { + "date": "2015-02-02", + "country": "Canada", + "debate_title": "Government Orders", + "debate_id": "ca.proc.d.2015-02-02", + "chamber": "House of Commons", + "party": "New Democratic Party", + "role": "Interjection", + "speaker": "Jack Harris", + "speaker_id": "c846297d-8bc7-4e69-b6eb-31d0e19f7ec1", + "speaker_constituency": "St. John's East", + "speech": "Mr. Speaker, I suppose I could ask the member for Nanaimo—Alberni why the Government of Canada would put $280 million into last year's budget if it was intended to compensate for something that would happen in 2020.", + "id": "ca.proc.d.2015-02-02.16582.214", + "topic": "Business of Supply", + "subtopic": "Opposition Motion—Newfoundland and Labrador Fisheries Investment Fund", + } + ], + "n_documents": 3, + }, + { + "name": "parliament-france", + "docs": [ + { + "book_id": "37531030876685 37531030876685/1/58 37531030876685_1_58_7", + "chamber": "Assemblee Nationale", + "country": "France", + "date": "1881-01-11", + "date_is_estimate": False, + "debate_id": "1881-01-11", + "debate_type": None, + "era": "3Rd Republic", + "legislature": None, + "page": "7", + "page_source": "X0000007.xml", + "sequence": "0", + "speech": """SOMMAIRE + +Constitution du bureau provisoire. + +Excuses. — Demande de congé. + +Communication par M. le président de deux lettres par lesquelles MM. Lou;s Legrand et Drumel déclinent toute candidature aux fonctions de secrétaire. + +Tirage au sort des bureaux. + +Fixation de l'ordre du jour : MM. Georges Perin, de Colbert-Laplace, Guichard, Janvier de La Motte (Eure). — Demande de renvoi au 20 janvier de la prochaine séance : M. Laroche-Joubert. Adoption. + +PRÉSIDBNCE DE M. DESSEAUX, DOYEN D'AGE La séance est ouverte à deux heures un quart. + +M. le président. Aux termes de l'article 1er de la loi constitutionnelle du 16 juillet 1875, je déclare ouverte la session ordinaire de la Chambre des députés pour 1881. + +J'invite lts six membres les plus jeunes de ''Assemblée à vouloir bien répondre à 'l'appel de leur nom pour prendre place au bureau en qualité de secrétaires provisoires. + +(L'appel des noms des députés les plus jeunes est fait par un huissier.) + +Sont successivement appelés : MM. Georges de Cassagnac, né le 17 févrièr 1855; Adrien Bastii, né Je 1er octobre 1853; Jules André, né le 23 août 1852 ; René Gautier, né le '25 avril 1852 ; Emile Réaux, né le 20 juin 1851 ; Le Provost de Launay fils, né le 8 juin 1850; René Eschasseriaux, né le 1. 1 mai 1850; Louis Janvier de La Motte, né le 23 août 1849; Lanauve, né le 24 mai 1849; Dreyfus, né le 5 mai 1849 ; Marcellin Pellet, né le 4 mars 1849 ; De Loqueyssip, né le 1er octobre 1848; Le comte de Breteuil, né le 17 septembre 1848; Roy de Loulay, né le 8 août 1848; D3 La Porte, né le 20 juin 1848 ; Thomson, né le 21 janvier 1848. + +MM Georges de Cassagnac, Adrien Bstid, limile Réaux, Dreyfus, de Loqueyssie et Thomson répondent à l'appel de leurs noms et prennent placn au bureau. + +M. le président. Le bureau probatoire est constitué. + +MM. Fourot, de Douville-Maillefeu et Laisant s'excusent de ne pouvoir assister à la séance de ce jour. + +M. Laumond demande un congé de vingt jours. + +La demande sera renvoyéa à la commission des congés. + +J'ai reçu de M. Louis Legrand la lettre suivante, dont je donne connaissance à la Chambré : « Valenciennes, 9 janvier 1881. + +c Monsieur le président, « Je vous prie de vouloir bien annoncer à mes collègues que je ne me représente pas à leurs suffrages pour les fonctions de secrétaile. + +« je saisis cette occasion pour remercier la Chambre de l'honneur qu'elle m'a fait en me choisissant comme l'un des membres de son bureau. + +« Agréez, monsieur je président, i assurance de ma haute considération. + +c Lotis LEGRAND, « Député du Nord. » + +J'ai reçu également de M. Drumel la lettre suivante: + +« Neuvizy (Ardennes', 10 janvier 1881. + +c Monsieur le président, « Depuis deux ans, 1* Chambre m'a fait l'honneur de m'appeler à siéger, comme secrétaire, dans son bureau. Je lui en suis profondément reconnaissant; et, en la priant de charger un autre de ses membres des fonctions que je tenais de sa confiance, je lui exprime ma vive gratitude pour les témoignages d'estime et de sympathie qu'à différentes reprises elle a bien voulu me donner. + +c Veuillez croire, monsieur le président, à mes sentiments respectueux et dévoués. + +« DRUMEL. » + +M. le président. L'ordre du jour appelle le tirage au sort des bureaux. + +Il va y être procédé. + +(Il est procédé au tirage au sort des bureaux dans les formes réglementaires.) M. le président. Messieurs, il y aurait lieu de procéder maintenant à la fixation de l'ordre du jour, mais je crois devoir faire remarquer à la Chambre qu'elle n'est pas en très-grand nombre. (81! si! à droite, — Non! + +non ! sur un grand nombre de bancs à gauche.) M. Clémenceau. Il n'est pas nécessaire que la Chambre soit en très-grand nombre, il suffit qu'elle soit en nombre. + +M. le président. Je n'ai pas dit que U Chambre n'était pas en nombre, j'ai dit qu'elld n'était pas en très-grand nombre. + +M. Haentjens. "Etm n'a jamais été aussi nombreuse à une première séance ! + +M. le président. La date de l'ouverture dela session, qui est fixée par la loi constitutionnelle, se place cette année entre les deux scrutins relatifs aux élections municipales. + +A droite. Qu'est-ce que cela fait? + +M. le président beaucoup de nos collègues som encore retenus dans leurs d^oL^rtements. + +A droite. Mais non ! mais non ! + +M. Laroche Joubert Il ne fallait pas nous convoquer alors ! (Interruptions diverses à droite et sur plusieurs bancs à gauche.) M. de Baudry-d'Asson. N.Jus sommes revenus exprès pour procéder à la nomination du bureau ; nous demandons que le bureau soit nommé aujourd'hui!M. le président. Il a paru à beaucoup d'entre vous que l'élection du bureau définitif doit se faire par le plus grand nombre possible de membres. (Interruptions à dro te et sur quelques bancs à gauche.) Je soumets donc à la Chambre la proposi. + +tion de s'ajourner. (Bruyantes exclamations à droite.) Sur divers bancs à droite el à l'extrême oauchu. Non 1 non l Sur un grand nombre d'autres bancs. Mais si 1 c'est nécessaire 1 M. Georges Perin. Je demande la parole. + +M. Laroche-Joubert. Je demande la pa. + +role. + +M le comte de Colbert-Laplace. Je demande la parole. + +M. le président. La parole est à M. + +Perin. + +M. Georges Perin. Messieurs, je viens, au nom d'un certain nombre de mes amis et en mon nom personnel, demander à la Chambre de repousser la proposition d'ajournement qui vient d'être faLe pac noire honorable président. (Très bien ! très bien ! à droite et à l'extrême gauche.) Autant qu'il m'a été permis de l'entendre au milieu du bruit, je crois que la seule raison que notre honorable président ait fait valoir 7our justifier sa proposition, c'est que nous n enous pas en nombre.""", + "id": "3rd_republic_0", + "url": "http://gallica.bnf.fr/ark:/12148/bpt6k64418203", + "url_html": None, + } + ], + "n_documents": 5, + }, + { + "name": "parliament-germany-new", + "docs": [ + { + "country": "Germany", + "chamber": "Bundestag", + "date": "1949-09-22", + "debate_id": "7", + "speaker": "Gebhard Seelos", + "speaker_id": "11002141", + "speaker_aristocracy": None, + "speaker_academic_title": "Dr.", + "speaker_birth_country": "Deutschland", + "speaker_birthplace": "München", + "speaker_birth_year": 1901, + "speaker_death_year": 1984, + "speaker_gender": "male", + "speaker_profession": "Dipl.-Volkswirt, Jurist, Diplomat, Staatsrat a. D.", + "role": "Member of Parliament", + "role_long": None, + "party": "BP", + "party_full": "Bayernpartei", + "party_id": "2", + "speech": "Baracken sind etwas Vorübergehendes; sie halten aber immer länger, als eigentlich geplant.", + "id": "94", + "url": "https://dip21.bundestag.de/dip21/btp/01/01007.pdf", + "sequence": "94", + } + ], + "n_documents": 2, + }, + { + "name": "parliament-germany-old", + "docs": [ + { + "country": "Germany", + "book_id": "bsb00000436", + "book_label": "1867/70,1 ( Protokolle mit Sach- und Sprechregister )", + "era": "Reichstag (Norddeutscher Bund/Zollparlamente) 1867 - 1895 Norddeutscher Bund", + "date": "1867-02-25", + "date_is_estimate": "true", + "page": "27", + "url": "https://api.digitale-sammlungen.de/iiif/image/v2/bsb00000436_00027/full/full/0/default.jpg", + "speech": "Nach vorangegangenem Gottesdienste in der Königlichen Schloßcapelle und der St. Hedwigskirche versammelten sich Heute- Nachmittags 11 Uhr die durch Allerhöchstes Patent vom 13. d. M. einberufenen Mitglieder des Reichstages des Norddeutschen Bundes im Weißen Saale des Königlichen Schlosses. Bald daraus traten die Reichstags-Commifsarien ein. Nachdem dieselben links vom Throne sich ausgestellt und die Versammlung sich -geordnet hatte, machte der Vorsitzende der Reichstags-Commissarien, Gras von Bismarck, Seiner Majestät dem Könige davon Meldung. Allerhöchst dieselben begaben Sich daraus in Begleitung Ihrer Königlichen Hoheiten des Kronprinzen und der Prinzen des Königlichen Hauses in dem nach dem Programm geordneten Zuge, unter 'Vortragung der Reichs-Insignien, nach dem Weißen Saale und nahmen, mit einem lebhaften dreimaligen Hoch, welches der Wirkliche Geheime Rath von Frankenberg ausbrachte, von der Versammlung empfangen, auf dem Throne Platz, während Seine Königliche Hoheit der Kronprinz guf der mittleren Stufe desselben, Ihre Königlichen Hoheiten die Prinzen des Königlichen Hauses zur Rechten des Thrones sich aufstellten. Seine Majestät der König verlasen hierauf, das Haupt mit dem Helme bedeckt, die nachfolgende Rede:", + "id": "0", + } + ], + "n_documents": 1, + }, + { + "name": "parliament-netherlands", + "docs": [ + { + "country": "Netherlands", + "date": "2000-01-18", + "chamber": "Eerste Kamer", + "debate_title": "Presentie en opening (dinsdag 18 januari 2000)", + "debate_id": "nl.proc.ob.d.h-ek-19992000-493-493", + "topic": "Presentie en opening", + "speech": "\n".join( + [ + "Ik deel aan de Kamer mede, dat zijn ingekomen berichten van verhindering van de leden:", + "Kohnstamm, wegens ziekte;", + "Boorsma, wegens verblijf buitenslands.", + ] + ), + "id": "nl.proc.ob.d.h-ek-19992000-493-493.1.5.1", + "source_archive": "PoliticalMashup", + "speaker": "De voorzitter Jurgens", + "speaker_id": "nl.m.01992", + "speaker_gender": None, + "role": "Chair", + "party": None, + "party_id": None, + "party_full": None, + "page": "493", + "url": "https://zoek.officielebekendmakingen.nl/h-ek-19992000-493-493.pdf", + "sequence": 1, + } + ], + "n_documents": 4, + "end": datetime(2015, 1, 1), + }, + { + "name": "parliament-netherlands", + "docs": [ + { + "country": "Netherlands", + "date": "2017-01-31", + "chamber": "Tweede Kamer", + "debate_title": "Report of the meeting of the Dutch Lower House, Meeting 46, Session 23 (2017-01-31)", + "debate_id": "ParlaMint-NL_2017-01-31-tweedekamer-23", + "topic": 'Rapport "Welvaart in kaart"', + "speech": "Ik heet de minister van Economische Zaken van harte welkom.", + "id": "ParlaMint-NL_2017-01-31-tweedekamer-23.u1", + "speaker": "Khadija Arib", + "speaker_id": "#KhadijaArib", + "speaker_gender": "vrouw", + "role": "Chair", + "party": "PvdA", + "party_id": "#party.PvdA", + "party_full": "Partij van de Arbeid", + "page": None, + "url": None, + "sequence": 1, + } + ], + "n_documents": 98, + "start": datetime(2015, 1, 1), + }, + { + "name": "parliament-uk", + "docs": [ + { + "country": "United Kingdom", + "date": "1872-02-06", + "chamber": "House of Commons", + "debate_title": "New Writs During The Recess", + "debate_id": None, + "speech": "acquainted the House, —that he had issued Warrants for New Writs, for Truro, v. Hon. John Cranch Walker Vivian, Under Secretary to the Eight hon. Edward Cardwell; for Plymouth, Sir Robert Porrett Collier, knight, one of the Justices of the Court of Common Pleas; Dover, George Jessel, esquire, Solicitor General; York County (West Riding, Northern Division), Sir Francis Crossley, baronet, deceased; Limerick City, Francis William Russell, esquire, deceased; Galway County, Eight hon. William Henry Gregory, Governor and Commander in Chief of the Island of Ceylon and its dependencies; Kerry, Eight hon. Valentine Augustus Browne, commonly called Viscount Castlerosse, now Earl of Kenmare.", + "id": "guldi_c19_365565", + "speaker": "Mr. Speaker", + "speaker_id": None, + "speech_type": None, + "topic": None, + "subtopic": None, + "sequence": "365565", + }, + { + "country": "United Kingdom", + "date": "2020-01-14", + "chamber": "House of Commons", + "debate_title": "House Of Commons Debate On 14/01/2020", + "debate_id": "debates2020-01-14c", + "speech": "What steps his Department is taking to ensure that legal aid is accessible to people who need it.", + "id": "uk.org.publicwhip/debate/2020-01-14c.865.4", + "speaker": "Sarah Dines", + "speaker_id": "uk.org.publicwhip/person/25877", + "speech_type": "Start Question", + "topic": "The Secretary of State was asked—", + "subtopic": "Legal Aid Access", + "sequence": "0", + }, + ], + "n_documents": 2, + }, + { + "name": "parliament-sweden", + "docs": [ + { + "date": "2021-09-14", + "date_is_estimate": None, + "chamber": "Riksdag", + "country": "Sweden", + "speech": "Ärade ledamöter! Varmt välkomna tillbaka till riksdagen! Det känns stort att få välkomna er här på tröskeln till det fjärde riksmötet den här mandatperioden. Vi har ännu ett mycket speciellt arbetsår bakom oss, till stor del präglat av pandemin. Även om vi visste att det inte var helt över för ett år sedan tror jag att vi var många som hoppades att en tydligare vändning var på väg. Så blev det inte. I stället fick vi ytterligare ett riksdagsår med ett reducerat antal ledamöter vid voteringar och utskottsarbete till stor del på distans. Men förhoppningsvis börjar vi nu gå tillbaka mot mer normala arbetsformer. Ett tydligt tecken på detta är att alla 349 ledamöter kommer att vara med vid riksmötets öppnande i eftermiddag. Jag tycker att det är angeläget att riksdagens och regeringens alla ledamöter kan vara på plats vid denna högtidliga och viktiga ceremoni, särskilt som detta är det sista öppnandet under den här mandatperioden. Däremot genomförs inget upprop nu på förmiddagen, och vi vidtar den försiktighetsåtgärden att drygt en tredjedel av ledamöterna och statsråden får sitta på läktaren under ceremonin. Formerna beslutades av mig efter diskussion med gruppledarna och de vice talmännen redan i början av augusti, alltså långt innan det blev bestämt att alla ledamöter får delta i voteringar efter riksmötets öppnande. Jag såg inget skäl att med kort varsel börja ändra i planeringen för riksmötets öppnande, så just denna speciella dag får inte alla ledamöter sitta nere på golvet här i kammaren . M en från och med riksmötets första votering sitter var och en på sin plats och röstar igen på vanligt sätt. Även om pandemin inte är över är situationen i Sverige ändå en helt annan nu än för ett år sedan. Därför har vi – talmanspresidiet och gruppledarna – gjort bedömningen att det är möjligt att samla fler personer än förra året men ändå långt färre än ett vanligt år. Vi har försökt finna en så god balans som möjligt mellan nödvändiga säkerhetsåtgärder, riksdagsordningens bestämmelser och respekt för traditionen. Den sedvanliga mottagningen i Sammanbindningsbanan är som bekant inställd, och det genomförs heller inte någon konsert i Konserthuset. Jag är glad över att vi också kommer att få hjälp att minnas dessa föregångare och förebilder genom att de får en permanent plats på Riksplan i form av en staty. Här tillkommer det att det i trapphallen i Östra riksdagshuset kommer att invigas en tavla som föreställer de här fem pionjärerna. Statyn dröjer ett tag – den kommer att invigas nästa år – men redan i kväll vill riksdagen på dagen för riksmötets öppnande, denna demokratins högtidsdag, uppmärksamma demokratijubileet med att lysa upp Stockholmsnatten med ett ljusspel. Jag kommer att tända en fasadbelysning på Östra riksdagshuset vid en webbsänd ceremoni klockan 20. Ljusspelet kan sedan ses varje kväll till och med den 20 september. Men demokratifirandet tar inte slut där. Vad passar väl bättre på FN:s demokratidag den 15 september än att fira med ett seminarium? I morgon anordnar riksdag och regering seminariet 100 år av demokrati – vilka lärdomar tar vi med oss? Se det gärna på riksdagen.se! Efter riksmötets öppnande tror jag att vi alla ser fram emot ett nytt arbetsår i riksdagen under något mer normala former. Jag har ju, som ni alla vet, tillsammans med gruppledarna slutit en ny överenskommelse om arbetsformerna under hösten, och gruppledarna har också beslutat att inte förlänga överenskommelsen om 55 närvarande ledamöter vid voteringar. Alla ledamöter kan alltså delta vid voteringarna, men vi behåller möjligheten att delta på distans vid utskottens sammanträden. Varje utskott avgör när det är motiverat att hålla fysiska sammanträden, och när man deltar fysiskt planerar vi för att det ska gå att hålla avstånd. Vi ska däremot fortsätta hjälpas åt att hålla antalet externa besök i riksdagens hus nere. Externa åhörare vid olika arrangemang bör undvikas liksom guidade visningar och mingelsituationer. Pandemin är inte över. Vi fortsätter att anpassa verksamheten när och om det behövs, men förhoppningsvis går vi mot ett mer normalt läge. Ärade ledamöter! Det här har varit en mandatperiod som ingen annan. Jag tror inte att någon hade kunnat förutse de många olika, oväntade och delvis dramatiska händelser som har inträffat. Jag tänker naturligtvis i första hand på pandemin och alla dess konsekvenser men även på de två regeringsbildningarna. Och då är det ändå ett helt år kvar av mandatperio ­ den. Jag tror att vi alla kan se fram emot ännu ett händelserikt och spännan ­ de riksdagsår fram till valet. Vi vet i alla fall att det i början av november blir den tredje regeringsbildningen under den här mandatperioden. Oavsett hur man ser på det politiska läget vill jag framhålla, apropå just demokratijubileet, att regeringsbildningarna inte har inneburit någon kris för demokratin. Svensk demokrati står stark, och den är värd att fira. Alla aktörer har i regeringsbildningsprocesserna använt de olika verktyg som finns i den demokratiska, parlamentariska verktygslådan. Misstroendeomröstning, beslut att inte utlysa extraval och talmansrundor – allt sådant följer av de lagar som vi har skapat för vår demokrati. Skeendet må vara turbulent i vissa stycken, men det följer demokratins spelregler. Ärade ledamöter! Jag vill avsluta med några rader ut dikten Sommaren i Sverige av Werner Aspenström. Den skildrar på ett fint sätt vemodet och skönheten när sommaren går mot sitt slut. Då landar på min hand den förgänglighetens tanke som vi kallar trollslända. Ett gult löv lösgör sig och faller klingande mot marken. Sommaren måste hastigt bärgas. … Ty hösten närmar sig med toppeld i asparna. Låt mig nu önska er en fin höst och ett produktivt arbetsår. På återseende här i kammaren klockan 14! Stockholms kommun Stockholms län Södermanlands län Jönköpings län Kronobergs län Blekinge län Hallands län Göteborgs kommun Värmlands län Jämtlands län Norrbottens län EU-dokument Åttaveckorsfristen för att avge ett motiverat yttrande skulle gå ut den 5 november . EU-dokument Följande frågor för skriftliga svar hade framställts: 2020/21:3636 Amorteringskravet och ojämställd bostadsmarknad 2020/21:3637 Den kinesiske ambassadörens agerande 2020/21:3638 Vaccin 2020/21:3639 Lukasjenkos tillgång till 1 miljard dollar från IMF 2020/21:3640 Markering mot Irans idrottsminister 2020/21:3642 Kriminalitet på bostadsmarknaden Skriftliga svar på följande frågor hade kommit in: 2020/21:3535 Barns rätt till säkerställda skyddade boenden 2020/21:3537 Elbrist som hotar investeringar i Sverige 2020/21:3538 Åtgärder för att trygga boende", + "sequence": "0", + "id": "i-2a00eff84ce04676-0", + "speaker": "Andreas Norlén", + "speaker_gender": "man", + "role": "Sveriges riksdags talman", + "ministerial_role": None, + "party": None, + "speaker_birth_year": 1973, + "speaker_death_year": None, + "speaker_constituency": None, + "speaker_id": "Q4755577", + }, + ], + "n_documents": 5, + }, + { + "name": "parliament-sweden-old", + "docs": [{}] * 5 + + [ + { + "book_id": "bn_1828-30_1__01", + "book_label": "Hederwärda bonde-ståndets protokoller wid lagtima riksdagen i Stockholm åren 1828 och 1829. Första bandet.", + "country": "Sweden", + "era": "Ståndsriksdagen", + "chamber": "Bönder", + "date_earliest": "1828-01-01", + "date_latest": "1828-12-31", + "speech": """Hederwärdo + +Bonde-Ständcts + +Protokoller + +wid + +LagMa Riksdagen i Stockhol». + +Ä«tt 1828 och I82t, + +första Lander. + +STOCKHOLM, + +Kongl. Ordens-Böktryckeriet, I8Z9.""", + "page": "0", + "sequence": 1, + "url": "https://weburn.kb.se/riks/ståndsriksdagen/pdf/bn_1828-30_1_/bn_1828-30_1__01.pdf", + "url_xml": "https://weburn.kb.se/riks/ståndsriksdagen/xml/bn_1828-30_1_/bn_1828-30_1__01.xml", + } + ], + "n_documents": 10, + }, + { + "name": "parliament-denmark", + "docs": [ + { + "speech": """6546 F. t. beslutn. vedr. udbetaling af sygedagpenge + +Beslutningsforslag nr. B 142. Fremsat den 3. juni 2008 af Thomas Adelskov (S), Lennart Damsbo-Andersen (S), + +Egil Andersen (SF), Margrethe Vestager (RV), Morten Østergaard (RV) og Line Barfod (EL) + +Forslag til folketingsbeslutning + +om ophævelse af varighedsbegrænsningen for udbetaling af sygedagpenge + +Folketinget pålægger regeringen at fremsætte lovforslag, som ophæver varighedsbegrænsnin- gen for udbetaling af sygedagpenge, således at + +lovforslaget kan træde i kraft den 1. januar 2009.""", + "page": "546", + "date_earliest": "2007-01-01", + "date_latest": "2007-12-31", + "book_label": "Folketingstidende 2007/8 (2. samling) Tillæg A side 6001 - 6565", + "book_id": "20072A6546", + "id": "20072A6546_546", + "chamber": "Folketinget", + "country": "Denmark", + "sequence": 546, + } + ], + "n_documents": 5, + }, + { + "name": "parliament-denmark-new", + "docs": [ + { + "country": "Denmark", + "id": "20100128100025", + "date": "2010-01-28", + "speech": "Mødet er åbnet. I dag er der følgende anmeldelser: Kirkeministeren (Birthe Rønn Hornbech): Lovforslag nr. L 115 (Forslag til lov om ændring af lov om udnævnelse af biskopper og om stiftsbåndsløsning og forskellige andre love.) og L 116 (Forslag til lov om ændring af lov om begravelse og ligbrænding og lov om folkekirkens økonomi.) Beskæftigelsesministeren (Inger Støjberg): Lovforslag nr. L 117 (Forslag til lov om ændring af lov om sygedagpenge, lov om ret til orlov og dagpenge ved barsel, lov om aktiv socialpolitik og lov om arbejdsløshedsforsikring m.v. Transportministeren (Lars Barfoed): Lovforslag nr. L 118 (Forslag til lov om ændring af lov om taxikørsel m.v.) Videnskabsministeren (Helge Sander): Lovforslag nr. L 119 (Forslag til lov om ændring af universitetsloven.) Titler på de fremsatte forslag vil fremgå af www.folketingstidende.dk (jf. ovenfor). Mens vi får de sidste medlemmer ind i salen, kan jeg lige oplyse, at vi er vidende om, at der er problemer med, hvordan urene går på Christiansborg. Det er et lidt større problem end som så blot at justere urene, for det er hele styringssystemet – det styres af 23 V strøm – der gør, at der er problemer med overhovedet at styre urene. Nogle er slidt ned, så man skal ikke regne med tiden. Min opfordring er, at man bruger soluret og kun tæller de lyse timer. Munterhed Men det afgørende er altså, at vi er opmærksomme på det og gør, hvad vi overhovedet kan for at udskifte, hvor der skal udskiftes, og i øvrigt at få et system, så urene altid går korrekt. Jeg går nemlig ud fra, at de, der kommer for sent, her nu hvor vi skal stemme, udelukkende gør det, fordi urene går forkert.", + "speaker": "Thor Pedersen", + "speaker_gender": "Male", + "speaker_birth_year": 1945, + "role": "formand", + "party": "Venstre", + "topic": "Punkt 0", + "subject": "other", + "sequence": "100025", + } + ], + "n_documents": 4, + }, + { + "name": "parliament-norway", + "docs": [ + { + "speech": """KONGERIKET NORGES 149. STORTINGS FORHANDLINGER 2004 - 2005 + +9. del + +INNEHOLDENDE REGISTER TIL FORHANDLINGER I STORTINGET OG DETS AVDELINGER + +OSLO LOBO MEDIA AS 2005""", + "page": "2", + "book_id": "digistorting_2004_part9_vol-a", + "book_label": "Stortingsforhandlinger; 2004/2005 Vol. 149 Nr. 9", + "date_earliest": "2004-01-01", + "date_latest": "2004-12-31", + "sequence": "2", + "chamber": "Stortinget", + "country": "Norway", + } + ], + "n_documents": 5, + }, + { + "name": "parliament-norway-new", + "docs": [ + {}, + {}, + {}, + { + "subject": "Statsbudsjettet", + }, # skip a few introductory speeches to one with more metadata + { + "country": "Norway", + "chamber": "Stortinget", + "date": "1998-10-20", + "debate_title": "Sak nr. 2", + "debate_type": "interpellasjon", + "party": "Høyre", + "party_id": "H", + "party_role": "Opposition", + "role": "Representant", + "speaker": "Sonja Irene Sjøli", + "speaker_id": "SONS", + "speaker_gender": "kvinne", + "speaker_birth_year": 1949, + "speaker_death_year": None, + "speaker_constituency": "Akershus", + "speech": "Det er en bred forståelse blant fagfolk og politikere om at norsk sykehusvesen ikke bare lider under mangel på ressurser, men at det først og fremst er behov for organisatoriske og strukturelle forandringer. Offentlige utredninger om eierskap, organisering og ledelse i sykehus viser at det er behov for en rekke endringer for å nå målet om et bedre og mer tilgjengelig helsetilbud til befolkningen. Erkjennelsen av at vi har brukt gamle og lite hensiktsmessige virkemidler i helsepolitikken, har også nådd Regjeringen. Helseministeren uttalte til Dagens Næringsliv i sommer at det ville tjene pasientene hvis vi kunne være mer dristig i bruken av etterspørselsteknikker og private bidrag innenfor sykehussektoren. Denne uttalte dristighet ser jeg fram til med spenning. Stortinget har i de siste år, etter sterkt påtrykk fra Høyre, vedtatt innsatsbasert finansiering og fritt sykehusvalg. Den naturlige konsekvens av dette er at sykehusene organiserer seg annerledes enn før. Vi er langt fra alene om disse tankene. En rekke svenske fagforbund krever en ny modell for det svenske helsevesenet. Den svenske legeforening og det svenske sykepleierforbundet har gått sammen og krever at markedet i større grad må styre helsetilbudet. De mener at fylkeskommunen har utspilt sin rolle i styringen av helsesektoren og krever en total omlegging av helsevesenet. Det er mulig at Norge har sterkere økonomi og bedre skiløpere enn svenskene, men helsedebatten i Sverige har i den senere tid vært langt mer dynamisk og spennende enn hos oss. Tankene om at sykehus ikke nødvendigvis må være eid og drevet av det offentlige, vinner terreng i stadig flere land og er allerede utviklet i flere miljøer også her i Norge. Til og med Jan Grund, Norges fremste helseøkonom, professor på BI og en svoren sosialdemokrat, mener at flertallet av norske politikere befinner seg i skyttergravene i debatten om private helsetjenester. Problemet er ifølge Grund at det ikke er definert hvilke grunnleggende helsetjenester vi har krav på, og hvilke tjenester som kan tilbys oss som forbrukere og kunder. Derfor er det så vanskelig å håndtere diskusjonen om privat kontra offentlig helsetilbud. Han uttrykker sterk støtte til å få private aktører inn i det offentlige helsevesen. Stiftelsen SINTEF Unimed er utpekt av Næringsdepartementet og Helsedepartementet til å lede næringsutvikling i helsesektoren. Lederen Paul Hellandsvik mener det er på høy tid å tenke nytt og utradisjonelt om hvordan det offentlige kan dra nytte av private aktører, og at det gjelder å komme i gang med noen prøveprosjekter. Erfaringer fra Sverige og andre land viser at en modell for helsevesenet hvor det offentlige drar nytte av private aktører til utbygging og drift av sykehus, gir store økonomiske gevinster og høy kvalitet på tjenestene. Forutsetningen for modellen er at det offentlige finansierer tjenestene, og at de fordeles etter behov i befolkningen. Den svenske sosialdemokratiske helseminister velsigner dette arbeidet og mener at det frigjør ressurser til å behandle enda flere pasienter, og at det gir bedre kvalitet på tjenestene. Og det er iallfall fem gode grunner til at vi bør se nærmere på disse ideene. For det første: Avstanden mellom befolkningens etterspørsel etter helsetjenester og det helsevesenet har kapasitet til å tilby, er økende. Lange helsekøer taler sitt tydelige språk. For det andre: De ideologiske motforestillingene er gledelig nok i ferd med å avta både i Arbeiderpartiet og i det såkalte sentrum. Som helseminister Høybråten uttrykte det i Dagens Næringsliv tidligere i sommer: «Spørsmålet om å bruke etterspørselsteknikker er … ikke først og fremst en ideologisk problemstilling, men heller et spørsmål om hvor mye og på hvilken måte det er hensiktsmessig å bruke teknikken.» Stadig flere mennesker har fått erfaring med private legesentre og private klinikker. Folk har forstått at helsepersonell som jobber i det private, er like opptatt av pasientenes beste og kvaliteten på behandlingen som helsepersonell i de offentlige sykehus. Det som måtte være igjen av ideologiske begrunnelser her i Norge, har mistet sin kraft, ikke minst fordi folk ser med egne øyne at det ikke er grunn til å frykte private tilbud som et supplement – tvert imot. I tillegg har betalingsviljen for mindre omfattende behandlingstilbud økt. For det tredje: Det offentlige har gjennom mange år brukt gamle og lite hensiktsmessige virkemidler i helsepolitikken. Offentlig monopol, hierarkiske styringssystemer, spillet mellom forvaltningsnivåene og manglende fokusering på service og kvalitet i behandlingen har skapt tillitskrise i helsevesenet, og – det må jeg si – med berettigelse. Ikke minst er inntrykket av uklare roller og uklar ansvarsfordeling mellom aktørene i helsevesenet frustrerende for pasientene. For det fjerde: Den demografiske utviklingen i den vestlige verden. Vi lever lenger, og presset på helsevesenet vil øke betraktelig i årene fremover. Teknologiutviklingen er en femte faktor. Sykehusene har nå, med den rette teknologi og de moderne medisiner, mulighet til å behandle sykdommer bedre og derigjennom gi pasienter lengre levetid og bedre livskvalitet. Jeg har registrert gjennom media i sommer at helseministeren er skeptisk til å skille mellom tilbyder- og etterspørselsrollen i helsevesenet. Han frykter at for mange private sykehus vil kanalisere tjenester og arbeidskraft bort fra de offentlige sykehusene, og at det vil bli ulik tilgang til helsetjenester. Men dersom ansvaret for funksjonsfordelingen mellom sykehusene ligger hos staten gjennom godkjenning av de regionale helseplaner, vil det bestemme hva som tilbys hvor. En nasjonal helseplan, slik Høyre ønsker, ville vært et enda bedre redskap. Dersom det offentlige har ansvar for finansieringen av tjenestene til den enkelte pasient, vil det sikre lik tilgang til tjenestene. Hvis pengene kunne følge pasienten direkte til sykehusene, slik Høyre vil, og slik Kristelig Folkeparti ville i opposisjon, ville vi unngå at fylkeskommunen tar deler av bevilgningen på veien. Sykehusene får klare insentiver til å behandle flere pasienter, og vi sikrer at pasientene settes først. En modell hvor man lar det offentlige og private konkurrere om å utføre tjenestene, er også den modell som best vil sikre pasientene en sterkere posisjon i forhold til sykehusvesenet. Når de politiske prioriteringer i helsesektoren, funksjonsfordelingen mellom sykehusene, kontrollsystemer og den offentlige finansieringen er på plass, blir det etter Høyres syn mindre viktig hvem som eier og driver sykehusene. Unntaket er universitets- og regionsykehusene, som etter Høyres oppfatning er i en spesiell situasjon. Private kan godt eie og ha driftsansvar for bygningene. Men selve sykehusdriften må være i offentlig regi, slik at man har en tilfredsstillende og god kontroll med universitetsfunksjonene. Vi er inne i en tid med stadig større ubalanse mellom tilbud og etterspørsel. Derfor må vi forholde oss til virkeligheten. Det er snart ingen grenser for hvilke tjenester helsevesenet skal utføre. I denne situasjonen må vi styre slik at vi får mest mulig ut av ressursene. Det offentlige må konsentrere seg om å sikre de grunnleggende helsetjenestene og lage spilleregler for de private aktørene. De bør også få en mulighet til å utføre oppgaver det offentlige definerer som «grunnleggende helsetjenester», slik man gjør i Sverige. Men det må, som jeg har sagt tidligere, være en forutsetning at det offentlige skal betale tjenestene, og at kontrollmekanismene er gode, slik at tjenestene holder kvalitetsmessige mål. Det viktigste er likevel at vi gir sykehusene frihet i forhold til det tungrodde politiske system, slik at det blir mulig å lede sykehusene mer profesjonelt og prøve ut ulike selskapsformer, slik en nå ser ut til å få politisk flertall for her i Oslo. Som politikere bør vi heller være opptatt av å fristille de offentlige sykehusene enn å begrense de private. Et samarbeid mellom det offentlige og det private helsevesen har vi tro på. Etter Høyres mening gjelder det å få i gang noen prøveprosjekter, for uten det tror jeg ikke vi kommer videre. Hvordan ser helseministeren på dette, og vil han ta initiativ og stimulere til et slikt samarbeid?", + "topic": "om en modell for helsevesenet hvor det offentlige drar nytte av _private aktører til utbygging og drift av sykehus_", + "sequence": "4", + "id": "tale000004", + "ministerial_role": None, + "legislature": "Bondevik I", + "subject": None, + "language": "Norwegian (Bokmål)", + "debate_id": "Saker-og-publikasjoner/Publikasjoner/Referater/Stortinget/1998-1999/981020/2/", + }, + {}, + {}, + { + # test special case of ministers answering questions + "ministerial_role": "helseministeren", + "speaker": "Presidenten", + "speaker_id": "DH", + "party": None, + "party_role": None, + "speech": "Representanten Sjøli nevnte et forslag. Betyr det at hun tar opp dette forslaget?", + }, + ], + "n_documents": 10, + }, + { + "name": "parliament-finland", + "docs": [ + { + "country": "Finland", + "speech": "Täysistunto alkaa. Toivotan kaikki tervetulleiksi tänne Sibelius-taloon Sibeliuksen juhlavuotena aloittamaan vastuullista työtämme isänmaan hyväksi. Iältäni vanhimpana eduskunnan jäsenenä johdan puhetta tässä valtiopäivien ensimmäisessä täysistunnossa, kunnes eduskunta on työjärjestyksen 4 §:n mukaan valinnut puhemiehen ja kaksi varapuhemiestä ja nämä ovat antaneet eduskunnalle juhlallisen vakuutuksen. Plenum börjar. Som den riksdagsledamot som är äldst till åren är det min uppgift att föra ordet vid första plenum under riksmötet till dess att riksdagen enligt 4 § i riksdagens arbets-ordning inom sig valt talman och två vice talmän och dessa har avgett högtidlig försäkran inför riksdagen.", + "speaker_id": "Pertti_Salolainen", + "speaker": "Pertti Salolainen", + "role": "Ikäpuhemies", + "party_id": "#party.KOK", + "party": "KOK", + "party_role": "Hallituspuolue", + "speaker_gender": "Male", + "speaker_birth_year": 1940, + "speech_type": "PuhemiesPuheenvuoro", + "id": "2015_1_1", + "url": "https://www.eduskunta.fi/FI/vaski/PoytakirjaAsiakohta/Sivut/PTK_1+2015+1.aspx", + "sequence": "1", + "topic": "Nimenhuuto", + "debate_id": "ptk_1_2015", + "debate_title": "PTK 1/2015", + "date": "2015-04-28", + }, + ], + "n_documents": 22, + }, + { + "name": "parliament-finland-old", + "docs": [ + { + "country": "Finland", + "speech": """FÖUDT HOS + +FINLANDS RIDDERSKAP OCR ADEL + +VID + +LANDTDAGEN ÅR 1877. + +TREDJE HÄFTET. + +Från den 1 till den 31 Oktober. + +FINSKA LITTERATUR-SÄLLBKAPETS TRYCKERl, + +1878.""", + "id": "Adeln_Prot_1877_III.pdf_0", + "speech_type": "minutes", + "chamber": "nobility", + "date_earliest": "1877-01-01", + "date_latest": "1877-12-31", + "page": "0", + "language": "swe", + "source_archive": "Adeln_Prot_1877_III.pdf", + }, + ], + "n_documents": 4, + }, + { + "name": "parliament-ireland", + "end": datetime(1999, 12, 31), + "docs": [ + { + "country": "Ireland", + "id": "1", + "date": "1919-01-21", + "speaker": "Count George Noble, Count Plunkett", + "speaker_id": "977", + "speaker_constituency": "Roscommon North", + "party": "Sinn Féin", + "party_id": "22", + "speech": "Molaimse don Dáil Cathal Brugha, an Teachta ó Dhéisibh Phortláirge do bheith mar Cheann Comhairle againn indiu.", + "topic": "1. CEANN COMHAIRLE I gCOIR AN LAE.", + "chamber": "Dáil", + "sequence": 1, + "source_archive": "1919-2013", + "url": None, + "ministerial_role": None, + "role": None, + "debate_type": None, + "committee": None, + }, + ], + "n_documents": 5, + }, + { + "name": "parliament-ireland", + "start": datetime(2000, 1, 1), + "end": datetime(2013, 12, 31), + "docs": [ + { + "country": "Ireland", + "id": "3088872", + "date": "2000-01-26", + "speaker": "Mr. Ruairí Quinn", + "speaker_id": "985", + "speaker_constituency": "Dublin South-East", + "party": "The Labour Party", + "party_id": "14", + "speech": "asked the Taoiseach if he will make a statement on his visit to South Africa and Lesotho.", + "topic": "Ceisteanna &ndash Questions. - Official Engagements.", + "chamber": "Dáil", + "sequence": 3088872, + "source_archive": "1919-2013", + "url": None, + "ministerial_role": None, + "role": None, + "debate_type": None, + "committee": None, + }, + ] + + [{}] * 13 # skip ahead to the first speech from a minister + + [ + { + "id": "3088886", + "speaker_id": "5", + "speaker": "Mr. Bertie Ahern", + "ministerial_role": "Taoiseach, Minister for Foreign Affairs", + } + ], + "n_documents": 15, + }, + { + "name": "parliament-ireland", + "start": datetime(2014, 1, 1), + "docs": [ + { + "country": "Ireland", + "sequence": 1, + "speaker_id": "#AndrewDoyle", + "date": "2014-12-09", + "topic": "Vote 30 - Agriculture, Food and the Marine (Supplementary)", + "speaker": "Andrew Doyle", + "chamber": "Dáil", + "url": "https://data.oireachtas.ie/akn/ie/debateRecord/select_committee_on_agriculture_food_and_the_marine/2014-12-09/debate/mul@/main.xml", + "source_archive": "2014-2020", + "party": None, + "party_id": None, + "speaker_constituency": None, + "role": "Chair", + "ministerial_role": None, + "debate_type": "committee", + "committee": "Select Committee on Agriculture, Food and the Marine", + "id": "debateRecord#select_committee_on_agriculture_food_and_the_marine#2014-12-09#debate#main#spk_1", + "speech": """As we have a quorum, we will commence in public session. All mobile phones should be switched off because they cause interference. I have apologies from Deputies Michael McNamara and Martin Heydon. This meeting has been convened to consider a Supplementary Estimate on Vote 30 - Agriculture, Food and the Marine, which was referred by the Dáil to the committee on 3 December with an instruction to report back to the Dáil not later than 11 December. +I welcome the Minister, Deputy Simon Coveney, and his officials. I thank them for the briefing material provided, which has been circulated to the members of the committee. I invite the Minister to make his opening statement.""", + }, + { + "speaker_id": "#SimonCoveney", + "speaker": "Simon Coveney", + "role": None, + "ministerial_role": "Minister for Agriculture, Food and the Marine", + }, + ], + "n_documents": 25, + }, + { + "name": "parliament-europe", + "start": datetime(1999, 7, 20), + "docs": [ + { + "id": "1999-07-21-Speech-3-063", + "date": "1999-07-21", + "debate_id": "1999-07-21_AgendaItem_5", + "debate_title": "Statement by Mr Prodi, President-elect of the Commission", + "party": "Group for the Technical Coordination and Defence of Indipendent Groups and Members (TGI)", + "sequence": 15, + "speaker": "Francesco Enrico Speroni", + "speaker_country": "Italy", + "speech": """Mr President, as a Member of the Italian national Parliament for the\n(The Northern League for the Independence of Padania), I did not vote for Professor Prodi in Rome as I considered he would be completely useless as head of government. I was then proved right as he lost the vote of confidence of the Italian Parliament. Reckoning also that a Roman idiot would still be that stupid wherever he was, which, incidently, is reflected in the symbol on the list which bears his name for the election of this Parliament, I cannot for consistency\"s sake express my faith in the President of the Commission. As a native of the Po valley who is Italian only by passport, I am fortunately immune from the national Christian Democrat type of opportunism which brings Berlusconi together with Mastella and De Mita and sees in Prodi not the impartial President of the Commissioners uninfluenced by the States, but the lavish dispenser of favours to a wide and varied assortment of Southern Italian profiteers. Although I hold some of the Commissioners in high esteem, I recall the old mafioso Neapolitan saying: ‘A fish rots from the head downwards’ and I therefore have to express my negative opinion of the Prodi Presidency.""", + "source_language": "Italian", + "url": "http://purl.org/linkedpolitics/eu/plenary/1999-07-21-Speech-3-063", + } + ] + + [{}] # skip ahead to last speech + + [ + { + "id": "2017-07-06-Speech-4-146-000", + "date": "2017-07-06", + "debate_id": "2017-07-06_AgendaItem_13", + "debate_title": "Composition of committees and delegations", + "party": None, + "sequence": 2, + "source_language": "English", + "speaker": "Ashley Fox", + "speaker_country": "United Kingdom", + "speech": """Mr President, yesterday afternoon we had a lively debate, under Rule 153, on the subject of a single seat for this Parliament. Unfortunately, under that rule, it was not possible to have a resolution, but it was the clear will of this House that we bring forward a report to propose a treaty change. So, as Mr Weber and Mr Pittella are in their seats, could they please take note of the view of this House and, when the matter comes to the Conference of Presidents, could they please authorise that report?""", + "url": "http://www.europarl.europa.eu/plenary/EN/vod.html?mode=unit&vodLanguage=EN&startTime=20170706-12:02:01-324", + } + ], + "n_documents": 3, + }, +] diff --git a/backend/corpora/parliament/description/euparl.md b/backend/corpora/parliament/description/euparl.md new file mode 100644 index 000000000..0a3fd9ca7 --- /dev/null +++ b/backend/corpora/parliament/description/euparl.md @@ -0,0 +1 @@ +The debates from the European Parliament, in English (translation), as provided by the [Talk of Europe](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x62-ew3m&version=1.0) dataset. The dataset covers debates from July 1999 to July 2017. diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py new file mode 100644 index 000000000..6520410bf --- /dev/null +++ b/backend/corpora/parliament/euparl.py @@ -0,0 +1,236 @@ +from datetime import datetime +from itertools import chain +import os +from typing import Tuple, Union + +from django.conf import settings +from langcodes import standardize_tag, Language +from rdflib import Graph, Namespace, URIRef +from rdflib.namespace import DCTERMS, FOAF, RDFS, RDF as RDFNS +from ianalyzer_readers.extract import Backup, Combined, Metadata, RDF + +from addcorpus.es_mappings import keyword_mapping +from addcorpus.python_corpora.corpus import FieldDefinition, RDFCorpusDefinition +from corpora.parliament.parliament import Parliament +import corpora.parliament.utils.field_defaults as field_defaults + +EVENTS_METADATA = 'Events_and_structure.ttl' +MP_METADATA = 'MembersOfParliament_background.ttl' +SPEECHES = 'English.ttl' + +# Namespaces of Linked Politics (NB: the purl links resolve to dead sites) +LP_EU = Namespace('http://purl.org/linkedpolitics/eu/plenary/') +LPV_EU = Namespace('http://purl.org/linkedpolitics/vocabulary/eu/plenary/') +LP = Namespace('http://purl.org/linkedpolitics/') +LPV = Namespace('http://purl.org/linkedpolitics/vocabulary/') + +def add_speaker_metadata(filename: str) -> dict: + """Parse all relevant metadata out of MembersOfParliament ttl to dict""" + speaker_dict = {} + speaker_graph = Graph() + speaker_graph.parse(filename) + speaker_subjects = speaker_graph.subjects(object=LPV.MemberOfParliament) + for speaker in speaker_subjects: + try: + name = speaker_graph.value(speaker, FOAF.name).value + except AttributeError: + # We cannot find the name of the speaker subject + continue + country_node = speaker_graph.value(speaker, LPV.countryOfRepresentation) + country_name = speaker_graph.value(country_node, RDFS.label).value + party_list = [] + speaker_functions = speaker_graph.objects(speaker, LPV.politicalFunction) + for function in speaker_functions: + function_type = speaker_graph.value(function, LPV.institution) + if speaker_graph.value(function_type, RDFNS.type) == LPV.EUParty: + party_labels = list(speaker_graph.objects(function_type, RDFS.label)) + party_acronym = min(party_labels, key=len) + party_name = max(party_labels, key=len) + date_start = speaker_graph.value(function, LPV.beginning) + date_end = speaker_graph.value(function, LPV.end) + party_list.append({ + 'party_acronym': party_acronym, + 'party_name': party_name, + 'date_start': date_start.value, + 'date_end': date_end.value + }) + speaker_dict.update({speaker: { + 'name': name, + 'country': country_name, + 'parties': party_list + } + }) + return speaker_dict + +def get_identifier(input: str) -> str: + return input.split('/')[-1] + + +def language_name(lang_code: str) -> str: + return Language.make(language=standardize_tag(lang_code)).display_name() + + +def get_speaker(input: Tuple[URIRef, dict]) -> str: + (speaker, speaker_dict) = input + return speaker_dict.get(speaker).get('name') + +def get_speaker_country(input: Tuple[URIRef, dict]) -> str: + (speaker, speaker_dict) = input + return speaker_dict.get(speaker).get('country') + +def get_speaker_party(input: Tuple[str, datetime, dict]) -> str: + ''' look up the which EU party the speaker was part of at the date of their speech ''' + (speaker, date, party_data) = input + party_list = party_data.get(speaker).get('parties') + return next( + ( + f"{p['party_name'].value} ({p['party_acronym'].value})" + for p in party_list + if (date >= p["date_start"] and date <= p["date_end"]) + ) + ) + +def get_speech_index(input: Tuple[str, list]) -> int: + ''' find index of speech in array of debate parts ''' + speech, speeches = input + if not speech: + return None + return speeches.index(speech) + 1 + +def get_speech_text(input: str) -> str: + ''' remove leading language information, e.g., `(IT)`''' + return input.split(') ')[-1] + +def get_uri(input: Union[URIRef, str]) -> str: + ''' convert input from URIRef to string ''' + try: + return input.n3().strip('<>') + except: + return input + +class ParliamentEurope(Parliament, RDFCorpusDefinition): + """ + Speeches of the European parliament, (originally in or translated to English), + provided as Linked Open Data by the "Talk of Europe" project + """ + title = 'People & Parliament (European Parliament)' + description = "Speeches from the European Parliament (EP)" + min_date = datetime(year=1999, month=7, day=20) + max_date = datetime(year=2017, month=7, day=6) + data_directory = settings.PP_EUPARL_DATA + es_index = getattr(settings, 'PP_EUPARL_INDEX', 'parliament-euparl') + languages = ['en'] + description_page = 'euparl.md' + image = 'euparl.jpeg' + + def sources(self, **kwargs): + metadata = { + "speakers": add_speaker_metadata( + os.path.join(self.data_directory, MP_METADATA) + ) + } + yield os.path.join(self.data_directory, SPEECHES), metadata + + def document_subjects(self, graph: Graph): + """return all subjects which have either translated or spoken text""" + return chain( + graph.subjects(predicate=LPV.translatedText), + graph.subjects(predicate=LPV.spokenText), + ) + + def parse_graph_from_filename(self, filename: str) -> Graph: + ''' we combine the graphs in place, to keep memory load low ''' + graph = Graph() + graph.parse(filename) + graph.parse(os.path.join(self.data_directory, EVENTS_METADATA)) + return graph + + debate_id = field_defaults.debate_id() + debate_id.extractor = RDF( + DCTERMS.isPartOf, + transform=get_identifier + ) + + debate_title = field_defaults.debate_title() + debate_title.extractor = RDF( + DCTERMS.isPartOf, + DCTERMS.title + ) + + date = field_defaults.date(min_date, max_date) + date.extractor = RDF( + DCTERMS.date, + transform=lambda x: x.strftime('%Y-%m-%d') + ) + + party = field_defaults.party() + party.extractor = Combined( + RDF(LPV.speaker), + RDF(DCTERMS.date), + Metadata('speakers'), + transform=get_speaker_party + ) + + sequence = field_defaults.sequence() + sequence.extractor = Combined( + RDF(), + RDF(DCTERMS.isPartOf, DCTERMS.hasPart, multiple=True), + transform=get_speech_index, + ) + + source_language = field_defaults.language() + source_language.name = 'source_language' + source_language.display_name = 'Source language' + source_language.description = 'Original language of the speech' + source_language.search_filter.description = 'Search only in speeches in the selected source languages', + source_language.extractor = RDF(DCTERMS.language, transform=language_name) + + speaker = field_defaults.speaker() + speaker.extractor = Combined( + RDF(LPV.speaker), + Metadata('speakers'), + transform=get_speaker + ) + + speaker_country = FieldDefinition( + name='speaker_country', + display_name='Represented country', + description='The EU country the speaker represents', + es_mapping=keyword_mapping(), + extractor=Combined( + RDF(LPV.speaker), + Metadata('speakers'), + transform=get_speaker_country + ) + ) + + speech = field_defaults.speech(language='en') + speech.extractor = Backup( + RDF( + LPV.spokenText, + ), + RDF( + LPV.translatedText, + ), + transform=get_speech_text + ) + + speech_id = field_defaults.speech_id() + speech_id.extractor = RDF(transform=get_identifier) + + url = field_defaults.url() + url.extractor = Backup(RDF(LPV.videoURI, transform=get_uri), RDF(transform=get_uri)) + + def __init__(self): + self.fields = [ + self.date, + self.debate_id, + self.debate_title, + self.party, + self.sequence, + self.source_language, + self.speaker, + self.speaker_country, + self.speech, self.speech_id, + self.url + ] diff --git a/backend/corpora/parliament/images/euparl.jpeg b/backend/corpora/parliament/images/euparl.jpeg new file mode 100644 index 000000000..73fff6758 Binary files /dev/null and b/backend/corpora/parliament/images/euparl.jpeg differ diff --git a/backend/corpora/parliament/tests/data/euparl/English.ttl b/backend/corpora/parliament/tests/data/euparl/English.ttl new file mode 100644 index 000000000..6c79988bf --- /dev/null +++ b/backend/corpora/parliament/tests/data/euparl/English.ttl @@ -0,0 +1,25 @@ +@prefix foaf: . +@prefix xsd: . +@prefix rdf: . +@prefix lpv_eu: . +@prefix lpv: . +@prefix lp_eu: . +@prefix lp: . +@prefix dcterms: . + +lp_eu:1999-07-21-Speech-3-063 lpv:translatedText "(IT) Mr President, as a Member of the Italian national Parliament for the\n(The Northern League for the Independence of Padania), I did not vote for Professor Prodi in Rome as I considered he would be completely useless as head of government. I was then proved right as he lost the vote of confidence of the Italian Parliament. Reckoning also that a Roman idiot would still be that stupid wherever he was, which, incidently, is reflected in the symbol on the list which bears his name for the election of this Parliament, I cannot for consistency\"s sake express my faith in the President of the Commission. As a native of the Po valley who is Italian only by passport, I am fortunately immune from the national Christian Democrat type of opportunism which brings Berlusconi together with Mastella and De Mita and sees in Prodi not the impartial President of the Commissioners uninfluenced by the States, but the lavish dispenser of favours to a wide and varied assortment of Southern Italian profiteers. Although I hold some of the Commissioners in high esteem, I recall the old mafioso Neapolitan saying: ‘A fish rots from the head downwards’ and I therefore have to express my negative opinion of the Prodi Presidency."@en . +lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Lega Nord per l'indipendenza della Padania" . +lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Speroni (NI)" . + +lp_eu:2009-03-24-Speech-2-371 lpv:translatedText "Mr President, ladies and gentlemen, allow me first of all to thank you for once again giving us the opportunity to pursue the constructive dialogue that has been established between the European Investment Bank and Parliament for some years now.\nMight we go further? I would remind you that the Court of Auditors already monitors all the EIB’s activities whenever these involve the use of funds from the European budget. Should we go further towards a formal system of banking supervision? That is what Mrs Stauner was hoping for. Mr Bullmann pointed out that things were perhaps not that simple. In any event, it is worth discussing. All I can do today is confirm that the EIB is fully open to being subjected to formal banking supervision, if it is considered worthwhile.\nFor the moment, we have organised, alongside the Financial Sector Supervisory Commission in Luxembourg, a form of informal supervision.\nIn answer to Mr Audy, I would say that the action that he requested last year from the Committee of European Banking Supervisors (CEBS) has indeed been carried out. We have therefore questioned the CEBS, but it informed us that it itself did not have any authority in the area and that it could not even act in an advisory role. We are therefore still in the hands of those who would like to take an initiative in this regard. I say again that we are open to such initiatives.\nA word in conclusion on cooperation between our two institutions. Mr Mirow has already indicated that it was developing well, particularly in the Western Balkans, and with our neighbours in the East, most recently in Turkey. All I want to say, in order to keep to my speaking time, is that we are in full agreement with the recommendations featured in Mr Mitchell’s report. We think that it would be in the common interest of both our institutions, and of our borrowers too, for us to move towards a more rational and functional division of labour.\nA word in conclusion on Mr Seppänen’s report. I would like to say how much we have appreciated Mr Seppänen’s constructive approach. He proposes a temporary solution, which allows the EIB to continue with its activities, but which fixes a date for an in-depth discussion of the role that the EIB should play outside the European Union. I am in no doubt that this is a debate on which we will spend some time and that, I believe, has come at just the right moment.\nI am particularly happy to have the opportunity to discuss the two reports being presented to us today, because they are two reports – that of Mr Mitchell and that of Mr Seppänen – that are interesting and that raise entirely relevant issues. I hope that we will have the opportunity to return to these issues later.\nToday, of course, we are facing a crisis on an exceptional scale – probably the most serious crisis since the end of the Second World War – and it is therefore quite normal in this context for Member States to call on our two institutions to try to make a contribution to the European Union’s response to this crisis. You know that in this context the Member States, which are our shareholders, have asked the EIB to substantially increase the volume of its lending in 2009, an increase of some 30% compared to the initial forecasts, and to channel this additional effort essentially into three areas: firstly, loans to banks for small and medium-sized enterprises; secondly, energy, and in particular the fight against climate change; and finally, a special effort for those countries that are hardest hit by the crisis.\nWhat point have we reached today? I will give you the statistics covering the last three months of 2008 – in other words, starting from the time at which the first appeals were made to the EIB – and the first two months of 2009. During those five months we lent more than EUR 31 billion, which represents a 38% increase compared with the same period of late-2007/early-2008. In the first area, as regards loans for small and medium-sized enterprises, EUR 5.6 billion in loans were issued in this short period. Several of you have stressed the importance of aiding small and medium-sized enterprises in the current climate. In fact, we are making a very special effort in this area, and I can already tell you that the objective that we were set of releasing EUR 15 billion of these loans during the years 2008 and 2009 will be exceeded.\nAs regards the second objective, energy and the fight against climate change, here too we have made a particular effort, and it is in this context that financing for the automotive industry must be placed. We must be clear: in this sector our funding is going towards projects involving research, development and production of eco-friendly cars, that is, cars that will meet the Union’s new standards regarding the reduction of CO\nemissions.\nFinally, regarding the third area: aid for countries that have been hardest hit by the crisis: during this same five-month period we issued EUR 910 million in loans in Hungary, EUR 600 million in Latvia, EUR 1 billion in Romania and EUR 1.1 billion in Lithuania.\nI therefore think that I can say that we have been doing our best to respond to the Member States’ appeal and to implement the agreed measures without delay. Mr Mirow himself has already alluded to the joint International Finance Corporation-European Bank for Reconstruction and Development action plan regarding aid for the banking sector in Central and Eastern Europe.\nNaturally, this increase in the volume of our loans is only possible thanks to the increase in capital on which our shareholders have decided – it will not cost the Member States anything. However, it was decided that we needed our shareholders’ authorisation to turn our reserves into capital.\nSeveral of you have asked questions about monitoring and supervision of the EIB, and I personally think that the question is totally legitimate. When a financial institution grows in such a way, it is normal for there to be concerns about how it is monitored. There is what is already in place, which is not insignificant: there is a certain amount of internal monitoring and, above all, there is external monitoring by an independent audit committee that reports directly to our governors. Moreover, the Treaty of Lisbon makes provision for strengthening this audit committee with the addition of people who have proven experience of banking supervision."@en . +lp_eu:2009-03-24-Speech-2-371 lpv:unclassifiedMetadata "2" . + +lp_eu:2017-07-06-Speech-4-146-000 lpv:spokenText "Mr President, yesterday afternoon we had a lively debate, under Rule 153, on the subject of a single seat for this Parliament. Unfortunately, under that rule, it was not possible to have a resolution, but it was the clear will of this House that we bring forward a report to propose a treaty change. So, as Mr Weber and Mr Pittella are in their seats, could they please take note of the view of this House and, when the matter comes to the Conference of Presidents, could they please authorise that report?"@en . +lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "(Applause)" . +lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "Ashley Fox (ECR )." . + +lp_eu:1999-07-21_AgendaItem_5 dcterms:title "Statement by Mr Prodi, President-elect of the Commission"@en . + +lp_eu:2009-03-24_AgendaItem_30 dcterms:title "EIB and EBRD annual reports for 2007 - Community guarantee to the European Investment Bank (debate)"@en . + +lp_eu:2017-07-06_AgendaItem_13 dcterms:title "Composition of committees and delegations"@en . diff --git a/backend/corpora/parliament/tests/data/euparl/Events_and_structure.ttl b/backend/corpora/parliament/tests/data/euparl/Events_and_structure.ttl new file mode 100644 index 000000000..121f987bf --- /dev/null +++ b/backend/corpora/parliament/tests/data/euparl/Events_and_structure.ttl @@ -0,0 +1,107 @@ +@prefix foaf: . +@prefix xsd: . +@prefix rdf: . +@prefix lpv_eu: . +@prefix lpv: . +@prefix lp_eu: . +@prefix lp: . +@prefix dcterms: . + +lp_eu:1999-07-21-Speech-3-063 a lpv_eu:Speech . +lp_eu:1999-07-21-Speech-3-063 dcterms:date "1999-07-21"^^xsd:date . +lp_eu:1999-07-21-Speech-3-063 dcterms:isPartOf lp_eu:1999-07-21_AgendaItem_5 . +lp_eu:1999-07-21-Speech-3-063 dcterms:language "it"^^xsd:language . +lp_eu:1999-07-21-Speech-3-063 lpv:docno "en.19990721.5.3-063" . +lp_eu:1999-07-21-Speech-3-063 lpv:hasSubsequent lp_eu:1999-07-21-Speech-3-064 . +lp_eu:1999-07-21-Speech-3-063 lpv:speaker lp:EUmember_997 . + +lp_eu:2009-03-24-Speech-2-371 a lpv_eu:Speech . +lp_eu:2009-03-24-Speech-2-371 dcterms:date "2009-03-24"^^xsd:date . +lp_eu:2009-03-24-Speech-2-371 dcterms:isPartOf lp_eu:2009-03-24_AgendaItem_30 . +lp_eu:2009-03-24-Speech-2-371 dcterms:language "fr"^^xsd:language . +lp_eu:2009-03-24-Speech-2-371 lpv:docno "en.20090324.30.2-371" . +lp_eu:2009-03-24-Speech-2-371 lpv:hasSubsequent lp_eu:2009-03-24-Speech-2-372 . +lp_eu:2009-03-24-Speech-2-371 lpv:speaker lp:Speaker_Philippe_Maystadt . +lp_eu:2009-03-24-Speech-2-371 lpv:videoURI . + +lp_eu:2017-07-06-Speech-4-146-000 a lpv_eu:Speech . +lp_eu:2017-07-06-Speech-4-146-000 dcterms:date "2017-07-06"^^xsd:date . +lp_eu:2017-07-06-Speech-4-146-000 dcterms:isPartOf lp_eu:2017-07-06_AgendaItem_13 . +lp_eu:2017-07-06-Speech-4-146-000 dcterms:language "en"^^xsd:language . +lp_eu:2017-07-06-Speech-4-146-000 lpv:docno "en.20170706.13.4-146-000" . +lp_eu:2017-07-06-Speech-4-146-000 lpv:hasSubsequent lp_eu:2017-07-06-Speech-4-147-000 . +lp_eu:2017-07-06-Speech-4-146-000 lpv:speaker lp:EUmember_96957 . +lp_eu:2017-07-06-Speech-4-146-000 lpv:videoURI . + +lp_eu:1999-07-21_AgendaItem_5 a lpv_eu:AgendaItem . +lp_eu:1999-07-21_AgendaItem_5 dcterms:date "1999-07-21"^^xsd:date . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-049 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-050 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-051 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-052 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-053 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-054 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-055 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-056 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-057 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-058 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-059 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-060 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-061 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-062 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-063 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-064 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-065 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-066 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-067 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-068 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-069 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-070 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-071 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-072 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-996 . +lp_eu:1999-07-21_AgendaItem_5 dcterms:isPartOf lp_eu:1999-07-21_SessionDay . +lp_eu:1999-07-21_AgendaItem_5 lpv:docno "en.19990721.5" . +lp_eu:1999-07-21_AgendaItem_5 lpv:hasSubsequent lp_eu:1999-07-21_AgendaItem_6 . + +lp_eu:2009-03-24_AgendaItem_30 a lpv_eu:AgendaItem . +lp_eu:2009-03-24_AgendaItem_30 dcterms:date "2009-03-24"^^xsd:date . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-353 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-354 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-355 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-356 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-357 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-358 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-359 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-360 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-361 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-362 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-363 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-364 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-365 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-366 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-367 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-368 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-369 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-370 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-371 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-372 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-373 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-374 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-375 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-376 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-377 . +lp_eu:2009-03-24_AgendaItem_30 dcterms:isPartOf lp_eu:2009-03-24_SessionDay . +lp_eu:2009-03-24_AgendaItem_30 lpv:docno "en.20090324.30" . +lp_eu:2009-03-24_AgendaItem_30 lpv:hasSubsequent lp_eu:2009-03-24_AgendaItem_31 . + +lp_eu:2017-07-06_AgendaItem_13 a lpv_eu:AgendaItem . +lp_eu:2017-07-06_AgendaItem_13 dcterms:date "2017-07-06"^^xsd:date . +lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-145-000 . +lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-146-000 . +lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-147-000 . +lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-148-000 . +lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-149-000 . +lp_eu:2017-07-06_AgendaItem_13 dcterms:isPartOf lp_eu:2017-07-06_SessionDay . +lp_eu:2017-07-06_AgendaItem_13 lpv:docno "en.20170706.13" . +lp_eu:2017-07-06_AgendaItem_13 lpv:hasSubsequent lp_eu:2017-07-06_AgendaItem_14 . diff --git a/backend/corpora/parliament/tests/data/euparl/MembersOfParliament_background.ttl b/backend/corpora/parliament/tests/data/euparl/MembersOfParliament_background.ttl new file mode 100644 index 000000000..ea1cd2748 --- /dev/null +++ b/backend/corpora/parliament/tests/data/euparl/MembersOfParliament_background.ttl @@ -0,0 +1,807 @@ +@prefix foaf: . +@prefix lp: . +@prefix lpv: . +@prefix ns1: . +@prefix rdfs: . +@prefix xsd: . + + a lpv:EUParty ; + rdfs:label "AGRI", + "European Democratic Group" ; + lpv:acronym "AGRI" ; + lpv:featuredRoleDescriptions "European Democratic Group - Chair", + "European Democratic Group - Member", + "European Democratic Group - Member of the Bureau", + "European Democratic Group - Treasurer", + "European Democratic Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "ALDE", + "Group of the Alliance of Liberals and Democrats for Europe" ; + lpv:acronym "ALDE" ; + lpv:featuredRoleDescriptions "Group of the Alliance of Liberals and Democrats for Europe -", + "Group of the Alliance of Liberals and Democrats for Europe - Chair", + "Group of the Alliance of Liberals and Democrats for Europe - Member", + "Group of the Alliance of Liberals and Democrats for Europe - Member of the Bureau", + "Group of the Alliance of Liberals and Democrats for Europe - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "COM", + "Communist and Allies Group" ; + lpv:acronym "COM" ; + lpv:featuredRoleDescriptions "Communist and Allies Group -", + "Communist and Allies Group - Chair", + "Communist and Allies Group - Member", + "Communist and Allies Group - Treasurer", + "Communist and Allies Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "DR", + "Technical Group of the European Right" ; + lpv:acronym "DR" ; + lpv:featuredRoleDescriptions "Technical Group of the European Right -", + "Technical Group of the European Right - Chair", + "Technical Group of the European Right - Member", + "Technical Group of the European Right - Member of the Bureau", + "Technical Group of the European Right - Treasurer", + "Technical Group of the European Right - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "EDA", + "Group of the European Democratic Alliance" ; + lpv:acronym "EDA" ; + lpv:featuredRoleDescriptions "Group of the European Democratic Alliance - Chair", + "Group of the European Democratic Alliance - Member", + "Group of the European Democratic Alliance - Member of the Bureau", + "Group of the European Democratic Alliance - Treasurer", + "Group of the European Democratic Alliance - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "EDD", + "Group for a Europe of Democracies and Diversities" ; + lpv:acronym "EDD" ; + lpv:featuredRoleDescriptions "Group for a Europe of Democracies and Diversities - Chair", + "Group for a Europe of Democracies and Diversities - Co-Chair", + "Group for a Europe of Democracies and Diversities - Member", + "Group for a Europe of Democracies and Diversities - Member of the Bureau" . + + a lpv:EUParty ; + rdfs:label "EFDD", + "Europe of Freedom and Direct Democracy Group" ; + lpv:acronym "EFDD" ; + lpv:featuredRoleDescriptions "Europe of Freedom and Direct Democracy Group - Chair of the Bureau", + "Europe of Freedom and Direct Democracy Group - Co-Chair", + "Europe of Freedom and Direct Democracy Group - Member", + "Europe of Freedom and Direct Democracy Group - Member of the Bureau", + "Europe of Freedom and Direct Democracy Group - Treasurer", + "Europe of Freedom and Direct Democracy Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "ELDR", + "Group of the European Liberal, Democrat and Reform Party" ; + lpv:acronym "ELDR" ; + lpv:featuredRoleDescriptions "Group of the European Liberal, Democrat and Reform Party -", + "Group of the European Liberal, Democrat and Reform Party - Chair", + "Group of the European Liberal, Democrat and Reform Party - Member", + "Group of the European Liberal, Democrat and Reform Party - Member of the Bureau", + "Group of the European Liberal, Democrat and Reform Party - Treasurer", + "Group of the European Liberal, Democrat and Reform Party - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "EN", + "Europe of Nations Group (Coordination Group)" ; + lpv:acronym "EN" ; + lpv:featuredRoleDescriptions "Europe of Nations Group (Coordination Group) -", + "Europe of Nations Group (Coordination Group) - Chair", + "Europe of Nations Group (Coordination Group) - Member", + "Europe of Nations Group (Coordination Group) - Treasurer", + "Europe of Nations Group (Coordination Group) - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "EPD", + "Group of European Progressive Democrats" ; + lpv:acronym "EPD" ; + lpv:featuredRoleDescriptions "Group of European Progressive Democrats - Chair", + "Group of European Progressive Democrats - Member", + "Group of European Progressive Democrats - Member of the Bureau", + "Group of European Progressive Democrats - Treasurer", + "Group of European Progressive Democrats - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "EPP", + "Group of the European People's Party (Christian Democrats)", + "Group of the European People's Party (Christian-Democratic Group)" ; + lpv:acronym "EPP" ; + lpv:featuredRoleDescriptions "Group of the European People's Party (Christian Democrats) -", + "Group of the European People's Party (Christian Democrats) - Chair", + "Group of the European People's Party (Christian Democrats) - Member", + "Group of the European People's Party (Christian Democrats) - Member of the Bureau", + "Group of the European People's Party (Christian Democrats) - Vice-Chair", + "Group of the European People's Party (Christian-Democratic Group) -", + "Group of the European People's Party (Christian-Democratic Group) - Chair", + "Group of the European People's Party (Christian-Democratic Group) - Member", + "Group of the European People's Party (Christian-Democratic Group) - Member of the Bureau", + "Group of the European People's Party (Christian-Democratic Group) - Treasurer", + "Group of the European People's Party (Christian-Democratic Group) - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "EPP-ED", + "Group of the European People's Party (Christian Democrats) and European Democrats" ; + lpv:acronym "EPP-ED" ; + lpv:featuredRoleDescriptions "Group of the European People's Party (Christian Democrats) and European Democrats -", + "Group of the European People's Party (Christian Democrats) and European Democrats - Chair", + "Group of the European People's Party (Christian Democrats) and European Democrats - Member", + "Group of the European People's Party (Christian Democrats) and European Democrats - Member of the Bureau", + "Group of the European People's Party (Christian Democrats) and European Democrats - Treasurer", + "Group of the European People's Party (Christian Democrats) and European Democrats - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "ER", + "Group of the European Right" ; + lpv:acronym "ER" ; + lpv:featuredRoleDescriptions "Group of the European Right -", + "Group of the European Right - Chair", + "Group of the European Right - Member", + "Group of the European Right - Member of the Bureau", + "Group of the European Right - Treasurer", + "Group of the European Right - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "ERA", + "Group of the European Radical Alliance" ; + lpv:acronym "ERA" ; + lpv:featuredRoleDescriptions "Group of the European Radical Alliance - Chair", + "Group of the European Radical Alliance - Member", + "Group of the European Radical Alliance - Member of the Bureau", + "Group of the European Radical Alliance - Treasurer" . + + a lpv:EUParty ; + rdfs:label "Confederal Group of the European United Left", + "EUL", + "Group for the European United Left" ; + lpv:acronym "EUL" ; + lpv:featuredRoleDescriptions "Confederal Group of the European United Left - Chair", + "Confederal Group of the European United Left - Member", + "Confederal Group of the European United Left - Treasurer", + "Confederal Group of the European United Left - Vice-Chair", + "Group for the European United Left - Chair", + "Group for the European United Left - Member", + "Group for the European United Left - Member of the Bureau", + "Group for the European United Left - Treasurer", + "Group for the European United Left - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "Confederal Group of the European United Left - Nordic Green Left", + "Confederal Group of the European United Left/Nordic Green Left", + "EUL/NGL" ; + lpv:acronym "EUL/NGL" ; + lpv:featuredRoleDescriptions "Confederal Group of the European United Left - Nordic Green Left -", + "Confederal Group of the European United Left - Nordic Green Left - Chair", + "Confederal Group of the European United Left - Nordic Green Left - Member", + "Confederal Group of the European United Left - Nordic Green Left - Member of the Bureau", + "Confederal Group of the European United Left - Nordic Green Left - Treasurer", + "Confederal Group of the European United Left - Nordic Green Left - Vice-Chair", + "Confederal Group of the European United Left/Nordic Green Left -", + "Confederal Group of the European United Left/Nordic Green Left - Chair", + "Confederal Group of the European United Left/Nordic Green Left - Member", + "Confederal Group of the European United Left/Nordic Green Left - Member of the Bureau", + "Confederal Group of the European United Left/Nordic Green Left - Treasurer", + "Confederal Group of the European United Left/Nordic Green Left - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "FE", + "Forza Europa Group" ; + lpv:acronym "FE" ; + lpv:featuredRoleDescriptions "Forza Europa Group - Chair", + "Forza Europa Group - Member", + "Forza Europa Group - Member of the Bureau", + "Forza Europa Group - Treasurer", + "Forza Europa Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "G", + "The Green Group in the European Parliament" ; + lpv:acronym "G" ; + lpv:featuredRoleDescriptions "The Green Group in the European Parliament - Chair", + "The Green Group in the European Parliament - Member", + "The Green Group in the European Parliament - Member of the Bureau", + "The Green Group in the European Parliament - Treasurer", + "The Green Group in the European Parliament - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "G/EFA", + "Group of the Greens/European Free Alliance" ; + lpv:acronym "G/EFA" ; + lpv:featuredRoleDescriptions "Group of the Greens/European Free Alliance -", + "Group of the Greens/European Free Alliance - Chair", + "Group of the Greens/European Free Alliance - Co-Chair", + "Group of the Greens/European Free Alliance - Member", + "Group of the Greens/European Free Alliance - Member of the Bureau", + "Group of the Greens/European Free Alliance - Treasurer", + "Group of the Greens/European Free Alliance - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "Group of Independents for a Europe of Nations", + "I-EN" ; + lpv:acronym "I-EN" ; + lpv:featuredRoleDescriptions "Group of Independents for a Europe of Nations -", + "Group of Independents for a Europe of Nations - Chair", + "Group of Independents for a Europe of Nations - Co-Chair", + "Group of Independents for a Europe of Nations - Member", + "Group of Independents for a Europe of Nations - Treasurer", + "Group of Independents for a Europe of Nations - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "ITS", + "Identity, Tradition and Sovereignty Group" ; + lpv:acronym "ITS" ; + lpv:featuredRoleDescriptions "Identity, Tradition and Sovereignty Group - Chair", + "Identity, Tradition and Sovereignty Group - Member", + "Identity, Tradition and Sovereignty Group - Member of the Bureau", + "Identity, Tradition and Sovereignty Group - Treasurer", + "Identity, Tradition and Sovereignty Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "LD", + "Liberal and Democratic Group" ; + lpv:acronym "LD" ; + lpv:featuredRoleDescriptions "Liberal and Democratic Group -", + "Liberal and Democratic Group - Chair", + "Liberal and Democratic Group - Member", + "Liberal and Democratic Group - Treasurer", + "Liberal and Democratic Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "LDR", + "Liberal and Democratic Reformist Group" ; + lpv:acronym "LDR" ; + lpv:featuredRoleDescriptions "Liberal and Democratic Reformist Group -", + "Liberal and Democratic Reformist Group - Chair", + "Liberal and Democratic Reformist Group - Member", + "Liberal and Democratic Reformist Group - Treasurer", + "Liberal and Democratic Reformist Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "LU", + "Left Unity" ; + lpv:acronym "LU" ; + lpv:featuredRoleDescriptions "Left Unity -", + "Left Unity - Chair", + "Left Unity - Member", + "Left Unity - Treasurer", + "Left Unity - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "Group of the Party of European Socialists", + "PES", + "Socialist Group in the European Parliament" ; + lpv:acronym "PES" ; + lpv:featuredRoleDescriptions "Group of the Party of European Socialists -", + "Group of the Party of European Socialists - Chair", + "Group of the Party of European Socialists - Member", + "Group of the Party of European Socialists - Member of the Bureau", + "Group of the Party of European Socialists - Treasurer", + "Group of the Party of European Socialists - Vice-Chair", + "Socialist Group in the European Parliament -", + "Socialist Group in the European Parliament - Chair", + "Socialist Group in the European Parliament - Member", + "Socialist Group in the European Parliament - Treasurer", + "Socialist Group in the European Parliament - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament", + "S&D" ; + lpv:acronym "S&D" ; + lpv:featuredRoleDescriptions "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament -", + "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Chair", + "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Member", + "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Treasurer", + "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "SOC", + "Socialist Group" ; + lpv:acronym "SOC" ; + lpv:featuredRoleDescriptions "Socialist Group -", + "Socialist Group - Chair", + "Socialist Group - Member", + "Socialist Group - Member of the Bureau", + "Socialist Group - Treasurer", + "Socialist Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "Group Union for Europe", + "UFE" ; + lpv:acronym "UFE" ; + lpv:featuredRoleDescriptions "Group Union for Europe - Chair", + "Group Union for Europe - Member", + "Group Union for Europe - Member of the Bureau", + "Group Union for Europe - Treasurer", + "Group Union for Europe - Vice-Chair" . + +lp:EUmember_96957 a ns1:MemberOfParliament ; + ns1:MEP_ID "96957" ; + ns1:countryOfRepresentation lp:EUCountry_GB ; + ns1:dateOfBirth "1969-11-15"^^xsd:date ; + ns1:placeOfBirth "Sutton Coldfield" ; + ns1:politicalFunction lp:pf00f88c83af0e95f9d4ba9f0c9bd0a093, + lp:pf0f78c03fefe0985f764dea70022d0043, + lp:pf2bdc66e4eb24629cd4d221b54b45a3ca, + lp:pf350caf782f08b96e2a3af35fc5616001, + lp:pf399a6d922ff4c20a852fc22d6600906e, + lp:pf52bab8d1a349df571ebeb55761db513c, + lp:pf58aecb3742c3fe35d5080e16645b07c8, + lp:pf8240375d0714e974c78fb753d862122a, + lp:pf99c4158caafcfba5bc9fda500f3d0c60, + lp:pfba1033551486f93285992b1f262dc79e, + lp:pfc8ae7a828c60553e27378b19a12a862a, + lp:pfcedffd5e8d734d1203db8fa3f7038f78, + lp:pfd9732399a8c6bbb7c5e041837244a2f8, + lp:pfe61b1fb146891c9d4d0bbf88349d10f8, + lp:pfe6b98484dd76e8c4c9e3e9783f2b0583, + lp:pff56829256d54fb49a895889d87467924 ; + foaf:name "Ashley Fox" . + +lp:EUmember_997 a ns1:MemberOfParliament ; + ns1:MEP_ID "997" ; + ns1:countryOfRepresentation lp:EUCountry_IT ; + ns1:dateOfBirth "1946-10-04"^^xsd:date ; + ns1:placeOfBirth "Busto Arsizio" ; + ns1:politicalFunction lp:pf00850f07d83ff7511d816f2906631631, + lp:pf1617b45a9e5df3163b8429577b4a3555, + lp:pf18c223c3363102672b51b21c184056cc, + lp:pf1ef5ada5f958b0aba640e490e460a221, + lp:pf2170f06ac1f449772b48a5e844b1f7cd, + lp:pf232468ecbffeb758cc38b17eb407a0e8, + lp:pf27013a6352e31f1747cb0925a8d315a9, + lp:pf2cff760ac2aa89d9fb6a7c65e2055670, + lp:pf3520fdebb83d60437179f91787dfecf9, + lp:pf36596b04e9f7439a17fd0764130a0a16, + lp:pf4009f4a6d184651935c94eaf0e8e3b31, + lp:pf42381eeb3b04b2383aa048ad07ce1723, + lp:pf46b57676499000ff10b62596ebaaff0a, + lp:pf4e79d2756b56075d1ba8ead84562bd8b, + lp:pf50f29b8e60bb70630a741c75d9e15850, + lp:pf5520a69a514cdd1f889ff24e68b34128, + lp:pf57173b89fb6ac13826d452fc74067ff3, + lp:pf5a2de253c92e0c12b84cac7c1e155666, + lp:pf6356d7122a0fb25ece5db75f61c19671, + lp:pf64ca0f3f6dde49e4a542560ed5122891, + lp:pf6579712af153fc6ce1b0db2de821f92d, + lp:pf76dd5ad131d30ed3306a78709e7ce251, + lp:pf870c0cf3766606d20cec0aecf1851569, + lp:pf8e8990b89425d1d5fefd0526973a771a, + lp:pf8e9b064d23472496eb5097e9e61f664f, + lp:pf9d156e058e9b318c9b9df62e3a5ab42a, + lp:pfb592b70d3ae90102ecf4f9cf0dbbf362, + lp:pfb8d606b79e721a9f0dd3232ebbe34ca4, + lp:pfbb4b02d1622a4615842f6f422386345c, + lp:pfc0aa2605dab67ed5619e571ffaa06fbc, + lp:pfc4defbc2ab4682bb4ffda87d34391823, + lp:pfd69065d8553a1a36e8aa0811ef83e0d3, + lp:pfd941468c3faf619ad5d0a0156420880c, + lp:pfddd269fab021f83ba0a89efc4298e439, + lp:pfe33ac318848090cbc8bce7f6e4f26f6f, + lp:pff4a2c78f09a9956180c64fe9dc17b9c1, + lp:pff75a3ca6e96a14db43823be10fb693d2, + lp:pffa15145f38d5a44e3ef93a4534a2df63, + lp:pffeee7811d1a25e86cc33849665eeb484 ; + foaf:name "Francesco Enrico Speroni" . + +lp:EUCountry_GB rdfs:label "United Kingdom"@en . + +lp:EUCountry_IT rdfs:label "Italy"@en . + + a lpv:EUParty ; + rdfs:label "EFD", + "Europe of freedom and democracy Group" ; + lpv:acronym "EFD" ; + lpv:featuredRoleDescriptions "Europe of freedom and democracy Group -", + "Europe of freedom and democracy Group - Chair of the Bureau", + "Europe of freedom and democracy Group - Co-Chair", + "Europe of freedom and democracy Group - Member", + "Europe of freedom and democracy Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "IND/DEM", + "Independence/Democracy Group" ; + lpv:acronym "IND/DEM" ; + lpv:featuredRoleDescriptions "Independence/Democracy Group - Chair", + "Independence/Democracy Group - Chair of the Bureau", + "Independence/Democracy Group - Co-Chair", + "Independence/Democracy Group - Member", + "Independence/Democracy Group - Member of the Bureau", + "Independence/Democracy Group - Treasurer" . + + a lpv:EUParty ; + rdfs:label "RBW", + "Rainbow Group in the European Parliament", + "Rainbow Group: Federation of the Green-Alternative European Links, Agelev-Ecolo, the Danish People's Movement against Membership of the European Community and the European Free Alliance in the European Parliament" ; + lpv:acronym "RBW" ; + lpv:featuredRoleDescriptions "Rainbow Group in the European Parliament - Chair", + "Rainbow Group in the European Parliament - Member", + "Rainbow Group in the European Parliament - Member of the Bureau", + "Rainbow Group in the European Parliament - Treasurer", + "Rainbow Group: Federation of the Green-Alternative European Links, Agelev-Ecolo, the Danish People's Movement against Membership of the European Community and the European Free Alliance in the European Parliament - Chair", + "Rainbow Group: Federation of the Green-Alternative European Links, Agelev-Ecolo, the Danish People's Movement against Membership of the European Community and the European Free Alliance in the European Parliament - Member" . + + a lpv:EUParty ; + rdfs:label "UEN", + "Union for Europe of the Nations Group" ; + lpv:acronym "UEN" ; + lpv:featuredRoleDescriptions "Union for Europe of the Nations Group -", + "Union for Europe of the Nations Group - Chair", + "Union for Europe of the Nations Group - Co-Chair", + "Union for Europe of the Nations Group - Member", + "Union for Europe of the Nations Group - Treasurer", + "Union for Europe of the Nations Group - Vice-Chair" . + +lp:pf00850f07d83ff7511d816f2906631631 a ns1:PoliticalFunction ; + ns1:beginning "1999-07-21"^^xsd:date ; + ns1:end "2002-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf00f88c83af0e95f9d4ba9f0c9bd0a093 a ns1:PoliticalFunction ; + ns1:beginning "2014-07-01"^^xsd:date ; + ns1:end "2017-03-31"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf0f78c03fefe0985f764dea70022d0043 a ns1:PoliticalFunction ; + ns1:beginning "2012-01-25"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf1617b45a9e5df3163b8429577b4a3555 a ns1:PoliticalFunction ; + ns1:beginning "2012-01-19"^^xsd:date ; + ns1:end "2014-04-02"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf18c223c3363102672b51b21c184056cc a ns1:PoliticalFunction ; + ns1:beginning "1991-10-10"^^xsd:date ; + ns1:end "1992-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf1ef5ada5f958b0aba640e490e460a221 a ns1:PoliticalFunction ; + ns1:beginning "2000-11-16"^^xsd:date ; + ns1:end "2002-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf2170f06ac1f449772b48a5e844b1f7cd a ns1:PoliticalFunction ; + ns1:beginning "1999-07-20"^^xsd:date ; + ns1:end "1999-07-21"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf232468ecbffeb758cc38b17eb407a0e8 a ns1:PoliticalFunction ; + ns1:beginning "1999-07-20"^^xsd:date ; + ns1:end "2004-07-19"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf27013a6352e31f1747cb0925a8d315a9 a ns1:PoliticalFunction ; + ns1:beginning "1992-01-15"^^xsd:date ; + ns1:end "1992-10-25"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf2bdc66e4eb24629cd4d221b54b45a3ca a ns1:PoliticalFunction ; + ns1:beginning "2009-09-15"^^xsd:date ; + ns1:end "2012-01-18"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf2cff760ac2aa89d9fb6a7c65e2055670 a ns1:PoliticalFunction ; + ns1:beginning "1989-07-26"^^xsd:date ; + ns1:end "1991-10-09"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf350caf782f08b96e2a3af35fc5616001 a ns1:PoliticalFunction ; + ns1:beginning "2009-07-21"^^xsd:date ; + ns1:end "2012-01-18"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf3520fdebb83d60437179f91787dfecf9 a ns1:PoliticalFunction ; + ns1:beginning "2007-01-31"^^xsd:date ; + ns1:end "2009-07-13"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf36596b04e9f7439a17fd0764130a0a16 a ns1:PoliticalFunction ; + ns1:beginning "1992-01-15"^^xsd:date ; + ns1:end "1994-05-11"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf399a6d922ff4c20a852fc22d6600906e a ns1:PoliticalFunction ; + ns1:beginning "2014-07-01"^^xsd:date ; + ns1:end "2017-03-31"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf4009f4a6d184651935c94eaf0e8e3b31 a ns1:PoliticalFunction ; + ns1:beginning "2007-01-15"^^xsd:date ; + ns1:end "2007-01-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf42381eeb3b04b2383aa048ad07ce1723 a ns1:PoliticalFunction ; + ns1:beginning "2009-07-14"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf46b57676499000ff10b62596ebaaff0a a ns1:PoliticalFunction ; + ns1:beginning "2002-01-17"^^xsd:date ; + ns1:end "2004-07-19"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf4e79d2756b56075d1ba8ead84562bd8b a ns1:PoliticalFunction ; + ns1:beginning "2009-07-16"^^xsd:date ; + ns1:end "2011-10-04"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf50f29b8e60bb70630a741c75d9e15850 a ns1:PoliticalFunction ; + ns1:beginning "2004-07-20"^^xsd:date ; + ns1:end "2004-07-20"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf52bab8d1a349df571ebeb55761db513c a ns1:PoliticalFunction ; + ns1:beginning "2012-01-19"^^xsd:date ; + ns1:end "2012-01-24"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf5520a69a514cdd1f889ff24e68b34128 a ns1:PoliticalFunction ; + ns1:beginning "2001-10-03"^^xsd:date ; + ns1:end "2004-07-19"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf57173b89fb6ac13826d452fc74067ff3 a ns1:PoliticalFunction ; + ns1:beginning "1989-07-26"^^xsd:date ; + ns1:end "1992-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf58aecb3742c3fe35d5080e16645b07c8 a ns1:PoliticalFunction ; + ns1:beginning "2011-03-09"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf5a2de253c92e0c12b84cac7c1e155666 a ns1:PoliticalFunction ; + ns1:beginning "2004-07-20"^^xsd:date ; + ns1:end "2009-07-13"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf6356d7122a0fb25ece5db75f61c19671 a ns1:PoliticalFunction ; + ns1:beginning "1989-07-26"^^xsd:date ; + ns1:end "1992-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf64ca0f3f6dde49e4a542560ed5122891 a ns1:PoliticalFunction ; + ns1:beginning "1992-10-26"^^xsd:date ; + ns1:end "1994-05-11"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf6579712af153fc6ce1b0db2de821f92d a ns1:PoliticalFunction ; + ns1:beginning "2004-09-14"^^xsd:date ; + ns1:end "2007-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf76dd5ad131d30ed3306a78709e7ce251 a ns1:PoliticalFunction ; + ns1:beginning "2006-04-27"^^xsd:date ; + ns1:end "2006-12-12"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf8240375d0714e974c78fb753d862122a a ns1:PoliticalFunction ; + ns1:beginning "2014-09-16"^^xsd:date ; + ns1:end "2014-11-11"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf870c0cf3766606d20cec0aecf1851569 a ns1:PoliticalFunction ; + ns1:beginning "1994-04-21"^^xsd:date ; + ns1:end "1994-05-11"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf8e8990b89425d1d5fefd0526973a771a a ns1:PoliticalFunction ; + ns1:beginning "2007-01-15"^^xsd:date ; + ns1:end "2007-01-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf8e9b064d23472496eb5097e9e61f664f a ns1:PoliticalFunction ; + ns1:beginning "1989-07-25"^^xsd:date ; + ns1:end "1994-05-11"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf99c4158caafcfba5bc9fda500f3d0c60 a ns1:PoliticalFunction ; + ns1:beginning "2012-01-19"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pf9d156e058e9b318c9b9df62e3a5ab42a a ns1:PoliticalFunction ; + ns1:beginning "2004-07-21"^^xsd:date ; + ns1:end "2007-01-14"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfb592b70d3ae90102ecf4f9cf0dbbf362 a ns1:PoliticalFunction ; + ns1:beginning "2010-01-07"^^xsd:date ; + ns1:end "2012-01-18"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfb8d606b79e721a9f0dd3232ebbe34ca4 a ns1:PoliticalFunction ; + ns1:beginning "2006-12-13"^^xsd:date ; + ns1:end "2009-07-13"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfba1033551486f93285992b1f262dc79e a ns1:PoliticalFunction ; + ns1:beginning "2009-07-14"^^xsd:date ; + ns1:end "2011-03-08"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfbb4b02d1622a4615842f6f422386345c a ns1:PoliticalFunction ; + ns1:beginning "2009-07-16"^^xsd:date ; + ns1:end "2012-01-18"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfc0aa2605dab67ed5619e571ffaa06fbc a ns1:PoliticalFunction ; + ns1:beginning "2007-01-31"^^xsd:date ; + ns1:end "2009-07-13"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfc4defbc2ab4682bb4ffda87d34391823 a ns1:PoliticalFunction ; + ns1:beginning "2002-01-17"^^xsd:date ; + ns1:end "2004-07-19"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfc8ae7a828c60553e27378b19a12a862a a ns1:PoliticalFunction ; + ns1:beginning "2014-07-01"^^xsd:date ; + ns1:end "2017-03-31"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfcedffd5e8d734d1203db8fa3f7038f78 a ns1:PoliticalFunction ; + ns1:beginning "2009-07-16"^^xsd:date ; + ns1:end "2011-02-06"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfd69065d8553a1a36e8aa0811ef83e0d3 a ns1:PoliticalFunction ; + ns1:beginning "1992-01-15"^^xsd:date ; + ns1:end "1994-05-11"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfd941468c3faf619ad5d0a0156420880c a ns1:PoliticalFunction ; + ns1:beginning "2007-01-31"^^xsd:date ; + ns1:end "2009-07-13"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfd9732399a8c6bbb7c5e041837244a2f8 a ns1:PoliticalFunction ; + ns1:beginning "2012-01-19"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfddd269fab021f83ba0a89efc4298e439 a ns1:PoliticalFunction ; + ns1:beginning "1999-09-09"^^xsd:date ; + ns1:end "1999-09-16"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfe33ac318848090cbc8bce7f6e4f26f6f a ns1:PoliticalFunction ; + ns1:beginning "1999-07-22"^^xsd:date ; + ns1:end "2001-10-02"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfe61b1fb146891c9d4d0bbf88349d10f8 a ns1:PoliticalFunction ; + ns1:beginning "2009-07-14"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pfe6b98484dd76e8c4c9e3e9783f2b0583 a ns1:PoliticalFunction ; + ns1:beginning "2011-02-07"^^xsd:date ; + ns1:end "2012-01-18"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pff4a2c78f09a9956180c64fe9dc17b9c1 a ns1:PoliticalFunction ; + ns1:beginning "2004-07-21"^^xsd:date ; + ns1:end "2006-04-26"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pff56829256d54fb49a895889d87467924 a ns1:PoliticalFunction ; + ns1:beginning "2014-07-01"^^xsd:date ; + ns1:end "2017-03-31"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pff75a3ca6e96a14db43823be10fb693d2 a ns1:PoliticalFunction ; + ns1:beginning "2009-07-14"^^xsd:date ; + ns1:end "2014-06-30"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pffa15145f38d5a44e3ef93a4534a2df63 a ns1:PoliticalFunction ; + ns1:beginning "2004-07-21"^^xsd:date ; + ns1:end "2006-04-26"^^xsd:date ; + ns1:institution ; + ns1:role . + +lp:pffeee7811d1a25e86cc33849665eeb484 a ns1:PoliticalFunction ; + ns1:beginning "1989-07-25"^^xsd:date ; + ns1:end "1994-04-20"^^xsd:date ; + ns1:institution ; + ns1:role . + + a lpv:EUParty ; + rdfs:label "Group for the Technical Coordination and Defence of Indipendent Groups and Members", + "TGI", + "Technical Coordination and Defence of Independent Groups and Members", + "Technical Group of Independent Members - mixed group" ; + lpv:acronym "TGI" ; + lpv:featuredRoleDescriptions "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Chair", + "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Member", + "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Member of the Bureau", + "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Vice-Chair", + "Technical Coordination and Defence of Independent Groups and Members - Member", + "Technical Coordination and Defence of Independent Groups and Members - Member of the Bureau", + "Technical Group of Independent Members - mixed group - Co-Chair", + "Technical Group of Independent Members - mixed group - Member", + "Technical Group of Independent Members - mixed group - Member of the Bureau", + "Technical Group of Independent Members - mixed group - Treasurer" . + + a lpv:EUParty ; + rdfs:label "ECR", + "European Conservatives and Reformists Group" ; + lpv:acronym "ECR" ; + lpv:featuredRoleDescriptions "European Conservatives and Reformists Group -", + "European Conservatives and Reformists Group - Chair", + "European Conservatives and Reformists Group - Co-treasurer", + "European Conservatives and Reformists Group - Member", + "European Conservatives and Reformists Group - Member of the Bureau", + "European Conservatives and Reformists Group - Treasurer", + "European Conservatives and Reformists Group - Vice-Chair" . + + a lpv:EUParty ; + rdfs:label "NA", + "Non-attached", + "Non-attached Members" ; + lpv:acronym "NA" ; + lpv:featuredRoleDescriptions "Non-attached -", + "Non-attached - Member", + "Non-attached Members -", + "Non-attached Members - Member" . + diff --git a/backend/corpora/parliament/tests/test_import.py b/backend/corpora/parliament/tests/test_import.py index e41e03de5..3eacb714e 100644 --- a/backend/corpora/parliament/tests/test_import.py +++ b/backend/corpora/parliament/tests/test_import.py @@ -1,616 +1,10 @@ import os import warnings import pytest -from datetime import datetime +from addcorpus.python_corpora.corpus import CorpusDefinition from addcorpus.python_corpora.load_corpus import load_corpus_definition - -CORPUS_TEST_DATA = [ - { - 'name': 'parliament-canada', - 'docs': [ - { - 'date': '2015-02-02', - 'country': 'Canada', - 'debate_title': 'Government Orders', - 'debate_id': 'ca.proc.d.2015-02-02', - 'chamber': 'House of Commons', - 'party': 'New Democratic Party', - 'role': 'Interjection', - 'speaker': 'Jack Harris', - 'speaker_id': 'c846297d-8bc7-4e69-b6eb-31d0e19f7ec1', - 'speaker_constituency': 'St. John\'s East', - 'speech': 'Mr. Speaker, I suppose I could ask the member for Nanaimo—Alberni why the Government of Canada would put $280 million into last year\'s budget if it was intended to compensate for something that would happen in 2020.', - 'id': 'ca.proc.d.2015-02-02.16582.214', - 'topic': 'Business of Supply', - 'subtopic': 'Opposition Motion—Newfoundland and Labrador Fisheries Investment Fund', - }], - 'n_documents': 3 - }, - { - 'name': 'parliament-france', - 'docs': [ - { - "book_id" : "37531030876685 37531030876685/1/58 37531030876685_1_58_7", - "chamber" : "Assemblee Nationale", - "country" : "France", - "date" : "1881-01-11", - "date_is_estimate" : False, - "debate_id" : "1881-01-11", - "debate_type" : None, - "era" : "3Rd Republic", - "legislature" : None, - "page" : "7", - "page_source" : "X0000007.xml", - "sequence" : "0", - "speech" : """SOMMAIRE - -Constitution du bureau provisoire. - -Excuses. — Demande de congé. - -Communication par M. le président de deux lettres par lesquelles MM. Lou;s Legrand et Drumel déclinent toute candidature aux fonctions de secrétaire. - -Tirage au sort des bureaux. - -Fixation de l'ordre du jour : MM. Georges Perin, de Colbert-Laplace, Guichard, Janvier de La Motte (Eure). — Demande de renvoi au 20 janvier de la prochaine séance : M. Laroche-Joubert. Adoption. - -PRÉSIDBNCE DE M. DESSEAUX, DOYEN D'AGE La séance est ouverte à deux heures un quart. - -M. le président. Aux termes de l'article 1er de la loi constitutionnelle du 16 juillet 1875, je déclare ouverte la session ordinaire de la Chambre des députés pour 1881. - -J'invite lts six membres les plus jeunes de ''Assemblée à vouloir bien répondre à 'l'appel de leur nom pour prendre place au bureau en qualité de secrétaires provisoires. - -(L'appel des noms des députés les plus jeunes est fait par un huissier.) - -Sont successivement appelés : MM. Georges de Cassagnac, né le 17 févrièr 1855; Adrien Bastii, né Je 1er octobre 1853; Jules André, né le 23 août 1852 ; René Gautier, né le '25 avril 1852 ; Emile Réaux, né le 20 juin 1851 ; Le Provost de Launay fils, né le 8 juin 1850; René Eschasseriaux, né le 1. 1 mai 1850; Louis Janvier de La Motte, né le 23 août 1849; Lanauve, né le 24 mai 1849; Dreyfus, né le 5 mai 1849 ; Marcellin Pellet, né le 4 mars 1849 ; De Loqueyssip, né le 1er octobre 1848; Le comte de Breteuil, né le 17 septembre 1848; Roy de Loulay, né le 8 août 1848; D3 La Porte, né le 20 juin 1848 ; Thomson, né le 21 janvier 1848. - -MM Georges de Cassagnac, Adrien Bstid, limile Réaux, Dreyfus, de Loqueyssie et Thomson répondent à l'appel de leurs noms et prennent placn au bureau. - -M. le président. Le bureau probatoire est constitué. - -MM. Fourot, de Douville-Maillefeu et Laisant s'excusent de ne pouvoir assister à la séance de ce jour. - -M. Laumond demande un congé de vingt jours. - -La demande sera renvoyéa à la commission des congés. - -J'ai reçu de M. Louis Legrand la lettre suivante, dont je donne connaissance à la Chambré : « Valenciennes, 9 janvier 1881. - -c Monsieur le président, « Je vous prie de vouloir bien annoncer à mes collègues que je ne me représente pas à leurs suffrages pour les fonctions de secrétaile. - -« je saisis cette occasion pour remercier la Chambre de l'honneur qu'elle m'a fait en me choisissant comme l'un des membres de son bureau. - -« Agréez, monsieur je président, i assurance de ma haute considération. - -c Lotis LEGRAND, « Député du Nord. » - -J'ai reçu également de M. Drumel la lettre suivante: - -« Neuvizy (Ardennes', 10 janvier 1881. - -c Monsieur le président, « Depuis deux ans, 1* Chambre m'a fait l'honneur de m'appeler à siéger, comme secrétaire, dans son bureau. Je lui en suis profondément reconnaissant; et, en la priant de charger un autre de ses membres des fonctions que je tenais de sa confiance, je lui exprime ma vive gratitude pour les témoignages d'estime et de sympathie qu'à différentes reprises elle a bien voulu me donner. - -c Veuillez croire, monsieur le président, à mes sentiments respectueux et dévoués. - -« DRUMEL. » - -M. le président. L'ordre du jour appelle le tirage au sort des bureaux. - -Il va y être procédé. - -(Il est procédé au tirage au sort des bureaux dans les formes réglementaires.) M. le président. Messieurs, il y aurait lieu de procéder maintenant à la fixation de l'ordre du jour, mais je crois devoir faire remarquer à la Chambre qu'elle n'est pas en très-grand nombre. (81! si! à droite, — Non! - -non ! sur un grand nombre de bancs à gauche.) M. Clémenceau. Il n'est pas nécessaire que la Chambre soit en très-grand nombre, il suffit qu'elle soit en nombre. - -M. le président. Je n'ai pas dit que U Chambre n'était pas en nombre, j'ai dit qu'elld n'était pas en très-grand nombre. - -M. Haentjens. "Etm n'a jamais été aussi nombreuse à une première séance ! - -M. le président. La date de l'ouverture dela session, qui est fixée par la loi constitutionnelle, se place cette année entre les deux scrutins relatifs aux élections municipales. - -A droite. Qu'est-ce que cela fait? - -M. le président beaucoup de nos collègues som encore retenus dans leurs d^oL^rtements. - -A droite. Mais non ! mais non ! - -M. Laroche Joubert Il ne fallait pas nous convoquer alors ! (Interruptions diverses à droite et sur plusieurs bancs à gauche.) M. de Baudry-d'Asson. N.Jus sommes revenus exprès pour procéder à la nomination du bureau ; nous demandons que le bureau soit nommé aujourd'hui!M. le président. Il a paru à beaucoup d'entre vous que l'élection du bureau définitif doit se faire par le plus grand nombre possible de membres. (Interruptions à dro te et sur quelques bancs à gauche.) Je soumets donc à la Chambre la proposi. - -tion de s'ajourner. (Bruyantes exclamations à droite.) Sur divers bancs à droite el à l'extrême oauchu. Non 1 non l Sur un grand nombre d'autres bancs. Mais si 1 c'est nécessaire 1 M. Georges Perin. Je demande la parole. - -M. Laroche-Joubert. Je demande la pa. - -role. - -M le comte de Colbert-Laplace. Je demande la parole. - -M. le président. La parole est à M. - -Perin. - -M. Georges Perin. Messieurs, je viens, au nom d'un certain nombre de mes amis et en mon nom personnel, demander à la Chambre de repousser la proposition d'ajournement qui vient d'être faLe pac noire honorable président. (Très bien ! très bien ! à droite et à l'extrême gauche.) Autant qu'il m'a été permis de l'entendre au milieu du bruit, je crois que la seule raison que notre honorable président ait fait valoir 7our justifier sa proposition, c'est que nous n enous pas en nombre.""", - "id" : "3rd_republic_0", - "url" : "http://gallica.bnf.fr/ark:/12148/bpt6k64418203", - "url_html": None - }], - 'n_documents': 5 - }, - { - 'name': 'parliament-germany-new', - 'docs': [ - { - 'country': 'Germany', - 'chamber': 'Bundestag', - 'date': '1949-09-22', - 'debate_id': '7', - 'speaker': 'Gebhard Seelos', - 'speaker_id': '11002141', - 'speaker_aristocracy': None, - 'speaker_academic_title': 'Dr.', - 'speaker_birth_country': 'Deutschland', - 'speaker_birthplace': 'München', - 'speaker_birth_year': 1901, - 'speaker_death_year': 1984, - 'speaker_gender': 'male', - 'speaker_profession': 'Dipl.-Volkswirt, Jurist, Diplomat, Staatsrat a. D.', - 'role': 'Member of Parliament', - 'role_long': None, - 'party': 'BP', - 'party_full': 'Bayernpartei', - 'party_id': '2', - 'speech': 'Baracken sind etwas Vorübergehendes; sie halten aber immer länger, als eigentlich geplant.', - 'id': '94', - 'url': 'https://dip21.bundestag.de/dip21/btp/01/01007.pdf', - 'sequence': '94' - }], - 'n_documents': 2 - }, - { - 'name': 'parliament-germany-old', - 'docs': [ - { - 'country': 'Germany', - 'book_id': 'bsb00000436', - 'book_label': '1867/70,1 ( Protokolle mit Sach- und Sprechregister )', - 'era': 'Reichstag (Norddeutscher Bund/Zollparlamente) 1867 - 1895 Norddeutscher Bund', - 'date': '1867-02-25', - 'date_is_estimate': 'true', - 'page': '27', - 'url': 'https://api.digitale-sammlungen.de/iiif/image/v2/bsb00000436_00027/full/full/0/default.jpg', - 'speech': "Nach vorangegangenem Gottesdienste in der Königlichen Schloßcapelle und der St. Hedwigskirche versammelten sich Heute- Nachmittags 11 Uhr die durch Allerhöchstes Patent vom 13. d. M. einberufenen Mitglieder des Reichstages des Norddeutschen Bundes im Weißen Saale des Königlichen Schlosses. Bald daraus traten die Reichstags-Commifsarien ein. Nachdem dieselben links vom Throne sich ausgestellt und die Versammlung sich -geordnet hatte, machte der Vorsitzende der Reichstags-Commissarien, Gras von Bismarck, Seiner Majestät dem Könige davon Meldung. Allerhöchst dieselben begaben Sich daraus in Begleitung Ihrer Königlichen Hoheiten des Kronprinzen und der Prinzen des Königlichen Hauses in dem nach dem Programm geordneten Zuge, unter 'Vortragung der Reichs-Insignien, nach dem Weißen Saale und nahmen, mit einem lebhaften dreimaligen Hoch, welches der Wirkliche Geheime Rath von Frankenberg ausbrachte, von der Versammlung empfangen, auf dem Throne Platz, während Seine Königliche Hoheit der Kronprinz guf der mittleren Stufe desselben, Ihre Königlichen Hoheiten die Prinzen des Königlichen Hauses zur Rechten des Thrones sich aufstellten. Seine Majestät der König verlasen hierauf, das Haupt mit dem Helme bedeckt, die nachfolgende Rede:", - 'id': '0', - }], - 'n_documents': 1 - }, - { - 'name': 'parliament-netherlands', - 'docs': [ - { - 'country': 'Netherlands', - 'date': '2000-01-18', - 'chamber': 'Eerste Kamer', - 'debate_title': 'Presentie en opening (dinsdag 18 januari 2000)', - 'debate_id': 'nl.proc.ob.d.h-ek-19992000-493-493', - 'topic': 'Presentie en opening', - 'speech': '\n'.join([ - 'Ik deel aan de Kamer mede, dat zijn ingekomen berichten van verhindering van de leden:', - 'Kohnstamm, wegens ziekte;', - 'Boorsma, wegens verblijf buitenslands.', - ]), - 'id': 'nl.proc.ob.d.h-ek-19992000-493-493.1.5.1', - 'source_archive': 'PoliticalMashup', - 'speaker': 'De voorzitter Jurgens', - 'speaker_id': 'nl.m.01992', - 'speaker_gender': None, - 'role': 'Chair', - 'party': None, - 'party_id': None, - 'party_full': None, - 'page': '493', - 'url': 'https://zoek.officielebekendmakingen.nl/h-ek-19992000-493-493.pdf', - 'sequence': 1, - } - ], - 'n_documents': 4, - 'end': datetime(2015, 1, 1), - }, - { - 'name': 'parliament-netherlands', - 'docs': [ - { - 'country': 'Netherlands', - 'date': '2017-01-31', - 'chamber': 'Tweede Kamer', - 'debate_title': 'Report of the meeting of the Dutch Lower House, Meeting 46, Session 23 (2017-01-31)', - 'debate_id': 'ParlaMint-NL_2017-01-31-tweedekamer-23', - 'topic': 'Rapport "Welvaart in kaart"', - 'speech': 'Ik heet de minister van Economische Zaken van harte welkom.', - 'id': 'ParlaMint-NL_2017-01-31-tweedekamer-23.u1', - 'speaker': 'Khadija Arib', - 'speaker_id': '#KhadijaArib', - 'speaker_gender': 'vrouw', - 'role': 'Chair', - 'party': 'PvdA', - 'party_id': '#party.PvdA', - 'party_full': 'Partij van de Arbeid', - 'page': None, - 'url': None, - 'sequence': 1, - } - ], - 'n_documents': 98, - 'start': datetime(2015, 1, 1), - }, - { - 'name': 'parliament-uk', - 'docs': [ - { - 'country': 'United Kingdom', - 'date': '1872-02-06', - 'chamber': 'House of Commons', - 'debate_title': 'New Writs During The Recess', - 'debate_id': None, - 'speech': "acquainted the House, —that he had issued Warrants for New Writs, for Truro, v. Hon. John Cranch Walker Vivian, Under Secretary to the Eight hon. Edward Cardwell; for Plymouth, Sir Robert Porrett Collier, knight, one of the Justices of the Court of Common Pleas; Dover, George Jessel, esquire, Solicitor General; York County (West Riding, Northern Division), Sir Francis Crossley, baronet, deceased; Limerick City, Francis William Russell, esquire, deceased; Galway County, Eight hon. William Henry Gregory, Governor and Commander in Chief of the Island of Ceylon and its dependencies; Kerry, Eight hon. Valentine Augustus Browne, commonly called Viscount Castlerosse, now Earl of Kenmare.", - 'id': 'guldi_c19_365565', - 'speaker': 'Mr. Speaker', - 'speaker_id': None, - 'speech_type': None, - 'topic': None, - 'subtopic': None, - 'sequence': '365565' - }, - { - 'country': 'United Kingdom', - 'date': '2020-01-14', - 'chamber': 'House of Commons', - 'debate_title': 'House Of Commons Debate On 14/01/2020', - 'debate_id': 'debates2020-01-14c', - 'speech': "What steps his Department is taking to ensure that legal aid is accessible to people who need it.", - 'id': 'uk.org.publicwhip/debate/2020-01-14c.865.4', - 'speaker': 'Sarah Dines', - 'speaker_id': 'uk.org.publicwhip/person/25877', - 'speech_type': 'Start Question', - 'topic': 'The Secretary of State was asked—', - 'subtopic': 'Legal Aid Access', - 'sequence': '0' - }], - 'n_documents': 2 - }, - { - 'name': 'parliament-sweden', - 'docs': [ - { - 'date': '2021-09-14', - 'date_is_estimate': None, - 'chamber': 'Riksdag', - 'country': 'Sweden', - 'speech': 'Ärade ledamöter! Varmt välkomna tillbaka till riksdagen! Det känns stort att få välkomna er här på tröskeln till det fjärde riksmötet den här mandatperioden. Vi har ännu ett mycket speciellt arbetsår bakom oss, till stor del präglat av pandemin. Även om vi visste att det inte var helt över för ett år sedan tror jag att vi var många som hoppades att en tydligare vändning var på väg. Så blev det inte. I stället fick vi ytterligare ett riksdagsår med ett reducerat antal ledamöter vid voteringar och utskottsarbete till stor del på distans. Men förhoppningsvis börjar vi nu gå tillbaka mot mer normala arbetsformer. Ett tydligt tecken på detta är att alla 349 ledamöter kommer att vara med vid riksmötets öppnande i eftermiddag. Jag tycker att det är angeläget att riksdagens och regeringens alla ledamöter kan vara på plats vid denna högtidliga och viktiga ceremoni, särskilt som detta är det sista öppnandet under den här mandatperioden. Däremot genomförs inget upprop nu på förmiddagen, och vi vidtar den försiktighetsåtgärden att drygt en tredjedel av ledamöterna och statsråden får sitta på läktaren under ceremonin. Formerna beslutades av mig efter diskussion med gruppledarna och de vice talmännen redan i början av augusti, alltså långt innan det blev bestämt att alla ledamöter får delta i voteringar efter riksmötets öppnande. Jag såg inget skäl att med kort varsel börja ändra i planeringen för riksmötets öppnande, så just denna speciella dag får inte alla ledamöter sitta nere på golvet här i kammaren . M en från och med riksmötets första votering sitter var och en på sin plats och röstar igen på vanligt sätt. Även om pandemin inte är över är situationen i Sverige ändå en helt annan nu än för ett år sedan. Därför har vi – talmanspresidiet och gruppledarna – gjort bedömningen att det är möjligt att samla fler personer än förra året men ändå långt färre än ett vanligt år. Vi har försökt finna en så god balans som möjligt mellan nödvändiga säkerhetsåtgärder, riksdagsordningens bestämmelser och respekt för traditionen. Den sedvanliga mottagningen i Sammanbindningsbanan är som bekant inställd, och det genomförs heller inte någon konsert i Konserthuset. Jag är glad över att vi också kommer att få hjälp att minnas dessa föregångare och förebilder genom att de får en permanent plats på Riksplan i form av en staty. Här tillkommer det att det i trapphallen i Östra riksdagshuset kommer att invigas en tavla som föreställer de här fem pionjärerna. Statyn dröjer ett tag – den kommer att invigas nästa år – men redan i kväll vill riksdagen på dagen för riksmötets öppnande, denna demokratins högtidsdag, uppmärksamma demokratijubileet med att lysa upp Stockholmsnatten med ett ljusspel. Jag kommer att tända en fasadbelysning på Östra riksdagshuset vid en webbsänd ceremoni klockan 20. Ljusspelet kan sedan ses varje kväll till och med den 20 september. Men demokratifirandet tar inte slut där. Vad passar väl bättre på FN:s demokratidag den 15 september än att fira med ett seminarium? I morgon anordnar riksdag och regering seminariet 100 år av demokrati – vilka lärdomar tar vi med oss? Se det gärna på riksdagen.se! Efter riksmötets öppnande tror jag att vi alla ser fram emot ett nytt arbetsår i riksdagen under något mer normala former. Jag har ju, som ni alla vet, tillsammans med gruppledarna slutit en ny överenskommelse om arbetsformerna under hösten, och gruppledarna har också beslutat att inte förlänga överenskommelsen om 55 närvarande ledamöter vid voteringar. Alla ledamöter kan alltså delta vid voteringarna, men vi behåller möjligheten att delta på distans vid utskottens sammanträden. Varje utskott avgör när det är motiverat att hålla fysiska sammanträden, och när man deltar fysiskt planerar vi för att det ska gå att hålla avstånd. Vi ska däremot fortsätta hjälpas åt att hålla antalet externa besök i riksdagens hus nere. Externa åhörare vid olika arrangemang bör undvikas liksom guidade visningar och mingelsituationer. Pandemin är inte över. Vi fortsätter att anpassa verksamheten när och om det behövs, men förhoppningsvis går vi mot ett mer normalt läge. Ärade ledamöter! Det här har varit en mandatperiod som ingen annan. Jag tror inte att någon hade kunnat förutse de många olika, oväntade och delvis dramatiska händelser som har inträffat. Jag tänker naturligtvis i första hand på pandemin och alla dess konsekvenser men även på de två regeringsbildningarna. Och då är det ändå ett helt år kvar av mandatperio ­ den. Jag tror att vi alla kan se fram emot ännu ett händelserikt och spännan ­ de riksdagsår fram till valet. Vi vet i alla fall att det i början av november blir den tredje regeringsbildningen under den här mandatperioden. Oavsett hur man ser på det politiska läget vill jag framhålla, apropå just demokratijubileet, att regeringsbildningarna inte har inneburit någon kris för demokratin. Svensk demokrati står stark, och den är värd att fira. Alla aktörer har i regeringsbildningsprocesserna använt de olika verktyg som finns i den demokratiska, parlamentariska verktygslådan. Misstroendeomröstning, beslut att inte utlysa extraval och talmansrundor – allt sådant följer av de lagar som vi har skapat för vår demokrati. Skeendet må vara turbulent i vissa stycken, men det följer demokratins spelregler. Ärade ledamöter! Jag vill avsluta med några rader ut dikten Sommaren i Sverige av Werner Aspenström. Den skildrar på ett fint sätt vemodet och skönheten när sommaren går mot sitt slut. Då landar på min hand den förgänglighetens tanke som vi kallar trollslända. Ett gult löv lösgör sig och faller klingande mot marken. Sommaren måste hastigt bärgas. … Ty hösten närmar sig med toppeld i asparna. Låt mig nu önska er en fin höst och ett produktivt arbetsår. På återseende här i kammaren klockan 14! Stockholms kommun Stockholms län Södermanlands län Jönköpings län Kronobergs län Blekinge län Hallands län Göteborgs kommun Värmlands län Jämtlands län Norrbottens län EU-dokument Åttaveckorsfristen för att avge ett motiverat yttrande skulle gå ut den 5 november . EU-dokument Följande frågor för skriftliga svar hade framställts: 2020/21:3636 Amorteringskravet och ojämställd bostadsmarknad 2020/21:3637 Den kinesiske ambassadörens agerande 2020/21:3638 Vaccin 2020/21:3639 Lukasjenkos tillgång till 1 miljard dollar från IMF 2020/21:3640 Markering mot Irans idrottsminister 2020/21:3642 Kriminalitet på bostadsmarknaden Skriftliga svar på följande frågor hade kommit in: 2020/21:3535 Barns rätt till säkerställda skyddade boenden 2020/21:3537 Elbrist som hotar investeringar i Sverige 2020/21:3538 Åtgärder för att trygga boende', - 'sequence': '0', - 'id': 'i-2a00eff84ce04676-0', - 'speaker': 'Andreas Norlén', - 'speaker_gender': 'man', - 'role': 'Sveriges riksdags talman', - 'ministerial_role': None, - 'party': None, - 'speaker_birth_year': 1973, - 'speaker_death_year': None, - 'speaker_constituency': None, - 'speaker_id': 'Q4755577' - }, - ], - 'n_documents': 5, - }, - { - 'name': 'parliament-sweden-old', - 'docs': [{}] * 5 + [ - { - 'book_id': 'bn_1828-30_1__01', - 'book_label': 'Hederwärda bonde-ståndets protokoller wid lagtima riksdagen i Stockholm åren 1828 och 1829. Första bandet.', - 'country': 'Sweden', - 'era': 'Ståndsriksdagen', - 'chamber': 'Bönder', - 'date_earliest': '1828-01-01', - 'date_latest': '1828-12-31', - 'speech': '''Hederwärdo - -Bonde-Ständcts - -Protokoller - -wid - -LagMa Riksdagen i Stockhol». - -Ä«tt 1828 och I82t, - -första Lander. - -STOCKHOLM, - -Kongl. Ordens-Böktryckeriet, I8Z9.''', - 'page': '0', - 'sequence': 1, - 'url': 'https://weburn.kb.se/riks/ståndsriksdagen/pdf/bn_1828-30_1_/bn_1828-30_1__01.pdf', - 'url_xml': 'https://weburn.kb.se/riks/ståndsriksdagen/xml/bn_1828-30_1_/bn_1828-30_1__01.xml', - } - ], - 'n_documents': 10 - }, - { - 'name': 'parliament-denmark', - 'docs': [ - { - 'speech': """6546 F. t. beslutn. vedr. udbetaling af sygedagpenge - -Beslutningsforslag nr. B 142. Fremsat den 3. juni 2008 af Thomas Adelskov (S), Lennart Damsbo-Andersen (S), - -Egil Andersen (SF), Margrethe Vestager (RV), Morten Østergaard (RV) og Line Barfod (EL) - -Forslag til folketingsbeslutning - -om ophævelse af varighedsbegrænsningen for udbetaling af sygedagpenge - -Folketinget pålægger regeringen at fremsætte lovforslag, som ophæver varighedsbegrænsnin- gen for udbetaling af sygedagpenge, således at - -lovforslaget kan træde i kraft den 1. januar 2009.""", - 'page': '546', - 'date_earliest': '2007-01-01', - 'date_latest': '2007-12-31', - 'book_label': 'Folketingstidende 2007/8 (2. samling) Tillæg A side 6001 - 6565', - 'book_id': '20072A6546', - 'id': '20072A6546_546', - 'chamber': 'Folketinget', - 'country': 'Denmark', - 'sequence': 546, - } - ], - 'n_documents': 5, - }, { - 'name': 'parliament-denmark-new', - 'docs': [ - { - 'country': 'Denmark', - 'id': '20100128100025', - 'date': '2010-01-28', - 'speech': 'Mødet er åbnet. I dag er der følgende anmeldelser: Kirkeministeren (Birthe Rønn Hornbech): Lovforslag nr. L 115 (Forslag til lov om ændring af lov om udnævnelse af biskopper og om stiftsbåndsløsning og forskellige andre love.) og L 116 (Forslag til lov om ændring af lov om begravelse og ligbrænding og lov om folkekirkens økonomi.) Beskæftigelsesministeren (Inger Støjberg): Lovforslag nr. L 117 (Forslag til lov om ændring af lov om sygedagpenge, lov om ret til orlov og dagpenge ved barsel, lov om aktiv socialpolitik og lov om arbejdsløshedsforsikring m.v. Transportministeren (Lars Barfoed): Lovforslag nr. L 118 (Forslag til lov om ændring af lov om taxikørsel m.v.) Videnskabsministeren (Helge Sander): Lovforslag nr. L 119 (Forslag til lov om ændring af universitetsloven.) Titler på de fremsatte forslag vil fremgå af www.folketingstidende.dk (jf. ovenfor). Mens vi får de sidste medlemmer ind i salen, kan jeg lige oplyse, at vi er vidende om, at der er problemer med, hvordan urene går på Christiansborg. Det er et lidt større problem end som så blot at justere urene, for det er hele styringssystemet – det styres af 23 V strøm – der gør, at der er problemer med overhovedet at styre urene. Nogle er slidt ned, så man skal ikke regne med tiden. Min opfordring er, at man bruger soluret og kun tæller de lyse timer. Munterhed Men det afgørende er altså, at vi er opmærksomme på det og gør, hvad vi overhovedet kan for at udskifte, hvor der skal udskiftes, og i øvrigt at få et system, så urene altid går korrekt. Jeg går nemlig ud fra, at de, der kommer for sent, her nu hvor vi skal stemme, udelukkende gør det, fordi urene går forkert.', - 'speaker': 'Thor Pedersen', - 'speaker_gender': 'Male', - 'speaker_birth_year': 1945, - 'role': 'formand', - 'party': 'Venstre', - 'topic': 'Punkt 0', - 'subject': 'other', - 'sequence': '100025', - } - ], - 'n_documents': 4, - }, - { - 'name': 'parliament-norway', - 'docs': [ - { - 'speech': """KONGERIKET NORGES 149. STORTINGS FORHANDLINGER 2004 - 2005 - -9. del - -INNEHOLDENDE REGISTER TIL FORHANDLINGER I STORTINGET OG DETS AVDELINGER - -OSLO LOBO MEDIA AS 2005""", - 'page': '2', - 'book_id': 'digistorting_2004_part9_vol-a', - 'book_label': 'Stortingsforhandlinger; 2004/2005 Vol. 149 Nr. 9', - 'date_earliest': '2004-01-01', - 'date_latest': '2004-12-31', - 'sequence': '2', - 'chamber': 'Stortinget', - 'country': 'Norway', - } - ], - 'n_documents': 5, - }, - { - 'name': 'parliament-norway-new', - 'docs': [ - {}, {}, {}, { - 'subject': 'Statsbudsjettet', - }, # skip a few introductory speeches to one with more metadata - { - 'country': 'Norway', - 'chamber': 'Stortinget', - 'date': '1998-10-20', - 'debate_title': 'Sak nr. 2', - 'debate_type': 'interpellasjon', - 'party': 'Høyre', - 'party_id': 'H', - 'party_role': 'Opposition', - 'role': 'Representant', - 'speaker': 'Sonja Irene Sjøli', - 'speaker_id': 'SONS', - 'speaker_gender': 'kvinne', - 'speaker_birth_year': 1949, - 'speaker_death_year': None, - 'speaker_constituency': 'Akershus', - 'speech': 'Det er en bred forståelse blant fagfolk og politikere om at norsk sykehusvesen ikke bare lider under mangel på ressurser, men at det først og fremst er behov for organisatoriske og strukturelle forandringer. Offentlige utredninger om eierskap, organisering og ledelse i sykehus viser at det er behov for en rekke endringer for å nå målet om et bedre og mer tilgjengelig helsetilbud til befolkningen. Erkjennelsen av at vi har brukt gamle og lite hensiktsmessige virkemidler i helsepolitikken, har også nådd Regjeringen. Helseministeren uttalte til Dagens Næringsliv i sommer at det ville tjene pasientene hvis vi kunne være mer dristig i bruken av etterspørselsteknikker og private bidrag innenfor sykehussektoren. Denne uttalte dristighet ser jeg fram til med spenning. Stortinget har i de siste år, etter sterkt påtrykk fra Høyre, vedtatt innsatsbasert finansiering og fritt sykehusvalg. Den naturlige konsekvens av dette er at sykehusene organiserer seg annerledes enn før. Vi er langt fra alene om disse tankene. En rekke svenske fagforbund krever en ny modell for det svenske helsevesenet. Den svenske legeforening og det svenske sykepleierforbundet har gått sammen og krever at markedet i større grad må styre helsetilbudet. De mener at fylkeskommunen har utspilt sin rolle i styringen av helsesektoren og krever en total omlegging av helsevesenet. Det er mulig at Norge har sterkere økonomi og bedre skiløpere enn svenskene, men helsedebatten i Sverige har i den senere tid vært langt mer dynamisk og spennende enn hos oss. Tankene om at sykehus ikke nødvendigvis må være eid og drevet av det offentlige, vinner terreng i stadig flere land og er allerede utviklet i flere miljøer også her i Norge. Til og med Jan Grund, Norges fremste helseøkonom, professor på BI og en svoren sosialdemokrat, mener at flertallet av norske politikere befinner seg i skyttergravene i debatten om private helsetjenester. Problemet er ifølge Grund at det ikke er definert hvilke grunnleggende helsetjenester vi har krav på, og hvilke tjenester som kan tilbys oss som forbrukere og kunder. Derfor er det så vanskelig å håndtere diskusjonen om privat kontra offentlig helsetilbud. Han uttrykker sterk støtte til å få private aktører inn i det offentlige helsevesen. Stiftelsen SINTEF Unimed er utpekt av Næringsdepartementet og Helsedepartementet til å lede næringsutvikling i helsesektoren. Lederen Paul Hellandsvik mener det er på høy tid å tenke nytt og utradisjonelt om hvordan det offentlige kan dra nytte av private aktører, og at det gjelder å komme i gang med noen prøveprosjekter. Erfaringer fra Sverige og andre land viser at en modell for helsevesenet hvor det offentlige drar nytte av private aktører til utbygging og drift av sykehus, gir store økonomiske gevinster og høy kvalitet på tjenestene. Forutsetningen for modellen er at det offentlige finansierer tjenestene, og at de fordeles etter behov i befolkningen. Den svenske sosialdemokratiske helseminister velsigner dette arbeidet og mener at det frigjør ressurser til å behandle enda flere pasienter, og at det gir bedre kvalitet på tjenestene. Og det er iallfall fem gode grunner til at vi bør se nærmere på disse ideene. For det første: Avstanden mellom befolkningens etterspørsel etter helsetjenester og det helsevesenet har kapasitet til å tilby, er økende. Lange helsekøer taler sitt tydelige språk. For det andre: De ideologiske motforestillingene er gledelig nok i ferd med å avta både i Arbeiderpartiet og i det såkalte sentrum. Som helseminister Høybråten uttrykte det i Dagens Næringsliv tidligere i sommer: «Spørsmålet om å bruke etterspørselsteknikker er … ikke først og fremst en ideologisk problemstilling, men heller et spørsmål om hvor mye og på hvilken måte det er hensiktsmessig å bruke teknikken.» Stadig flere mennesker har fått erfaring med private legesentre og private klinikker. Folk har forstått at helsepersonell som jobber i det private, er like opptatt av pasientenes beste og kvaliteten på behandlingen som helsepersonell i de offentlige sykehus. Det som måtte være igjen av ideologiske begrunnelser her i Norge, har mistet sin kraft, ikke minst fordi folk ser med egne øyne at det ikke er grunn til å frykte private tilbud som et supplement – tvert imot. I tillegg har betalingsviljen for mindre omfattende behandlingstilbud økt. For det tredje: Det offentlige har gjennom mange år brukt gamle og lite hensiktsmessige virkemidler i helsepolitikken. Offentlig monopol, hierarkiske styringssystemer, spillet mellom forvaltningsnivåene og manglende fokusering på service og kvalitet i behandlingen har skapt tillitskrise i helsevesenet, og – det må jeg si – med berettigelse. Ikke minst er inntrykket av uklare roller og uklar ansvarsfordeling mellom aktørene i helsevesenet frustrerende for pasientene. For det fjerde: Den demografiske utviklingen i den vestlige verden. Vi lever lenger, og presset på helsevesenet vil øke betraktelig i årene fremover. Teknologiutviklingen er en femte faktor. Sykehusene har nå, med den rette teknologi og de moderne medisiner, mulighet til å behandle sykdommer bedre og derigjennom gi pasienter lengre levetid og bedre livskvalitet. Jeg har registrert gjennom media i sommer at helseministeren er skeptisk til å skille mellom tilbyder- og etterspørselsrollen i helsevesenet. Han frykter at for mange private sykehus vil kanalisere tjenester og arbeidskraft bort fra de offentlige sykehusene, og at det vil bli ulik tilgang til helsetjenester. Men dersom ansvaret for funksjonsfordelingen mellom sykehusene ligger hos staten gjennom godkjenning av de regionale helseplaner, vil det bestemme hva som tilbys hvor. En nasjonal helseplan, slik Høyre ønsker, ville vært et enda bedre redskap. Dersom det offentlige har ansvar for finansieringen av tjenestene til den enkelte pasient, vil det sikre lik tilgang til tjenestene. Hvis pengene kunne følge pasienten direkte til sykehusene, slik Høyre vil, og slik Kristelig Folkeparti ville i opposisjon, ville vi unngå at fylkeskommunen tar deler av bevilgningen på veien. Sykehusene får klare insentiver til å behandle flere pasienter, og vi sikrer at pasientene settes først. En modell hvor man lar det offentlige og private konkurrere om å utføre tjenestene, er også den modell som best vil sikre pasientene en sterkere posisjon i forhold til sykehusvesenet. Når de politiske prioriteringer i helsesektoren, funksjonsfordelingen mellom sykehusene, kontrollsystemer og den offentlige finansieringen er på plass, blir det etter Høyres syn mindre viktig hvem som eier og driver sykehusene. Unntaket er universitets- og regionsykehusene, som etter Høyres oppfatning er i en spesiell situasjon. Private kan godt eie og ha driftsansvar for bygningene. Men selve sykehusdriften må være i offentlig regi, slik at man har en tilfredsstillende og god kontroll med universitetsfunksjonene. Vi er inne i en tid med stadig større ubalanse mellom tilbud og etterspørsel. Derfor må vi forholde oss til virkeligheten. Det er snart ingen grenser for hvilke tjenester helsevesenet skal utføre. I denne situasjonen må vi styre slik at vi får mest mulig ut av ressursene. Det offentlige må konsentrere seg om å sikre de grunnleggende helsetjenestene og lage spilleregler for de private aktørene. De bør også få en mulighet til å utføre oppgaver det offentlige definerer som «grunnleggende helsetjenester», slik man gjør i Sverige. Men det må, som jeg har sagt tidligere, være en forutsetning at det offentlige skal betale tjenestene, og at kontrollmekanismene er gode, slik at tjenestene holder kvalitetsmessige mål. Det viktigste er likevel at vi gir sykehusene frihet i forhold til det tungrodde politiske system, slik at det blir mulig å lede sykehusene mer profesjonelt og prøve ut ulike selskapsformer, slik en nå ser ut til å få politisk flertall for her i Oslo. Som politikere bør vi heller være opptatt av å fristille de offentlige sykehusene enn å begrense de private. Et samarbeid mellom det offentlige og det private helsevesen har vi tro på. Etter Høyres mening gjelder det å få i gang noen prøveprosjekter, for uten det tror jeg ikke vi kommer videre. Hvordan ser helseministeren på dette, og vil han ta initiativ og stimulere til et slikt samarbeid?', - 'topic': 'om en modell for helsevesenet hvor det offentlige drar nytte av _private aktører til utbygging og drift av sykehus_', - 'sequence': '4', - 'id': 'tale000004', - 'ministerial_role': None, - 'legislature': 'Bondevik I', - 'subject': None, - 'language': 'Norwegian (Bokmål)', - 'debate_id': 'Saker-og-publikasjoner/Publikasjoner/Referater/Stortinget/1998-1999/981020/2/' - }, - {}, {}, - { - # test special case of ministers answering questions - 'ministerial_role': 'helseministeren', - 'speaker': 'Presidenten', - 'speaker_id': 'DH', - 'party': None, - 'party_role': None, - 'speech': "Representanten Sjøli nevnte et forslag. Betyr det at hun tar opp dette forslaget?" - } - ], - 'n_documents': 10, - }, - { - 'name': 'parliament-finland', - 'docs': [ - { - 'country': 'Finland', - 'speech': 'Täysistunto alkaa. Toivotan kaikki tervetulleiksi tänne Sibelius-taloon Sibeliuksen juhlavuotena aloittamaan vastuullista työtämme isänmaan hyväksi. Iältäni vanhimpana eduskunnan jäsenenä johdan puhetta tässä valtiopäivien ensimmäisessä täysistunnossa, kunnes eduskunta on työjärjestyksen 4 §:n mukaan valinnut puhemiehen ja kaksi varapuhemiestä ja nämä ovat antaneet eduskunnalle juhlallisen vakuutuksen. Plenum börjar. Som den riksdagsledamot som är äldst till åren är det min uppgift att föra ordet vid första plenum under riksmötet till dess att riksdagen enligt 4 § i riksdagens arbets-ordning inom sig valt talman och två vice talmän och dessa har avgett högtidlig försäkran inför riksdagen.', - 'speaker_id': 'Pertti_Salolainen', - 'speaker': 'Pertti Salolainen', - 'role': 'Ikäpuhemies', - 'party_id': '#party.KOK', - 'party': 'KOK', - 'party_role': 'Hallituspuolue', - 'speaker_gender': 'Male', - 'speaker_birth_year': 1940, - 'speech_type': 'PuhemiesPuheenvuoro', - 'id': '2015_1_1', - 'url': 'https://www.eduskunta.fi/FI/vaski/PoytakirjaAsiakohta/Sivut/PTK_1+2015+1.aspx', - 'sequence': '1', - 'topic': 'Nimenhuuto', - 'debate_id': 'ptk_1_2015', - 'debate_title': 'PTK 1/2015', - 'date': '2015-04-28', - }, - ], - 'n_documents': 22, - }, - { - 'name': 'parliament-finland-old', - 'docs': [ - { - 'country': 'Finland', - 'speech': """FÖUDT HOS - -FINLANDS RIDDERSKAP OCR ADEL - -VID - -LANDTDAGEN ÅR 1877. - -TREDJE HÄFTET. - -Från den 1 till den 31 Oktober. - -FINSKA LITTERATUR-SÄLLBKAPETS TRYCKERl, - -1878.""", - 'id': 'Adeln_Prot_1877_III.pdf_0', - 'speech_type': 'minutes', - 'chamber': 'nobility', - 'date_earliest': '1877-01-01', - 'date_latest': '1877-12-31', - 'page': '0', - 'language': 'swe', - 'source_archive': 'Adeln_Prot_1877_III.pdf' - }, - ], - 'n_documents': 4, - }, - { - 'name': 'parliament-ireland', - 'end': datetime(1999, 12, 31), - 'docs': [ - { - 'country': 'Ireland', - 'id': '1', - 'date': '1919-01-21', - 'speaker': 'Count George Noble, Count Plunkett', - 'speaker_id': '977', - 'speaker_constituency': 'Roscommon North', - 'party': 'Sinn Féin', - 'party_id': '22', - 'speech': 'Molaimse don Dáil Cathal Brugha, an Teachta ó Dhéisibh Phortláirge do bheith mar Cheann Comhairle againn indiu.', - 'topic': '1. CEANN COMHAIRLE I gCOIR AN LAE.', - 'chamber': 'Dáil', - 'sequence': 1, - 'source_archive': '1919-2013', - 'url': None, - 'ministerial_role': None, - 'role': None, - 'debate_type': None, - 'committee': None, - }, - ], - 'n_documents': 5, - }, - { - 'name': 'parliament-ireland', - 'start': datetime(2000, 1, 1), - 'end': datetime(2013, 12, 31), - 'docs': [ - { - 'country': 'Ireland', - 'id': '3088872', - 'date': '2000-01-26', - 'speaker': 'Mr. Ruairí Quinn', - 'speaker_id': '985', - 'speaker_constituency': 'Dublin South-East', - 'party': 'The Labour Party', - 'party_id': '14', - 'speech': 'asked the Taoiseach if he will make a statement on his visit to South Africa and Lesotho.', - 'topic': 'Ceisteanna &ndash Questions. - Official Engagements.', - 'chamber': 'Dáil', - 'sequence': 3088872, - 'source_archive': '1919-2013', - 'url': None, - 'ministerial_role': None, - 'role': None, - 'debate_type': None, - 'committee': None, - }, - ] + - [ {} ] * 13 + # skip ahead to the first speech from a minister - [ - { - 'id': '3088886', - 'speaker_id': '5', - 'speaker': 'Mr. Bertie Ahern', - 'ministerial_role': 'Taoiseach, Minister for Foreign Affairs', - } - ] - , - 'n_documents': 15, - }, - { - 'name': 'parliament-ireland', - 'start': datetime(2014, 1, 1), - 'docs': [ - { - 'country': 'Ireland', - 'sequence': 1, - 'speaker_id': '#AndrewDoyle', - 'date': '2014-12-09', - 'topic': 'Vote 30 - Agriculture, Food and the Marine (Supplementary)', - 'speaker': 'Andrew Doyle', - 'chamber': 'Dáil', - 'url': 'https://data.oireachtas.ie/akn/ie/debateRecord/select_committee_on_agriculture_food_and_the_marine/2014-12-09/debate/mul@/main.xml', - 'source_archive': '2014-2020', - 'party': None, - 'party_id': None, - 'speaker_constituency': None, - 'role': 'Chair', - 'ministerial_role': None, - 'debate_type': 'committee', - 'committee': 'Select Committee on Agriculture, Food and the Marine', - 'id': 'debateRecord#select_committee_on_agriculture_food_and_the_marine#2014-12-09#debate#main#spk_1', - 'speech': '''As we have a quorum, we will commence in public session. All mobile phones should be switched off because they cause interference. I have apologies from Deputies Michael McNamara and Martin Heydon. This meeting has been convened to consider a Supplementary Estimate on Vote 30 - Agriculture, Food and the Marine, which was referred by the Dáil to the committee on 3 December with an instruction to report back to the Dáil not later than 11 December. -I welcome the Minister, Deputy Simon Coveney, and his officials. I thank them for the briefing material provided, which has been circulated to the members of the committee. I invite the Minister to make his opening statement.''' - }, { - 'speaker_id': '#SimonCoveney', - 'speaker': 'Simon Coveney', - 'role': None, - 'ministerial_role': 'Minister for Agriculture, Food and the Marine', - } - ], - 'n_documents': 25, - } -] +from corpora.parliament.conftest import CORPUS_TEST_DATA def corpus_test_name(corpus_spec): return corpus_spec['name'] @@ -645,7 +39,7 @@ def test_imports(parliament_corpora_settings, corpus_object): docs = get_documents(corpus, start, end) assert len(list(docs)) == corpus_object.get('n_documents') -def get_documents(corpus, start, end): +def get_documents(corpus: CorpusDefinition, start, end): sources = corpus.sources( start=start, end=end diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py index 9365ba1b5..35dc4c651 100644 --- a/backend/corpora/parliament/utils/field_defaults.py +++ b/backend/corpora/parliament/utils/field_defaults.py @@ -55,7 +55,7 @@ def committee(): description = 'Committee that held the debate', es_mapping = keyword_mapping(), search_filter = MultipleChoiceFilter( - description='Search only in debates from the selected chamber(s)', + description='Search only in debates from the selected committee(s)', ), visualizations = ['resultscount', 'termfrequency'] ) @@ -69,23 +69,23 @@ def country(): es_mapping=keyword_mapping(), ) -def date(): + +def date(min_date: datetime = MIN_DATE, max_date: datetime = MAX_DATE): "The date on which the debate took place." return FieldDefinition( - name='date', - display_name='Date', - description='The date on which the debate took place.', + name="date", + display_name="Date", + description="The date on which the debate took place.", es_mapping=date_mapping(), results_overview=True, search_filter=DateFilter( - MIN_DATE, - MAX_DATE, - description='Search only within this time range.' + min_date, max_date, description="Search only within this time range." ), - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], csv_core=True, ) + def date_is_estimate(): """Wether the date field is an estimate. Boolean value.""" return FieldDefinition( diff --git a/backend/corpora/peaceportal/utils/field_defaults.py b/backend/corpora/peaceportal/utils/field_defaults.py index 1800e25eb..802d4260e 100644 --- a/backend/corpora/peaceportal/utils/field_defaults.py +++ b/backend/corpora/peaceportal/utils/field_defaults.py @@ -39,20 +39,20 @@ def url(): def year(min_year, max_year): return FieldDefinition( - name='year', - display_name='Year', - description='Year of origin of the inscription.', + name="year", + display_name="Year", + description="Year of origin of the inscription.", es_mapping=int_mapping(), search_filter=RangeFilter( - description='Restrict the years from which search results will be returned.', + description="Restrict the years from which search results will be returned.", lower=min_year, upper=max_year, ), csv_core=True, sortable=True, - visualization_type='term_frequency', - visualization_sort='key', - results_overview=True + visualizations=["resultscount"], + visualization_sort="key", + results_overview=True, ) @@ -70,7 +70,6 @@ def date(min_date, max_date): ) - def not_before(): return FieldDefinition( name='not_before', @@ -204,44 +203,41 @@ def sex(): def country(): return FieldDefinition( - name='country', - display_name='Country', - description='Country where the inscription was found.', + name="country", + display_name="Country", + description="Country where the inscription was found.", es_mapping=keyword_mapping(True), search_filter=MultipleChoiceFilter( - description='Search only within these countries.', - option_count=5 + description="Search only within these countries.", option_count=5 ), - visualization_type='term_frequency', - results_overview=True + visualizations=["resultscount"], + results_overview=True, ) def settlement(): return FieldDefinition( - name='settlement', - display_name='Settlement', - description='The settlement where the inscription was found.', + name="settlement", + display_name="Settlement", + description="The settlement where the inscription was found.", es_mapping=keyword_mapping(True), search_filter=MultipleChoiceFilter( - description='Search only within these settlements.', - option_count=29 + description="Search only within these settlements.", option_count=29 ), - visualization_type='term_frequency' + visualizations=["resultscount"], ) def region(): return FieldDefinition( - name='region', - display_name='Region', - description='The region where the inscription was found.', + name="region", + display_name="Region", + description="The region where the inscription was found.", es_mapping=keyword_mapping(True), search_filter=MultipleChoiceFilter( - description='Search only within these regions.', - option_count=29 + description="Search only within these regions.", option_count=29 ), - visualization_type='term_frequency' + visualizations=["resultscount"], ) @@ -256,15 +252,14 @@ def location_details(): def material(): return FieldDefinition( - name='material', - display_name='Material', - description='Type of material the inscription is written on.', + name="material", + display_name="Material", + description="Type of material the inscription is written on.", es_mapping=keyword_mapping(), search_filter=MultipleChoiceFilter( - description='Search only within these material types.', - option_count=39 + description="Search only within these material types.", option_count=39 ), - visualization_type='term_frequency' + visualization_type="resultscount", ) @@ -280,16 +275,15 @@ def material_details(): def language(): return FieldDefinition( - name='language', - display_name='Language', - description='Language of the inscription.', + name="language", + display_name="Language", + description="Language of the inscription.", es_mapping=keyword_mapping(), search_filter=MultipleChoiceFilter( - description='Search only within these languages.', - option_count=10 + description="Search only within these languages.", option_count=10 ), csv_core=True, - visualization_type='term_frequency' + visualizations=["resultscount"], ) diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py index 24111c8a5..e6ab86e95 100644 --- a/backend/corpora/periodicals/periodicals.py +++ b/backend/corpora/periodicals/periodicals.py @@ -5,7 +5,7 @@ import logging logger = logging.getLogger(__name__) -from os.path import join, isfile, splitext +from os.path import join, isfile from datetime import datetime import re import openpyxl @@ -59,7 +59,8 @@ def sources(self, start=min_date, end=max_date): metadict['title'] = row[0] if row[1].startswith('['): date = row[1][1:-1] - else: date = row[1] + else: + date = row[1] metadict['date_full'] = date if date=='Date Unknown': metadict['date'] = None @@ -80,172 +81,172 @@ def sources(self, start=min_date, end=max_date): fields = [ FieldDefinition( - name='date', - display_name='Formatted Date', - description='Publication date, formatted from the full date', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Formatted Date", + description="Publication date, formatted from the full date", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, histogram=True, search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only articles with publication date in this range.' - ) + "Accept only articles with publication date in this range." + ), ), - extractor=extract.Metadata('date'), + extractor=extract.Metadata("date"), csv_core=True, - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='date_pub', - display_name='Publication Date', - description='Publication date as full string, as found in source file', + name="date_pub", + display_name="Publication Date", + description="Publication date as full string, as found in source file", es_mapping=keyword_mapping(), results_overview=True, - extractor=extract.Metadata('date_full') + extractor=extract.Metadata("date_full"), ), FieldDefinition( - name='id', - display_name='ID', - description='Unique identifier of the entry.', + name="id", + display_name="ID", + description="Unique identifier of the entry.", es_mapping=keyword_mapping(), - extractor=extract.XML(attribute='id'), + extractor=extract.XML(attribute="id"), ), FieldDefinition( - name='issue', - display_name='Issue number', - description='Source issue number.', + name="issue", + display_name="Issue number", + description="Source issue number.", es_mapping=keyword_mapping(), results_overview=False, - extractor=extract.Metadata('issue_id'), + extractor=extract.Metadata("issue_id"), csv_core=False, ), FieldDefinition( - name='periodical', - display_name='Periodical name', + name="periodical", + display_name="Periodical name", histogram=True, results_overview=True, - es_mapping={'type': 'keyword'}, - description='Periodical name.', + es_mapping={"type": "keyword"}, + description="Periodical name.", search_filter=filters.MultipleChoiceFilter( - description='Search only within these periodicals.', - option_count=90 + description="Search only within these periodicals.", option_count=90 ), - extractor=extract.Metadata('title'), + extractor=extract.Metadata("title"), csv_core=True, - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - description='Text content.', - es_mapping=main_content_mapping(True, True, True, 'en'), + name="content", + display_name="Content", + display_type="text_content", + description="Text content.", + es_mapping=main_content_mapping(True, True, True, "en"), results_overview=True, - extractor=extract.XML(Tag('ocrText'), flatten=True), + extractor=extract.XML(Tag("ocrText"), flatten=True), search_field_core=True, - visualizations=["wordcloud"], - language='en', + visualizations=["wordcloud", "ngram"], + language="en", ), FieldDefinition( - name='ocr', - display_name='OCR confidence', - description='OCR confidence level.', - es_mapping={'type': 'float'}, - search_filter=filters.RangeFilter(0, 100, - description=( - 'Accept only articles for which the Opitical Character Recognition confidence ' - 'indicator is in this range.' - ) - ), + name="ocr", + display_name="OCR confidence", + description="OCR confidence level.", + es_mapping={"type": "float"}, + search_filter=filters.RangeFilter( + 0, + 100, + description=( + "Accept only articles for which the Opitical Character Recognition confidence " + "indicator is in this range." + ), + ), extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('ocr'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("ocr"), ), - sortable=True + sortable=True, ), FieldDefinition( - name='title', - display_name='Article title', - description='Title of the article.', + name="title", + display_name="Article title", + description="Title of the article.", extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('ti'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("ti"), external_file=True, ), - visualizations=['wordcloud'] + visualizations=["wordcloud"], ), FieldDefinition( - name='start_column', - es_mapping={'type': 'keyword'}, - display_name='Starting column', - description='Which column the article starts in.', + name="start_column", + es_mapping={"type": "keyword"}, + display_name="Starting column", + description="Which column the article starts in.", extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('sc'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("sc"), external_file=True, - ) + ), ), FieldDefinition( - name='page_count', - display_name='Page count', - description='How many pages the article covers.', - es_mapping={'type': 'integer'}, + name="page_count", + display_name="Page count", + description="How many pages the article covers.", + es_mapping={"type": "integer"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('pc'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("pc"), external_file=True, - ) + ), ), FieldDefinition( - name='word_count', - display_name='Word count', - description='Number of words in the article.', - es_mapping={'type': 'integer'}, + name="word_count", + display_name="Word count", + description="Number of words in the article.", + es_mapping={"type": "integer"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('wordCount'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("wordCount"), external_file=True, - ) + ), ), FieldDefinition( - name='category', + name="category", csv_core=True, - display_name='Category', - description='Article category.', - es_mapping={'type': 'keyword'}, + display_name="Category", + description="Article category.", + es_mapping={"type": "keyword"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), - SiblingTag('ct'), + lambda metadata: Tag("id", string=metadata["id"]), + SiblingTag("ct"), external_file=True, ), search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=26 + description="Accept only articles in these categories.", option_count=26 ), - visualizations=['resultscount', 'termfrequency'] + visualizations=["resultscount", "termfrequency"], ), FieldDefinition( - name='page_no', - display_name='Page number', - description='At which page the article starts.', - es_mapping={'type': 'integer'}, + name="page_no", + display_name="Page number", + description="At which page the article starts.", + es_mapping={"type": "integer"}, extractor=extract.XML( - lambda metadata: Tag('id', string=metadata['id']), + lambda metadata: Tag("id", string=metadata["id"]), ParentTag(2), - Tag('pa'), + Tag("pa"), external_file=True, - transform=lambda x: re.sub('[\[\]]', '', x) - ) + transform=lambda x: re.sub("[\[\]]", "", x), + ), ), FieldDefinition( - name='image_path', - display_name='Image path', - es_mapping={'type': 'keyword'}, - description='Path of scan.', - extractor=extract.Metadata('image_path'), + name="image_path", + display_name="Image path", + es_mapping={"type": "keyword"}, + description="Path of scan.", + extractor=extract.Metadata("image_path"), hidden=True, - downloadable=False + downloadable=False, ), ] diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py index fc46c2d39..683ae184f 100644 --- a/backend/corpora/rechtspraak/rechtspraak.py +++ b/backend/corpora/rechtspraak/rechtspraak.py @@ -36,7 +36,6 @@ def _rdf_description_extractor(tag: Tag, section='xml', **kwargs) -> extract.XML ) - class Rechtspraak(XMLCorpusDefinition): title = "Judicial system Netherlands" description = "Open data of (anonymised) court rulings of the Dutch judicial system" @@ -146,179 +145,173 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None fields = [ FieldDefinition( - name='id', - display_name='ID', - description='', + name="id", + display_name="ID", + description="", es_mapping=keyword_mapping(), - extractor=_rdf_description_extractor(Tag('dcterms:identifier')), + extractor=_rdf_description_extractor(Tag("dcterms:identifier")), csv_core=True, ), FieldDefinition( - name='has_content', - display_name='Has text content', - description='Document has available text content.', - es_mapping={'type': 'boolean'}, + name="has_content", + display_name="Has text content", + description="Document has available text content.", + es_mapping={"type": "boolean"}, extractor=extract.Backup( - extract.XML(Tag('uitspraak'), flatten=True), - extract.XML(Tag('conclusie'), flatten=True), + extract.XML(Tag("uitspraak"), flatten=True), + extract.XML(Tag("conclusie"), flatten=True), extract.Constant(False), - transform=bool + transform=bool, ), search_filter=filters.BooleanFilter( - true='has content', - false='does not have content', - description=( - 'Accept only articles that have available text content.' - ) + true="has content", + false="does not have content", + description=("Accept only articles that have available text content."), ), ), FieldDefinition( - name='year', - display_name='Year', - es_mapping={'type': 'integer'}, - extractor=extract.Metadata('year'), - search_filter=filters.RangeFilter(min_date.year, max_date.year) + name="year", + display_name="Year", + es_mapping={"type": "integer"}, + extractor=extract.Metadata("year"), + search_filter=filters.RangeFilter(min_date.year, max_date.year), ), FieldDefinition( - name='date', - display_name='Date', - extractor=_rdf_description_extractor(Tag('dcterms:date')), - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Date", + extractor=_rdf_description_extractor(Tag("dcterms:date")), + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, results_overview=True, csv_core=True, search_filter=filters.DateFilter( min_date, max_date, - description=( - 'Accept only rulings with date in this range.' - ) + description=("Accept only rulings with date in this range."), ), - ), FieldDefinition( - name='issued', - display_name='Publication Date', - extractor=_rdf_description_extractor(Tag('dcterms:issued')), - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="issued", + display_name="Publication Date", + extractor=_rdf_description_extractor(Tag("dcterms:issued")), + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only rulings with publication date in this range.' - ) + "Accept only rulings with publication date in this range." + ), ), ), FieldDefinition( - name='publisher', - display_name='Publisher', - extractor=_rdf_description_extractor(Tag('dcterms:publisher')), - es_mapping={'type': 'keyword'}, - language='nl', + name="publisher", + display_name="Publisher", + extractor=_rdf_description_extractor(Tag("dcterms:publisher")), + es_mapping={"type": "keyword"}, + language="nl", ), FieldDefinition( - name='creator', - display_name='Court', - extractor=_rdf_description_extractor(Tag('dcterms:creator')), - es_mapping={'type': 'keyword'}, + name="creator", + display_name="Court", + extractor=_rdf_description_extractor(Tag("dcterms:creator")), + es_mapping={"type": "keyword"}, csv_core=True, results_overview=True, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings of selected courts.', - option_count=9999 + description="Accept only rulings of selected courts.", option_count=9999 ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='zaaknr', - display_name='Case Number', + name="zaaknr", + display_name="Case Number", es_mapping=keyword_mapping(), - extractor=_rdf_description_extractor(Tag('psi:zaaknummer')), + extractor=_rdf_description_extractor(Tag("psi:zaaknummer")), ), FieldDefinition( - name='type', - display_name='Type', - extractor=_rdf_description_extractor(Tag('dcterms:type')), - es_mapping={'type': 'keyword'}, + name="type", + display_name="Type", + extractor=_rdf_description_extractor(Tag("dcterms:type")), + es_mapping={"type": "keyword"}, csv_core=True, results_overview=True, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings of selected type.', - option_count=2 + description="Accept only rulings of selected type.", option_count=2 ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='procedure', - display_name='(type of) Procedure', - extractor=_rdf_description_extractor(Tag('psi:procedure')), + name="procedure", + display_name="(type of) Procedure", + extractor=_rdf_description_extractor(Tag("psi:procedure")), csv_core=True, - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings of selected procedure type.', - option_count=44 + description="Accept only rulings of selected procedure type.", + option_count=44, ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='spatial', - display_name='Location', + name="spatial", + display_name="Location", es_mapping=keyword_mapping(), - extractor=_rdf_description_extractor(Tag('dcterms:spatial')), - language='nl', + extractor=_rdf_description_extractor(Tag("dcterms:spatial")), + language="nl", ), FieldDefinition( - name='subject', - display_name='Area of law', - extractor=_rdf_description_extractor(Tag('dcterms:subject')), + name="subject", + display_name="Area of law", + extractor=_rdf_description_extractor(Tag("dcterms:subject")), csv_core=True, - es_mapping={'type': 'keyword'}, + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only rulings within this area of law.', - option_count=32 + description="Accept only rulings within this area of law.", + option_count=32, ), - visualizations=['resultscount', 'termfrequency'], - language='nl', + visualizations=["resultscount", "termfrequency"], + language="nl", ), FieldDefinition( - name='title', - display_name='Title', - extractor=_rdf_description_extractor( - Tag('dcterms:title'), section='html'), + name="title", + display_name="Title", + extractor=_rdf_description_extractor(Tag("dcterms:title"), section="html"), results_overview=True, search_field_core=True, - language='nl', + language="nl", ), FieldDefinition( - name='abstract', - display_name='Abstract', - extractor=extract.XML(Tag('inhoudsindicatie'), flatten=True), + name="abstract", + display_name="Abstract", + extractor=extract.XML(Tag("inhoudsindicatie"), flatten=True), results_overview=True, - language='nl', + language="nl", ), FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - es_mapping=main_content_mapping(True, True, True, 'nl'), + name="content", + display_name="Content", + display_type="text_content", + es_mapping=main_content_mapping(True, True, True, "nl"), extractor=extract.Backup( - extract.XML(Tag('uitspraak'), flatten=True), - extract.XML(Tag('conclusie'), flatten=True), - extract.Constant('Content not available') + extract.XML(Tag("uitspraak"), flatten=True), + extract.XML(Tag("conclusie"), flatten=True), + extract.Constant("Content not available"), ), csv_core=True, search_field_core=True, - language='nl', + language="nl", + visualizations=["ngram"], ), FieldDefinition( - name='url', - display_name='Source URL', - display_type='url', - description='URL of the case on rechtspraak.nl', + name="url", + display_name="Source URL", + display_type="url", + description="URL of the case on rechtspraak.nl", es_mapping=keyword_mapping(), extractor=_rdf_description_extractor( - Tag('dcterms:identifier'), section='html') - ) + Tag("dcterms:identifier"), section="html" + ), + ), ] diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py index 5a0fbb954..65fbcbf09 100644 --- a/backend/corpora/times/times.py +++ b/backend/corpora/times/times.py @@ -40,6 +40,7 @@ class Times(XMLCorpusDefinition): citation_page = 'citation.md' languages = ['en'] category = 'periodical' + word_model_path = getattr(settings, "TIMES_WM", None) @property def es_settings(self): @@ -95,172 +96,151 @@ def sources(self, start=datetime.min, end=datetime.max): fields = [ FieldDefinition( - name='date', - display_name='Publication Date', - description='Publication date, parsed to yyyy-MM-dd format', - es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, + name="date", + display_name="Publication Date", + description="Publication date, parsed to yyyy-MM-dd format", + es_mapping={"type": "date", "format": "yyyy-MM-dd"}, hidden=True, - visualizations=['resultscount', 'termfrequency'], + visualizations=["resultscount", "termfrequency"], search_filter=filters.DateFilter( min_date, max_date, description=( - 'Accept only articles with publication date in this range.' - ) + "Accept only articles with publication date in this range." + ), + ), + extractor=extract.Metadata( + "date", transform=lambda x: x.strftime("%Y-%m-%d") ), - extractor=extract.Metadata('date', - transform=lambda x: x.strftime( - '%Y-%m-%d') - ) ), FieldDefinition( - name='source', - display_name='Source', - description='Library where the microfilm is sourced', + name="source", + display_name="Source", + description="Library where the microfilm is sourced", es_mapping=keyword_mapping(), extractor=extract.XML( - Tag('metadatainfo'), - Tag('sourceLibrary'), + Tag("metadatainfo"), + Tag("sourceLibrary"), toplevel=True, - applicable=after(1985) - ) + applicable=after(1985), + ), ), FieldDefinition( - name='edition', - display_name='Edition', + name="edition", + display_name="Edition", es_mapping=keyword_mapping(), extractor=extract.Choice( + extract.XML(Tag("ed"), toplevel=True, applicable=until(1985)), extract.XML( - Tag('ed'), - toplevel=True, - applicable=until(1985) + Tag("ed"), toplevel=True, multiple=True, applicable=after(1985) ), - extract.XML( - Tag('ed'), - toplevel=True, multiple=True, - applicable=after(1985) - ) ), - csv_core=True + csv_core=True, ), FieldDefinition( - name='issue', - display_name='Issue number', - es_mapping={'type': 'integer'}, - description='Source issue number.', + name="issue", + display_name="Issue number", + es_mapping={"type": "integer"}, + description="Source issue number.", extractor=extract.XML( - Tag('is'), + Tag("is"), toplevel=True, # Hardcoded to ignore one particular issue with source data - transform=lambda x: (62226 if x == "6222662226" else int(x)) + transform=lambda x: (62226 if x == "6222662226" else int(x)), ), sortable=True, - csv_core=True + csv_core=True, ), FieldDefinition( - name='volume', - display_name='Volume', - description='Volume number.', + name="volume", + display_name="Volume", + description="Volume number.", es_mapping=keyword_mapping(), - extractor=extract.XML( - Tag('volNum'), - toplevel=True, - applicable=after(1985) - ), - csv_core=True + extractor=extract.XML(Tag("volNum"), toplevel=True, applicable=after(1985)), + csv_core=True, ), FieldDefinition( - name='date-pub', - display_name='Publication Date', + name="date-pub", + display_name="Publication Date", es_mapping=keyword_mapping(), csv_core=True, results_overview=True, sortable=True, - description='Publication date as full string, as found in source file', - extractor=extract.XML( - Tag('da'), - toplevel=True - ) + description="Publication date as full string, as found in source file", + extractor=extract.XML(Tag("da"), toplevel=True), ), FieldDefinition( - name='ocr', - display_name='OCR confidence', - description='OCR confidence level.', - es_mapping={'type': 'float'}, - search_filter=filters.RangeFilter(0, 100, - description=( - 'Accept only articles for which the Opitical Character Recognition confidence ' - 'indicator is in this range.' - ) - ), - extractor=extract.XML(Tag('ocr'), transform=float), - sortable=True + name="ocr", + display_name="OCR confidence", + description="OCR confidence level.", + es_mapping={"type": "float"}, + search_filter=filters.RangeFilter( + 0, + 100, + description=( + "Accept only articles for which the Opitical Character Recognition confidence " + "indicator is in this range." + ), + ), + extractor=extract.XML(Tag("ocr"), transform=float), + sortable=True, ), FieldDefinition( - name='date-end', - display_name='Ending date', + name="date-end", + display_name="Ending date", es_mapping=keyword_mapping(), description=( - 'Ending date of publication. ' - 'For issues that span more than 1 day.' + "Ending date of publication. " "For issues that span more than 1 day." ), - extractor=extract.XML( - Tag('tdate'), toplevel=True, - applicable=after(1985) - ) + extractor=extract.XML(Tag("tdate"), toplevel=True, applicable=after(1985)), ), FieldDefinition( - name='page-count', - display_name='Image count', - description='Page count: number of images present in the issue.', - es_mapping={'type': 'integer'}, - extractor=extract.XML( - Tag('ip'), toplevel=True, transform=int - ), - sortable=True + name="page-count", + display_name="Image count", + description="Page count: number of images present in the issue.", + es_mapping={"type": "integer"}, + extractor=extract.XML(Tag("ip"), toplevel=True, transform=int), + sortable=True, ), FieldDefinition( - name='page-type', - display_name='Page type', - description='Supplement in which article occurs.', - es_mapping={'type': 'keyword'}, + name="page-type", + display_name="Page type", + description="Supplement in which article occurs.", + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( description=( - 'Accept only articles that occur in the relevant ' - 'supplement. Only after 1985.' + "Accept only articles that occur in the relevant " + "supplement. Only after 1985." ), - option_count=2 + option_count=2, ), extractor=extract.XML( - ParentTag(), - Tag('pageid'), - attribute='isPartOf', - applicable=after(1985) - ) + ParentTag(), Tag("pageid"), attribute="isPartOf", applicable=after(1985) + ), ), FieldDefinition( - name='supplement-title', - display_name='Supplement title', - description='Supplement title.', + name="supplement-title", + display_name="Supplement title", + description="Supplement title.", extractor=extract.XML( ParentTag(), - Tag('pageid'), - Tag('supptitle'), + Tag("pageid"), + Tag("supptitle"), multiple=True, - applicable=after(1985) + applicable=after(1985), ), ), FieldDefinition( - name='supplement-subtitle', - display_name='Supplement subtitle', - description='Supplement subtitle.', + name="supplement-subtitle", + display_name="Supplement subtitle", + description="Supplement subtitle.", extractor=extract.XML( ParentTag(), - Tag('pageid'), - Tag('suppsubtitle'), + Tag("pageid"), + Tag("suppsubtitle"), multiple=True, - applicable=after(1985) - ) + applicable=after(1985), + ), ), # There are no datapoints where this is True, hence the outcomment # FieldDefinition( @@ -283,183 +263,158 @@ def sources(self, start=datetime.min, end=datetime.max): # ) # ), FieldDefinition( - name='id', - display_name='ID', - description='Article identifier.', + name="id", + display_name="ID", + description="Article identifier.", es_mapping=keyword_mapping(), - extractor=extract.XML(Tag('id')) + extractor=extract.XML(Tag("id")), ), FieldDefinition( - name='ocr-relevant', - display_name='OCR relevant', - description='Whether OCR confidence level is relevant.', - es_mapping={'type': 'boolean'}, + name="ocr-relevant", + display_name="OCR relevant", + description="Whether OCR confidence level is relevant.", + es_mapping={"type": "boolean"}, extractor=extract.XML( - Tag('ocr'), attribute='relevant', + Tag("ocr"), + attribute="relevant", transform=string_contains("yes"), - ) + ), ), FieldDefinition( - name='column', - display_name='Column', + name="column", + display_name="Column", description=( - 'Starting column: a string to label the column' - 'where article starts.' + "Starting column: a string to label the column" "where article starts." ), es_mapping=keyword_mapping(), - extractor=extract.XML(Tag('sc')) + extractor=extract.XML(Tag("sc")), ), FieldDefinition( - name='page', - display_name='Page', - description='Start page label, from source (1, 2, 17A, ...).', + name="page", + display_name="Page", + description="Start page label, from source (1, 2, 17A, ...).", es_mapping=keyword_mapping(), extractor=extract.Choice( - extract.XML(Tag('pa'), applicable=until(1985)), - extract.XML(ParentTag(), Tag('pa'), applicable=after(1985)) - ) + extract.XML(Tag("pa"), applicable=until(1985)), + extract.XML(ParentTag(), Tag("pa"), applicable=after(1985)), + ), ), FieldDefinition( - name='pages', - display_name='Page count', - es_mapping={'type': 'integer'}, + name="pages", + display_name="Page count", + es_mapping={"type": "integer"}, description=( - 'Page count: total number of pages containing sections ' - 'of the article.' + "Page count: total number of pages containing sections " + "of the article." ), - extractor=extract.XML( - Tag('pc'), transform=int - ), - sortable=True + extractor=extract.XML(Tag("pc"), transform=int), + sortable=True, ), FieldDefinition( - name='title', - display_name='Title', + name="title", + display_name="Title", results_overview=True, search_field_core=True, - visualizations=['wordcloud'], - description='Article title.', - extractor=extract.XML(Tag('ti')) + visualizations=["wordcloud"], + description="Article title.", + extractor=extract.XML(Tag("ti")), ), FieldDefinition( - name='subtitle', - display_name='Subtitle', - description='Article subtitle.', - extractor=extract.XML(Tag('ta'), multiple=True), - search_field_core=True + name="subtitle", + display_name="Subtitle", + description="Article subtitle.", + extractor=extract.XML(Tag("ta"), multiple=True), + search_field_core=True, ), FieldDefinition( - name='subheader', - display_name='Subheader', - description='Article subheader (product dependent field).', + name="subheader", + display_name="Subheader", + description="Article subheader (product dependent field).", extractor=extract.XML( - Tag('subheader'), multiple=True, - applicable=after(1985) - ) + Tag("subheader"), multiple=True, applicable=after(1985) + ), ), FieldDefinition( - name='author', - display_name='Author', - description='Article author.', + name="author", + display_name="Author", + description="Article author.", es_mapping=keyword_mapping(True), extractor=extract.Choice( - extract.XML( - Tag('au'), multiple=True, - applicable=until(1985) - ), - extract.XML( - Tag('au_composed'), multiple=True, - applicable=after(1985) - ) + extract.XML(Tag("au"), multiple=True, applicable=until(1985)), + extract.XML(Tag("au_composed"), multiple=True, applicable=after(1985)), ), search_field_core=True, - csv_core=True + csv_core=True, ), FieldDefinition( - name='source-paper', - display_name='Source paper', - description='Credited as source.', + name="source-paper", + display_name="Source paper", + description="Credited as source.", es_mapping=keyword_mapping(True), - extractor=extract.XML( - Tag('altSource'), multiple=True - ) + extractor=extract.XML(Tag("altSource"), multiple=True), ), FieldDefinition( - name='category', - visualizations=['resultscount', 'termfrequency'], - display_name='Category', - description='Article subject categories.', - es_mapping={'type': 'keyword'}, + name="category", + visualizations=["resultscount", "termfrequency"], + display_name="Category", + description="Article subject categories.", + es_mapping={"type": "keyword"}, search_filter=filters.MultipleChoiceFilter( - description='Accept only articles in these categories.', - option_count=25 + description="Accept only articles in these categories.", option_count=25 ), - extractor=extract.XML(Tag('ct'), multiple=True), - csv_core=True + extractor=extract.XML(Tag("ct"), multiple=True), + csv_core=True, ), FieldDefinition( - name='illustration', - display_name='Illustration', - description=( - 'Tables and other illustrations associated with the article.' - ), - es_mapping={'type': 'keyword'}, - visualizations=['resultscount', 'termfrequency'], + name="illustration", + display_name="Illustration", + description=("Tables and other illustrations associated with the article."), + es_mapping={"type": "keyword"}, + visualizations=["resultscount", "termfrequency"], search_filter=filters.MultipleChoiceFilter( description=( - 'Accept only articles associated with these types ' - 'of illustrations.'), - option_count=7 + "Accept only articles associated with these types " + "of illustrations." + ), + option_count=7, ), extractor=extract.Choice( + extract.XML(Tag("il"), multiple=True, applicable=until(1985)), extract.XML( - Tag('il'), multiple=True, - applicable=until(1985) + Tag("il"), attribute="type", multiple=True, applicable=after(1985) ), - extract.XML( - Tag('il'), attribute='type', multiple=True, - applicable=after(1985) - ) ), - csv_core=True + csv_core=True, ), FieldDefinition( - name='content-preamble', - display_name='Content preamble', - description='Raw OCR\'ed text (preamble).', - extractor=extract.XML( - Tag('text'), - Tag('text.preamble'), - flatten=True - ) + name="content-preamble", + display_name="Content preamble", + description="Raw OCR'ed text (preamble).", + extractor=extract.XML(Tag("text"), Tag("text.preamble"), flatten=True), ), FieldDefinition( - name='content-heading', - display_name='Content heading', - description='Raw OCR\'ed text (header).', - extractor=extract.XML( - Tag('text'), - Tag('text.title'), - flatten=True - ) + name="content-heading", + display_name="Content heading", + description="Raw OCR'ed text (header).", + extractor=extract.XML(Tag("text"), Tag("text.title"), flatten=True), ), FieldDefinition( - name='content', - display_name='Content', - display_type='text_content', - es_mapping=main_content_mapping(True, True, True, 'en'), - visualizations=['wordcloud'], - description='Raw OCR\'ed text (content).', + name="content", + display_name="Content", + display_type="text_content", + es_mapping=main_content_mapping(True, True, True, "en"), + visualizations=["wordcloud", "ngram"], + description="Raw OCR'ed text (content).", results_overview=True, search_field_core=True, extractor=extract.XML( - Tag('text'), - Tag('text.cr'), + Tag("text"), + Tag("text.cr"), multiple=True, flatten=True, - transform='\n'.join, + transform="\n".join, ), - language='en', + language="en", ), ] diff --git a/backend/corpora/troonredes/description/troonredes.md b/backend/corpora/troonredes/description/troonredes.md index 65f35c747..39a624e14 100644 --- a/backend/corpora/troonredes/description/troonredes.md +++ b/backend/corpora/troonredes/description/troonredes.md @@ -3,3 +3,5 @@ Troonredes (throne speeches) are the speeches from the throne that formally mark Missing years: in 1940-1944 no speech was written. The transcripts are provided by [troonredes.nl](https://www.troonredes.nl). + +The transcripts were enriched with named entities using the [the TextMiNER library](https://github.com/CentreForDigitalHumanities/TextMiNER). diff --git a/backend/es/es_alias.py b/backend/es/es_alias.py index 2d3e353bf..ad84470b5 100644 --- a/backend/es/es_alias.py +++ b/backend/es/es_alias.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import re -from addcorpus.models import Corpus +from addcorpus.models import Corpus, CorpusConfiguration from ianalyzer.elasticsearch import elasticsearch import logging @@ -51,6 +51,13 @@ def alias(corpus: Corpus, clean=False): logger.info('Done updating aliases') +def get_current_index_name(corpus: CorpusConfiguration, client) -> str: + """get the name of the current corpus' associated index""" + alias = corpus.es_alias or corpus.es_index + indices = client.indices.get(index="{}".format(alias)) + return max(sorted(indices.keys())) + + def get_new_version_number(client, alias, current_index=None): ''' Get version number for a new versioned index (e.g. `indexname-1`). diff --git a/backend/es/es_index.py b/backend/es/es_index.py index d89fad177..efe20ff17 100644 --- a/backend/es/es_index.py +++ b/backend/es/es_index.py @@ -18,7 +18,7 @@ from addcorpus.python_corpora.load_corpus import load_corpus_definition from addcorpus.reader import make_reader from ianalyzer.elasticsearch import elasticsearch -from .es_alias import alias, get_new_version_number +from .es_alias import alias, get_current_index_name, get_new_version_number import datetime import logging @@ -50,7 +50,13 @@ def _make_es_mapping(corpus_configuration: CorpusConfiguration) -> Dict: } -def create(client: Elasticsearch, corpus: Corpus, add: bool = False, clear: bool = False, prod: bool = False): +def create( + client: Elasticsearch, + corpus: Corpus, + add: bool = False, + clear: bool = False, + prod: bool = False, +) -> str: ''' Initialise an ElasticSearch index. ''' @@ -59,8 +65,8 @@ def create(client: Elasticsearch, corpus: Corpus, add: bool = False, clear: bool es_mapping = _make_es_mapping(corpus_config) if add: - # we add document to existing index - skip creation. - return None + # we add document to existing index - skip creation, return current index + return get_current_index_name(corpus_config, client) if clear: logger.info('Attempting to clean old index...') @@ -93,6 +99,7 @@ def create(client: Elasticsearch, corpus: Corpus, add: bool = False, clear: bool settings=settings, mappings=es_mapping, ) + return index_name except RequestError as e: if 'already_exists' not in e.error: # ignore that the index already exist, @@ -100,13 +107,18 @@ def create(client: Elasticsearch, corpus: Corpus, add: bool = False, clear: bool raise -def populate(client: Elasticsearch, corpus: Corpus, start=None, end=None): +def populate( + client: Elasticsearch, + corpus: Corpus, + versioned_index_name: str, + start=None, + end=None, +): ''' Populate an ElasticSearch index from the corpus' source files. ''' corpus_config = corpus.configuration corpus_name = corpus.name - index_name = corpus_config.es_index reader = make_reader(corpus) logger.info('Attempting to populate index...') @@ -121,11 +133,12 @@ def populate(client: Elasticsearch, corpus: Corpus, start=None, end=None): # can be sent to ElasticSearch in bulk actions = ( { - '_op_type': 'index', - '_index': index_name, - '_id': doc.get('id'), - '_source': doc - } for doc in docs + "_op_type": "index", + "_index": versioned_index_name, + "_id": doc.get("id"), + "_source": doc, + } + for doc in docs ) corpus_server = settings.SERVERS[ @@ -135,8 +148,10 @@ def populate(client: Elasticsearch, corpus: Corpus, start=None, end=None): for success, info in es_helpers.streaming_bulk( client, actions, - chunk_size=corpus_server['chunk_size'], - max_chunk_bytes=corpus_server['max_chunk_bytes'], + chunk_size=corpus_server["chunk_size"], + max_chunk_bytes=corpus_server["max_chunk_bytes"], + raise_on_exception=False, + raise_on_error=False, ): if not success: logger.error(f"FAILED INDEX: {info}") @@ -174,26 +189,23 @@ def perform_indexing( logger.info('retry on timeout: {}'.format( vars(client).get('_retry_on_timeout')) ) - create(client, corpus, add, clear, prod) + versioned_index_name = create(client, corpus, add, clear, prod) client.cluster.health(wait_for_status='yellow') if mappings_only: logger.info('Created index `{}` with mappings only.'.format(index_name)) return - populate(client, corpus, start=start, end=end) + populate(client, corpus, versioned_index_name, start=start, end=end) logger.info('Finished indexing `{}` to index `{}`.'.format( corpus_name, index_name)) if prod: - logger.info('Updating settings for index `{}`'.format( - index_name)) + logger.info("Updating settings for index `{}`".format(versioned_index_name)) client.indices.put_settings( - settings={'number_of_replicas': 1}, - index=index_name + settings={"number_of_replicas": 1}, index=versioned_index_name ) if rollover: - logger.info('Adjusting alias for index `{}`'.format( - index_name)) + logger.info("Adjusting alias for index `{}`".format(versioned_index_name)) alias(corpus) # not deleting old index, so we can roll back diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py index 71ff4585f..8ad14ddea 100644 --- a/backend/es/tests/test_es_index.py +++ b/backend/es/tests/test_es_index.py @@ -66,3 +66,18 @@ def test_mismatch_corpus_index_names(mock_corpus, corpus_definition, es_index_cl def test_db_only_corpus(json_mock_corpus, es_client, index_json_mock_corpus): res = es_client.count(index=json_mock_corpus.configuration.es_index) assert res.get('count') == 10 + + +def test_indexing_with_version(mock_corpus, corpus_definition, es_index_client): + corpus = Corpus.objects.get(name=mock_corpus) + perform_indexing( + corpus, + START, + END, + mappings_only=False, + add=False, + clear=False, + prod=True, + rollover=True, + ) + assert es_index_client.indices.exists(index="times-test-1") == True diff --git a/backend/ianalyzer/common_settings.py b/backend/ianalyzer/common_settings.py index d9991c2d0..8ec4f48de 100644 --- a/backend/ianalyzer/common_settings.py +++ b/backend/ianalyzer/common_settings.py @@ -98,6 +98,7 @@ }, ] +ACCOUNT_USER_DISPLAY = lambda user: user.username.replace(".", "\u2024") # Internationalization # https://docs.djangoproject.com/en/3.0/topics/i18n/ diff --git a/backend/ianalyzer/settings.py b/backend/ianalyzer/settings.py index 214ad7623..a59f8a458 100644 --- a/backend/ianalyzer/settings.py +++ b/backend/ianalyzer/settings.py @@ -48,9 +48,6 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' CSRF_TRUSTED_ORIGINS = ['http://localhost:8000'] - -# ACCOUNT_ADAPTER = 'users.adapters.CustomAccountAdapter' - SITE_NAME = 'IANALYZER' HOST = 'localhost:8000' diff --git a/backend/requirements.txt b/backend/requirements.txt index de7dcd3ce..d2743de8b 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -132,7 +132,7 @@ h11==0.14.0 # wsproto humanize==4.9.0 # via flower -ianalyzer-readers==0.2.0 +ianalyzer-readers==0.2.1 # via -r requirements.in idna==3.4 # via diff --git a/documentation/Named-entities.md b/documentation/Named-entities.md new file mode 100644 index 000000000..552d2adfe --- /dev/null +++ b/documentation/Named-entities.md @@ -0,0 +1,25 @@ +# Named Entities +I-Analyzer has the capacity to display named entities. + +## Prerequisites +In order to display a corpus enriched with named entities, install the Annotated Text plugin of Elasticsearch, following the instructions [here](https://www.elastic.co/guide/en/elasticsearch/plugins/8.6/mapper-annotated-text.html). + +### Named entity fields +To determine whether named entities are available for a given corpus, the application checks if a given corpus contains fields ending with `:ner`. + +If the main content field is called `speech`, the field containing named entity annotations should be called `speech:ner`. This field should have the following Elasticsearch mapping: +```python +{ + 'type': 'annotated_text' +} +``` + +Moreover, an enriched corpus should contain the following keyword fields: +- `ner:person` +- `ner:location` +- `ner:organization` +- `ner:miscellaneous` +These can be used to search or filter (to be implemented). + +## Enriching a corpus with named entities +To enrich a corpus with named entities, we recommend using the [TextMiNER](https://github.com/CentreForDigitalHumanities/TextMiNER) library. This library will read from an existing index and a specified field name. The content of the field is analyzed with the BERT-based models for named entity recognition provided by [flair](https://github.com/flairNLP/flair). The library then adds named entities to the `annotated_text` field and the keyword fields, as outlined above. diff --git a/frontend/karma.conf.js b/frontend/karma.conf.js index ef4935f03..988c29b95 100644 --- a/frontend/karma.conf.js +++ b/frontend/karma.conf.js @@ -19,7 +19,7 @@ module.exports = function (config) { dir: require('path').join(__dirname, 'coverage'), reports: [ 'html', 'lcovonly' ], fixWebpackSourcePaths: true }, - + reporters: ['progress', 'kjhtml'], port: 9876, colors: true, @@ -32,7 +32,7 @@ module.exports = function (config) { base: 'Chrome', flags: [ '--headless', - // '--disable-gpu', this might not be needed http://cvuorinen.net/2017/05/running-angular-tests-in-headless-chrome/ + '--disable-gpu', // Without a remote debugging port, Google Chrome exits immediately. '--remote-debugging-port=9222', '--no-sandbox' diff --git a/frontend/src/app/document-page/document-page.component.html b/frontend/src/app/document-page/document-page.component.html index 8c2635901..9fbd74496 100644 --- a/frontend/src/app/document-page/document-page.component.html +++ b/frontend/src/app/document-page/document-page.component.html @@ -2,9 +2,10 @@
- Show named entities
-
+
+ +
diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html index 16772e50f..dde7c8274 100644 --- a/frontend/src/app/document-view/document-view.component.html +++ b/frontend/src/app/document-view/document-view.component.html @@ -45,16 +45,8 @@
-
- - - {{textSegment.text}} - - - - {{textSegment.text | paragraph}} - - +
diff --git a/frontend/src/app/document/document-popup/document-popup.component.html b/frontend/src/app/document/document-popup/document-popup.component.html index 705199abb..8f79a8f3d 100644 --- a/frontend/src/app/document/document-popup/document-popup.component.html +++ b/frontend/src/app/document/document-popup/document-popup.component.html @@ -3,7 +3,7 @@ [responsive]="true" [maximizable]="true" [dismissableMask]="true" [draggable]="true" [resizable]="false" [blockScroll]="true"> Document {{document.position}} of {{page.total}} - Show named entities + diff --git a/frontend/src/app/document/document-popup/document-popup.component.spec.ts b/frontend/src/app/document/document-popup/document-popup.component.spec.ts index d8040c8fd..856c169ef 100644 --- a/frontend/src/app/document/document-popup/document-popup.component.spec.ts +++ b/frontend/src/app/document/document-popup/document-popup.component.spec.ts @@ -1,4 +1,4 @@ -import { ComponentFixture, TestBed, fakeAsync, tick, waitForAsync } from '@angular/core/testing'; +import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; import { By } from '@angular/platform-browser'; import { DocumentPopupComponent } from './document-popup.component'; @@ -7,7 +7,6 @@ import { makeDocument } from '../../../mock-data/constructor-helpers'; import { mockCorpus, mockCorpus2, mockField } from '../../../mock-data/corpus'; import { DocumentPage } from '@models/document-page'; import { QueryModel } from '@models'; -import { query } from '@angular/animations'; describe('DocumentPopupComponent', () => { @@ -33,7 +32,7 @@ describe('DocumentPopupComponent', () => { }); it('does not show the NER toggle for corpora without named entities', () => { - expect(fixture.debugElement.query(By.css('ia-toggle'))).toBeFalsy(); + expect(fixture.debugElement.query(By.css('ia-entity-toggle'))).toBeFalsy(); }); it('shows the NER toggle for corpora with named entities', () => { @@ -42,6 +41,6 @@ describe('DocumentPopupComponent', () => { component.queryModel = queryModel; component.ngOnChanges({queryModel: {previousValue: setModel, currentValue: queryModel, firstChange: false, isFirstChange: null}}); fixture.detectChanges(); - expect(fixture.debugElement.query(By.css('ia-toggle'))).toBeTruthy(); + expect(fixture.debugElement.query(By.css('ia-entity-toggle'))).toBeTruthy(); }); }); diff --git a/frontend/src/app/document/document-popup/document-popup.component.ts b/frontend/src/app/document/document-popup/document-popup.component.ts index 8f3ef12c6..dd2cd02b6 100644 --- a/frontend/src/app/document/document-popup/document-popup.component.ts +++ b/frontend/src/app/document/document-popup/document-popup.component.ts @@ -8,7 +8,7 @@ import { takeUntil } from 'rxjs/operators'; import * as _ from 'lodash'; import { FoundDocument, QueryModel } from '@models'; import { Subject } from 'rxjs'; -import { documentIcons, actionIcons, corpusIcons } from '@shared/icons'; +import { actionIcons, documentIcons } from '../../shared/icons'; @Component({ selector: 'ia-document-popup', diff --git a/frontend/src/app/document/document.module.ts b/frontend/src/app/document/document.module.ts index 6bc0a9ef8..303d13a6d 100644 --- a/frontend/src/app/document/document.module.ts +++ b/frontend/src/app/document/document.module.ts @@ -10,12 +10,8 @@ import { DocumentPopupComponent } from './document-popup/document-popup.componen import { DialogModule } from 'primeng/dialog'; import { DocumentPreviewComponent } from './document-preview/document-preview.component'; import { EntityLegendComponent } from './entity-legend/entity-legend.component'; -import { - ElasticsearchHighlightPipe, - GeoDataPipe, - ParagraphPipe, - SnippetPipe, -} from '@shared/pipes'; +import { EntityToggleComponent } from './entity-toggle/entity-toggle.component'; +import { ElasticsearchHighlightPipe, EntityPipe, GeoDataPipe, ParagraphPipe, SnippetPipe } from '../shared/pipes'; @NgModule({ declarations: [ @@ -25,7 +21,9 @@ import { DocumentPopupComponent, DocumentPreviewComponent, EntityLegendComponent, + EntityToggleComponent, ElasticsearchHighlightPipe, + EntityPipe, GeoDataPipe, ParagraphPipe, SnippetPipe @@ -42,6 +40,7 @@ import { DocumentPageComponent, DocumentPopupComponent, EntityLegendComponent, + EntityToggleComponent, SearchRelevanceComponent, ] }) diff --git a/frontend/src/app/document/entity-legend/entity-legend.component.html b/frontend/src/app/document/entity-legend/entity-legend.component.html index 41f9622f6..5fedf0bfe 100644 --- a/frontend/src/app/document/entity-legend/entity-legend.component.html +++ b/frontend/src/app/document/entity-legend/entity-legend.component.html @@ -8,6 +8,6 @@ - No named entities were found in this text. + No named entities found. diff --git a/frontend/src/app/document/entity-toggle/entity-toggle.component.html b/frontend/src/app/document/entity-toggle/entity-toggle.component.html new file mode 100644 index 000000000..6e99326ac --- /dev/null +++ b/frontend/src/app/document/entity-toggle/entity-toggle.component.html @@ -0,0 +1,6 @@ + +Show named entities diff --git a/frontend/src/app/document/entity-toggle/entity-toggle.component.scss b/frontend/src/app/document/entity-toggle/entity-toggle.component.scss new file mode 100644 index 000000000..a14bc8c19 --- /dev/null +++ b/frontend/src/app/document/entity-toggle/entity-toggle.component.scss @@ -0,0 +1,8 @@ +button { + float: left; + margin-right: 1em; +} +em { + position: absolute; + margin-top: .3em; +} diff --git a/frontend/src/app/document/entity-toggle/entity-toggle.component.spec.ts b/frontend/src/app/document/entity-toggle/entity-toggle.component.spec.ts new file mode 100644 index 000000000..20bc7fb44 --- /dev/null +++ b/frontend/src/app/document/entity-toggle/entity-toggle.component.spec.ts @@ -0,0 +1,21 @@ +import { ComponentFixture, TestBed } from '@angular/core/testing'; + +import { commonTestBed } from '../../common-test-bed'; +import { EntityToggleComponent } from './entity-toggle.component'; + +describe('EntityToggleComponent', () => { + let component: EntityToggleComponent; + let fixture: ComponentFixture; + + beforeEach(async () => { + await commonTestBed().testingModule.compileComponents(); + + fixture = TestBed.createComponent(EntityToggleComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/document/entity-toggle/entity-toggle.component.ts b/frontend/src/app/document/entity-toggle/entity-toggle.component.ts new file mode 100644 index 000000000..a1bf02802 --- /dev/null +++ b/frontend/src/app/document/entity-toggle/entity-toggle.component.ts @@ -0,0 +1,23 @@ +import { Component, output } from '@angular/core'; + +import { actionIcons } from '../../shared/icons'; +import { DialogService } from '../../services'; + +@Component({ + selector: 'ia-entity-toggle', + templateUrl: './entity-toggle.component.html', + styleUrl: './entity-toggle.component.scss' +}) +export class EntityToggleComponent { + actionIcons = actionIcons; + toggleNER = output(); + toggleLabel: string; + + constructor(private dialogService: DialogService) { + this.toggleLabel = 'ner-toggle'; + } + + public showNamedEntityDocumentation() { + this.dialogService.showManualPage('namedentities'); + } +} diff --git a/frontend/src/app/login/registration/registration.component.html b/frontend/src/app/login/registration/registration.component.html index 514bf1a34..b5b02ef4b 100644 --- a/frontend/src/app/login/registration/registration.component.html +++ b/frontend/src/app/login/registration/registration.component.html @@ -24,7 +24,7 @@

I-Analyzer sign-up

- + @@ -32,7 +32,7 @@

I-Analyzer sign-up

Email is required.
-
+
Please enter a valid email address.
diff --git a/frontend/src/app/login/reset-password/request-reset.component.html b/frontend/src/app/login/reset-password/request-reset.component.html index e4704f287..f58382f95 100644 --- a/frontend/src/app/login/reset-password/request-reset.component.html +++ b/frontend/src/app/login/reset-password/request-reset.component.html @@ -17,7 +17,7 @@

Reset password

Email is required.
-
+
Please enter a valid email address.
{{message}}
diff --git a/frontend/src/app/shared/pipes/entity.pipe.spec.ts b/frontend/src/app/shared/pipes/entity.pipe.spec.ts new file mode 100644 index 000000000..bf1b2ddda --- /dev/null +++ b/frontend/src/app/shared/pipes/entity.pipe.spec.ts @@ -0,0 +1,36 @@ +import { FieldEntities } from '../../models'; +import { EntityPipe } from './entity.pipe'; + +describe('EntityPipe', () => { + const mockInput: Array = [ + {text: 'Nobody expects the ', entity: 'flat'}, + {text: 'Spanish Inquisition', entity: 'organization'}, + {text: '!', entity: 'flat'} + ]; + + it('creates an instance', () => { + const pipe = new EntityPipe(); + expect(pipe).toBeTruthy(); + }); + + it('adds mark tags to named entity annotations', ()=> { + const pipe = new EntityPipe(); + const output = pipe.transform(mockInput.slice(1,2)); + expect(output).toContain(''); + expect(output).toContain(''); + }); + + it('does not change Field Entities of `flat` type', () => { + const pipe = new EntityPipe(); + const output = pipe.transform(mockInput.slice(0,1)); + expect(output).toEqual(mockInput[0].text); + }) + + it('concatenates highlighted and non-annotated text', () => { + const pipe = new EntityPipe(); + const output = pipe.transform(mockInput); + expect(typeof output).toBe('string'); + }) +}); diff --git a/frontend/src/app/shared/pipes/entity.pipe.ts b/frontend/src/app/shared/pipes/entity.pipe.ts new file mode 100644 index 000000000..750d76348 --- /dev/null +++ b/frontend/src/app/shared/pipes/entity.pipe.ts @@ -0,0 +1,33 @@ +import { Pipe, PipeTransform } from '@angular/core'; +import { icon } from '@fortawesome/fontawesome-svg-core'; + +import { entityIcons } from '../icons'; +import { FieldEntities } from '../../models'; + +@Pipe({ + name: 'entity' +}) +export class EntityPipe implements PipeTransform { + /** + * a pipe to transform a list of FieldEntities into flat text and entities + * wrapped in tags, with icons indicating the type of named entity. + * Note that this pipe needs to be followed by the | paragraph or | safeHtml pipe; + * otherwise, the icons will be removed due to sanitization + * @param entityArray: list of FieldEntities + * @returns string of mixed text and html. + */ + + transform(entityArray: Array): string { + const output = entityArray.map(ent => { + if (ent.entity === 'flat') { + return ent.text + } + else { + const iconName = entityIcons[ent.entity]; + return `${ent.text} ${icon(iconName as any).html}` + } + }) + return output.join(''); + } + +} diff --git a/frontend/src/app/shared/pipes/geo-data.pipe.ts b/frontend/src/app/shared/pipes/geo-data.pipe.ts index 64726ee7e..9d32a9ce0 100644 --- a/frontend/src/app/shared/pipes/geo-data.pipe.ts +++ b/frontend/src/app/shared/pipes/geo-data.pipe.ts @@ -1,12 +1,9 @@ import { Pipe, PipeTransform } from '@angular/core'; -import { DomSanitizer } from '@angular/platform-browser'; -import { CorpusField, FoundDocument } from '@models'; +import { CorpusField, FoundDocument } from '../../models'; @Pipe({ name: 'geoData' }) export class GeoDataPipe implements PipeTransform { - constructor(private sanitizer: DomSanitizer) { - } /** * Transforms GeoJSON data diff --git a/frontend/src/app/shared/pipes/index.ts b/frontend/src/app/shared/pipes/index.ts index 0bf4c0f5b..b090beb70 100644 --- a/frontend/src/app/shared/pipes/index.ts +++ b/frontend/src/app/shared/pipes/index.ts @@ -1,4 +1,5 @@ export * from './elasticsearch-highlight.pipe'; +export * from './entity.pipe'; export * from './geo-data.pipe'; export * from './paragraph.pipe'; export * from './regex-highlight.pipe'; diff --git a/frontend/src/app/shared/pipes/paragraph.pipe.spec.ts b/frontend/src/app/shared/pipes/paragraph.pipe.spec.ts index 10f3ec4c0..2bf443f1d 100644 --- a/frontend/src/app/shared/pipes/paragraph.pipe.spec.ts +++ b/frontend/src/app/shared/pipes/paragraph.pipe.spec.ts @@ -1,8 +1,47 @@ +import { TestBed } from '@angular/core/testing'; +import { DomSanitizer } from '@angular/platform-browser'; + + import { ParagraphPipe } from './paragraph.pipe'; describe('ParagraphPipe', () => { - it('create an instance', () => { - const pipe = new ParagraphPipe(); - expect(pipe).toBeTruthy(); - }); + let pipe: ParagraphPipe; + + beforeEach(() => { + TestBed.configureTestingModule({ + providers: [ + ParagraphPipe, + { provide: DomSanitizer, useValue: { + bypassSecurityTrustHtml: (input) => input + } + } + ] + }); + pipe = TestBed.inject(ParagraphPipe); + }) + + it('creates an instance', () => { + expect(pipe).toBeTruthy(); + }); + + it('does not alter text without linebreaks', () => { + const input = 'Some text. And some more text. And even more.'; + const output = pipe.transform(input); + expect(output).toEqual(input); + }); + + it('wraps text with linebreaks in paragraph tags', () => { + const input = 'Some text.\nAnd some more text.\nAnd even more.'; + const output = pipe.transform(input); + const expected = '

Some text.

And some more text.

And even more.

' + expect(output).toEqual(expected); + }); + + it('ignores multiple linebreaks', () => { + const input = '\nSome text.\n\n\nAnd some more text.\n\n'; + const output = pipe.transform(input); + const expected = '

Some text.

And some more text.

' + expect(output).toEqual(expected); + }); + }); diff --git a/frontend/src/app/shared/pipes/paragraph.pipe.ts b/frontend/src/app/shared/pipes/paragraph.pipe.ts index eebafc51a..42992fcbe 100644 --- a/frontend/src/app/shared/pipes/paragraph.pipe.ts +++ b/frontend/src/app/shared/pipes/paragraph.pipe.ts @@ -1,22 +1,25 @@ import { Pipe, PipeTransform } from '@angular/core'; - +import { DomSanitizer, SafeHtml } from '@angular/platform-browser'; @Pipe({ name: 'paragraph', }) export class ParagraphPipe implements PipeTransform { + constructor(private domSanitizer: DomSanitizer) {} transform(content: string | string[]): unknown { - const splitText = this.addParagraphTags(content); + const splitText = this.addParagraphBreaks(content); return splitText; } - addParagraphTags(content: string | string[]) { + addParagraphBreaks(content: string | string[]): SafeHtml { const paragraphs = typeof content === 'string' ? content.split('\n') : content; if (!paragraphs || paragraphs.length === 1) { - return content; + return content as string; } - return paragraphs.map(p => `

${p}

`).join(' '); + const cleanedParagraphs = paragraphs.filter(p => p !== '') + const wrapped = cleanedParagraphs.join('

') + return this.domSanitizer.bypassSecurityTrustHtml(`

${wrapped}

`); } diff --git a/frontend/src/app/shared/toggle/toggle.component.html b/frontend/src/app/shared/toggle/toggle.component.html index 8b855a255..312aaf0d9 100644 --- a/frontend/src/app/shared/toggle/toggle.component.html +++ b/frontend/src/app/shared/toggle/toggle.component.html @@ -1,4 +1,4 @@
- - + +
diff --git a/frontend/src/app/shared/toggle/toggle.component.scss b/frontend/src/app/shared/toggle/toggle.component.scss index 7ca30a115..902a65f38 100644 --- a/frontend/src/app/shared/toggle/toggle.component.scss +++ b/frontend/src/app/shared/toggle/toggle.component.scss @@ -60,9 +60,9 @@ input:checked { -ms-transform: translateX(2rem); -o-transform: translateX(2rem); } - + + .slider:after { left: calc(100% - 5px); transform: translateX(-100%); } -} \ No newline at end of file +} diff --git a/frontend/src/app/shared/toggle/toggle.component.ts b/frontend/src/app/shared/toggle/toggle.component.ts index 13194c3c8..57997a34d 100644 --- a/frontend/src/app/shared/toggle/toggle.component.ts +++ b/frontend/src/app/shared/toggle/toggle.component.ts @@ -1,22 +1,20 @@ -import { Component, EventEmitter, OnInit, Output } from '@angular/core'; +import { Component, EventEmitter, Input, Output } from '@angular/core'; @Component({ selector: 'ia-toggle', templateUrl: './toggle.component.html', styleUrls: ['./toggle.component.scss'] }) -export class ToggleComponent implements OnInit { - @Output() toggled = new EventEmitter(); - active = false; +export class ToggleComponent { + @Input() toggleLabel: string; + @Output() toggled = new EventEmitter(); + active = false; - constructor() { } + constructor() { } - ngOnInit(): void { - } - - public toggleButton() { - this.active = !this.active; - this.toggled.emit(this.active); - } + public toggleButton() { + this.active = !this.active; + this.toggled.emit(this.active); + } } diff --git a/frontend/src/assets/manual/en-GB/manifest.json b/frontend/src/assets/manual/en-GB/manifest.json index 71db94acd..318291f36 100644 --- a/frontend/src/assets/manual/en-GB/manifest.json +++ b/frontend/src/assets/manual/en-GB/manifest.json @@ -20,6 +20,10 @@ { "id": "citation", "title": "Citing I-analyzer" + }, + { + "id": "namedentities", + "title": "Viewing named entities" } ] }, diff --git a/frontend/src/assets/manual/en-GB/namedentities.md b/frontend/src/assets/manual/en-GB/namedentities.md new file mode 100644 index 000000000..64d6c25f7 --- /dev/null +++ b/frontend/src/assets/manual/en-GB/namedentities.md @@ -0,0 +1,12 @@ +In some corpora, you have the option to view named entities. If this option is available, a toggle button labeled *Show named entities* is visible in the document overview, above the document content and metadata. + +Activating the toggle will show entities of four types: +- Persons +- Locations +- Organizations +- Miscellaneous + +## Disclaimer +Named entities are automatically assigned using machine learning. As such, the annotations can include mistakes, and should not be taken at face value. + +If a corpus has named entity annotations, the info page for the corpus will provide more information on how the models were trained. We recommend being critical about the accuracy and possible biases of this process if you intend to use named entities in your research.