diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml index 2b87893d4..15b83ca73 100644 --- a/.github/workflows/backend-test.yml +++ b/.github/workflows/backend-test.yml @@ -1,4 +1,4 @@ -# This workflow will run backend tests on the Python version defined in the Dockerfiles +# This workflow will run backend tests on the Python version defined in the backend/Dockerfile name: Backend unit tests @@ -13,15 +13,45 @@ on: - 'hotfix/**' - 'release/**' - 'dependabot/**' - paths-ignore: - - 'frontend/**' - - '**.md' + paths: + - 'backend/**' + - '.github/workflows/backend*' + - 'docker-compose.yaml' jobs: backend-test: name: Test Backend runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push Elasticsearch image + uses: docker/build-push-action@v6 + with: + context: . + file: DockerfileElastic + push: true + tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest + cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest + cache-to: type=inline + - name: Build and push Backend + uses: docker/build-push-action@v6 + with: + context: backend/. + push: true + tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest + cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest + cache-to: type=inline - name: Run backend tests - run: sudo mkdir -p /ci-data && sudo docker-compose --env-file .env-ci run backend pytest + run: | + sudo mkdir -p /ci-data + docker compose pull elasticsearch + docker compose pull backend + docker compose --env-file .env-ci run --rm backend pytest diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml index fdb14f20e..0e19cb73a 100644 --- a/.github/workflows/frontend-test.yml +++ b/.github/workflows/frontend-test.yml @@ -13,15 +13,34 @@ on: - 'hotfix/**' - 'release/**' - 'dependabot/**' - paths-ignore: - - 'backend/**' - - '**.md' + paths: + - 'frontend/**' + - '.github/workflows/frontend*' + - 'docker-compose.yaml' jobs: frontend-test: name: Test Frontend runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Run frontend tests - run: sudo docker-compose --env-file .env-ci run frontend yarn test + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build frontend image, using cache from Github registry + uses: docker/build-push-action@v6 + with: + context: frontend/. + push: true + tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest + cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest + cache-to: type=inline + - name: Run frontend unit tests + run: | + docker compose pull frontend + docker compose --env-file .env-ci run --rm frontend yarn test diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index ed2bc732e..000000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,25 +0,0 @@ -# This action will update the CITATION.cff file for new release or hotfix branches - -name: Release - -on: - push: - branches: - - 'release/**' - - 'hotfix/**' - -jobs: - citation-update: - name: Update CITATION.cff - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Autoformat CITATION.cff - run: | - version=`grep -o '\d\+\.\d\+\.\d\+' package.json` - today=`date +"%Y-%m-%d"` - sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff - sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff - bash ./update-citation.sh - git commit -a -m "update version and date in CITATION.cff" - diff --git a/.vscode/launch.json b/.vscode/launch.json index 786b5f0a5..953445716 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -61,6 +61,16 @@ } }, { + "name": "Python: Debug Tests", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "purpose": [ + "debug-test" + ], + "console": "internalConsole", + "justMyCode": false + }, { "name": "celery", "type": "debugpy", "request": "launch", diff --git a/CITATION.cff b/CITATION.cff index 69b99051a..00046e52f 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -35,5 +35,5 @@ keywords: - elasticsearch - natural language processing license: MIT -version: 5.9.0 -date-released: '2024-07-05' +version: 5.11.0 +date-released: '2024-08-08' diff --git a/backend/Dockerfile b/backend/Dockerfile index 2c58b766e..aefd4cd76 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -7,7 +7,6 @@ RUN apt-get -y update && apt-get -y upgrade RUN apt-get install -y pkg-config libxml2-dev libxmlsec1-dev libxmlsec1-openssl default-libmysqlclient-dev RUN pip install --upgrade pip -RUN pip install pip-tools # make a directory in the container WORKDIR /backend # copy requirements from the host system to the directory in the container diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py index c0f19e5d2..12fb38edb 100644 --- a/backend/addcorpus/constants.py +++ b/backend/addcorpus/constants.py @@ -49,9 +49,18 @@ class VisualizationType(Enum): 'scan', 'tab-scan' 'p', + 'tags', + 'context', + 'tab', ] ''' -Field names that cannot be used because they are also query parameters in frontend routes. +Field names that cannot be used because they interfere with other functionality. -Using them would make routing ambiguous. +This is usually because they are also query parameters in frontend routes, and using them +would make routing ambiguous. + +`query` is also forbidden because it is a reserved column in CSV downloads. Likewise, +`context` is forbidden because it's used in download requests. + +`scan` and `tab-scan` are added because they interfere with element IDs in the DOM. ''' diff --git a/backend/addcorpus/reader.py b/backend/addcorpus/reader.py index 2566f6fa6..ca95eea13 100644 --- a/backend/addcorpus/reader.py +++ b/backend/addcorpus/reader.py @@ -36,8 +36,6 @@ class NewReader(CSVReader): for f in corpus.configuration.fields.all()] def sources(self, *args, **kwargs): - return ( - (fn, {}) for fn in glob.glob(f'{self.data_directory}/**/*.csv', recursive=True) - ) + return glob.glob(f'{self.data_directory}/**/*.csv', recursive=True) return NewReader() diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py index b588d3bb2..76fcb4ed5 100644 --- a/backend/corpora/dbnl/dbnl.py +++ b/backend/corpora/dbnl/dbnl.py @@ -2,6 +2,7 @@ import os import re from tqdm import tqdm +from ianalyzer_readers.xml_tag import Tag, CurrentTag, TransformTag from django.conf import settings from addcorpus.python_corpora.corpus import XMLCorpusDefinition, FieldDefinition @@ -25,8 +26,8 @@ class DBNL(XMLCorpusDefinition): languages = ['nl', 'dum', 'fr', 'la', 'fy', 'lat', 'en', 'nds', 'de', 'af'] category = 'book' - tag_toplevel = 'TEI.2' - tag_entry = { 'name': 'div', 'attrs': {'type': 'chapter'} } + tag_toplevel = Tag('TEI.2') + tag_entry = Tag('div', type='chapter') document_context = { 'context_fields': ['title_id'], @@ -261,18 +262,18 @@ def _xml_files(self): Pass( Backup( XML( # get the language on chapter-level if available + CurrentTag(), attribute='lang', transform=lambda value: [value] if value else None, ), XML( # look for section-level codes - {'name': 'div', 'attrs': {'type': 'section'}}, + Tag('div', type='section'), attribute='lang', multiple=True, ), XML( # look in the top-level metadata - 'language', + Tag('language'), toplevel=True, - recursive=True, multiple=True, attribute='id' ), @@ -298,17 +299,17 @@ def _xml_files(self): extractor=Pass( Backup( XML( # get the language on chapter-level if available + CurrentTag(), attribute='lang', ), XML( # look for section-level code - {'name': 'div', 'attrs': {'type': 'section'}}, + Tag('div', type='section'), attribute='lang' ), XML( #otherwise, get the (first) language for the book - 'language', + Tag('language'), attribute='id', toplevel=True, - recursive=True, ), transform=utils.single_language_code, ), @@ -322,13 +323,11 @@ def _xml_files(self): display_name='Chapter', extractor=Backup( XML( - tag='head', - recursive=True, + Tag('head'), flatten=True, ), XML( - tag=utils.LINE_TAG, - recursive=True, + Tag(utils.LINE_TAG), flatten=True, ) ), @@ -359,11 +358,11 @@ def _xml_files(self): search_field_core=True, csv_core=True, extractor=XML( - tag=utils.LINE_TAG, - recursive=True, + Tag(utils.LINE_TAG), + TransformTag(utils.pad_content), multiple=True, flatten=True, - transform_soup_func=utils.pad_content, + transform=lambda lines: '\n'.join(lines).strip() if lines else None, ), es_mapping=main_content_mapping(token_counts=True), visualizations=['wordcloud'], diff --git a/backend/corpora/dbnl/tests/test_dbnl_extraction.py b/backend/corpora/dbnl/tests/test_dbnl_extraction.py index 2c6d976c4..bc6ed6063 100644 --- a/backend/corpora/dbnl/tests/test_dbnl_extraction.py +++ b/backend/corpora/dbnl/tests/test_dbnl_extraction.py @@ -145,12 +145,12 @@ def test_append_to_tag(xml, tag, padding, original_output, new_output): 'content': '\n'.join([ 'Register der Liedekens.', 'A.', - 'ACh gesalfde van den Heer. Pag. 30 ', - 'Als Saul, en david den vyant in\'t velt. 41 ', - 'Als ick de Son verhoogen sie. 184 ', - 'Als hem de Son begeeft. 189 ', - 'Als ick den Herfst aenschou. 194 ', - 'Als in koelt, de nacht komt overkleeden 208 ', + 'ACh gesalfde van den Heer. Pag. 30', + 'Als Saul, en david den vyant in\'t velt. 41', + 'Als ick de Son verhoogen sie. 184', + 'Als hem de Son begeeft. 189', + 'Als ick den Herfst aenschou. 194', + 'Als in koelt, de nacht komt overkleeden 208', 'Als van der meer op Eng\'le-vleug\'len vloog. 232', ]) }, { # metadata-only book @@ -194,6 +194,8 @@ def test_dbnl_extraction(dbnl_corpus): for actual, expected in zip(docs, expected_docs): # assert that actual is a superset of expected for key in expected: + if expected[key] != actual[key]: + print(key) assert expected[key] == actual[key] assert expected.items() <= actual.items() diff --git a/backend/corpora/dbnl/utils.py b/backend/corpora/dbnl/utils.py index 6a819425f..029f7388b 100644 --- a/backend/corpora/dbnl/utils.py +++ b/backend/corpora/dbnl/utils.py @@ -183,7 +183,8 @@ def append_to_tag(soup, tag, padding): def pad_content(node): pad_cells = lambda n: append_to_tag(n, 'cell', ' ') pad_linebreaks = lambda n: append_to_tag(n, 'lb', '\n') - return pad_cells(pad_linebreaks(node)) + pad_cells(pad_linebreaks(node)) + return [node] def standardize_language_code(code): if code: diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py index e9d4993c5..32ca73396 100644 --- a/backend/corpora/dutchannualreports/dutchannualreports.py +++ b/backend/corpora/dutchannualreports/dutchannualreports.py @@ -4,6 +4,7 @@ import os.path as op import logging from datetime import datetime +from ianalyzer_readers.xml_tag import Tag from django.conf import settings @@ -20,7 +21,6 @@ class DutchAnnualReports(XMLCorpusDefinition): """ Alto XML corpus of Dutch annual reports. """ - # Data overrides from .common.Corpus (fields at bottom of class) title = "Dutch Annual Reports" description = "Annual reports of Dutch financial and non-financial institutes" min_date = datetime(year=1957, month=1, day=1) @@ -38,9 +38,8 @@ class DutchAnnualReports(XMLCorpusDefinition): mimetype = 'application/pdf' - # Data overrides from .common.XMLCorpus - tag_toplevel = 'alto' - tag_entry = 'Page' + tag_toplevel = Tag('alto') + tag_entry = Tag('Page') # New data members non_xml_msg = 'Skipping non-XML file {}' @@ -187,9 +186,8 @@ def sources(self, start=min_date, end=max_date): description='Text content of the page.', results_overview=True, extractor=XML( - tag='String', + Tag('String'), attribute='CONTENT', - recursive=True, multiple=True, transform=lambda x: ' '.join(x), ), diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index 19e5c4377..9d295f82b 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -7,6 +7,7 @@ from datetime import datetime from os.path import join, split, splitext import os +from ianalyzer_readers.xml_tag import Tag, SiblingTag from django.conf import settings @@ -43,8 +44,9 @@ class DutchNewspapersPublic(XMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - tag_toplevel = 'text' - tag_entry = 'p' + tag_toplevel = Tag('text') + tag_entry = Tag('p') + external_file_tag_toplevel = Tag('DIDL') # New data members definition_pattern = re.compile(r'didl') @@ -137,18 +139,10 @@ def fields(self): description="Link to record on Delpher", display_type='url', es_mapping=keyword_mapping(), - extractor=XML(tag='identifier', - toplevel=True, - recursive=True, - multiple=False, - secondary_tag={ - 'tag': 'recordIdentifier', - 'match': 'id' - }, - external_file={ - 'xml_tag_toplevel': 'DIDL', - 'xml_tag_entry': 'dcx' - } + extractor=XML( + lambda metadata: Tag('recordIdentifier', string=metadata['id']), + SiblingTag('identifier'), + external_file=True ) ), FieldDefinition( @@ -179,13 +173,9 @@ def fields(self): 'indicator is in this range.' ) ), - extractor=XML(tag='OCRConfidencelevel', - toplevel=True, - recursive=True, - external_file={ - 'xml_tag_toplevel': 'DIDL', - 'xml_tag_entry': 'dcx' - }, + extractor=XML( + Tag('OCRConfidencelevel'), + external_file=True, transform=lambda x: float(x)*100 ), sortable=True @@ -225,19 +215,11 @@ def fields(self): description='Whether the item is an article, advertisment, etc.', csv_core=True, es_mapping={'type': 'keyword'}, - extractor=XML(tag='subject', - toplevel=True, - recursive=True, - multiple=False, - secondary_tag={ - 'tag': 'recordIdentifier', - 'match': 'id' - }, - external_file={ - 'xml_tag_toplevel': 'DIDL', - 'xml_tag_entry': 'dcx' - } - ), + extractor=XML( + lambda metadata: Tag('recordIdentifier', string=metadata['id']), + SiblingTag('subject'), + external_file=True + ), search_filter=filters.MultipleChoiceFilter( description='Accept only articles in these categories.', option_count=2, @@ -276,7 +258,7 @@ def fields(self): description='Article title', results_overview=True, search_field_core=True, - extractor=XML(tag='title', flatten=True, toplevel=True) + extractor=XML(Tag('title'), flatten=True, toplevel=True) ), FieldDefinition( name='id', @@ -320,8 +302,13 @@ def fields(self): es_mapping=main_content_mapping(True, True, True, 'nl'), results_overview=True, search_field_core=True, - extractor=XML(tag='p', multiple=True, - flatten=True, toplevel=True), + extractor=XML( + Tag('p'), + multiple=True, + flatten=True, + toplevel=True, + transform='\n'.join, + ), visualizations=["wordcloud"], language='nl', ), diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py index 863c08433..54e095aaf 100644 --- a/backend/corpora/ecco/ecco.py +++ b/backend/corpora/ecco/ecco.py @@ -7,6 +7,7 @@ from datetime import datetime import logging import re +from ianalyzer_readers.xml_tag import Tag from django.conf import settings @@ -37,8 +38,8 @@ class Ecco(XMLCorpusDefinition): languages = ['en', 'fr', 'la', 'grc', 'de', 'it', 'cy', 'ga', 'gd'] category = 'book' - tag_toplevel = 'pageContent' - tag_entry = 'page' + tag_toplevel = Tag('pageContent') + tag_entry = Tag('page') meta_pattern = re.compile('^\d+\_DocMetadata\.xml$') @@ -153,8 +154,7 @@ def fields(self): description='Text content.', results_overview=True, search_field_core=True, - extractor=XML(tag='ocrText', - flatten=True), + extractor=XML(Tag('ocrText'), flatten=True), visualizations=['wordcloud'] ), FieldDefinition( diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py index 54737e274..be6f0f658 100644 --- a/backend/corpora/guardianobserver/guardianobserver.py +++ b/backend/corpora/guardianobserver/guardianobserver.py @@ -11,6 +11,7 @@ from datetime import datetime from zipfile import ZipFile from io import BytesIO +from ianalyzer_readers.xml_tag import Tag from django.conf import settings @@ -46,7 +47,7 @@ class GuardianObserver(XMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - tag_toplevel = 'Record' + tag_toplevel = Tag('Record') def sources(self, start=datetime.min, end=datetime.max): ''' @@ -84,7 +85,7 @@ def sources(self, start=datetime.min, end=datetime.max): ) ), extractor=extract.XML( - tag='NumericPubDate', toplevel=True, + Tag('NumericPubDate'), transform=lambda x: '{y}-{m}-{d}'.format(y=x[:4],m=x[4:6],d=x[6:]) ), sortable=True, @@ -96,30 +97,28 @@ def sources(self, start=datetime.min, end=datetime.max): csv_core=True, results_overview=True, description='Publication date as full string, as found in source file', - extractor=extract.XML( - tag='AlphaPubDate', toplevel=True - ) + extractor=extract.XML(Tag('AlphaPubDate')) ), FieldDefinition( name='id', es_mapping=keyword_mapping(), display_name='ID', description='Article identifier.', - extractor=extract.XML(tag='RecordID', toplevel=True) + extractor=extract.XML(Tag('RecordID')), ), FieldDefinition( name='pub_id', es_mapping=keyword_mapping(), display_name='Publication ID', description='Publication identifier', - extractor=extract.XML(tag='PublicationID', toplevel=True, recursive=True) + extractor=extract.XML(Tag('PublicationID')) ), FieldDefinition( name='page', es_mapping=keyword_mapping(), display_name='Page', description='Start page label, from source (1, 2, 17A, ...).', - extractor=extract.XML(tag='StartPage', toplevel=True) + extractor=extract.XML(Tag('StartPage')) ), FieldDefinition( name='title', @@ -127,14 +126,14 @@ def sources(self, start=datetime.min, end=datetime.max): search_field_core=True, visualizations=['wordcloud'], description='Article title.', - extractor=extract.XML(tag='RecordTitle', toplevel=True) + extractor=extract.XML(Tag('RecordTitle')) ), FieldDefinition( name='source-paper', es_mapping=keyword_mapping(True), display_name='Source paper', description='Credited as source.', - extractor=extract.XML(tag='Title', toplevel=True, recursive=True), + extractor=extract.XML(Tag('Title')), search_filter=filters.MultipleChoiceFilter( description='Accept only articles from these source papers.', option_count=5 @@ -145,14 +144,14 @@ def sources(self, start=datetime.min, end=datetime.max): mapping=keyword_mapping(True), display_name='Place', description='Place in which the article was published', - extractor=extract.XML(tag='Qualifier', toplevel=True, recursive=True) + extractor=extract.XML(Tag('Qualifier')) ), FieldDefinition( name='author', mapping=keyword_mapping(True), display_name='Author', description='Article author', - extractor=extract.XML(tag='PersonName', toplevel=True, recursive=True) + extractor=extract.XML(Tag('PersonName')) ), FieldDefinition( name='category', @@ -164,7 +163,7 @@ def sources(self, start=datetime.min, end=datetime.max): description='Accept only articles in these categories.', option_count=19 ), - extractor=extract.XML(tag='ObjectType', toplevel=True), + extractor=extract.XML(Tag('ObjectType')), csv_core=True ), FieldDefinition( @@ -176,7 +175,7 @@ def sources(self, start=datetime.min, end=datetime.max): description='Raw OCR\'ed text (content).', results_overview=True, search_field_core=True, - extractor=extract.XML(tag='FullText', toplevel=True, flatten=True), + extractor=extract.XML(Tag('FullText'), flatten=True), language='en', ) ] diff --git a/backend/corpora/jewishinscriptions/jewishinscriptions.py b/backend/corpora/jewishinscriptions/jewishinscriptions.py index 29371c3fe..8d307a269 100644 --- a/backend/corpora/jewishinscriptions/jewishinscriptions.py +++ b/backend/corpora/jewishinscriptions/jewishinscriptions.py @@ -3,6 +3,7 @@ import os.path as op import logging from datetime import datetime +from ianalyzer_readers.xml_tag import Tag, CurrentTag from django.conf import settings @@ -15,7 +16,6 @@ class JewishInscriptions(XMLCorpusDefinition): """ Alto XML corpus of Jewish funerary inscriptions. """ - # Data overrides from .common.Corpus (fields at bottom of class) title = "Jewish Funerary Inscriptions" description = "A collection of inscriptions on Jewish burial sites" min_date = datetime(year=769, month=1, day=1) @@ -27,9 +27,9 @@ class JewishInscriptions(XMLCorpusDefinition): languages = ['heb', 'lat'] category = 'inscription' - # Data overrides from .common.XMLCorpus - tag_toplevel = '' - tag_entry = 'TEI' + + tag_toplevel = CurrentTag() + tag_entry = Tag('TEI') # New data members filename_pattern = re.compile('\d+') @@ -60,8 +60,10 @@ def sources(self, start=min_date, end=max_date): display_name='ID', description='ID of the inscription entry.', extractor=XML( - tag=['teiHeader', 'fileDesc', 'titleStmt', 'title'], - toplevel=False, + Tag('teiHeader'), + Tag('fileDesc'), + Tag('titleStmt'), + Tag('title'), ), es_mapping=keyword_mapping() ), @@ -76,8 +78,13 @@ def sources(self, start=min_date, end=max_date): upper=max_date.year, ), extractor=XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'history', 'origin', 'origDate'], - toplevel=False, + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('msDesc'), + Tag('history'), + Tag('origin'), + Tag('origDate'), ), csv_core=True, sortable=True, @@ -90,8 +97,13 @@ def sources(self, start=min_date, end=max_date): display_name='Date comments', description='Additional comments on the year.', extractor=XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'history', 'origin', 'remarksOnDate'], - toplevel=False, + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('msDesc'), + Tag('history'), + Tag('origin'), + Tag('remarksOnDate'), ), ), FieldDefinition( @@ -99,8 +111,9 @@ def sources(self, start=min_date, end=max_date): display_name='Transcription', description='Text content of the inscription.', extractor=XML( - tag=['text', 'body', 'transcription'], - toplevel=False, + Tag('text'), + Tag('body'), + Tag('transcription'), flatten=True ), search_field_core=True, @@ -118,30 +131,21 @@ def sources(self, start=min_date, end=max_date): description='Search only within these incipit types.', option_count=8 ), - extractor=XML( - tag=['text', 'body', 'incipit'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('incipit')), visualizations=['resultscount', 'termfrequency'] ), FieldDefinition( name='names', display_name='Names', description='Names of the buried persons.', - extractor=XML( - tag=['text', 'body', 'namesMentioned'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('namesMentioned')), search_field_core=True ), FieldDefinition( name='names_hebrew', display_name='Names (Hebrew)', description='Names in Hebrew of the buried persons.', - extractor=XML( - tag=['text', 'body', 'namesMentionedHebrew'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('namesMentionedHebrew')), ), FieldDefinition( name='sex', @@ -152,10 +156,7 @@ def sources(self, start=min_date, end=max_date): description='Search only within these genders.', option_count=3, ), - extractor=XML( - tag=['text', 'body', 'sex'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('sex')), csv_core=True ), FieldDefinition( @@ -168,10 +169,7 @@ def sources(self, start=min_date, end=max_date): lower=0, upper=100, ), - extractor=XML( - tag=['text', 'body', 'age'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('age')), csv_core=True, sortable=True ), @@ -179,10 +177,7 @@ def sources(self, start=min_date, end=max_date): name='age_remarks', display_name='Age remarks', description='Additional comments on the age.', - extractor=XML( - tag=['text', 'body', 'remarksOnAge'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('remarksOnAge')), ), FieldDefinition( name='provenance', @@ -194,8 +189,13 @@ def sources(self, start=min_date, end=max_date): option_count = 8 ), extractor=XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'history', 'origin', 'provenance'], - toplevel=False, + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('msDesc'), + Tag('history'), + Tag('origin'), + Tag('provenance'), ), visualizations=['resultscount', 'termfrequency'] ), @@ -204,10 +204,7 @@ def sources(self, start=min_date, end=max_date): display_name='Inscription type', description='Type of inscription found.', es_mapping={'type': 'keyword'}, - extractor=XML( - tag=['text', 'body', 'inscriptionType'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('inscriptionType')), csv_core=True ), FieldDefinition( @@ -219,10 +216,7 @@ def sources(self, start=min_date, end=max_date): description='Search only within these iconography types.', option_count=8 ), - extractor=XML( - tag=['text', 'body', 'iconographyType'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('iconographyType')), csv_core=True, visualizations=['resultscount', 'termfrequency'] ), @@ -230,10 +224,7 @@ def sources(self, start=min_date, end=max_date): name='iconography_desc', display_name='Iconography description', description='Description of the iconography on the inscription.', - extractor=XML( - tag=['text', 'body', 'iconographyDescription'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('iconographyDescription')), ), FieldDefinition( name='material', @@ -244,10 +235,7 @@ def sources(self, start=min_date, end=max_date): description='Search only within these material types.', option_count=8 ), - extractor=XML( - tag=['text', 'body', 'material'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('material')), csv_core=True, visualizations=['resultscount', 'termfrequency'] ), @@ -260,10 +248,7 @@ def sources(self, start=min_date, end=max_date): description='Search only within these languages.', option_count = 3 ), - extractor=XML( - tag=['text', 'body', 'language'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('language')), csv_core=True, visualizations=['resultscount', 'termfrequency'] ), @@ -278,10 +263,7 @@ def sources(self, start=min_date, end=max_date): # lower=0, # upper=100, # ), - extractor=XML( - tag=['text', 'body', 'numberOfLinesSurviving'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('numberOfLinesSurviving')), csv_core=True ), FieldDefinition( @@ -290,8 +272,12 @@ def sources(self, start=min_date, end=max_date): description='Storage location of the published work.', es_mapping=keyword_mapping(), extractor=XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'location'], - toplevel=False, + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('msDesc'), + Tag('msIdentifier'), + Tag('location'), ), csv_core=True, results_overview=True @@ -301,8 +287,12 @@ def sources(self, start=min_date, end=max_date): display_name='Publication', description='Article or book where inscription is published.', extractor=XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'publication'], - toplevel=False, + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('msDesc'), + Tag('msIdentifier'), + Tag('publication'), ), es_mapping=keyword_mapping(True) ), @@ -310,40 +300,28 @@ def sources(self, start=min_date, end=max_date): name='facsimile', display_name='Facsimile', description='Photo or facsimile of publication.', - extractor=XML( - tag=['facsimile', 'photoFacsimile'], - toplevel=False, - ), + extractor=XML(Tag('facsimile'), Tag('photoFacsimile')), es_mapping=keyword_mapping() ), FieldDefinition( name='photos_leonard', display_name='Photos (Leonard)', description='Photos by Leonard.', - extractor=XML( - tag=['facsimile', 'photosLeonard'], - toplevel=False, - ), + extractor=XML(Tag('facsimile'), Tag('photosLeonard')), es_mapping=keyword_mapping() ), FieldDefinition( name='3D_image', display_name='3D image', description='3D image of inscription.', - extractor=XML( - tag=['facsimile', 'image3D'], - toplevel=False, - ), + extractor=XML(Tag('facsimile'), Tag('image3D')), es_mapping=keyword_mapping() ), FieldDefinition( name='commentary', display_name='Commentary', description='Extra comments, questions or remarks on this inscription.', - extractor=XML( - tag=['text', 'body', 'commentary'], - toplevel=False, - ), + extractor=XML(Tag('text'), Tag('body'), Tag('commentary')), search_field_core=True, ) ] diff --git a/backend/corpora/jewishmigration/test_jewishmigration.py b/backend/corpora/jewishmigration/test_jewishmigration.py index d1dc7acdd..ef6d56fb4 100644 --- a/backend/corpora/jewishmigration/test_jewishmigration.py +++ b/backend/corpora/jewishmigration/test_jewishmigration.py @@ -136,7 +136,7 @@ def jm_corpus_settings(settings): settings.CORPORA = { 'jewishmigration': os.path.join(here, 'jewishmigration.py') } - settings.JMIG_DATA_DIR = '/corpora' + settings.JMIG_DATA_DIR = None settings.JMIG_DATA = None settings.JMIG_DATA_URL = 'http://www.example.com' settings.JMIG_INDEX = 'test-jewishmigration' diff --git a/backend/corpora/parliament/finland.py b/backend/corpora/parliament/finland.py index ad3524572..8be053707 100644 --- a/backend/corpora/parliament/finland.py +++ b/backend/corpora/parliament/finland.py @@ -1,5 +1,6 @@ from datetime import datetime from glob import glob +from ianalyzer_readers.xml_tag import Tag, FindParentTag, PreviousSiblingTag, ParentTag from addcorpus.python_corpora.corpus import XMLCorpusDefinition from addcorpus.python_corpora.extract import XML, Combined, Constant, Metadata @@ -17,24 +18,6 @@ def format_role(values): clean_id = id.replace('#', '') return roles.get(clean_id, clean_id) -def speech_metadata(speech_node): - """Gets the `note` sibling to the speech.""" - return speech_node.find_previous_sibling('note') - -def find_topic(speech_node): - return speech_node.parent.find_previous_sibling('head') - -def find_debate_node(speech_node): - return speech_node.find_parent('TEI') - -def find_debate_title(speech_node): - debate_node = find_debate_node(speech_node) - return debate_node.teiHeader.find('title') - -def find_date(speech_node): - debate_node = find_debate_node(speech_node) - return debate_node.teiHeader.find('date') - class ParliamentFinland(Parliament, XMLCorpusDefinition): title = 'People and Parliament (Finland, 1907-)' @@ -67,27 +50,31 @@ def sources(self, start, end): document_context = document_context() - tag_toplevel = 'teiCorpus' - tag_entry = 'u' + tag_toplevel = Tag('teiCorpus') + tag_entry = Tag('u') country = field_defaults.country() country.extractor = Constant('Finland') date = field_defaults.date() date.extractor = XML( - transform_soup_func = find_date, - attribute = 'when' + FindParentTag('TEI'), + Tag('teiHeader', recursive=False), + Tag('date'), + attribute='when' ) debate_id = field_defaults.debate_id() debate_id.extractor = XML( - transform_soup_func = find_debate_node, - attribute = 'xml:id' + FindParentTag('TEI'), + attribute='xml:id' ) debate_title = field_defaults.debate_title() debate_title.extractor = XML( - transform_soup_func = find_debate_title, + FindParentTag('TEI'), + Tag('teiHeader', recursive=False), + Tag('title'), transform = clean_value, ) @@ -104,7 +91,7 @@ def sources(self, start, end): role = field_defaults.parliamentary_role() role.extractor = Combined( - XML(attribute = 'ana'), + XML(attribute='ana'), Metadata('roles'), transform = format_role, ) @@ -125,26 +112,25 @@ def sources(self, start, end): speech.extractor = XML(transform = clean_value) speech_id = field_defaults.speech_id() - speech_id.extractor = XML( - attribute = 'xml:id' - ) + speech_id.extractor = XML(attribute='xml:id') speech_type = field_defaults.speech_type() speech_type.extractor = XML( - transform_soup_func = speech_metadata, + PreviousSiblingTag('note'), attribute = 'speechType' ) speech_type.language = 'fi' topic = field_defaults.topic() topic.extractor = XML( - transform_soup_func = find_topic, + ParentTag(), + PreviousSiblingTag('head'), transform = clean_value, ) url = field_defaults.url() url.extractor = XML( - transform_soup_func = speech_metadata, + PreviousSiblingTag('note'), attribute = 'link' ) diff --git a/backend/corpora/parliament/ireland.py b/backend/corpora/parliament/ireland.py index e2b3279b2..3c06238a4 100644 --- a/backend/corpora/parliament/ireland.py +++ b/backend/corpora/parliament/ireland.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup import json import csv +from ianalyzer_readers.xml_tag import Tag, PreviousSiblingTag from addcorpus.python_corpora.corpus import CorpusDefinition, CSVCorpusDefinition, XMLCorpusDefinition from addcorpus.python_corpora.extract import Constant, CSV, XML, Metadata, Combined, Backup @@ -246,8 +247,6 @@ def extract_number_from_id(id): if match: return int(match.group(0)) -def find_topic_heading(speech_node): - return speech_node.find_previous_sibling('heading') def get_debate_id(filename): name, _ = os.path.splitext(filename) @@ -319,8 +318,8 @@ class ParliamentIrelandNew(XMLCorpusDefinition): min_date = datetime(year=2014, month=1, day=1) max_date = datetime(year=2020, month=12, day=31) - tag_toplevel = 'debate' - tag_entry = 'speech' + tag_toplevel = Tag('debate') + tag_entry = Tag('speech') def sources(self, start, end): if in_date_range(self, start, end): @@ -359,9 +358,8 @@ def sources(self, start, end): date = field_defaults.date() date.extractor = XML( - tag = 'docDate', + Tag('docDate'), attribute = 'date', - recursive = True, toplevel = True, ) @@ -388,7 +386,7 @@ def sources(self, start, end): speech = field_defaults.speech() speech.extractor = XML( - 'p', + Tag('p'), multiple = True, transform = strip_and_join_paragraphs, ) @@ -402,7 +400,7 @@ def sources(self, start, end): topic = field_defaults.topic() topic.extractor = XML( - transform_soup_func = find_topic_heading, + PreviousSiblingTag('heading'), extract_soup_func = lambda node : node.text, ) diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py index 09a21dc5e..06f088d51 100644 --- a/backend/corpora/parliament/netherlands.py +++ b/backend/corpora/parliament/netherlands.py @@ -4,10 +4,11 @@ from bs4 import BeautifulSoup from os.path import join from django.conf import settings +from ianalyzer_readers.xml_tag import Tag, FindParentTag, PreviousTag, TransformTag import bs4 from addcorpus.python_corpora.corpus import XMLCorpusDefinition -from addcorpus.python_corpora.extract import XML, Constant, Combined, Choice +from addcorpus.python_corpora.extract import XML, Constant, Combined, Choice, Order from corpora.parliament.utils.parlamint import extract_all_party_data, extract_people_data, extract_role_data, party_attribute_extractor, person_attribute_extractor from corpora.utils.formatting import format_page_numbers from corpora.parliament.parliament import Parliament @@ -29,8 +30,7 @@ def format_role(role): else: return role.title() if type(role) == str else role -def find_topic(speech): - return speech.find_parent('topic') + def format_house(house): if house == 'senate': @@ -53,21 +53,6 @@ def format_house_recent(url): else: return 'Tweede Kamer' -def find_last_pagebreak(node): - "find the last pagebreak node before the start of the current node" - is_tag = lambda x : type(x) == bs4.element.Tag - - #look for pagebreaks in previous nodes - for prev_node in node.previous_siblings: - if is_tag(prev_node): - breaks = prev_node.find_all('pagebreak') - if breaks: - return breaks[-1] - - #if none was found, go up a level - parent = node.parent - if parent: - return find_last_pagebreak(parent) def format_pages(pages): topic_start, topic_end, prev_break, last_break = pages @@ -90,10 +75,10 @@ def format_party(data): def get_party_full(speech_node): party_ref = speech_node.attrs.get(':party-ref') if not party_ref: - return None + return [] parents = list(speech_node.parents) party_node = parents[-1].find('organization', attrs={'pm:ref':party_ref}) - return party_node + return [party_node] def get_source(meta_node): if type(meta_node) == bs4.element.Tag: @@ -103,9 +88,6 @@ def get_source(meta_node): return '' -def get_sequence(node, tag_entry): - previous = node.find_all_previous(tag_entry) - return len(previous) + 1 # start from 1 def is_old(metadata): return metadata['dataset'] == 'old' @@ -134,8 +116,8 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition): image = 'netherlands.jpg' description_page = 'netherlands.md' citation_page = 'netherlands.md' - tag_toplevel = lambda _, metadata: 'root' if is_old(metadata) else 'TEI' - tag_entry = lambda _, metadata: 'speech' if is_old(metadata) else 'u' + tag_toplevel = lambda metadata: Tag('root') if is_old(metadata) else Tag('TEI') + tag_entry = lambda metadata: Tag('speech') if is_old(metadata) else Tag('u') languages = ['nl'] category = 'parliament' @@ -183,12 +165,17 @@ def sources(self, start, end): date = field_defaults.date() date.extractor = Choice( XML( - tag=['meta','dc:date'], + Tag('meta'), + Tag('dc:date'), toplevel=True, applicable=is_old ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc','bibl', 'date'], + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('bibl'), + Tag('date'), toplevel=True ) ) @@ -197,14 +184,20 @@ def sources(self, start, end): chamber = field_defaults.chamber() chamber.extractor = Choice( XML( - tag=['meta','dc:subject', 'pm:house'], + Tag('meta'), + Tag('dc:subject'), + Tag('pm:house'), attribute='pm:house', toplevel=True, transform=format_house, applicable=is_old ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc','bibl','idno'], + Tag('teiHeader'), + Tag('fileDesc'), + Tag('sourceDesc'), + Tag('bibl'), + Tag('idno'), toplevel=True, transform=format_house_recent ) @@ -214,12 +207,16 @@ def sources(self, start, end): debate_title = field_defaults.debate_title() debate_title.extractor = Choice( XML( - tag=['meta', 'dc:title'], + Tag('meta'), + Tag('dc:title'), toplevel=True, applicable=is_old ), XML( - tag=['teiHeader', 'fileDesc', 'titleStmt', 'title'], + Tag('teiHeader'), + Tag('fileDesc'), + Tag('titleStmt'), + Tag('title'), multiple=True, toplevel=True, transform=lambda titles: titles[-2] if len(titles) else titles @@ -230,12 +227,12 @@ def sources(self, start, end): debate_id = field_defaults.debate_id() debate_id.extractor = Choice( XML( - tag=['meta', 'dc:identifier'], + Tag('meta'), + Tag('dc:identifier'), toplevel=True, applicable=is_old ), XML( - tag=None, attribute='xml:id', toplevel=True, ) @@ -244,14 +241,13 @@ def sources(self, start, end): topic = field_defaults.topic() topic.extractor = Choice( XML( - transform_soup_func = find_topic, + FindParentTag('topic'), attribute='title', applicable=is_old, ), XML( - tag=['note'], + Tag('note'), toplevel=True, - recursive=True ) ) topic.language = 'nl' @@ -259,16 +255,17 @@ def sources(self, start, end): speech = field_defaults.speech(language='nl') speech.extractor = Choice( XML( - tag='p', + Tag('p'), multiple=True, flatten=True, - applicable=is_old + applicable=is_old, ), XML( - tag=['seg'], + Tag('seg'), multiple=True, flatten=True, - ) + ), + transform='\n'.join, ) speech_id = field_defaults.speech_id() @@ -278,7 +275,6 @@ def sources(self, start, end): applicable=is_old ), XML( - tag=None, attribute='xml:id' ) ) @@ -317,10 +313,9 @@ def sources(self, start, end): XML( attribute='role', transform=format_role, - applicable = is_old, + applicable=is_old, ), XML( - tag=None, attribute='ana', transform=lambda x: x[1:].title() ) @@ -351,8 +346,8 @@ def sources(self, start, end): party_full = field_defaults.party_full() party_full.extractor = Choice( XML( + TransformTag(get_party_full), attribute='pm:name', - transform_soup_func=get_party_full, applicable = is_old, ), party_attribute_extractor('full_name') @@ -362,16 +357,18 @@ def sources(self, start, end): page = field_defaults.page() page.extractor = Choice( Combined( - XML(transform_soup_func=find_topic, + XML(FindParentTag('topic'), attribute='source-start-page' ), - XML(transform_soup_func=find_topic, + XML(FindParentTag('topic'), attribute='source-end-page' ), - XML(transform_soup_func=find_last_pagebreak, + XML(PreviousTag('pagebreak'), attribute='originalpagenr', ), - XML(tag=['stage-direction', 'pagebreak'], + XML( + Tag('stage-direction'), + Tag('pagebreak'), attribute='originalpagenr', multiple=True, transform=lambda pages : pages[-1] if pages else pages @@ -383,25 +380,16 @@ def sources(self, start, end): url = field_defaults.url() url.extractor = XML( - tag=['meta', 'dc:source'], - transform_soup_func=get_source, + Tag('meta'), + Tag('dc:source'), + Tag('pm:link'), toplevel=True, attribute='pm:source', applicable = is_old, ) sequence = field_defaults.sequence() - sequence.extractor = Choice( - XML( - extract_soup_func = lambda node : get_sequence(node, 'speech'), - applicable = is_old - ), - XML( - tag=None, - attribute='xml:id', - transform = get_sequence_recent, - ) - ) + sequence.extractor = Order(transform=lambda value: value + 1) source_archive = field_defaults.source_archive() source_archive.extractor = Choice( diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py index 8bc5ce6c4..5c30f63b3 100644 --- a/backend/corpora/peaceportal/FIJI/fiji.py +++ b/backend/corpora/peaceportal/FIJI/fiji.py @@ -2,6 +2,7 @@ import os import os.path as op import logging +from ianalyzer_readers.xml_tag import Tag from django.conf import settings @@ -46,8 +47,7 @@ def __init__(self): ) self._id.extractor = XML( - tag=['teiHeader', 'fileDesc', 'titleStmt', 'title'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('titleStmt'), Tag('title'), ) self.url.extractor = Constant( @@ -63,30 +63,27 @@ def __init__(self): # ) self.transcription.extractor = XML( - tag=['text', 'body', 'transcription'], - toplevel=False, + Tag('text'), Tag('body'), Tag('transcription'), flatten=True ) self.names.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson', 'person'], + Tag('teiHeader'), Tag('profileDesc'), Tag('particDesc'), Tag('listPerson'), + Tag('person'), flatten=True, multiple=True, - toplevel=False, + transform=lambda result: ' '.join(result).strip(), ) self.sex.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson', 'person'], + Tag('teiHeader'), Tag('profileDesc'), Tag('particDesc'), Tag('listPerson'), + Tag('person'), attribute='sex', multiple=True, - toplevel=False, ) self.age.extractor = XML( - tag=['text', 'body', 'age'], - toplevel=False, + Tag('text'), Tag('body'), Tag('age'), transform=lambda age: transform_age_integer(age) ) @@ -95,47 +92,41 @@ def __init__(self): ) self.settlement.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'history', 'origin', 'provenance'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('provenance'), ) self.material.extractor = XML( - tag=['text', 'body', 'material'], - toplevel=False, + Tag('text'), Tag('body'), Tag('material'), transform=lambda x: categorize_material(x) ) self.material_details = XML( - tag=['text', 'body', 'material'], - toplevel=False, + Tag('text'), Tag('body'), Tag('material'), ) self.language.extractor = XML( - tag=['teiHeader', 'profileDesc', 'langUsage', 'language'], - toplevel=False, + Tag('teiHeader'), Tag('profileDesc'), Tag('langUsage'), Tag('language'), multiple=True, transform=lambda x: normalize_language(x) ) self.comments.extractor = Combined( XML( - tag=['text', 'body', 'commentary'], - toplevel=False, + Tag('text'), Tag('body'), Tag('commentary'), + transform=str.strip, ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'history', 'origin', 'remarksOnDate'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('remarksOnDate'), transform=lambda x: 'DATE:\n{}\n'.format(x) if x else x ), XML( - tag=['text', 'body', 'ageComments'], - toplevel=False, + Tag('text'), Tag('body'), Tag('ageComments'), transform=lambda x: 'AGE:\n{}\n'.format(x) if x else x ), XML( - tag=['text', 'body', 'iconographyDescription'], - toplevel=False, + Tag('text'), Tag('body'), Tag('iconographyDescription'), transform=lambda x: 'ICONOGRAPHY:\n{}\n'.format(x) if x else x ), transform=lambda x: join_commentaries(x) @@ -143,19 +134,18 @@ def __init__(self): self.bibliography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'publications', 'publication'], - toplevel=False, - multiple=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('publications'), Tag('publication'), + multiple=True, ) self.location_details.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'location'], - toplevel=False + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('location'), ) self.iconography.extractor = XML( - tag=['text', 'body', 'iconographyType'], - toplevel=False + Tag('text'), Tag('body'), Tag('iconographyType'), ) self.transcription_hebrew.extractor = Combined( diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py index ef741b50c..394b45a88 100644 --- a/backend/corpora/peaceportal/epidat.py +++ b/backend/corpora/peaceportal/epidat.py @@ -1,11 +1,14 @@ import re from copy import copy +from ianalyzer_readers.xml_tag import Tag, TransformTag +from typing import Iterable, Optional +import bs4 from django.conf import settings from addcorpus.python_corpora.corpus import XMLCorpusDefinition from addcorpus.es_mappings import date_mapping -from addcorpus.python_corpora.extract import XML, Constant, Combined, FilterAttribute +from addcorpus.python_corpora.extract import XML, Constant, Combined, Pass from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, \ clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language, \ not_before_extractor @@ -28,28 +31,19 @@ def __init__(self): ) self._id.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'msIdentifier', 'idno'], - multiple=False, - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('idno'), flatten=True ) - self.url.extractor = FilterAttribute( - tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'], - multiple=False, - toplevel=False, + self.url.extractor = XML( + Tag('teiHeader'), Tag('fileDesc'), Tag('publicationStmt'), Tag('idno', type='url'), flatten=True, - attribute_filter={ - 'attribute': 'type', - 'value': 'url' - } ) self.year.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origDate', 'date'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origDate'), Tag('date'), transform=lambda x: get_year(x), ) @@ -58,157 +52,147 @@ def __init__(self): # the dataset of the Steinheim institute is from the 19th/20th century and has accurate dates self.date.es_mapping = date_mapping() self.date.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origDate', 'date'], - toplevel=False + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origDate'), Tag('date'), ) - self.transcription.extractor = XML( - tag=['text', 'body', 'div'], - toplevel=False, - multiple=False, - flatten=True, + self.transcription.extractor = Pass( + XML( + Tag('text'), Tag('body'), Tag('div', type='edition'), Tag('ab'), + multiple=True, + flatten=True, + transform='\n'.join, + ), transform=lambda x: clean_newline_characters(x), - transform_soup_func=extract_transcript ) - self.transcription_german.extractor = XML( - tag=['text', 'body', ], - toplevel=False, - multiple=False, - flatten=True, + self.transcription_german.extractor = Pass( + XML( + Tag('text'), Tag('body'), Tag('div', type='translation'), Tag('ab'), + multiple=True, + flatten=True, + transform='\n'.join + ), transform=lambda x: clean_newline_characters(x), - transform_soup_func=extract_translation ) self.names.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson', 'person'], + Tag('teiHeader'), Tag('profileDesc'), Tag('particDesc'), + Tag('listPerson'), Tag('person'), flatten=True, multiple=True, - toplevel=False, + transform=' '.join, ) self.sex.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson', 'person'], + Tag('teiHeader'), Tag('profileDesc'), Tag('particDesc'), + Tag('listPerson'), Tag('person'), attribute='sex', multiple=True, - toplevel=False, transform=lambda x: convert_sex(x) ) self.dates_of_death.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson'], - transform_soup_func=extract_death, + Tag('teiHeader'), Tag('profileDesc'), Tag('particDesc'), + Tag('listPerson'), Tag('death'), attribute='when', - multiple=False, - toplevel=False, + multiple=True, ) self.country.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'country'], - toplevel=False, - transform_soup_func=extract_country, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('country'), + TransformTag(_extract_country), transform=lambda x: clean_country(x), flatten=True, ) self.region.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'country', 'region'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('country'), + Tag('region'), flatten=True ) self.settlement.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'settlement'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('settlement'), + TransformTag(_extract_settlement), flatten=True, - transform_soup_func=extract_settlement, ) self.location_details.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'settlement', 'geogName'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('settlement'), + Tag('geogName'), TransformTag(_extract_location_details), flatten=True, - transform_soup_func=extract_location_details, ) self.material.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p', 'material'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), Tag('material'), flatten=True, transform=lambda x: categorize_material(x) ) self.material_details.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p', 'material'], - toplevel=False, - flatten=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), Tag('material'), + flatten=True, ) self.language.extractor = XML( - tag=['teiHeader', 'profileDesc', 'langUsage', 'language'], - toplevel=False, + Tag('teiHeader'), Tag('profileDesc'), Tag('langUsage'), Tag('language'), multiple=True, transform=lambda x: get_language(x) ) self.comments.extractor = Combined( XML( - tag=['text', 'body'], - toplevel=False, - transform_soup_func=extract_commentary, + Tag('text'), Tag('body'), Tag('div', type='commentary'), + multiple=True, + extract_soup_func=_extract_commentary, + transform=lambda found: "\n".join(found) if len(found) > 1 else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'condition'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('condition'), flatten=True, - transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x + transform=lambda x: f'CONDITION:\n{x}\n' if x else x ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p'], - toplevel=False, - transform_soup_func=extract_support_comments, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), + extract_soup_func=_extract_support_comments, ), transform=lambda x: join_commentaries(x) ) self.images.extractor = XML( - tag=['facsimile', 'graphic'], + Tag('facsimile'), Tag('graphic'), multiple=True, attribute='url', - toplevel=False ) self.coordinates.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'], - toplevel=False, - multiple=False, - flatten=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('settlement'), + Tag('geogName'), Tag('geo'), + flatten=True, ) self.iconography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'physDesc', 'decoDesc', 'decoNote'], - toplevel=False, - multiple=False + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('decoDesc'), Tag('decoNote'), ) self.bibliography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'msIdentifier', 'publications', 'publication'], - toplevel=False, - multiple=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('publications'), Tag('publication'), + multiple=True, + transform=lambda x: x if x else None ) self.transcription_hebrew.extractor = Combined( @@ -270,68 +254,27 @@ def get_language(values): return values -def extract_transcript(soup): - ''' - Helper function to ensure correct extraction of the transcripts. - Note that there are multiple formats in which these are stored, - but the text that we need is always in the `` children of - `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one). - ''' - if not soup: - return - return soup.find_all('ab') - - -def extract_translation(soup): - ''' - Helper function to extract translation from the tag - ''' - if not soup: - return - translation = soup.find('div', {'type': 'translation'}) - if translation: - return translation.find_all('ab') - else: - return - - -def extract_commentary(soup): +def _extract_commentary(commentary: bs4.PageElement) -> Optional[str]: ''' Helper function to extract all commentaries from the tag. A single element will be returned with the commentaries found as text content. ''' - if not soup: - return - found = [] - commentaries = soup.find_all('div', {'type': 'commentary'}) - - for commentary in commentaries: - if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']: - p = commentary.find('p') - if p: - text = p.get_text() - if text: - text = clean_commentary(text) - found.append('{}:\n{}\n'.format( - commentary['subtype'].strip().upper(), text)) - - if len(found) > 1: - cloned_soup = copy(soup) - cloned_soup.clear() - cloned_soup.string = "\n".join(found) - return cloned_soup - else: - return None - - -def extract_support_comments(soup): - if not soup: - return + if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']: + p = commentary.find('p') + if p: + text = p.get_text() + if text: + text = clean_commentary(text) + return '{}:\n{}\n'.format( + commentary['subtype'].strip().upper(), text) + + +def _extract_support_comments(soup: bs4.PageElement) -> str: cloned_soup = copy(soup) cloned_soup.clear() - commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS') - commentaries = add_support_comment( + commentaries = _add_support_comment(soup, '', 'dim', 'DIMENSIONS') + commentaries = _add_support_comment( soup, commentaries, 'objectType', 'OBJECTTYPE') # add any additional text from the

element, @@ -342,11 +285,10 @@ def extract_support_comments(soup): text = clean_commentary(text) commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text) - cloned_soup.string = commentaries - return cloned_soup + return commentaries -def add_support_comment(soup, existing_commentaries, elem_name, commentary_name): +def _add_support_comment(soup: bs4.PageElement, existing_commentaries: str, elem_name, commentary_name) -> str: elem = soup.find(elem_name) if elem: text = elem.get_text() @@ -356,45 +298,35 @@ def add_support_comment(soup, existing_commentaries, elem_name, commentary_name) return existing_commentaries -def extract_death(soup): - ''' - Helper function to extract date of death from multiple person tags. - ''' - if not soup: - return - return soup.find_all('death') - -def extract_country(soup): +def _extract_country(soup) -> Iterable[bs4.PageElement]: ''' Helper function to extract country. This is needed because the output of `flatten` would otherwise include the text contents of the ``. ''' - return clone_soup_extract_child(soup, 'region') + return _clone_soup_extract_child(soup, 'region') -def extract_settlement(soup): - return clone_soup_extract_child(soup, 'geogName') +def _extract_settlement(soup) -> Iterable[bs4.PageElement]: + return _clone_soup_extract_child(soup, 'geogName') -def extract_location_details(soup): - return clone_soup_extract_child(soup, 'geo') +def _extract_location_details(soup) -> Iterable[bs4.PageElement]: + return _clone_soup_extract_child(soup, 'geo') -def clone_soup_extract_child(soup, to_extract): +def _clone_soup_extract_child(soup, to_extract) -> Iterable[bs4.PageElement]: ''' Helper function to clone the soup and extract a child element. This is useful when the output of `flatten` would otherwise include the text contents of the child. ''' - if not soup: - return cloned_soup = copy(soup) child = cloned_soup.find(to_extract) if child: child.extract() - return cloned_soup + return [cloned_soup] # TODO: add field diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py index 8243fd7dc..18824db19 100644 --- a/backend/corpora/peaceportal/iis.py +++ b/backend/corpora/peaceportal/iis.py @@ -1,10 +1,11 @@ from copy import copy from os.path import join, split - +from ianalyzer_readers.xml_tag import Tag +from typing import Optional from django.conf import settings from addcorpus.python_corpora.corpus import XMLCorpusDefinition -from addcorpus.python_corpora.extract import Combined, Constant, ExternalFile, FilterAttribute, XML +from addcorpus.python_corpora.extract import Combined, Constant, ExternalFile, XML from addcorpus.serializers import LanguageField from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, \ clean_commentary, join_commentaries, get_text_in_language, \ @@ -29,19 +30,15 @@ def __init__(self): ) self._id.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'msIdentifier', 'idno'], - multiple=False, - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('idno'), flatten=True, transform=lambda x: ''.join(x.lower().split()) ) - self.url.extractor = FilterAttribute( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'msIdentifier', 'idno'], - multiple=False, - toplevel=False, + self.url.extractor = XML( + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('idno'), flatten=True, transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format( ''.join(x.lower().split())) @@ -49,10 +46,9 @@ def __init__(self): # quick and dirty for now: extract value for 'notBefore' self.year.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'date'], - toplevel=False, - attribute='notBefore' + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('date'), + attribute='notBefore', ) self.not_before.extractor = not_before_extractor() @@ -66,19 +62,13 @@ def __init__(self): ) self.transcription.extractor = ExternalFile( - stream_handler=extract_transcript + stream_handler=_extract_transcript ) - self.transcription_english.extractor = FilterAttribute( - tag=['div'], + self.transcription_english.extractor = XML( + Tag('div', type='translation'), Tag('p', limit=1), toplevel=True, - multiple=False, flatten=True, - attribute_filter={ - 'attribute': 'type', - 'value': 'translation' - }, - transform_soup_func=extract_paragraph, transform=lambda x: ' '.join(x.split()) if x else None ) @@ -92,11 +82,11 @@ def __init__(self): # ) self.iconography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'physDesc', 'decoDesc', 'decoNote'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('decoDesc'), Tag('decoNote'), multiple=True, - flatten=True + flatten=True, + transform='\n'.join, ) # is not present in IIS data @@ -109,147 +99,125 @@ def __init__(self): ) self.region.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'placeName', 'region'], - toplevel=False, - flatten=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('placeName'), Tag('region'), + flatten=True, ) self.settlement.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'placeName', 'settlement'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('placeName'), Tag('settlement'), flatten=True ) self.location_details.extractor = Combined( XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'placeName'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('placeName'), flatten=True ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'p'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('p'), flatten=True ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'provenance'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('provenance'), flatten=True ) ) self.material.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), attribute='ana', - toplevel=False, flatten=True, transform=lambda x: categorize_material(x) ) self.material_details.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), attribute='ana', - toplevel=False, flatten=True ) self.language.extractor = Combined( XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', - 'textLang'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msContents'), Tag('textLang'), attribute='mainLang', - toplevel=False, transform=lambda x: normalize_language(x) ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', - 'textLang'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msContents'), Tag('textLang'), attribute='otherLangs', - toplevel=False, transform=lambda x: normalize_language(x) ) ) self.language_code.extractor = Combined( XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', - 'textLang'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msContents'), Tag('textLang'), attribute='mainLang', - toplevel=False ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', - 'textLang'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msContents'), Tag('textLang'), attribute='otherLangs', - toplevel=False ) ) self.comments.extractor = Combined( XML( - tag=['text'], - toplevel=False, - multiple=False, + Tag('text'), Tag('div', type='commentary'), Tag('p', limit=1), flatten=True, - transform_soup_func=extract_comments, transform=lambda x: clean_commentary(x) if x else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'condition'], - toplevel=False, - transform_soup_func=extract_condition + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('condition'), + extract_soup_func=_extract_condition, ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'layoutDesc', 'layout', 'p'], + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('layoutDesc'), Tag('layout'), Tag('p'), toplevel=False, - transform=lambda x: 'LAYOUT:\n{}\n\n'.format( - clean_commentary(x)) if x else None + transform=lambda x: f'LAYOUT:\n{clean_commentary(x)}\n\n' if x else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), attribute='ana', - transform=lambda x: 'OBJECTTYPE:\n{}\n\n'.format( - x[1:]) if x else None + transform=lambda x: f'OBJECTTYPE:\n{x[1:]}\n\n' if x else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'dimensions'], - toplevel=False, - transform_soup_func=extract_dimensions, - transform=lambda x: 'DIMENSIONS:\n{}\n\n'.format( - x) if x else None + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('dimensions'), + extract_soup_func=_extract_dimensions, + transform=lambda x: f'DIMENSIONS:\n{x}\n\n' if x else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), flatten=True, - transform=lambda x: 'SUPPORT:\n{}\n\n'.format( - clean_commentary(x)) if x else None + transform=lambda x: f'SUPPORT:\n{clean_commentary(x)}\n\n' if x else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'physDesc', 'handDesc', 'handNote'], - toplevel=False, - transform_soup_func=extract_handnotes + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('handDesc'), Tag('handNote'), + extract_soup_func=_extract_handnotes ), transform=lambda x: join_commentaries(x) ) self.bibliography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'msIdentifier', 'publications', 'publication'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('publications'), Tag('publication'), multiple=True ) @@ -274,7 +242,7 @@ def __init__(self): self.fields = exclude_fields_without_extractor(self.fields) -def extract_transcript(filestream): +def _extract_transcript(filestream): text = filestream.read().strip() filestream.close() # remove the tabs and spaces inherited from xml @@ -284,31 +252,11 @@ def extract_transcript(filestream): return text -def extract_paragraph(soup): - ''' - Extract first

element from `soup`, ignore the rest. - Ideal for ignoring

headers in the HTML versions of the body. - ''' - if not soup: - return - return soup.find('p') - - -def extract_comments(soup): - ''' - Helper function to extract the commentary from either or (siblings under ) - ''' - if not soup: - return - commentary_div = soup.find('div', {'type': 'commentary'}) - return extract_paragraph(commentary_div) - - -def extract_attribute_and_child_p(soup, field_header): +def _extract_attribute_and_child_p(soup, field_header) -> Optional[str]: ''' Extract value for 'ana' attribute from soup, as well as the text from a

child. Will be returned - in a new soup, i.e. a single element with text content + in as a string in the following format `textcontent (attrivubtevalue)` ''' result = '' @@ -316,7 +264,7 @@ def extract_attribute_and_child_p(soup, field_header): ana = None if 'ana' in soup.attrs: ana = soup['ana'] - p = extract_paragraph(soup) + p = soup.find('p') if p: text = p.get_text() if text: @@ -327,21 +275,18 @@ def extract_attribute_and_child_p(soup, field_header): if result: cloned_soup = copy(soup) cloned_soup.clear() - cloned_soup.string = '{}:\n{}\n\n'.format(field_header, result) - return cloned_soup + return '{}:\n{}\n\n'.format(field_header, result) -def extract_condition(soup): - return extract_attribute_and_child_p(soup, 'CONDITION') +def _extract_condition(soup): + return _extract_attribute_and_child_p(soup, 'CONDITION') -def extract_handnotes(soup): - if not soup: - return - return extract_attribute_and_child_p(soup, 'HANDNOTES') +def _extract_handnotes(soup): + return _extract_attribute_and_child_p(soup, 'HANDNOTES') -def extract_dimensions(soup): +def _extract_dimensions(soup) -> str: result = '' height_elem = soup.find('height') if height_elem: @@ -361,10 +306,7 @@ def extract_dimensions(soup): if depth: result = "{} D: {}".format(result, depth) - cloned_soup = copy(soup) - cloned_soup.clear() - cloned_soup.string = result - return cloned_soup + return result def normalize_language(text): @@ -381,9 +323,8 @@ def normalize_language(text): def not_after_extractor(): ''' iis misses the enclosing tag ''' return XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'date'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('date'), attribute='notAfter', transform=lambda x: transform_to_date(x, 'upper') ) @@ -392,9 +333,8 @@ def not_after_extractor(): def not_before_extractor(): ''' iis misses the enclosing tag ''' return XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'date'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('date'), attribute='notBefore', transform=lambda x: transform_to_date(x, 'lower') ) diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py index f278de6fd..72f9134bf 100644 --- a/backend/corpora/peaceportal/peaceportal.py +++ b/backend/corpora/peaceportal/peaceportal.py @@ -2,6 +2,7 @@ import datetime from langdetect import detect from langdetect.lang_detect_exception import LangDetectException +from ianalyzer_readers.xml_tag import Tag from django.conf import settings @@ -34,12 +35,8 @@ class PeacePortal(ParentCorpusDefinition): min_date = datetime.datetime(year=746, month=1, day=1) category = 'inscription' - # Data overrides from .common.XMLCorpus - tag_entry = 'TEI' + tag_entry = Tag('TEI') - # New data members - non_xml_msg = 'Skipping non-XML file {}' - non_match_msg = 'Skipping XML file with nonmatching name {}' # overwrite below in child class if you need to extract the (converted) transcription # from external files. See README. # el stands for modern Greek (1500-) @@ -158,15 +155,12 @@ def clean_commentary(commentary): return ' '.join(commentary.split()) -def join_commentaries(commentaries): +def join_commentaries(commentaries) -> str: ''' Helper function to join the result of a Combined extractor into one string, separating items by a newline ''' - results = [] - for comm in commentaries: - if comm: - results.append(comm) + results = filter(None, commentaries) return "\n".join(results) @@ -303,9 +297,8 @@ def transform_to_date_range(earliest, latest): def not_after_extractor(transform=True): ''' extractor for standard epidat format ''' return XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origDate', 'date'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origDate'), Tag('date'), attribute='notAfter', transform=lambda x: transform_to_date(x, 'upper') if transform else x ) @@ -314,9 +307,8 @@ def not_after_extractor(transform=True): def not_before_extractor(transform=True): ''' extractor for standard epidat format ''' return XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origDate', 'date'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origDate'), Tag('date'), attribute='notBefore', transform=lambda x: transform_to_date(x, 'lower') if transform else x ) diff --git a/backend/corpora/peaceportal/tests/test_peace.py b/backend/corpora/peaceportal/tests/test_peace.py index 968996702..26030962b 100644 --- a/backend/corpora/peaceportal/tests/test_peace.py +++ b/backend/corpora/peaceportal/tests/test_peace.py @@ -157,8 +157,12 @@ "bibliography": [ "Noy 1995, p. 69-70 (83)" ], - "comments": """DATE: + "comments": """Found on the 3rd of December 1904 in Cub.XL. The lower third of the plaque was left unused. There are poits between the syllables. Ferrua thought it might be pagan. +DATE: Uncertain + +AGE: +not mentioned """, "transcription_he": "", "transcription_la": "", @@ -265,6 +269,9 @@ def test_peace_imports(peace_test_settings, corpus_object): for key in target: tested_fields.add(key) assert key in doc + if doc[key] != target[key]: + compare = doc[key], target[key] + print(key) assert doc[key] == target[key] for key in doc: diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py index 6393bc191..773c29808 100644 --- a/backend/corpora/peaceportal/tol.py +++ b/backend/corpora/peaceportal/tol.py @@ -1,10 +1,12 @@ import re from copy import copy - +from ianalyzer_readers.xml_tag import Tag, TransformTag +from typing import Optional +import bs4 from django.conf import settings from addcorpus.python_corpora.corpus import XMLCorpusDefinition -from addcorpus.python_corpora.extract import XML, Constant, Combined, FilterAttribute +from addcorpus.python_corpora.extract import XML, Constant, Combined from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, \ clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language, \ transform_to_date_range, not_before_extractor, not_after_extractor @@ -24,28 +26,19 @@ def __init__(self): ) self._id.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'msIdentifier', 'idno'], - multiple=False, - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), + Tag('msDesc'), Tag('msIdentifier'), Tag('idno'), flatten=True ) - self.url.extractor = FilterAttribute( - tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'], - multiple=False, - toplevel=False, + self.url.extractor = XML( + Tag('teiHeader'), Tag('fileDesc'), Tag('publicationStmt'), Tag('idno', type='url'), flatten=True, - attribute_filter={ - 'attribute': 'type', - 'value': 'url' - } ) self.year.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origDate', 'date'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origDate'), Tag('date'), transform=lambda x: get_year(x), ) @@ -55,147 +48,136 @@ def __init__(self): self.date.extractor = Combined( not_before_extractor(), not_after_extractor(), - transform=transform_to_date_range + transform=lambda dates: transform_to_date_range(*dates), ) self.transcription.extractor = XML( - tag=['text', 'body', 'div'], - toplevel=False, - multiple=False, + Tag('text'), Tag('body'), Tag('div', type='edition'), + Tag('ab'), flatten=True, transform=lambda x: clean_newline_characters(x), - transform_soup_func=extract_transcript ) self.names.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson', 'person'], + Tag('teiHeader'), Tag('profileDesc'), + Tag('particDesc'), Tag('listPerson'), Tag('person'), flatten=True, multiple=True, - toplevel=False, + transform=lambda names: ' '.join if names else None, ) self.sex.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson', 'person'], + Tag('teiHeader'), Tag('profileDesc'), + Tag('particDesc'), Tag('listPerson'), Tag('person'), attribute='sex', multiple=True, - toplevel=False, transform=lambda x: convert_sex(x) ) self.dates_of_death.extractor = XML( - tag=['teiHeader', 'profileDesc', - 'particDesc', 'listPerson'], - transform_soup_func=extract_death, + Tag('teiHeader'), Tag('profileDesc'), + Tag('particDesc'), Tag('listPerson'), + Tag('death'), + multiple=True, attribute='when', - multiple=False, - toplevel=False, + transform=lambda x: x if x else None ) self.country.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'country'], - toplevel=False, - transform_soup_func=extract_country, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('country'), + TransformTag(extract_country), transform=lambda x: clean_country(x), flatten=True, ) self.region.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'country', 'region'], - toplevel=False, + Tag('teiHeader'), Tag('sourceDesc'), Tag('fileDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('country'), Tag('region'), flatten=True ) self.settlement.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'settlement'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('settlement'), + TransformTag(extract_settlement), flatten=True, - transform_soup_func=extract_settlement, ) self.location_details.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'settlement', 'geogName'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('settlement'), + Tag('geogName'), TransformTag(extract_location_details), flatten=True, - transform_soup_func=extract_location_details, ) self.material.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p', 'material'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), Tag('material'), flatten=True, transform=lambda x: categorize_material(x) ) self.material_details.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p', 'material'], - toplevel=False, - flatten=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), Tag('material'), + flatten=True, ) self.language.extractor = XML( - tag=['teiHeader', 'profileDesc', 'langUsage', 'language'], - toplevel=False, + Tag('teiHeader'), Tag('profileDesc'), Tag('langUsage'), Tag('language'), multiple=True, transform=lambda x: get_language(x) ) self.comments.extractor = Combined( XML( - tag=['text', 'body'], - toplevel=False, - transform_soup_func=extract_commentary, + Tag('text'), Tag('body'), Tag('div', type='commentary'), + multiple=True, + extract_soup_func=_extract_commentary, + transform=lambda x: '\n'.join(filter(None, x)) if x else None ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'condition'], - toplevel=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('condition'), flatten=True, - transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x + transform=lambda x: f'CONDITION:\n{x}\n' if x else x ), XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', - 'objectDesc', 'supportDesc', 'support', 'p'], - toplevel=False, - transform_soup_func=extract_support_comments, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('objectDesc'), Tag('supportDesc'), Tag('support'), + Tag('p'), + extract_soup_func=_extract_support_comments, ), transform=lambda x: join_commentaries(x) ) self.images.extractor = XML( - tag=['facsimile', 'graphic'], + Tag('facsimile'), Tag('graphic'), multiple=True, attribute='url', - toplevel=False + transform=lambda x: x if x else None ) self.coordinates.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'], - toplevel=False, - multiple=False, + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('history'), Tag('origin'), Tag('origPlace'), Tag('settlement'), + Tag('geogName'), Tag('geo'), flatten=True ) self.iconography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', - 'msDesc', 'physDesc', 'decoDesc', 'decoNote'], - toplevel=False, - multiple=False + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('physDesc'), Tag('decoDesc'), Tag('decoNote'), ) self.bibliography.extractor = XML( - tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', - 'msIdentifier', 'publications', 'publication'], - toplevel=False, - multiple=True + Tag('teiHeader'), Tag('fileDesc'), Tag('sourceDesc'), Tag('msDesc'), + Tag('msIdentifier'), Tag('publications'), Tag('publication'), + multiple=True, + transform=lambda x: x if x else None ) self.transcription_hebrew.extractor = Combined( @@ -257,24 +239,10 @@ def get_language(values): return values -def extract_transcript(soup): - ''' - Helper function to ensure correct extraction of the transcripts. - Note that there are multiple formats in which these are stored, - but the text that we need is always in the `` children of - `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one). - ''' - if not soup: - return - return soup.find_all('ab') - - def extract_translation(soup): ''' Helper function to extract translation from the tag ''' - if not soup: - return translation = soup.find('div', {'type': 'translation'}) if translation: return translation.find_all('ab') @@ -282,43 +250,25 @@ def extract_translation(soup): return -def extract_commentary(soup): +def _extract_commentary(commentary: bs4.PageElement) -> Optional[str]: ''' - Helper function to extract all commentaries from the tag. + Helper function to extract all commentaries from

1: - cloned_soup = copy(soup) - cloned_soup.clear() - cloned_soup.string = "\n".join(found) - return cloned_soup - else: - return None + if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']: + p = commentary.find('p') + if p: + text = p.get_text() + if text: + subtype = commentary['subtype'] + text = clean_commentary(text) + return f'{subtype}:\n{text}\n' -def extract_support_comments(soup): - if not soup: - return - cloned_soup = copy(soup) - cloned_soup.clear() - commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS') - commentaries = add_support_comment( +def _extract_support_comments(soup: bs4.PageElement) -> str: + commentaries = _add_support_comment(soup, '', 'dim', 'DIMENSIONS') + commentaries = _add_support_comment( soup, commentaries, 'objectType', 'OBJECTTYPE') # add any additional text from the

element, @@ -327,13 +277,12 @@ def extract_support_comments(soup): text = contents[len(contents) - 1].strip() if text: text = clean_commentary(text) - commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text) + commentaries = f'{commentaries}SUPPORT:\n{text}\n' - cloned_soup.string = commentaries - return cloned_soup + return commentaries -def add_support_comment(soup, existing_commentaries, elem_name, commentary_name): +def _add_support_comment(soup: bs4.PageElement, existing_commentaries: str, elem_name, commentary_name) -> str: elem = soup.find(elem_name) if elem: text = elem.get_text() @@ -343,15 +292,6 @@ def add_support_comment(soup, existing_commentaries, elem_name, commentary_name) return existing_commentaries -def extract_death(soup): - ''' - Helper function to extract date of death from multiple person tags. - ''' - if not soup: - return - return soup.find_all('death') - - def extract_country(soup): ''' Helper function to extract country. @@ -375,13 +315,11 @@ def clone_soup_extract_child(soup, to_extract): This is useful when the output of `flatten` would otherwise include the text contents of the child. ''' - if not soup: - return cloned_soup = copy(soup) child = cloned_soup.find(to_extract) if child: - child.extract() - return cloned_soup + [child.extract()] + return [cloned_soup] # TODO: add field diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py index 9b8b017de..24111c8a5 100644 --- a/backend/corpora/periodicals/periodicals.py +++ b/backend/corpora/periodicals/periodicals.py @@ -9,6 +9,7 @@ from datetime import datetime import re import openpyxl +from ianalyzer_readers.xml_tag import Tag, SiblingTag, ParentTag from django.conf import settings @@ -40,13 +41,9 @@ class Periodicals(XMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - tag_toplevel = 'articles' - tag_entry = 'artInfo' - - # New data members - filename_pattern = re.compile('[a-zA-z]+_(\d+)_(\d+)') - non_xml_msg = 'Skipping non-XML file {}' - non_match_msg = 'Skipping XML file with nonmatching name {}' + tag_toplevel = Tag('articles') + tag_entry = Tag('artInfo') + external_file_tag_toplevel = Tag('issue') mimetype = 'image/jpeg' @@ -112,9 +109,7 @@ def sources(self, start=min_date, end=max_date): display_name='ID', description='Unique identifier of the entry.', es_mapping=keyword_mapping(), - extractor=extract.XML(tag=None, - toplevel=False, - attribute='id'), + extractor=extract.XML(attribute='id'), ), FieldDefinition( name='issue', @@ -147,7 +142,7 @@ def sources(self, start=min_date, end=max_date): description='Text content.', es_mapping=main_content_mapping(True, True, True, 'en'), results_overview=True, - extractor=extract.XML(tag='ocrText', flatten=True), + extractor=extract.XML(Tag('ocrText'), flatten=True), search_field_core=True, visualizations=["wordcloud"], language='en', @@ -163,15 +158,9 @@ def sources(self, start=min_date, end=max_date): 'indicator is in this range.' ) ), - extractor=extract.XML(tag='ocr', - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - } + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + SiblingTag('ocr'), ), sortable=True ), @@ -179,15 +168,10 @@ def sources(self, start=min_date, end=max_date): name='title', display_name='Article title', description='Title of the article.', - extractor=extract.XML(tag='ti', - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - } + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + SiblingTag('ti'), + external_file=True, ), visualizations=['wordcloud'] ), @@ -196,15 +180,10 @@ def sources(self, start=min_date, end=max_date): es_mapping={'type': 'keyword'}, display_name='Starting column', description='Which column the article starts in.', - extractor=extract.XML(tag='sc', - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - } + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + SiblingTag('sc'), + external_file=True, ) ), FieldDefinition( @@ -212,15 +191,10 @@ def sources(self, start=min_date, end=max_date): display_name='Page count', description='How many pages the article covers.', es_mapping={'type': 'integer'}, - extractor=extract.XML(tag='pc', - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - } + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + SiblingTag('pc'), + external_file=True, ) ), FieldDefinition( @@ -228,15 +202,10 @@ def sources(self, start=min_date, end=max_date): display_name='Word count', description='Number of words in the article.', es_mapping={'type': 'integer'}, - extractor=extract.XML(tag='wordCount', - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - } + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + SiblingTag('wordCount'), + external_file=True, ) ), FieldDefinition( @@ -245,15 +214,10 @@ def sources(self, start=min_date, end=max_date): display_name='Category', description='Article category.', es_mapping={'type': 'keyword'}, - extractor=extract.XML(tag='ct', - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - } + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + SiblingTag('ct'), + external_file=True, ), search_filter=filters.MultipleChoiceFilter( description='Accept only articles in these categories.', @@ -266,16 +230,11 @@ def sources(self, start=min_date, end=max_date): display_name='Page number', description='At which page the article starts.', es_mapping={'type': 'integer'}, - extractor=extract.XML(tag='pa', - parent_level=1, - external_file={ - 'xml_tag_toplevel': 'issue', - 'xml_tag_entry': 'article' - }, - secondary_tag = { - 'tag': 'id', - 'match': 'id' - }, + extractor=extract.XML( + lambda metadata: Tag('id', string=metadata['id']), + ParentTag(2), + Tag('pa'), + external_file=True, transform=lambda x: re.sub('[\[\]]', '', x) ) ), diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py index 3f07a3488..fc46c2d39 100644 --- a/backend/corpora/rechtspraak/rechtspraak.py +++ b/backend/corpora/rechtspraak/rechtspraak.py @@ -5,6 +5,7 @@ from os import makedirs, remove from typing import Optional from zipfile import ZipFile, BadZipFile +from ianalyzer_readers.xml_tag import Tag, ParentTag from django.conf import settings @@ -17,20 +18,25 @@ logger = logging.getLogger('indexing') -def rdf_description_extractor(tag, section='xml', **kwargs): - '''rdf:Description extractor +def _rdf_description_extractor(tag: Tag, section='xml', **kwargs) -> extract.XML: + ''' + Extracts a child of the rdf:Description tag + There are two rdf:Description tags available in the data: - description about the open data enrichment - description about the source There is only deterministic way to select the right one: - - check the dcterms:format sibling tag''' + - check the dcterms:format sibling tag + ''' return extract.XML( - tag=tag, - secondary_tag={'tag': 'dcterms:format', 'exact': f'text/{section}'}, + Tag('dcterms:format', string=f'text/{section}'), + ParentTag(1), + tag, **kwargs ) + class Rechtspraak(XMLCorpusDefinition): title = "Judicial system Netherlands" description = "Open data of (anonymised) court rulings of the Dutch judicial system" @@ -49,7 +55,7 @@ class Rechtspraak(XMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - tag_toplevel = 'open-rechtspraak' + tag_toplevel = Tag('open-rechtspraak') def unpack(self, min_year: Optional[int] = None, @@ -144,7 +150,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None display_name='ID', description='', es_mapping=keyword_mapping(), - extractor=rdf_description_extractor('dcterms:identifier'), + extractor=_rdf_description_extractor(Tag('dcterms:identifier')), csv_core=True, ), FieldDefinition( @@ -153,8 +159,8 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None description='Document has available text content.', es_mapping={'type': 'boolean'}, extractor=extract.Backup( - extract.XML('uitspraak', flatten=True), - extract.XML('conclusie', flatten=True), + extract.XML(Tag('uitspraak'), flatten=True), + extract.XML(Tag('conclusie'), flatten=True), extract.Constant(False), transform=bool ), @@ -176,7 +182,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='date', display_name='Date', - extractor=rdf_description_extractor('dcterms:date'), + extractor=_rdf_description_extractor(Tag('dcterms:date')), es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, results_overview=True, csv_core=True, @@ -192,7 +198,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='issued', display_name='Publication Date', - extractor=rdf_description_extractor('dcterms:issued'), + extractor=_rdf_description_extractor(Tag('dcterms:issued')), es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, search_filter=filters.DateFilter( min_date, @@ -205,14 +211,14 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='publisher', display_name='Publisher', - extractor=rdf_description_extractor('dcterms:publisher'), + extractor=_rdf_description_extractor(Tag('dcterms:publisher')), es_mapping={'type': 'keyword'}, language='nl', ), FieldDefinition( name='creator', display_name='Court', - extractor=rdf_description_extractor('dcterms:creator'), + extractor=_rdf_description_extractor(Tag('dcterms:creator')), es_mapping={'type': 'keyword'}, csv_core=True, results_overview=True, @@ -227,12 +233,12 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None name='zaaknr', display_name='Case Number', es_mapping=keyword_mapping(), - extractor=rdf_description_extractor('psi:zaaknummer') + extractor=_rdf_description_extractor(Tag('psi:zaaknummer')), ), FieldDefinition( name='type', display_name='Type', - extractor=rdf_description_extractor('dcterms:type'), + extractor=_rdf_description_extractor(Tag('dcterms:type')), es_mapping={'type': 'keyword'}, csv_core=True, results_overview=True, @@ -246,7 +252,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='procedure', display_name='(type of) Procedure', - extractor=rdf_description_extractor('psi:procedure'), + extractor=_rdf_description_extractor(Tag('psi:procedure')), csv_core=True, es_mapping={'type': 'keyword'}, search_filter=filters.MultipleChoiceFilter( @@ -260,13 +266,13 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None name='spatial', display_name='Location', es_mapping=keyword_mapping(), - extractor=rdf_description_extractor('dcterms:spatial'), + extractor=_rdf_description_extractor(Tag('dcterms:spatial')), language='nl', ), FieldDefinition( name='subject', display_name='Area of law', - extractor=rdf_description_extractor('dcterms:subject'), + extractor=_rdf_description_extractor(Tag('dcterms:subject')), csv_core=True, es_mapping={'type': 'keyword'}, search_filter=filters.MultipleChoiceFilter( @@ -279,8 +285,8 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='title', display_name='Title', - extractor=rdf_description_extractor( - 'dcterms:title', section='html'), + extractor=_rdf_description_extractor( + Tag('dcterms:title'), section='html'), results_overview=True, search_field_core=True, language='nl', @@ -288,7 +294,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None FieldDefinition( name='abstract', display_name='Abstract', - extractor=extract.XML(tag='inhoudsindicatie', flatten=True), + extractor=extract.XML(Tag('inhoudsindicatie'), flatten=True), results_overview=True, language='nl', ), @@ -298,8 +304,8 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None display_type='text_content', es_mapping=main_content_mapping(True, True, True, 'nl'), extractor=extract.Backup( - extract.XML('uitspraak', flatten=True), - extract.XML('conclusie', flatten=True), + extract.XML(Tag('uitspraak'), flatten=True), + extract.XML(Tag('conclusie'), flatten=True), extract.Constant('Content not available') ), csv_core=True, @@ -312,7 +318,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None display_type='url', description='URL of the case on rechtspraak.nl', es_mapping=keyword_mapping(), - extractor=rdf_description_extractor( - 'dcterms:identifier', section='html') + extractor=_rdf_description_extractor( + Tag('dcterms:identifier'), section='html') ) ] diff --git a/backend/corpora/times/images/times.jpg b/backend/corpora/times/images/times.jpg index 6acbd5524..4bc3d3608 100644 Binary files a/backend/corpora/times/images/times.jpg and b/backend/corpora/times/images/times.jpg differ diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py index c05ee90a4..5a0fbb954 100644 --- a/backend/corpora/times/times.py +++ b/backend/corpora/times/times.py @@ -22,6 +22,8 @@ from django.conf import settings from media.media_url import media_url +from ianalyzer_readers.xml_tag import Tag, ParentTag + logger = logging.getLogger(__name__) @@ -43,8 +45,8 @@ class Times(XMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - tag_toplevel = 'issue' - tag_entry = 'article' + tag_toplevel = Tag('issue') + tag_entry = Tag('article') def sources(self, start=datetime.min, end=datetime.max): ''' @@ -117,7 +119,9 @@ def sources(self, start=datetime.min, end=datetime.max): description='Library where the microfilm is sourced', es_mapping=keyword_mapping(), extractor=extract.XML( - tag=['metadatainfo', 'sourceLibrary'], toplevel=True, + Tag('metadatainfo'), + Tag('sourceLibrary'), + toplevel=True, applicable=after(1985) ) ), @@ -127,11 +131,13 @@ def sources(self, start=datetime.min, end=datetime.max): es_mapping=keyword_mapping(), extractor=extract.Choice( extract.XML( - tag='ed', toplevel=True, + Tag('ed'), + toplevel=True, applicable=until(1985) ), extract.XML( - tag='ed', toplevel=True, multiple=True, + Tag('ed'), + toplevel=True, multiple=True, applicable=after(1985) ) ), @@ -143,7 +149,8 @@ def sources(self, start=datetime.min, end=datetime.max): es_mapping={'type': 'integer'}, description='Source issue number.', extractor=extract.XML( - tag='is', toplevel=True, + Tag('is'), + toplevel=True, # Hardcoded to ignore one particular issue with source data transform=lambda x: (62226 if x == "6222662226" else int(x)) ), @@ -156,7 +163,8 @@ def sources(self, start=datetime.min, end=datetime.max): description='Volume number.', es_mapping=keyword_mapping(), extractor=extract.XML( - tag='volNum', toplevel=True, + Tag('volNum'), + toplevel=True, applicable=after(1985) ), csv_core=True @@ -170,7 +178,8 @@ def sources(self, start=datetime.min, end=datetime.max): sortable=True, description='Publication date as full string, as found in source file', extractor=extract.XML( - tag='da', toplevel=True + Tag('da'), + toplevel=True ) ), FieldDefinition( @@ -184,7 +193,7 @@ def sources(self, start=datetime.min, end=datetime.max): 'indicator is in this range.' ) ), - extractor=extract.XML(tag='ocr', transform=float), + extractor=extract.XML(Tag('ocr'), transform=float), sortable=True ), FieldDefinition( @@ -196,7 +205,7 @@ def sources(self, start=datetime.min, end=datetime.max): 'For issues that span more than 1 day.' ), extractor=extract.XML( - tag='tdate', toplevel=True, + Tag('tdate'), toplevel=True, applicable=after(1985) ) ), @@ -206,7 +215,7 @@ def sources(self, start=datetime.min, end=datetime.max): description='Page count: number of images present in the issue.', es_mapping={'type': 'integer'}, extractor=extract.XML( - tag='ip', toplevel=True, transform=int + Tag('ip'), toplevel=True, transform=int ), sortable=True ), @@ -223,7 +232,9 @@ def sources(self, start=datetime.min, end=datetime.max): option_count=2 ), extractor=extract.XML( - tag=['..', 'pageid'], attribute='isPartOf', + ParentTag(), + Tag('pageid'), + attribute='isPartOf', applicable=after(1985) ) ), @@ -232,7 +243,10 @@ def sources(self, start=datetime.min, end=datetime.max): display_name='Supplement title', description='Supplement title.', extractor=extract.XML( - tag=['..', 'pageid', 'supptitle'], multiple=True, + ParentTag(), + Tag('pageid'), + Tag('supptitle'), + multiple=True, applicable=after(1985) ), ), @@ -241,7 +255,10 @@ def sources(self, start=datetime.min, end=datetime.max): display_name='Supplement subtitle', description='Supplement subtitle.', extractor=extract.XML( - tag=['..', 'pageid', 'suppsubtitle'], multiple=True, + ParentTag(), + Tag('pageid'), + Tag('suppsubtitle'), + multiple=True, applicable=after(1985) ) ), @@ -270,7 +287,7 @@ def sources(self, start=datetime.min, end=datetime.max): display_name='ID', description='Article identifier.', es_mapping=keyword_mapping(), - extractor=extract.XML(tag='id') + extractor=extract.XML(Tag('id')) ), FieldDefinition( name='ocr-relevant', @@ -278,7 +295,7 @@ def sources(self, start=datetime.min, end=datetime.max): description='Whether OCR confidence level is relevant.', es_mapping={'type': 'boolean'}, extractor=extract.XML( - tag='ocr', attribute='relevant', + Tag('ocr'), attribute='relevant', transform=string_contains("yes"), ) ), @@ -290,7 +307,7 @@ def sources(self, start=datetime.min, end=datetime.max): 'where article starts.' ), es_mapping=keyword_mapping(), - extractor=extract.XML(tag='sc') + extractor=extract.XML(Tag('sc')) ), FieldDefinition( name='page', @@ -298,8 +315,8 @@ def sources(self, start=datetime.min, end=datetime.max): description='Start page label, from source (1, 2, 17A, ...).', es_mapping=keyword_mapping(), extractor=extract.Choice( - extract.XML(tag='pa', applicable=until(1985)), - extract.XML(tag=['..', 'pa'], applicable=after(1985)) + extract.XML(Tag('pa'), applicable=until(1985)), + extract.XML(ParentTag(), Tag('pa'), applicable=after(1985)) ) ), FieldDefinition( @@ -311,7 +328,7 @@ def sources(self, start=datetime.min, end=datetime.max): 'of the article.' ), extractor=extract.XML( - tag='pc', transform=int + Tag('pc'), transform=int ), sortable=True ), @@ -322,13 +339,13 @@ def sources(self, start=datetime.min, end=datetime.max): search_field_core=True, visualizations=['wordcloud'], description='Article title.', - extractor=extract.XML(tag='ti') + extractor=extract.XML(Tag('ti')) ), FieldDefinition( name='subtitle', display_name='Subtitle', description='Article subtitle.', - extractor=extract.XML(tag='ta', multiple=True), + extractor=extract.XML(Tag('ta'), multiple=True), search_field_core=True ), FieldDefinition( @@ -336,7 +353,7 @@ def sources(self, start=datetime.min, end=datetime.max): display_name='Subheader', description='Article subheader (product dependent field).', extractor=extract.XML( - tag='subheader', multiple=True, + Tag('subheader'), multiple=True, applicable=after(1985) ) ), @@ -347,11 +364,11 @@ def sources(self, start=datetime.min, end=datetime.max): es_mapping=keyword_mapping(True), extractor=extract.Choice( extract.XML( - tag='au', multiple=True, + Tag('au'), multiple=True, applicable=until(1985) ), extract.XML( - tag='au_composed', multiple=True, + Tag('au_composed'), multiple=True, applicable=after(1985) ) ), @@ -364,7 +381,7 @@ def sources(self, start=datetime.min, end=datetime.max): description='Credited as source.', es_mapping=keyword_mapping(True), extractor=extract.XML( - tag='altSource', multiple=True + Tag('altSource'), multiple=True ) ), FieldDefinition( @@ -377,7 +394,7 @@ def sources(self, start=datetime.min, end=datetime.max): description='Accept only articles in these categories.', option_count=25 ), - extractor=extract.XML(tag='ct', multiple=True), + extractor=extract.XML(Tag('ct'), multiple=True), csv_core=True ), FieldDefinition( @@ -396,11 +413,11 @@ def sources(self, start=datetime.min, end=datetime.max): ), extractor=extract.Choice( extract.XML( - tag='il', multiple=True, + Tag('il'), multiple=True, applicable=until(1985) ), extract.XML( - tag='il', attribute='type', multiple=True, + Tag('il'), attribute='type', multiple=True, applicable=after(1985) ) ), @@ -411,7 +428,8 @@ def sources(self, start=datetime.min, end=datetime.max): display_name='Content preamble', description='Raw OCR\'ed text (preamble).', extractor=extract.XML( - tag=['text', 'text.preamble'], + Tag('text'), + Tag('text.preamble'), flatten=True ) ), @@ -420,7 +438,8 @@ def sources(self, start=datetime.min, end=datetime.max): display_name='Content heading', description='Raw OCR\'ed text (header).', extractor=extract.XML( - tag=['text', 'text.title'], + Tag('text'), + Tag('text.title'), flatten=True ) ), @@ -434,8 +453,11 @@ def sources(self, start=datetime.min, end=datetime.max): results_overview=True, search_field_core=True, extractor=extract.XML( - tag=['text', 'text.cr'], multiple=True, - flatten=True + Tag('text'), + Tag('text.cr'), + multiple=True, + flatten=True, + transform='\n'.join, ), language='en', ), diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py index 0fa5cdc01..02ad45ab0 100644 --- a/backend/corpora/troonredes/troonredes.py +++ b/backend/corpora/troonredes/troonredes.py @@ -8,6 +8,7 @@ import os from os.path import join, splitext from datetime import datetime +from ianalyzer_readers.xml_tag import Tag from django.conf import settings @@ -45,11 +46,10 @@ class Troonredes(XMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - tag_toplevel = 'doc' - tag_entry = 'entry' + tag_toplevel = Tag('doc') + tag_entry = Tag('entry') non_xml_msg = 'Skipping non-XML file {}' - non_match_msg = 'Skipping XML file with nonmatching name {}' def sources(self, start=min_date, end=max_date): logger = logging.getLogger(__name__) @@ -68,7 +68,7 @@ def sources(self, start=min_date, end=max_date): name='date', display_name='Date', description='Date of the speech', - extractor=extract.XML(tag='date'), + extractor=extract.XML(Tag('date')), es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'}, results_overview=True, csv_core=True, @@ -90,7 +90,7 @@ def sources(self, start=min_date, end=max_date): name='title', display_name='Title', description='title.', - extractor=extract.XML(tag='title'), + extractor=extract.XML(Tag('title')), results_overview=True, search_field_core=True, language='nl', @@ -99,7 +99,7 @@ def sources(self, start=min_date, end=max_date): name='monarch', display_name='Monarch', description='Monarch that gave the speech.', - extractor=extract.XML(tag='monarch'), + extractor=extract.XML(Tag('monarch')), es_mapping={'type': 'keyword'}, results_overview=True, csv_core=True, @@ -116,7 +116,7 @@ def sources(self, start=min_date, end=max_date): name='speech_type', display_name='Speech type', description='Type of speech.', - extractor=extract.XML(tag='speech_type'), + extractor=extract.XML(Tag('speech_type')), es_mapping={'type': 'keyword'}, results_overview=True, csv_core=True, @@ -138,7 +138,7 @@ def sources(self, start=min_date, end=max_date): results_overview=True, search_field_core=True, visualizations=['wordcloud', 'ngram'], - extractor=extract.XML(tag='content'), + extractor=extract.XML(Tag('content')), language='nl', ), ] diff --git a/backend/corpora/ublad/ublad.py b/backend/corpora/ublad/ublad.py index ec340eb9c..5b0c8949e 100644 --- a/backend/corpora/ublad/ublad.py +++ b/backend/corpora/ublad/ublad.py @@ -1,23 +1,17 @@ from datetime import datetime import os -from os.path import join, splitext -import locale +from os.path import join import logging from django.conf import settings from addcorpus.python_corpora.corpus import HTMLCorpusDefinition, FieldDefinition -from addcorpus.python_corpora.extract import FilterAttribute +from ianalyzer_readers.extract import XML +from ianalyzer_readers.xml_tag import Tag from addcorpus.es_mappings import * from addcorpus.python_corpora.filters import DateFilter from addcorpus.es_settings import es_settings -from ianalyzer_readers.readers.html import HTMLReader -from ianalyzer_readers.readers.core import Field -from ianalyzer_readers.extract import html, Constant - -from bs4 import BeautifulSoup, Tag - def transform_content(soup): """ Transforms the text contents of a page node (soup) into a string consisting @@ -39,13 +33,18 @@ def transform_content(soup): page_text += paragraph_text + '\n\n' return page_text +months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', + 'september', 'oktober', 'november', 'december'] + def transform_date(date_string): + day_string, month_string, year_string = date_string.split() try: - locale.setlocale(locale.LC_ALL, 'nl_NL.UTF-8') - date = datetime.strptime(date_string, '%d %B %Y').strftime('%Y-%m-%d') - locale.setlocale(locale.LC_ALL, '') - return date - except ValueError: + day = int(day_string) + month = next(i + 1 for i, month in enumerate(months) if month == month_string) + year = int(year_string) + date = datetime(year=year, month=month, day=day) + return date.strftime('%Y-%m-%d') + except: logger.error("Unable to get date from {}".format(date_string)) return None @@ -79,7 +78,7 @@ class UBlad(HTMLCorpusDefinition): def es_settings(self): return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) - def sources(self, start=min_date, end=max_date): + def sources(self, **kwargs): for directory, _, filenames in os.walk(self.data_directory): _body, tail = os.path.split(directory) if '.snapshot' in _: @@ -102,15 +101,10 @@ def sources(self, start=min_date, end=max_date): search_field_core=True, visualizations=['ngram', 'wordcloud'], es_mapping = main_content_mapping(True, True, True, 'nl'), - extractor= FilterAttribute(tag='div', - recursive=True, - multiple=False, - flatten=False, - extract_soup_func=transform_content, - attribute_filter={ - 'attribute': 'class', - 'value': 'ocr_page' - }) + extractor=XML( + Tag('div', attrs={'class': 'ocr_page'}), + extract_soup_func=transform_content, + ) ), FieldDefinition( name='pagenum', @@ -118,21 +112,19 @@ def sources(self, start=min_date, end=max_date): description='Page number', csv_core=True, es_mapping = int_mapping(), - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'pagenum' - } + extractor = XML( + Tag('meta', attrs={'name': 'pagenum'}), + attribute='content' ) ), FieldDefinition( name='journal_title', display_name='Publication Title', description='Title of the publication', - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'journal_title' - } - ) + extractor=XML( + Tag('meta', attrs={'name': 'journal_title'}), + attribute='content', + ), ), FieldDefinition( name='volume_id', @@ -140,21 +132,19 @@ def sources(self, start=min_date, end=max_date): description='Unique identifier for this volume', hidden=True, es_mapping=keyword_mapping(), - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'identifier_ocn' - } - ) + extractor=XML( + Tag('meta', attrs={'name': 'identifier_ocn'}), + attribute='content', + ), ), FieldDefinition( name='id', display_name='Page ID', description='Unique identifier for this page', hidden=True, - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'identifier_indexid' - } + extractor=XML( + Tag('meta', attrs={'name': 'identifier_indexid'}), + attribute='content', ) ), FieldDefinition( @@ -163,10 +153,9 @@ def sources(self, start=min_date, end=max_date): description='The number of the edition in this volume. Every year starts at 1.', sortable=True, es_mapping = keyword_mapping(), - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'aflevering' - } + extractor=XML( + Tag('meta', attrs={'name': 'aflevering'}), + attribute='content', ) ), FieldDefinition( @@ -177,10 +166,9 @@ def sources(self, start=min_date, end=max_date): csv_core=True, description='The volume number of this publication. There is one volume per year.', es_mapping=keyword_mapping(), - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'yearstring' - } + extractor=XML( + Tag('meta', attrs={'name': 'yearstring'}), + attribute='content', ), ), FieldDefinition( @@ -198,12 +186,12 @@ def sources(self, start=min_date, end=max_date): 'Accept only articles with publication date in this range.' ) ), - extractor = FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'datestring', - }, - transform=transform_date - ) + extractor=XML( + Tag('meta', attrs={'name': 'datestring'}), + attribute='content', + transform=transform_date, + + ), ), FieldDefinition( name='repo_url', @@ -212,11 +200,10 @@ def sources(self, start=min_date, end=max_date): es_mapping=keyword_mapping(), display_type='url', searchable=False, - extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'link_repository' - } - ) + extractor=XML( + Tag('meta', attrs={'name': 'link_repository'}), + attribute='content', + ), ), FieldDefinition( name='reader_url', @@ -225,11 +212,10 @@ def sources(self, start=min_date, end=max_date): es_mapping=keyword_mapping(), display_type='url', searchable=False, - extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'link_objects_image' - } - ) + extractor=XML( + Tag('meta', attrs={'name': 'link_objects_image'}), + attribute='content', + ), ), FieldDefinition( name='jpg_url', @@ -238,11 +224,10 @@ def sources(self, start=min_date, end=max_date): es_mapping=keyword_mapping(), display_type='url', searchable=False, - extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'link_objects_jpg' - } - ) + extractor=XML( + Tag('meta', attrs={'name': 'link_objects_jpg'}), + attribute='content', + ), ), FieldDefinition( name='worldcat_url', @@ -251,11 +236,10 @@ def sources(self, start=min_date, end=max_date): es_mapping=keyword_mapping(), display_type='url', searchable=False, - extractor=FilterAttribute(tag='meta', attribute='content', attribute_filter={ - 'attribute': 'name', - 'value': 'link_worldcat' - } - ) + extractor=XML( + Tag('meta', attrs={'name': 'link_worldcat'}), + attribute='content', + ), ) ] diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py index 7cbbd981c..0aec685ad 100644 --- a/backend/download/tests/test_download_views.py +++ b/backend/download/tests/test_download_views.py @@ -265,3 +265,22 @@ def test_query_text_in_csv(db, client, basic_mock_corpus, basic_corpus_public, i reader = csv.DictReader(stream, delimiter=';') row = next(reader) assert row['query'] == 'ghost' + +@pytest.mark.xfail(reason='query in context download does not work') +def test_download_with_query_in_context( + db, admin_client, small_mock_corpus, index_small_mock_corpus +): + es_query = query.set_query_text(query.MATCH_ALL, 'the') + es_query['highlight'] = { 'fragment_size': 200, 'fields': { 'content': {} } } + es_query['size'] = 3 + request_json = { + 'corpus': small_mock_corpus, + 'es_query': es_query, + 'fields': ['date', 'content', 'context'], + 'route': f"/search/{small_mock_corpus}?query=the&highlight=200", + 'encoding': 'utf-8' + } + response = admin_client.post( + '/api/download/search_results', request_json, content_type='application/json' + ) + assert status.is_success(response.status_code) diff --git a/backend/es/conftest.py b/backend/es/conftest.py index 40d462e05..8f60cd588 100644 --- a/backend/es/conftest.py +++ b/backend/es/conftest.py @@ -2,6 +2,7 @@ from time import sleep from django.contrib.auth.models import Group +import elasticsearch from addcorpus.python_corpora.load_corpus import load_corpus_definition from addcorpus.models import Corpus @@ -27,8 +28,11 @@ def es_ner_search_client(es_client, basic_mock_corpus, basic_corpus_public, inde """ # add data from mock corpus corpus = Corpus.objects.get(name=basic_mock_corpus) - es_client.indices.put_mapping(index=corpus.configuration.es_index, properties={ - "content:ner": {"type": "annotated_text"}}) + try: + es_client.indices.put_mapping(index=corpus.configuration.es_index, properties={ + "content:ner": {"type": "annotated_text"}}) + except elasticsearch.BadRequestError: + pytest.skip('Annotated text plugin not installed') es_client.index(index=corpus.configuration.es_index, document={ 'id': 'my_identifier', diff --git a/backend/requirements.txt b/backend/requirements.txt index d850969c1..4f21d7295 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile @@ -32,7 +32,7 @@ celery==5.3.1 # -r requirements.in # flower # pytest-celery -certifi==2023.7.22 +certifi==2024.7.4 # via # elastic-transport # requests @@ -73,9 +73,9 @@ defusedxml==0.7.1 # djangosaml2 # pysaml2 # python3-openid -dj-rest-auth[with-social,with_social]==4.0.1 +dj-rest-auth[with_social]==4.0.1 # via -r requirements.in -django==4.2.11 +django==4.2.14 # via # -r requirements.in # dj-rest-auth @@ -90,7 +90,7 @@ django-livereload-server==0.4 # via -r requirements.in django-revproxy @ git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a # via -r requirements.in -djangorestframework==3.14.0 +djangorestframework==3.15.2 # via # -r requirements.in # dj-rest-auth @@ -132,7 +132,7 @@ h11==0.14.0 # wsproto humanize==4.9.0 # via flower -ianalyzer-readers==0.1.0 +ianalyzer-readers==0.2.0 # via -r requirements.in idna==3.4 # via @@ -143,6 +143,8 @@ iniconfig==2.0.0 # via # pytest # seleniumbase +isodate==0.6.1 + # via rdflib joblib==1.3.2 # via # nltk @@ -254,6 +256,8 @@ pyopenssl==23.2.0 # via pysaml2 pyotp==2.9.0 # via seleniumbase +pyparsing==3.1.2 + # via rdflib pypdf2==3.0.1 # via -r requirements.in pysaml2==7.3.1 @@ -297,10 +301,11 @@ python3-openid==3.2.0 # via django-allauth pytz==2023.3 # via - # djangorestframework # flower # pandas # pysaml2 +rdflib==7.0.0 + # via ianalyzer-readers redis==5.0.0 # via -r requirements.in referencing==0.33.0 @@ -325,7 +330,7 @@ rpds-py==0.18.0 # referencing sbvirtualdisplay==1.2.0 # via seleniumbase -scikit-learn==1.3.0 +scikit-learn==1.5.0 # via -r requirements.in scipy==1.10.1 # via @@ -347,6 +352,7 @@ six==1.16.0 # via # behave # django-livereload-server + # isodate # langdetect # parse-type # python-dateutil @@ -395,7 +401,10 @@ trio-websocket==0.10.3 # selenium # seleniumbase typing-extensions==4.7.1 - # via asgiref + # via + # asgiref + # kombu + # pypdf2 tzdata==2023.3 # via # celery diff --git a/docker-compose.yaml b/docker-compose.yaml index f3f4d63b3..90f5481c7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,6 +13,7 @@ services: volumes: - ianalyzer-db:/var/lib/postgresql/data/ backend: + image: ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest build: context: ./backend depends_on: @@ -39,6 +40,7 @@ services: target: /corpora command: bash -c "python manage.py migrate && python manage.py loadcorpora && python manage.py runserver 0.0.0.0:8000" frontend: + image: ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest build: context: ./frontend ports: @@ -52,6 +54,7 @@ services: target: /frontend/build command: sh -c "yarn prebuild && yarn start-docker" elasticsearch: + image: ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest build: context: . dockerfile: DockerfileElastic @@ -79,8 +82,7 @@ services: image: redis:latest restart: unless-stopped celery: - build: - context: ./backend + image: ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest environment: CELERY_BROKER: $CELERY_BROKER SQL_DATABASE: $SQL_DATABASE diff --git a/documentation/Making-a-release.md b/documentation/Making-a-release.md index 33be0298e..e06d0a1dc 100644 --- a/documentation/Making-a-release.md +++ b/documentation/Making-a-release.md @@ -23,13 +23,14 @@ Determine if your release is a major, minor, or patch release to figure out the Start a new branch for your releases. Use `git flow release start x.x.x` or `git flow hotfix start x.x.x`. -Update the version number in `package.json`. + +Use the `yarn [major|minor|patch]` command to update the version number in `package.json`. This also updates the `CITATION.cff` file with the new version number and release date. ## Check if everything works In your local environment, start up elasticsearch and run backend tests with `yarn test-back`. Run frontend tests with `yarn test-front`. -Publish the release branch with `git flow release publish x.x.x`. The push will trigger the [release workflow](https://github.com/UUDigitalHumanitieslab/I-analyzer/blob/develop/.github/workflows/release.yaml) to update the version number and release date in `CITATION.cff`. Deploy on the test or acc server. Check that everything works as intended. +Publish the release branch with `git flow release publish x.x.x`. Deploy on the test or acc server. Check that everything works as intended. ## Publish the release diff --git a/frontend/src/app/download/download.component.html b/frontend/src/app/download/download.component.html index 485160d46..5dd0fc22b 100644 --- a/frontend/src/app/download/download.component.html +++ b/frontend/src/app/download/download.component.html @@ -1,22 +1,99 @@ -

-
-

- -

-

- - + +

{{total}} results.

+ +

+ You can download your search results as a CSV file. View the + manual + for more information. +

+ +
+

+ Only the first {{downloadLimit}} results will be included in the file.

-
+
+
+ +
+ + +
+

+ Select which fields should be included as columns in the CSV file. +

+
+ +
+

+ Sort results +

+ +
+ + + - + +
+
+ File encoding + +
+

+ We recommend using utf-8 encoding for most applications, including Python and R. + For importing files in Microsoft Excel, we recommend utf-16. +

+
+
+ +
+
+ +
+

+ Your download contains too many documents to be immediately available. + You can request the download now, and receive an email when it's + ready. +

+
+
+ +
+
+
+ diff --git a/frontend/src/app/download/download.component.spec.ts b/frontend/src/app/download/download.component.spec.ts index 6d802b880..718a6fbcf 100644 --- a/frontend/src/app/download/download.component.spec.ts +++ b/frontend/src/app/download/download.component.spec.ts @@ -5,6 +5,7 @@ import { commonTestBed } from '../common-test-bed'; import { QueryModel } from '../models'; import { DownloadComponent } from './download.component'; +import { SimpleChange } from '@angular/core'; describe('DownloadComponent', () => { let component: DownloadComponent; @@ -19,7 +20,9 @@ describe('DownloadComponent', () => { component = fixture.componentInstance; component.corpus = mockCorpus; component.queryModel = new QueryModel(mockCorpus); - component.ngOnChanges(); + component.ngOnChanges({ + queryModel: new SimpleChange(undefined, component.queryModel, true) + }); fixture.detectChanges(); }); @@ -29,16 +32,15 @@ describe('DownloadComponent', () => { it('should respond to field selection', () => { // Start with a single field - expect(component['getCsvFields']()).toEqual(mockCorpus.fields); + expect(component['getColumnNames']()).toEqual(['great_field', 'speech']); // Deselect all - component.selectCsvFields([]); - expect(component['getCsvFields']()).toEqual([]); + component.selectedCsvFields = []; + expect(component['getColumnNames']()).toEqual([]); // Select two - component.selectCsvFields([mockField, mockField2]); - const expected_fields = [mockField, mockField2]; - expect(component['getCsvFields']()).toEqual(expected_fields); - expect(component.selectedCsvFields).toEqual(expected_fields); + component.selectedCsvFields = [mockField, mockField2]; + const expected_fields = ['great_field', 'speech']; + expect(component['getColumnNames']()).toEqual(expected_fields); }); }); diff --git a/frontend/src/app/download/download.component.ts b/frontend/src/app/download/download.component.ts index 62c1db767..d7cfb7a4a 100644 --- a/frontend/src/app/download/download.component.ts +++ b/frontend/src/app/download/download.component.ts @@ -1,10 +1,17 @@ -import { Component, Input, OnChanges } from '@angular/core'; +import { Component, Input, OnChanges, SimpleChanges } from '@angular/core'; import * as _ from 'lodash'; import { environment } from '../../environments/environment'; -import { DownloadService, NotificationService } from '../services/index'; -import { Corpus, CorpusField, DownloadOptions, PendingDownload, QueryModel, ResultOverview } from '../models/index'; +import { AuthService, DownloadService, NotificationService, SearchService } from '../services/index'; +import { Corpus, CorpusField, PendingDownload, QueryModel, SortState } from '../models/index'; import { actionIcons } from '../shared/icons'; +import { TotalResults } from '../models/total-results'; +import { SimpleStore } from '../store/simple-store'; +import { Observable, map } from 'rxjs'; +import { Router } from '@angular/router'; +import { pageResultsParametersToParams } from '../utils/params'; +import { DEFAULT_HIGHLIGHT_SIZE, PageResults, PageResultsParameters } from '../models/page-results'; + @Component({ selector: 'ia-download', @@ -14,11 +21,6 @@ import { actionIcons } from '../shared/icons'; export class DownloadComponent implements OnChanges { @Input() public corpus: Corpus; @Input() public queryModel: QueryModel; - @Input() public resultOverview: ResultOverview; - @Input() public hasLimitedResults: boolean; - // download limit is either the user's download limit, or (for unauthenticated users) the corpus' direct download limit - @Input() public downloadLimit: number; - @Input() public route: string; public selectedCsvFields: CorpusField[]; public availableCsvFields: CorpusField[]; @@ -29,9 +31,22 @@ export class DownloadComponent implements OnChanges { public pendingDownload: PendingDownload; + resultsConfig: PageResults; + actionIcons = actionIcons; - directDownloadLimit = environment.directDownloadLimit; + downloadLimit: number; + + canDownloadDirectly$: Observable; + + encodingOptions = ['utf-8', 'utf-16']; + encoding: 'utf-8' | 'utf-16' = 'utf-8'; + + totalResults: TotalResults; + downloadDisabled$: Observable; + + private directDownloadLimit: number = environment.directDownloadLimit; + private userDownloadLimit: number; private downloadsPageLink = { text: 'view downloads', @@ -40,66 +55,61 @@ export class DownloadComponent implements OnChanges { constructor( private downloadService: DownloadService, - private notificationService: NotificationService - ) {} - - get downloadDisabled(): boolean { - return !this.resultOverview || this.resultOverview.resultsCount === 0; + private notificationService: NotificationService, + private searchService: SearchService, + private authService: AuthService, + private router: Router, + ) { + this.userDownloadLimit = this.authService.getCurrentUser()?.downloadLimit; + this.downloadLimit = this.userDownloadLimit || this.directDownloadLimit; } - ngOnChanges() { - this.availableCsvFields = _.filter(this.corpus?.fields, 'downloadable'); - const highlight = this.resultOverview?.highlight; - // 'Query in context' becomes an extra option if any field in the corpus has been marked as highlightable - if (highlight !== undefined) { - this.availableCsvFields.push({ - name: 'context', - description: `Query surrounded by ${highlight} characters`, - displayName: 'Query in context', - displayType: 'text_content', - csvCore: false, - hidden: false, - sortable: false, - searchable: false, - downloadable: true, - filterOptions: null, - mappingType: null, - } as unknown as CorpusField); + ngOnChanges(changes: SimpleChanges): void { + if (changes.corpus) { + this.availableCsvFields = _.filter(this.corpus?.fields, 'downloadable'); + this.selectedCsvFields = _.filter(this.corpus?.fields, 'csvCore'); + } + if (changes.queryModel) { + this.totalResults?.complete(); + this.resultsConfig?.complete(); + this.totalResults = new TotalResults( + new SimpleStore(), this.searchService, this.queryModel + ); + this.downloadDisabled$ = this.totalResults.result$.pipe( + map(result => result > 0) + ); + this.canDownloadDirectly$ = this.totalResults.result$.pipe( + map(this.enableDirectDownload.bind(this)) + ); + this.resultsConfig = new PageResults( + new SimpleStore(), this.searchService, this.queryModel + ); } } - /** - * called by download csv button. Large files are rendered in backend via Celery async task, - * and an email is sent with download link from backend - */ - public chooseDownloadMethod() { - if ( - this.resultOverview.resultsCount < this.directDownloadLimit || - this.downloadLimit === undefined - ) { - this.directDownload(); + onHighlightChange(event): void { + if (event.target.checked) { + this.resultsConfig.setParams({ highlight: DEFAULT_HIGHLIGHT_SIZE }); } else { - this.longDownload(); + this.resultsConfig.setParams({ highlight: null }); } } /** download short file directly */ - public confirmDirectDownload(options: DownloadOptions) { - const nDocuments = Math.min( - this.resultOverview.resultsCount, - this.directDownloadLimit - ); + public confirmDirectDownload(): void { + const sort = this.resultsConfig.state$.value.sort; + const highlight = this.resultsConfig.state$.value.highlight; this.isDownloading = true; this.downloadService .download( this.corpus, this.queryModel, - this.getCsvFields(), - nDocuments, - this.route, - this.resultOverview.sort, - this.resultOverview.highlight, - options + this.getColumnNames(), + this.directDownloadLimit, + this.resultsRoute(this.queryModel, sort, highlight), + sort, + highlight, + { encoding: this.encoding } ) .catch((error) => { this.notificationService.showMessage(error); @@ -110,25 +120,19 @@ export class DownloadComponent implements OnChanges { }); } - public selectCsvFields(selection: CorpusField[]) { - this.selectedCsvFields = selection; - } - - /** results can be downloaded directly: show menu to pick file options */ - private directDownload() { - this.pendingDownload = { download_type: 'search_results' }; - } /** start backend task to create csv file */ - private longDownload() { + longDownload(): void { + const sort = this.resultsConfig.state$.value.sort; + const highlight = this.resultsConfig.state$.value.highlight; this.downloadService .downloadTask( this.corpus, this.queryModel, - this.getCsvFields(), - this.route, - this.resultOverview.sort, - this.resultOverview.highlight + this.getColumnNames(), + this.resultsRoute(this.queryModel, sort, highlight), + sort, + highlight, ) .then((results) => { this.notificationService.showMessage( @@ -142,11 +146,40 @@ export class DownloadComponent implements OnChanges { }); } - private getCsvFields(): CorpusField[] { + private enableDirectDownload(totalResults: number): boolean { + const totalToDownload = _.min([totalResults, this.downloadLimit]); + return totalToDownload <= this.directDownloadLimit; + } + + private getColumnNames(): string[] { + let selectedFields: CorpusField[]; if (this.selectedCsvFields === undefined) { - return this.corpus.fields.filter((field) => field.csvCore); + selectedFields = this.corpus.fields.filter((field) => field.csvCore); } else { - return this.selectedCsvFields; + selectedFields = this.selectedCsvFields; + } + const selected = _.map(selectedFields, 'name'); + if (this.resultsConfig.state$.value.highlight) { + selected.push('context'); } + return selected; + } + + /** + * Generate URL to view these results in the web interface + */ + private resultsRoute( + queryModel: QueryModel, sort: SortState, highlight?: number + ): string { + const resultsParameters: PageResultsParameters = {sort, from: 0, size: 20, highlight }; + const queryParams = { + ...queryModel.toQueryParams(), + ...pageResultsParametersToParams(resultsParameters, queryModel.corpus) + }; + const tree = this.router.createUrlTree( + ['/search', queryModel.corpus.name], + { queryParams } + ); + return tree.toString(); } } diff --git a/frontend/src/app/download/download.module.ts b/frontend/src/app/download/download.module.ts index fe3de9d65..df3fa2bec 100644 --- a/frontend/src/app/download/download.module.ts +++ b/frontend/src/app/download/download.module.ts @@ -2,9 +2,9 @@ import { NgModule } from '@angular/core'; import { DownloadComponent } from './download.component'; import { DownloadOptionsComponent } from './download-options/download-options.component'; import { DownloadService } from '../services'; -import { SelectFieldComponent } from '../select-field/select-field.component'; import { MultiSelectModule } from 'primeng/multiselect'; import { SharedModule } from '../shared/shared.module'; +import { ResultsSortModule } from '../search/results-sort/results-sort.module'; @@ -15,16 +15,15 @@ import { SharedModule } from '../shared/shared.module'; declarations: [ DownloadComponent, DownloadOptionsComponent, - SelectFieldComponent, ], imports: [ SharedModule, MultiSelectModule, + ResultsSortModule, ], exports: [ DownloadComponent, DownloadOptionsComponent, - SelectFieldComponent, ] }) export class DownloadModule { } diff --git a/frontend/src/app/history/download-history/download-history.component.ts b/frontend/src/app/history/download-history/download-history.component.ts index eadc44b8b..442aa2b0f 100644 --- a/frontend/src/app/history/download-history/download-history.component.ts +++ b/frontend/src/app/history/download-history/download-history.component.ts @@ -72,8 +72,8 @@ export class DownloadHistoryComponent extends HistoryDirective implements OnInit parameters.fields : [parameters[0].field_name]; const corpus = findByName(this.corpora, download.corpus); const fields = fieldNames.map(fieldName => - findByName(corpus.fields, fieldName).displayName - ); + findByName(corpus.fields, fieldName)?.displayName + ).filter(_.negate(_.isUndefined)); return _.join(fields, ', '); } diff --git a/frontend/src/app/models/page-results.ts b/frontend/src/app/models/page-results.ts index 3da10f6fc..65e4ec28b 100644 --- a/frontend/src/app/models/page-results.ts +++ b/frontend/src/app/models/page-results.ts @@ -11,6 +11,7 @@ import { Store } from '../store/types'; import { pageResultsParametersFromParams, pageResultsParametersToParams } from '../utils/params'; export const RESULTS_PER_PAGE = 20; +export const DEFAULT_HIGHLIGHT_SIZE = 200; export interface PageParameters { from: number; diff --git a/frontend/src/app/models/search-results.ts b/frontend/src/app/models/search-results.ts index 1eac5b1ff..3c4b28644 100644 --- a/frontend/src/app/models/search-results.ts +++ b/frontend/src/app/models/search-results.ts @@ -3,7 +3,6 @@ import { HttpErrorResponse } from '@angular/common/http'; import { CorpusField } from './corpus'; import { FoundDocument } from './found-document'; import { APIQuery } from './search-requests'; -import { SortState } from './sort'; import { AggregateTermFrequencyParameters, DateTermFrequencyParameters, @@ -20,13 +19,6 @@ export interface SearchResults { }; } -export interface ResultOverview { - queryText: string; - highlight?: number; - sort: SortState; - resultsCount: number; -}; - export interface MostFrequentWordsResult { key: string; doc_count: number; diff --git a/frontend/src/app/models/total-results.ts b/frontend/src/app/models/total-results.ts new file mode 100644 index 000000000..f989c09c3 --- /dev/null +++ b/frontend/src/app/models/total-results.ts @@ -0,0 +1,44 @@ +import { Observable, from, map } from 'rxjs'; +import { SearchService } from '../services'; +import { Store } from '../store/types'; +import { PageResultsParameters } from './page-results'; +import { QueryModel } from './query'; +import { Results } from './results'; +import { Params } from '@angular/router'; + +type Empty = Record + +/** + * fetches the total number of search results. + */ +export class TotalResults extends Results { + constructor( + store: Store, + private searchService: SearchService, + query: QueryModel, + ) { + super(store, query, []); + this.connectToStore(); + this.getResults(); + } + + fetch(): Observable { + const params: PageResultsParameters = { + size: 0, + sort: [undefined, 'asc'], + from: 0, + } + const results = this.searchService.loadResults(this.query, params); + return from(results).pipe( + map(result => result.total.value) + ); + } + + protected stateToStore(state: Empty): Params { + return {} + } + + protected storeToState(params: Params): Empty { + return {} + } +} diff --git a/frontend/src/app/search/highlight-selector.component.ts b/frontend/src/app/search/highlight-selector.component.ts index a961c8e8f..1d5665618 100644 --- a/frontend/src/app/search/highlight-selector.component.ts +++ b/frontend/src/app/search/highlight-selector.component.ts @@ -1,6 +1,6 @@ import { Component, Input } from '@angular/core'; import { actionIcons } from '../shared/icons'; -import { PageResults } from '../models/page-results'; +import { DEFAULT_HIGHLIGHT_SIZE, PageResults } from '../models/page-results'; import { Observable } from 'rxjs'; import * as _ from 'lodash'; @@ -27,10 +27,10 @@ export class HighlightSelectorComponent { } updateHighlightSize(instruction?: string) { - const currentValue = this.pageResults.state$.value.highlight || 200; + const currentValue = this.pageResults.state$.value.highlight || DEFAULT_HIGHLIGHT_SIZE; let newValue: number|undefined; if (instruction === 'on') { - newValue = 200; + newValue = DEFAULT_HIGHLIGHT_SIZE; } else if (instruction === 'more' && currentValue < 800) { newValue = currentValue + 200; } else if (instruction === 'less' && currentValue > 200) { diff --git a/frontend/src/app/search/index.ts b/frontend/src/app/search/index.ts index 40905695a..2f9140f19 100644 --- a/frontend/src/app/search/index.ts +++ b/frontend/src/app/search/index.ts @@ -1,5 +1,5 @@ export * from './search.component'; export * from './search-relevance.component'; export * from './search-results.component'; -export * from './search-sorting.component'; +export * from './results-sort/search-sorting.component'; export * from '../download/download.component'; diff --git a/frontend/src/app/search/results-sort/results-sort.module.ts b/frontend/src/app/search/results-sort/results-sort.module.ts new file mode 100644 index 000000000..f0d0b47cf --- /dev/null +++ b/frontend/src/app/search/results-sort/results-sort.module.ts @@ -0,0 +1,18 @@ +import { NgModule } from '@angular/core'; +import { SearchSortingComponent } from './search-sorting.component'; +import { SharedModule } from '../../shared/shared.module'; + + + +@NgModule({ + declarations: [ + SearchSortingComponent, + ], + imports: [ + SharedModule + ], + exports: [ + SearchSortingComponent, + ] +}) +export class ResultsSortModule { } diff --git a/frontend/src/app/search/search-sorting.component.html b/frontend/src/app/search/results-sort/search-sorting.component.html similarity index 87% rename from frontend/src/app/search/search-sorting.component.html rename to frontend/src/app/search/results-sort/search-sorting.component.html index 7fce81798..d1f2045ac 100644 --- a/frontend/src/app/search/search-sorting.component.html +++ b/frontend/src/app/search/results-sort/search-sorting.component.html @@ -11,7 +11,7 @@
- -
-
- -
-
- -
-
- - -
-
- - - - - +
+
+
+ +
+
+ +
+
+ +
+
+ + +
- +
@@ -53,14 +42,19 @@
- + - + - + + + + + +
diff --git a/frontend/src/app/search/search.component.ts b/frontend/src/app/search/search.component.ts index 6f813421b..8d439be9e 100644 --- a/frontend/src/app/search/search.component.ts +++ b/frontend/src/app/search/search.component.ts @@ -1,23 +1,23 @@ -import { Component, ElementRef, HostListener, ViewChild } from '@angular/core'; -import { ActivatedRoute, ParamMap, Router } from '@angular/router'; +import { Component, ElementRef, HostListener, OnDestroy, OnInit, ViewChild } from '@angular/core'; import * as _ from 'lodash'; import { Subscription } from 'rxjs'; -import { Corpus, CorpusField, ResultOverview, QueryModel, User } from '../models/index'; -import { CorpusService, DialogService, ParamService, } from '../services/index'; -import { ParamDirective } from '../param/param-directive'; +import { Corpus, CorpusField, QueryModel, User } from '../models/index'; +import { CorpusService, DialogService, } from '../services/index'; + import { AuthService } from '../services/auth.service'; import { distinct, filter } from 'rxjs/operators'; import { actionIcons, searchIcons } from '../shared/icons'; import { RouterStoreService } from '../store/router-store.service'; import { Title } from '@angular/platform-browser'; +import { SearchTab, SearchTabs } from './search-tabs'; @Component({ selector: 'ia-search', templateUrl: './search.component.html', styleUrls: ['./search.component.scss'], }) -export class SearchComponent extends ParamDirective { +export class SearchComponent implements OnInit, OnDestroy { @ViewChild('searchSection', { static: false }) public searchSection: ElementRef; @@ -25,37 +25,23 @@ export class SearchComponent extends ParamDirective { public corpus: Corpus; - /** - * The filters have been modified. - */ - public isSearching: boolean; - public hasSearched: boolean; - /** - * Whether the total number of hits exceeds the download limit. - */ - public hasLimitedResults = false; - public user: User; searchIcons = searchIcons; actionIcons = actionIcons; - activeTab: string; - public queryModel: QueryModel; /** * This is the query text currently entered in the interface. */ public queryText: string; - resultOverview: ResultOverview; - public filterFields: CorpusField[] = []; - public showVisualization: boolean; - public nullableParameters = []; + tabs: SearchTabs; + protected corpusSubscription: Subscription; /** @@ -68,14 +54,10 @@ export class SearchComponent extends ParamDirective { private authService: AuthService, private corpusService: CorpusService, private dialogService: DialogService, - paramService: ParamService, - route: ActivatedRoute, - router: Router, private routerStoreService: RouterStoreService, private title: Title, ) { - super(route, router, paramService); - + this.tabs = new SearchTabs(this.routerStoreService); } @HostListener('window:scroll', []) @@ -85,8 +67,8 @@ export class SearchComponent extends ParamDirective { this.searchSection.nativeElement.getBoundingClientRect().y === 0; } - async initialize(): Promise { - this.user = await this.authService.getCurrentUserPromise(); + ngOnInit() { + this.authService.getCurrentUserPromise().then(user => this.user = user); this.corpusSubscription = this.corpusService.currentCorpus .pipe( filter((corpus) => !!corpus), @@ -102,29 +84,12 @@ export class SearchComponent extends ParamDirective { } } - teardown() { + ngOnDestroy() { this.user = undefined; this.corpusSubscription.unsubscribe(); this.queryModel.complete(); } - setStateFromParams(params: ParamMap) { - this.showVisualization = params.has('visualize') ? true : false; - } - - /** - * Event triggered from search-results.component - * - * @param input - */ - public onSearched(input: ResultOverview) { - this.isSearching = false; - this.hasSearched = true; - this.resultOverview = input; - this.hasLimitedResults = - this.user? input.resultsCount > this.user.downloadLimit : true; - } - public showQueryDocumentation() { this.dialogService.showManualPage('query'); } @@ -133,6 +98,10 @@ export class SearchComponent extends ParamDirective { this.queryModel.setQueryText(this.queryText); } + onTabChange(tab: SearchTab) { + this.tabs.setParams({tab}); + } + private setCorpus(corpus: Corpus) { this.corpus = corpus; this.setQueryModel(); diff --git a/frontend/src/app/search/search.module.ts b/frontend/src/app/search/search.module.ts index b1f72d613..60e6e4fed 100644 --- a/frontend/src/app/search/search.module.ts +++ b/frontend/src/app/search/search.module.ts @@ -6,11 +6,13 @@ import { SearchResultsComponent } from './search-results.component'; import { SearchComponent } from './search.component'; import { DocumentModule } from '../document/document.module'; import { CorpusModule } from '../corpus-header/corpus.module'; -import { SearchSortingComponent } from './search-sorting.component'; import { FilterModule } from '../filter/filter.module'; import { DownloadModule } from '../download/download.module'; import { QueryService, SearchService } from '../services'; import { VisualizationModule } from '../visualization/visualization.module'; +import { ResultsSortModule } from './results-sort/results-sort.module'; +import { SelectFieldComponent } from '../select-field/select-field.component'; +import { MultiSelectModule } from 'primeng/multiselect'; @@ -24,7 +26,7 @@ import { VisualizationModule } from '../visualization/visualization.module'; PaginationComponent, SearchComponent, SearchResultsComponent, - SearchSortingComponent, + SelectFieldComponent, ], imports: [ CorpusModule, @@ -33,6 +35,8 @@ import { VisualizationModule } from '../visualization/visualization.module'; FilterModule, SharedModule, VisualizationModule, + ResultsSortModule, + MultiSelectModule, ], exports: [ SearchComponent, diff --git a/frontend/src/app/services/download.service.spec.ts b/frontend/src/app/services/download.service.spec.ts index 308c65640..a6ecc7a66 100644 --- a/frontend/src/app/services/download.service.spec.ts +++ b/frontend/src/app/services/download.service.spec.ts @@ -41,7 +41,8 @@ describe('DownloadService', () => { }; spyOn(apiService, 'download').and.returnValue(Promise.resolve({})); - service.download(query.corpus, query, query.corpus.fields, size, route, sort, highlight, options); + const fieldNames = query.corpus.fields.map(field => field.name) + service.download(query.corpus, query, fieldNames, size, route, sort, highlight, options); const expectedBody: LimitedResultsDownloadParameters = { corpus: mockCorpus.name, fields: ['great_field', 'speech'], diff --git a/frontend/src/app/services/download.service.ts b/frontend/src/app/services/download.service.ts index 59ece2cb2..39434a775 100644 --- a/frontend/src/app/services/download.service.ts +++ b/frontend/src/app/services/download.service.ts @@ -19,7 +19,7 @@ export class DownloadService { public async download( corpus: Corpus, queryModel: QueryModel, - fields: CorpusField[], + fieldNames: string[], requestedResults: number, route: string, sort: SortState, @@ -38,7 +38,7 @@ export class DownloadService { { ...query, corpus: corpus.name, - fields: fields.map((field) => field.name), + fields: fieldNames, route, }, fileOptions @@ -68,14 +68,13 @@ export class DownloadService { public async downloadTask( corpus: Corpus, queryModel: QueryModel, - fields: - CorpusField[], + fields: string[], route: string, sort: SortState, highlightFragmentSize: number ) { const query = queryModel.toAPIQuery(); - return this.apiService.downloadTask({ corpus: corpus.name, ...query, fields: fields.map(field => field.name), route }) + return this.apiService.downloadTask({ corpus: corpus.name, ...query, fields, route }) .then(result => result) .catch(error => { throw new Error(error.headers.message[0]); diff --git a/frontend/src/app/shared/icons.ts b/frontend/src/app/shared/icons.ts index 21b16626d..65803f01d 100644 --- a/frontend/src/app/shared/icons.ts +++ b/frontend/src/app/shared/icons.ts @@ -1,15 +1,18 @@ import { IconDefinition as RegularIconDefinition, + faClock, faNewspaper, } from '@fortawesome/free-regular-svg-icons'; import { IconDefinition as SolidIconDefinition, - faAngleDown, faAngleUp, faArrowLeft, faArrowRight, faAt, faBook, faBookmark, faBookOpen, faBuilding, faChartColumn, - faCheck, faChevronLeft, faChevronRight, faCog, faCogs, faDatabase, faDiagramProject, - faDownload, faEnvelope, faEye, faFilter, faHistory, faImage, faInfo, faInfoCircle, faLink, faList, faLocationDot, faLock, - faMinus, faPalette, faPencil, faPlus, faQuestionCircle, faSearch, faSearchMinus, faSearchPlus, faSignOut, - faSortAlphaAsc, faSortAlphaDesc, faSortNumericAsc, faSortNumericDesc, faSquare, - faTable, faTags, faTimes, faTrashCan, faUndo, faUpload, faUser + faAngleDown, faAngleUp, faArrowLeft, faArrowRight, faAt, faBook, faBookmark, + faBookOpen, faBuilding, faChartColumn, faCheck, faChevronLeft, faChevronRight, faCog, + faCogs, faDatabase, faDiagramProject, faDownload, faEnvelope, faEye, faFilter, + faHistory, faImage, faInfo, faInfoCircle, faLink, faList, faLocationDot, faLock, + faMinus, faPalette, faPencil, faPlus, faQuestionCircle, faSearch, faSearchMinus, + faSearchPlus, faSignOut, faSortAlphaAsc, faSortAlphaDesc, faSortNumericAsc, + faSortNumericDesc, faSquare, faTable, faTags, faTimes, faTrashCan, faUndo, faUpload, + faUser } from '@fortawesome/free-solid-svg-icons'; type IconDefinition = SolidIconDefinition | RegularIconDefinition; @@ -55,6 +58,7 @@ export const actionIcons: Icons = { delete: faTrashCan, edit: faPencil, view: faEye, + wait: faClock, }; export const formIcons: Icons = { diff --git a/package.json b/package.json index 937dc3e55..ce53cef0b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "i-analyzer", - "version": "5.9.0", + "version": "5.11.0", "license": "MIT", "scripts": { "postinstall": "yarn install-back && yarn install-front", @@ -31,7 +31,11 @@ "watch-front-p": "yarn front yarn watch", "start-back-p": "cd backend && python manage.py runserver --settings production --insecure --pythonpath ..", "start-p": "yarn static-p && yarn watch-front-p & yarn start-back-p", - "celery": "yarn back celery -A ianalyzer.celery" + "celery": "yarn back celery -A ianalyzer.celery", + "patch": "yarn version --patch --no-commit-hooks --no-git-tag-version && yarn update-citation && yarn fyarn prebuild", + "minor": "yarn version --minor --no-commit-hooks --no-git-tag-version && yarn update-citation && yarn fyarn prebuild", + "major": "yarn version --major --no-commit-hooks --no-git-tag-version && yarn update-citation && yarn fyarn prebuild", + "update-citation": "python $PWD/update_citation.py" }, "private": true, "devDependencies": {}, diff --git a/update_citation.py b/update_citation.py new file mode 100644 index 000000000..b965035b7 --- /dev/null +++ b/update_citation.py @@ -0,0 +1,35 @@ +'''Updates the CITATION.cff file: + - Sets the date-released to the current date + - Sets the version from toplevel package.json +''' + +from datetime import datetime +import json +import re + +CITATION_FILE = 'CITATION.cff' +PACKAGE_FILE = 'package.json' +VERSION_PATTERN = r'^version:\s+.*$' +DATE_RELEASED_PATTERN = r'^date-released:.*$' +VERSION = None +TODAY = datetime.today().strftime('%Y-%m-%d') + +with open(PACKAGE_FILE, 'r') as package_file: + package_json = json.load(package_file) + VERSION = package_json.get('version') + + +with open(CITATION_FILE) as citation_file: + citation_in = citation_file.readlines() + citation_out = [] + + for line in citation_in: + if re.match(VERSION_PATTERN, line): + citation_out.append(f'version: {VERSION}\n') + elif re.match(DATE_RELEASED_PATTERN, line): + citation_out.append(f"date-released: '{TODAY}'\n") + else: + citation_out.append(line) + +with open(CITATION_FILE, 'w') as citation_file: + citation_file.writelines(citation_out)