diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 5a8565682..000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -name: Bug report -about: Let us know about something that isn't working right -title: '' -labels: bug -assignees: '' - ---- - -### What went wrong? - -Describe what happened. - -### Expected behavior - -What did you expect to happen? - -### Screenshots - -If applicable, please add a screenshot of the problem! - -### Which version? - -Please specify where you encountered the issue: - -- [ ] https://ianalyzer.hum.uu.nl -- [ ] https://peopleandparliament.hum.uu.nl -- [ ] https://peace.sites.uu.nl/ -- [ ] a server hosted elsewhere (i.e. not by the research software lab) -- [ ] a local server - -If this happened on local or third-party server, it helps if you can be more specific about the version. Please include the version number (e.g. "3.2.4") or a commit hash if you know it! - -### To reproduce - -How can a developer replicate the issue? Please provide any information you can. For example: "I went to https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then clicked on *Download CSV*. I pressed *cancel* and then I clicked *Download CSV* again." diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 000000000..82fed0b25 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,66 @@ +--- +name: Bug report +description: Let us know that something isn't working right +labels: + - bug +body: + - type: markdown + attributes: + value: | + Thank you for making a bug report! Please fill in this information so we can get to the + bottom of your issue. + - type: textarea + id: what-happened + attributes: + label: What went wrong? + description: Please describe what happened. + validations: + required: true + - type: textarea + id: expected + attributes: + label: What did you expect to happen? + validations: + required: true + - type: textarea + id: screenshot + attributes: + label: Screenshot + description: If you can make a screenshot of the issue, please include it! + validations: + required: false + - type: checkboxes + id: instance + attributes: + label: Where did you find the bug? + description: Please add where you found the bug. + options: + - label: https://ianalyzer.hum.uu.nl + - label: https://peopleandparliament.hum.uu.nl + - label: https://peace.sites.uu.nl + - label: a server hosted elsewhere (i.e. not by the research software lab) + - label: a local server + validations: + required: true + - type: input + id: version + attributes: + label: Version + description: | + For third-party and local servers, please add information about the version of the + software, if you know it. A version number (e.g "1.2.3") is great. For a pre-release + build, you can provide the branch or commit hash. + validations: + required: false + - type: textarea + id: to-reproduce + attributes: + label: Steps to reproduce + description: | + How can a developer replicate the issue? Please provide any information you can. For + example: "I went to + https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then + clicked on Download CSV. I pressed cancel and then I clicked Download CSV again." + validations: + required: true +--- diff --git a/backend/Dockerfile b/backend/Dockerfile index a8442a733..726934cd0 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,5 +1,5 @@ # Base image -FROM python:3.8-buster +FROM docker.io/library/python:3.8-buster # Setting this means stdout and stderr streams are sent to terminal in real time ENV PYTHONUNBUFFERED 1 # Get required libraries for xmlsec diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py index 810fc4c70..76b05e81b 100644 --- a/backend/addcorpus/corpus.py +++ b/backend/addcorpus/corpus.py @@ -11,9 +11,6 @@ from os.path import isdir from django.conf import settings -from langcodes import Language, standardize_tag - -from addcorpus.constants import CATEGORIES import logging @@ -33,37 +30,37 @@ class CorpusDefinition(object): @property def title(self): ''' - Path to source data directory. + Title of the corpus ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing title') @property def description(self): ''' Short description of the corpus ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing description') @property def data_directory(self): ''' Path to source data directory. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing data_directory') @property def min_date(self): ''' Minimum timestamp for data files. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing min_date') @property def max_date(self): ''' Maximum timestamp for data files. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing max_date') ''' @@ -81,14 +78,14 @@ def category(self): See addcorpus.constants.CATEGORIES for options ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing category') @property def es_index(self): ''' ElasticSearch index name. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing category') ''' Elasticsearch alias. Defaults to None. @@ -111,7 +108,7 @@ def fields(self): the `Field` class, containing information about each attribute. MUST include a field with `name='id'`. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing fields') ''' @@ -139,7 +136,7 @@ def image(self): Name of the corpus image. Should be relative path from a directory 'images' in the same directory as the corpus definition file. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing image') ''' MIME type of scanned documents (images) @@ -241,7 +238,7 @@ def sources(self, start=datetime.min, end=datetime.max): empty or contains only a timestamp; but any data that is to be extracted without reading the file itself can be specified there. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing sources') def source2dicts(self, sources): ''' @@ -249,7 +246,7 @@ def source2dicts(self, sources): The dictionaries are created from this corpus' `Field`s. ''' - raise NotImplementedError() + raise NotImplementedError('CorpusDefinition missing source2dicts') def documents(self, sources=None): ''' @@ -274,6 +271,31 @@ def _reject_extractors(self, *inapplicable_extractors): if isinstance(field.extractor, inapplicable_extractors): raise RuntimeError( "Specified extractor method cannot be used with this type of data") + +class ParentCorpusDefinition(CorpusDefinition): + ''' A class from which other corpus definitions can inherit. + This class is in charge of setting fields, usually without defining an extractor. + The subclassed CorpusDefinitions will set extractors on the fields - + this way, CorpusDefinitions can share the same mappings and filters, + while the logic to collect sources and populate the fields can be different. + The ParentCorpusDefinition can also be used to allow cross-corpus search and filtering. + ''' + #define fields property so it can be set in __init__ + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, value): + self._fields = value + + def __init__(self): + ''' Specify a list of fields which all subclasses share + A subclass of ParentCorpusDefinition will provide extractors for the fields, + and potentially prune done the list of fields to those which have an extractor + ''' + self.fields = [] + class XMLCorpusDefinition(CorpusDefinition): ''' @@ -309,7 +331,7 @@ def source2dicts(self, source): default implementation for XML layouts; may be subclassed if more ''' # Make sure that extractors are sensible - self._reject_extractors(extract.HTML, extract.CSV) + self._reject_extractors(extract.CSV) # extract information from external xml files first, if applicable metadata = {} @@ -519,7 +541,7 @@ def source2dicts(self, source): ''' (filename, metadata) = source - self._reject_extractors(extract.XML, extract.CSV) + self._reject_extractors(extract.CSV) # Loading HTML logger.info('Reading HTML file {} ...'.format(filename)) @@ -594,7 +616,7 @@ class CSVCorpusDefinition(CorpusDefinition): def source2dicts(self, source): # make sure the field size is as big as the system permits csv.field_size_limit(sys.maxsize) - self._reject_extractors(extract.XML, extract.HTML) + self._reject_extractors(extract.XML, extract.FilterAttribute) if isinstance(source, str): filename = source @@ -693,6 +715,7 @@ def __init__(self, visualizations=[], visualization_sort=None, es_mapping={'type': 'text'}, + language=None, search_filter=None, extractor=extract.Constant(None), sortable=None, @@ -716,6 +739,7 @@ def __init__(self, self.visualizations = visualizations self.visualization_sort = visualization_sort self.es_mapping = es_mapping + self.language = language self.indexed = indexed self.hidden = not indexed or hidden self.extractor = extractor diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py index 54bddbaa0..72662dfa8 100644 --- a/backend/addcorpus/es_mappings.py +++ b/backend/addcorpus/es_mappings.py @@ -1,11 +1,13 @@ -def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True): +from addcorpus.es_settings import add_language_string + +def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True): ''' Mapping for the main content field. Options: - `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies. - - `stopword_analysis`: enables analysis using stopword removal. Requires setting a `clean` analyser in the `es_settings` of the corpus. - - `stemming_analysis`: enables analysis using stemming. Requires a `stemmed` analyser in the `es_settings` for the corpus. - - 'updated_highlighting': enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'. + - `stopword_analysis`: enables analysis using stopword removal. + - `stemming_analysis`: enables analysis using stemming. + - `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'. ''' mapping = { @@ -27,13 +29,13 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an if stopword_analysis: multifields['clean'] = { "type": "text", - "analyzer": "clean", + "analyzer": add_language_string('clean', language), "term_vector": "with_positions_offsets" # include character positions for highlighting } if stemming_analysis: multifields['stemmed'] = { "type": "text", - "analyzer": "stemmed", + "analyzer": add_language_string('stemmed', language), "term_vector": "with_positions_offsets", } mapping['fields'] = multifields diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py index 3bf3c25f5..4268665b3 100644 --- a/backend/addcorpus/es_settings.py +++ b/backend/addcorpus/es_settings.py @@ -1,9 +1,17 @@ -import nltk import os + +from django.conf import settings from langcodes import Language +import nltk -HERE = os.path.abspath(os.path.dirname(__file__)) -NLTK_DATA_PATH = os.path.join(HERE, 'nltk_data') +# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html] +AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian', + 'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch', + 'english', 'estonian', 'finnish', 'french', 'galician', + 'german', 'greek', 'hindi', 'hungarian', 'indonesian', + 'irish', 'italian', 'latvian', 'lithuanian', 'norwegian', + 'persian', 'portuguese', 'romanian', 'russian', 'sorani', + 'spanish', 'swedish', 'turkish', 'thai'] def get_language_key(language_code): ''' @@ -15,8 +23,8 @@ def get_language_key(language_code): return Language.make(language_code).display_name().lower() def get_nltk_stopwords(language_code): - nltk.download('stopwords', NLTK_DATA_PATH) - stopwords_dir = os.path.join(NLTK_DATA_PATH, 'corpora', 'stopwords') + nltk.download('stopwords', settings.NLTK_DATA_PATH) + stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords') languages = os.listdir(stopwords_dir) language = get_language_key(language_code) @@ -28,31 +36,48 @@ def get_nltk_stopwords(language_code): else: raise NotImplementedError('language {} has no nltk stopwords list'.format(language)) +def add_language_string(name, language): + return '{}_{}'.format(name, language) if language else name -def es_settings(language=None, stopword_analyzer=False, stemming_analyzer=False): +def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False): ''' Make elasticsearch settings json for a corpus index. Options: - - `language`: string with the language code. See addcorpus.constants for options, and which languages support stopwords/stemming - - `stopword_analyzer`: define an analyser that removes stopwords. - - `stemming_analyzer`: define an analyser that removes stopwords and performs stemming. + - `languages`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming + - `stopword_analysis`: set to True to add an analyzer that removes stopwords. + - `stemming_analysis`: set to True to add an analyzer that removes stopwords and performs stemming. ''' settings = {'index': {'number_of_shards': 1, 'number_of_replicas': 1}} - - if stopword_analyzer or stemming_analyzer: - settings["analysis"] = { - "analyzer": {}, - "char_filter":{ "number_filter": number_filter() }, - 'filter': { - "stopwords": make_stopword_filter(language) - } - } - - if stopword_analyzer: - settings["analysis"]['analyzer']['clean'] = make_stopword_analyzer() - - if stemming_analyzer: - settings['analysis']['filter']['stemmer'] = make_stemmer_filter(language) - settings["analysis"]['analyzer']['stemmed'] = make_stemmed_analyzer() + stopword_filter_name = 'stopwords' + clean_analyzer_name = 'clean' + stemmer_filter_name = 'stemmer' + stemmed_analyzer_name = 'stemmed' + + set_char_filter(settings) + + for language in languages: + # do not attach language isocodes if there is just one language + + if stopword_analysis or stemming_analysis: + if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language), language): + continue # skip languages for which we do not have a stopword list + + if stopword_analysis: + set_clean_analyzer( + settings, + add_language_string(stopword_filter_name, language), + add_language_string(clean_analyzer_name, language), + ) + if stemming_analysis: + if not get_language_key(language) in AVAILABLE_ES_STEMMERS: + raise UserWarning('You specified `stemming_analysis=True`, but \ + there is no stemmer available for this language') + set_stemmed_analyzer( + settings, + add_language_string(stopword_filter_name, language), + add_language_string(stemmer_filter_name, language), + add_language_string(stemmed_analyzer_name, language), + language + ) return settings @@ -64,17 +89,20 @@ def number_filter(): } def make_stopword_filter(language): - stopwords = get_nltk_stopwords(language) - return { - "type": "stop", - "stopwords": stopwords - } + try: + stopwords = get_nltk_stopwords(language) + return { + "type": "stop", + 'stopwords': stopwords + } + except: + return None -def make_stopword_analyzer(): +def make_clean_analyzer(stopword_filter_name): return { "tokenizer": "standard", "char_filter": ["number_filter"], - "filter": ["lowercase", "stopwords"] + "filter": ["lowercase", stopword_filter_name] } def make_stemmer_filter(language): @@ -84,18 +112,49 @@ def make_stemmer_filter(language): "language": stemmer_language } -def make_stemmed_analyzer(): +def make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name): return { "tokenizer": "standard", "char_filter": ["number_filter"], - "filter": ["lowercase", "stopwords", "stemmer"] + "filter": ["lowercase", stopword_filter_name, stemmer_filter_name] } -def get_stopwords_from_settings(es_settings): +def get_stopwords_from_settings(es_settings, analyzer): try: - token_filter = es_settings["analysis"]['filter']['stopwords'] - stopwords = token_filter['stopwords'] + # the name of the stopword filter is second in the list, after "lowercase" + stopword_filter_name = es_settings['analysis']['analyzer'].get( + analyzer).get('filter')[-1] + token_filter = es_settings["analysis"]['filter'][stopword_filter_name] + return token_filter['stopwords'] except: - stopwords = None + return [] + +def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language): + filters = settings['analysis'].get('filter', {}) + filters.update({stemmer_filter_name: make_stemmer_filter(language)}) + settings['analysis']['filter'] = filters + analyzers = settings['analysis'].get('analyzer') + analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)}) + settings['analysis']['analyzer'] = analyzers + +def set_char_filter(settings): + settings["analysis"] = { + "char_filter": { "number_filter": number_filter() } + } - return stopwords +def set_stopword_filter(settings, stopword_filter_name, language): + stopword_filter = make_stopword_filter(language) + if not stopword_filter: + return False + filters = settings['analysis'].get('filter', {}) + filters.update({ + stopword_filter_name: stopword_filter + }) + settings['analysis']['filter'] = filters + return True + +def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name): + clean_analyzer = make_clean_analyzer(stopword_filter_name) + analyzers = settings['analysis'].get('analyzer', {}) + analyzers.update({clean_analyzer_name: clean_analyzer}) + settings["analysis"]['analyzer'] = analyzers \ No newline at end of file diff --git a/backend/addcorpus/extract.py b/backend/addcorpus/extract.py index fba462923..4f4b38187 100644 --- a/backend/addcorpus/extract.py +++ b/backend/addcorpus/extract.py @@ -320,14 +320,14 @@ def _attr(self, soup): ] -class HTML(XML): +class FilterAttribute(XML): ''' This extractor extracts attributes or contents from a BeautifulSoup node. - It is an extension of XML class + It is an extension of the XML extractor ''' def __init__(self, - attribute_filter={ # Whether to search other xml files for this field, and the file tag these files should have + attribute_filter={ # Specify an attribute / value pair by which to select content 'attribute': None, 'value': None}, *nargs, diff --git a/backend/addcorpus/save_corpus.py b/backend/addcorpus/save_corpus.py index 1c9010754..82a0db368 100644 --- a/backend/addcorpus/save_corpus.py +++ b/backend/addcorpus/save_corpus.py @@ -48,7 +48,10 @@ def _copy_corpus_attributes(corpus_definition: CorpusDefinition, configuration: 'word_models_present', ] - defined = get_defined_attributes(corpus_definition, attributes_to_copy) + try: + defined = get_defined_attributes(corpus_definition, attributes_to_copy) + except Exception as e: + raise e for attr, value in defined.items(): configuration.__setattr__(attr, value) diff --git a/backend/addcorpus/tests/test_es_settings.py b/backend/addcorpus/tests/test_es_settings.py new file mode 100644 index 000000000..0f178f321 --- /dev/null +++ b/backend/addcorpus/tests/test_es_settings.py @@ -0,0 +1,70 @@ +import pytest + +from addcorpus.es_settings import es_settings + +char_filter_tokenizer = {'char_filter': ['number_filter'], 'tokenizer': 'standard'} + +test_cases = { + 'single_language': { + 'languages': ['en'], + 'stopword': True, + 'stemming': True, + 'expected': { + 'filter': { + 'stemmer_en': {'type': 'stemmer', 'language': 'english'}, + 'stopwords_en': {'type': 'stop', 'stopwords': list()}, + }, + 'analyzer': { + 'clean_en': { + 'filter': ['lowercase', 'stopwords_en'], + **char_filter_tokenizer + }, + 'stemmed_en': { + 'filter': ['lowercase', 'stopwords_en', 'stemmer_en'], + **char_filter_tokenizer + } + } + } + }, + 'multiple_languages': { + 'languages': ['en', 'de'], + 'stopword': True, + 'stemming': True, + 'expected': { + 'filter': { + 'stemmer_de': {'type': 'stemmer', 'language': 'german'}, + 'stopwords_de': {'type': 'stop', 'stopwords': list()}, + 'stemmer_en': {'type': 'stemmer', 'language': 'english'}, + 'stopwords_en': {'type': 'stop', 'stopwords': list()}, + }, + 'analyzer': { + 'clean_de': { + 'filter': ['lowercase', 'stopwords_de'], + **char_filter_tokenizer + }, + 'stemmed_de': { + 'filter': ['lowercase', 'stopwords_de', 'stemmer_de'], + **char_filter_tokenizer + }, + 'clean_en': { + 'filter': ['lowercase', 'stopwords_en'], + **char_filter_tokenizer + }, + 'stemmed_en': { + 'filter': ['lowercase', 'stopwords_en', 'stemmer_en'], + **char_filter_tokenizer + } + } + } + } +} + +@pytest.mark.parametrize('test_config', list(test_cases.values())) +def test_es_settings(test_config): + settings = es_settings(test_config['languages'], test_config['stopword'], test_config['stemming']) + assert settings['analysis']['filter'].keys() == test_config['expected']['filter'].keys() + assert settings['analysis']['analyzer'].keys() == test_config['expected']['analyzer'].keys() + for analyzer in settings['analysis']['analyzer'].keys(): + assert settings['analysis']['analyzer'][analyzer]['filter'][1] in settings['analysis']['filter'] + if analyzer.startswith('stemmed'): + assert settings['analysis']['analyzer'][analyzer]['filter'][2] in settings['analysis']['filter'] \ No newline at end of file diff --git a/backend/addcorpus/tests/test_times.py b/backend/addcorpus/tests/test_times.py index 5d232ab03..6a2790250 100644 --- a/backend/addcorpus/tests/test_times.py +++ b/backend/addcorpus/tests/test_times.py @@ -1,6 +1,5 @@ -from os.path import expanduser, realpath, join, dirname, relpath, abspath +from os.path import join, dirname, abspath from datetime import datetime -from importlib import reload import pytest diff --git a/backend/conftest.py b/backend/conftest.py index 47b7e430f..09a24fd5c 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -1,6 +1,9 @@ +from time import sleep + import pytest +import requests from allauth.account.models import EmailAddress -from time import sleep + from ianalyzer.elasticsearch import elasticsearch from addcorpus.load_corpus import load_corpus_definition from addcorpus.save_corpus import load_and_save_all_corpora @@ -60,8 +63,18 @@ def admin_client(client, admin_user, admin_credentials): yield client client.logout() -# elasticsearch +@pytest.fixture(scope='session') +def connected_to_internet(): + """ + Check if there is internet connection. Skip if no connection can be made. + """ + try: + requests.get("https://1.1.1.1") + except: + pytest.skip('Cannot connect to internet') + +# elasticsearch @pytest.fixture(scope='session') def es_client(): """ @@ -78,7 +91,6 @@ def es_client(): return client # mock corpora - @pytest.fixture(autouse=True) def add_mock_corpora_to_db(db): #add mock corpora to the database at the start of each test diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py index 6a7c89168..02d92c432 100644 --- a/backend/corpora/dutchannualreports/dutchannualreports.py +++ b/backend/corpora/dutchannualreports/dutchannualreports.py @@ -50,7 +50,7 @@ class DutchAnnualReports(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f: reader = csv.DictReader(f) @@ -180,7 +180,7 @@ def sources(self, start=min_date, end=max_date): ), FieldDefinition( name='content', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'nl'), display_name='Content', display_type='text_content', visualizations=['wordcloud'], diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index f326ced2a..ba366c289 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -40,7 +40,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) tag_toplevel = 'text' tag_entry = 'p' @@ -315,7 +315,7 @@ def fields(self): display_name='Content', display_type='text_content', description='Text content.', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'nl'), results_overview=True, search_field_core=True, extractor=XML(tag='p', multiple=True, diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py index a6b517dac..00c96593f 100644 --- a/backend/corpora/ecco/ecco.py +++ b/backend/corpora/ecco/ecco.py @@ -29,6 +29,7 @@ class Ecco(XMLCorpusDefinition): description_page = 'ecco.md' min_date = datetime(year=1700, month=1, day=1) max_date = datetime(year=1800, month=12, day=31) + data_directory = settings.ECCO_DATA es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco') image = 'ecco.jpg' @@ -44,7 +45,7 @@ class Ecco(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) def sources(self, start=min_date, end=max_date): logging.basicConfig(filename='ecco.log', level=logging.INFO) @@ -149,7 +150,7 @@ def fields(self): name='content', display_name='Content', display_type='text_content', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'en'), description='Text content.', results_overview=True, search_field_core=True, diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py index dfdd07259..db5d0561c 100644 --- a/backend/corpora/goodreads/goodreads.py +++ b/backend/corpora/goodreads/goodreads.py @@ -13,7 +13,6 @@ from addcorpus.corpus import CSVCorpusDefinition, FieldDefinition from addcorpus.es_mappings import main_content_mapping -from addcorpus.es_settings import es_settings logger = logging.getLogger('indexing') diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py index b700e82c1..5d08bf104 100644 --- a/backend/corpora/guardianobserver/guardianobserver.py +++ b/backend/corpora/guardianobserver/guardianobserver.py @@ -46,7 +46,7 @@ class GuardianObserver(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) tag_toplevel = 'Record' @@ -170,7 +170,7 @@ def sources(self, start=datetime.min, end=datetime.max): ), FieldDefinition( name='content', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'en'), display_name='Content', display_type='text_content', visualizations=['wordcloud'], diff --git a/backend/corpora/parliament/parliament.py b/backend/corpora/parliament/parliament.py index 94a557b5d..6f3be976f 100644 --- a/backend/corpora/parliament/parliament.py +++ b/backend/corpora/parliament/parliament.py @@ -38,7 +38,7 @@ class Parliament(CorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) # overwrite below in child class if you need to extract the (converted) transcription diff --git a/backend/corpora/parliament/tests/test_es_settings.py b/backend/corpora/parliament/tests/test_es_settings.py index 11d446416..f554fae15 100644 --- a/backend/corpora/parliament/tests/test_es_settings.py +++ b/backend/corpora/parliament/tests/test_es_settings.py @@ -3,11 +3,11 @@ import os import shutil -def test_stopwords(clean_nltk_data_directory): +def test_stopwords(clean_nltk_data_directory, settings, connected_to_internet): """ Check that stopwords results are valid and all languages are included """ - + settings.NLTK_DATA_PATH = clean_nltk_data_directory cases = [ { 'language': 'en', @@ -50,30 +50,14 @@ def test_stopwords(clean_nltk_data_directory): @pytest.fixture -def clean_nltk_data_directory(): +def clean_nltk_data_directory(settings): """ Temporarily move already downloaded nltk_data if it was already downloaded, and restore the nltk_data directory after testing. If no nltk_data folder existed, data downloaded during testing will also be removed when done. """ - data_path = es_settings.NLTK_DATA_PATH - - if os.path.isdir(data_path): - # remove already downloaded data - temp_path = os.path.join(es_settings.HERE, '_nltk_data_temp') - shutil.move(data_path, temp_path) - - yield data_path - - # clear test data - if os.path.exists(data_path): - shutil.rmtree(data_path) - - # move the old data back - shutil.move(temp_path, data_path) - else: - yield data_path + here = os.path.dirname(os.path.abspath(__file__)) + data_path = os.path.join(here, '_nltk_data_temp') + yield data_path - # clear test data - if os.path.isdir(data_path): - shutil.rmtree(data_path) + shutil.rmtree(data_path) diff --git a/backend/corpora/parliament/tests/test_import.py b/backend/corpora/parliament/tests/test_import.py index fd1b907c6..398e85c6b 100644 --- a/backend/corpora/parliament/tests/test_import.py +++ b/backend/corpora/parliament/tests/test_import.py @@ -16,6 +16,7 @@ 'debate_id': 'ca.proc.d.2015-02-02', 'chamber': 'House of Commons', 'party': 'New Democratic Party', + 'role': 'Interjection', 'speaker': 'Jack Harris', 'speaker_id': 'c846297d-8bc7-4e69-b6eb-31d0e19f7ec1', 'speaker_constituency': 'St. John\'s East', @@ -198,6 +199,7 @@ 'Boorsma, wegens verblijf buitenslands.', ]), 'id': 'nl.proc.ob.d.h-ek-19992000-493-493.1.5.1', + 'source_archive': 'PoliticalMashup', 'speaker': 'De voorzitter Jurgens', 'speaker_id': 'nl.m.01992', 'speaker_gender': None, @@ -282,6 +284,7 @@ 'date': '2021-09-14', 'date_is_estimate': None, 'chamber': 'Riksdag', + 'country': 'Sweden', 'speech': 'Ärade ledamöter! Varmt välkomna tillbaka till riksdagen! Det känns stort att få välkomna er här på tröskeln till det fjärde riksmötet den här mandatperioden. Vi har ännu ett mycket speciellt arbetsår bakom oss, till stor del präglat av pandemin. Även om vi visste att det inte var helt över för ett år sedan tror jag att vi var många som hoppades att en tydligare vändning var på väg. Så blev det inte. I stället fick vi ytterligare ett riksdagsår med ett reducerat antal ledamöter vid voteringar och utskottsarbete till stor del på distans. Men förhoppningsvis börjar vi nu gå tillbaka mot mer normala arbetsformer. Ett tydligt tecken på detta är att alla 349 ledamöter kommer att vara med vid riksmötets öppnande i eftermiddag. Jag tycker att det är angeläget att riksdagens och regeringens alla ledamöter kan vara på plats vid denna högtidliga och viktiga ceremoni, särskilt som detta är det sista öppnandet under den här mandatperioden. Däremot genomförs inget upprop nu på förmiddagen, och vi vidtar den försiktighetsåtgärden att drygt en tredjedel av ledamöterna och statsråden får sitta på läktaren under ceremonin. Formerna beslutades av mig efter diskussion med gruppledarna och de vice talmännen redan i början av augusti, alltså långt innan det blev bestämt att alla ledamöter får delta i voteringar efter riksmötets öppnande. Jag såg inget skäl att med kort varsel börja ändra i planeringen för riksmötets öppnande, så just denna speciella dag får inte alla ledamöter sitta nere på golvet här i kammaren . M en från och med riksmötets första votering sitter var och en på sin plats och röstar igen på vanligt sätt. Även om pandemin inte är över är situationen i Sverige ändå en helt annan nu än för ett år sedan. Därför har vi – talmanspresidiet och gruppledarna – gjort bedömningen att det är möjligt att samla fler personer än förra året men ändå långt färre än ett vanligt år. Vi har försökt finna en så god balans som möjligt mellan nödvändiga säkerhetsåtgärder, riksdagsordningens bestämmelser och respekt för traditionen. Den sedvanliga mottagningen i Sammanbindningsbanan är som bekant inställd, och det genomförs heller inte någon konsert i Konserthuset. Jag är glad över att vi också kommer att få hjälp att minnas dessa föregångare och förebilder genom att de får en permanent plats på Riksplan i form av en staty. Här tillkommer det att det i trapphallen i Östra riksdagshuset kommer att invigas en tavla som föreställer de här fem pionjärerna. Statyn dröjer ett tag – den kommer att invigas nästa år – men redan i kväll vill riksdagen på dagen för riksmötets öppnande, denna demokratins högtidsdag, uppmärksamma demokratijubileet med att lysa upp Stockholmsnatten med ett ljusspel. Jag kommer att tända en fasadbelysning på Östra riksdagshuset vid en webbsänd ceremoni klockan 20. Ljusspelet kan sedan ses varje kväll till och med den 20 september. Men demokratifirandet tar inte slut där. Vad passar väl bättre på FN:s demokratidag den 15 september än att fira med ett seminarium? I morgon anordnar riksdag och regering seminariet 100 år av demokrati – vilka lärdomar tar vi med oss? Se det gärna på riksdagen.se! Efter riksmötets öppnande tror jag att vi alla ser fram emot ett nytt arbetsår i riksdagen under något mer normala former. Jag har ju, som ni alla vet, tillsammans med gruppledarna slutit en ny överenskommelse om arbetsformerna under hösten, och gruppledarna har också beslutat att inte förlänga överenskommelsen om 55 närvarande ledamöter vid voteringar. Alla ledamöter kan alltså delta vid voteringarna, men vi behåller möjligheten att delta på distans vid utskottens sammanträden. Varje utskott avgör när det är motiverat att hålla fysiska sammanträden, och när man deltar fysiskt planerar vi för att det ska gå att hålla avstånd. Vi ska däremot fortsätta hjälpas åt att hålla antalet externa besök i riksdagens hus nere. Externa åhörare vid olika arrangemang bör undvikas liksom guidade visningar och mingelsituationer. Pandemin är inte över. Vi fortsätter att anpassa verksamheten när och om det behövs, men förhoppningsvis går vi mot ett mer normalt läge. Ärade ledamöter! Det här har varit en mandatperiod som ingen annan. Jag tror inte att någon hade kunnat förutse de många olika, oväntade och delvis dramatiska händelser som har inträffat. Jag tänker naturligtvis i första hand på pandemin och alla dess konsekvenser men även på de två regeringsbildningarna. Och då är det ändå ett helt år kvar av mandatperio ­ den. Jag tror att vi alla kan se fram emot ännu ett händelserikt och spännan ­ de riksdagsår fram till valet. Vi vet i alla fall att det i början av november blir den tredje regeringsbildningen under den här mandatperioden. Oavsett hur man ser på det politiska läget vill jag framhålla, apropå just demokratijubileet, att regeringsbildningarna inte har inneburit någon kris för demokratin. Svensk demokrati står stark, och den är värd att fira. Alla aktörer har i regeringsbildningsprocesserna använt de olika verktyg som finns i den demokratiska, parlamentariska verktygslådan. Misstroendeomröstning, beslut att inte utlysa extraval och talmansrundor – allt sådant följer av de lagar som vi har skapat för vår demokrati. Skeendet må vara turbulent i vissa stycken, men det följer demokratins spelregler. Ärade ledamöter! Jag vill avsluta med några rader ut dikten Sommaren i Sverige av Werner Aspenström. Den skildrar på ett fint sätt vemodet och skönheten när sommaren går mot sitt slut. Då landar på min hand den förgänglighetens tanke som vi kallar trollslända. Ett gult löv lösgör sig och faller klingande mot marken. Sommaren måste hastigt bärgas. … Ty hösten närmar sig med toppeld i asparna. Låt mig nu önska er en fin höst och ett produktivt arbetsår. På återseende här i kammaren klockan 14! Stockholms kommun Stockholms län Södermanlands län Jönköpings län Kronobergs län Blekinge län Hallands län Göteborgs kommun Värmlands län Jämtlands län Norrbottens län EU-dokument Åttaveckorsfristen för att avge ett motiverat yttrande skulle gå ut den 5 november . EU-dokument Följande frågor för skriftliga svar hade framställts: 2020/21:3636 Amorteringskravet och ojämställd bostadsmarknad 2020/21:3637 Den kinesiske ambassadörens agerande 2020/21:3638 Vaccin 2020/21:3639 Lukasjenkos tillgång till 1 miljard dollar från IMF 2020/21:3640 Markering mot Irans idrottsminister 2020/21:3642 Kriminalitet på bostadsmarknaden Skriftliga svar på följande frågor hade kommit in: 2020/21:3535 Barns rätt till säkerställda skyddade boenden 2020/21:3537 Elbrist som hotar investeringar i Sverige 2020/21:3538 Åtgärder för att trygga boende', 'sequence': '0', 'id': 'i-2a00eff84ce04676-0', @@ -304,6 +307,7 @@ { 'book_id': 'bn_1828-30_1__01', 'book_label': 'Hederwärda bonde-ståndets protokoller wid lagtima riksdagen i Stockholm åren 1828 och 1829. Första bandet.', + 'country': 'Sweden', 'era': 'Ståndsriksdagen', 'chamber': 'Bönder', 'date_earliest': '1828-01-01', @@ -497,6 +501,7 @@ 1878.""", 'id': 'Adeln_Prot_1877_III.pdf_0', + 'speech_type': 'minutes', 'chamber': 'nobility', 'date_earliest': '1877-01-01', 'date_latest': '1877-12-31', @@ -634,7 +639,7 @@ def test_imports(parliament_corpora_settings, corpus_object): for key in resulted_fields: if not key in tested_fields: - message = 'Key "{}" is included the result for {} but has no specification'.format(key, corpus_object.get('name')) + message = 'Key "{}" is included in the result for {} but has no specification'.format(key, corpus_object.get('name')) warnings.warn(message) docs = get_documents(corpus, start, end) diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py index cf74a6c10..eb8da607e 100644 --- a/backend/corpora/parliament/utils/field_defaults.py +++ b/backend/corpora/parliament/utils/field_defaults.py @@ -284,7 +284,7 @@ def speech(): display_name='Speech', description='The transcribed speech', # each index has its own definition of the 'clean' and 'stemmed' analyzer, based on language - es_mapping = main_content_mapping(token_counts=True, stopword_analysis=True, stemming_analysis=True, updated_highlighting=True), + es_mapping = main_content_mapping(token_counts=True, stopword_analysis=True, stemming_analysis=True, language='en', updated_highlighting=True), results_overview=True, search_field_core=True, display_type='text_content', diff --git a/backend/corpora/peaceportal/FIJI/XMLtemplate.j2 b/backend/corpora/peaceportal/FIJI/XMLtemplate.j2 new file mode 100644 index 000000000..702a8d0ef --- /dev/null +++ b/backend/corpora/peaceportal/FIJI/XMLtemplate.j2 @@ -0,0 +1,71 @@ + + + + + {{ title }} + + + + + + + + + {{ presentLocation }} + {%- if publications %} + + {%- for publication in publications %} + {{ publication }} + {%- endfor %} + + {% endif -%} + + + + {{ provenance }} + {{ date }} + {{ remarksOnDate }} + + + + + + + {%- if persons %} + + + {%- for person in persons %} + + {{ person.name }} + + {%- endfor %} + + + {% endif -%} + + {%- for language in languages %} + {{ language }} + {%- endfor %} + + + + + {{ facsimile }} + {{ photosLeonard }} + {{ image3D }} + + + + {{ transcription }} + {{ inscriptionType }} + {{ iconographyType }} + {{ iconographyDescription }} + {{ material }} + {{ incipit }} + {{ age }} + {{ ageComments }} + {{ commentary }} + + + diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py new file mode 100644 index 000000000..6aafed152 --- /dev/null +++ b/backend/corpora/peaceportal/FIJI/fiji.py @@ -0,0 +1,281 @@ +import re +import os +import os.path as op +import logging + +from django.conf import settings + +from addcorpus.corpus import XMLCorpusDefinition +from addcorpus.extract import XML, Constant, Combined +from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, join_commentaries, get_text_in_language +from corpora.utils.exclude_fields import exclude_fields_without_extractor + +class PeaceportalFIJI(PeacePortal, XMLCorpusDefinition): + ''' + This is a fresh version of Ortal-Paz Saar's 'Funerary Inscriptions of Jews from Italy' corpus, + updated to align with the PEACE portal index. This mostly implies that there are fewer fields + than in the earlier version (i.e. the one under corpora/jewishinscriptions). + ''' + + data_directory = settings.PEACEPORTAL_FIJI_DATA + es_index = getattr(settings, 'PEACEPORTAL_FIJI_ES_INDEX', 'peaceportal-fiji') + filename_pattern = re.compile(r'\d+') + + def sources(self, start, end): + logger = logging.getLogger(__name__) + for directory, _, filenames in os.walk(self.data_directory): + for filename in filenames: + name, extension = op.splitext(filename) + full_path = op.join(directory, filename) + if extension != '.xml': + logger.debug(self.non_xml_msg.format(full_path)) + continue + match = self.filename_pattern.match(name) + if not match: + logger.warning(self.non_match_msg.format(full_path)) + continue + inscriptionID = match.groups() + yield full_path, { + 'inscriptionID': inscriptionID + } + + def __init__(self): + super().__init__() + self.source_database.extractor = Constant( + value='Funerary Inscriptions of Jews from Italy (Utrecht University)' + ) + + self._id.extractor = XML( + tag=['teiHeader', 'fileDesc', 'titleStmt', 'title'], + toplevel=False, + ) + + self.url.extractor = Constant( + value=None + ) + + # the year is commented out: need to have not before / not after fields + # this is advisable since often we only roughly know the century + # self.year.extractor = XML( + # tag=['teiHeader', 'fileDesc', 'sourceDesc', + # 'msDesc', 'history', 'origin', 'origDate'], + # toplevel=False + # ) + + self.transcription.extractor = XML( + tag=['text', 'body', 'transcription'], + toplevel=False, + flatten=True + ) + + self.names.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson', 'person'], + flatten=True, + multiple=True, + toplevel=False, + ) + + self.sex.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson', 'person'], + attribute='sex', + multiple=True, + toplevel=False, + ) + + self.age.extractor = XML( + tag=['text', 'body', 'age'], + toplevel=False, + transform=lambda age: transform_age_integer(age) + ) + + self.country.extractor = Constant( + value='Italy' + ) + + self.settlement.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'history', 'origin', 'provenance'], + toplevel=False, + ) + + self.material.extractor = XML( + tag=['text', 'body', 'material'], + toplevel=False, + transform=lambda x: categorize_material(x) + ) + + self.material_details = XML( + tag=['text', 'body', 'material'], + toplevel=False, + ) + + self.language.extractor = XML( + tag=['teiHeader', 'profileDesc', 'langUsage', 'language'], + toplevel=False, + multiple=True, + transform=lambda x: normalize_language(x) + ) + + self.comments.extractor = Combined( + XML( + tag=['text', 'body', 'commentary'], + toplevel=False, + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'history', 'origin', 'remarksOnDate'], + toplevel=False, + transform=lambda x: 'DATE:\n{}\n'.format(x) if x else x + ), + XML( + tag=['text', 'body', 'ageComments'], + toplevel=False, + transform=lambda x: 'AGE:\n{}\n'.format(x) if x else x + ), + XML( + tag=['text', 'body', 'iconographyDescription'], + toplevel=False, + transform=lambda x: 'ICONOGRAPHY:\n{}\n'.format(x) if x else x + ), + transform=lambda x: join_commentaries(x) + ) + + + self.bibliography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'publications', 'publication'], + toplevel=False, + multiple=True + ) + + self.location_details.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'location'], + toplevel=False + ) + + self.iconography.extractor = XML( + tag=['text', 'body', 'iconographyType'], + toplevel=False + ) + + self.transcription_hebrew.extractor = Combined( + self.transcription.extractor, + Constant('he'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_latin.extractor = Combined( + self.transcription.extractor, + Constant('la'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_greek.extractor = Combined( + self.transcription.extractor, + Constant('el'), + transform=lambda x: get_text_in_language(x) + ) + + self.fields = exclude_fields_without_extractor(self.fields) + + +def transform_age_integer(age): + try: + return int(age) + except: + return None + + +def normalize_language(languages): + results = [] + for lang in languages: + if not lang: + results.append('Unknown') + continue + + ltext = lang.lower().strip() + if 'greek' in ltext or 'greeek' in ltext: + results.append(select_greek(lang)) + if 'latin' in ltext: + results.append(select_latin(lang)) + if 'hebrew' in ltext: + results.append(select_hebrew(lang)) + if ltext == 'aramaic' or ltext == 'samaritan': + return lang + if '?' in ltext or ltext == 'x' or ltext == 'none': + results.append('Unknown') + return results + + +def select_greek(text): + text = text.strip() + if text in [ + "Greek", "Greek (?)", "Greeek", + "Greek (some Latin characters)", + "Latin (some Greek characters)", + "Greek or Latin", "Latin and Greek (?)", + "Latin in Greek characters" + "Greek (transliterated Latin?)", + "Greek with transliterated Latin (?)", + "Greek with transliterated Latin formula", + ]: + return 'Greek' + if text in [ + "Greek (in Hebrew characters)", + "Greek in Latin characters (?)", + "Latin (including transliterated Greek)", + "transliterated Greek" + ]: + return 'Greek (transliterated)' + +def select_latin(text): + text = text.strip() + if text in [ + "Latin", "Latin (?)", + "Greek (some Latin characters)", + "Latin (some Greek characters)", + "Latin (including transliterated Greek)", + "Greek or Latin", "Latin and Greek (?)", + "Latin (transliterated Hebrew)" + ]: + return "Latin" + + if text in [ + "Latin in Greek characters", + "Greek (transliterated Latin?)", + "Greek with transliterated Latin (?)", + "Greek with transliterated Latin formula", + ]: + return "Latin (transliterated)" + + +def select_hebrew(text): + text = text.strip() + + if text in [ + "Hebrew", "Hebrew (?)" + ]: + return "Hebrew" + + if text in [ + "Latin (transliterated Hebrew)", + "Hebrew (transliterated)", + ]: + return "Hebrew (transliterated)" + + + + + # TODO: new fields + + # TODO: move to a comments field: + + + + # excluded (for now): + # 3D_image + # inscription_type + + # TODO: discuss + # fascimile + # photos_leonard diff --git a/backend/corpora/peaceportal/FIJI/fiji_converter.py b/backend/corpora/peaceportal/FIJI/fiji_converter.py new file mode 100644 index 000000000..3fe108f39 --- /dev/null +++ b/backend/corpora/peaceportal/FIJI/fiji_converter.py @@ -0,0 +1,277 @@ +''' +This script is based on the convertDatabase.py Jelmer van Nuss wrote to extract +FIJI data from Ortal-Paz Saar's excelsheet. As opposed to that script (which seemed to have +worked only with a manually edited source file), it is explicit in the changes required +to extract the data. This hopefully secures that the script can be re-used when Ortal-Paz +sends us a updated excelsheet (e.g. with translations added). +''' +import os +import sys +import openpyxl +import argparse +from jinja2 import Template + + +def main(sys_args): + args = parse_arguments(sys_args) + out_folder = args.out_folder + + if not os.path.exists(out_folder): + os.makedirs(out_folder) + + wb = openpyxl.load_workbook(args.input) + sheet = wb['Sheet1'] + headers = list(list(sheet.values)[0]) + preprocess_headers(headers) + for row in sheet.values: + row_dict = {headers[i]: row[i] for i in range(len(row))} + record = extract_record(row_dict) + if record: + export(out_folder, record) + + +def preprocess_headers(headers): + for index, header in enumerate(headers): + if header == 'Date (add 68 to the year of Temple destruction)': + headers[index] = 'Date' + if header == 'Sex ': + headers[index] = 'Sex' + if header == 'Iconography': + headers[index] = 'Iconography type' + if header == 'Iconography details': + headers[index] = 'Iconography description' + + +def extract_record(row): + if not row['Inscription no.']: + return None + return dict( + title=row["Inscription no."], + date=row["Date"], + remarksOnDate=preprocess_text(row["Remarks on date"]), + provenance=row["Provenance"], + presentLocation=row["Present location"], + publications=get_publications(row), + facsimile=row["Photo / Facsimile from publication"], + photosLeonard=row["Photos by Leonard"], + image3D=row["3D image"], + transcription=get_transcription(row), + inscriptionType=row["Inscription type"], + persons=get_persons(row), + age=row['Age'], + ageComments=preprocess_text(row["Remarks on age"]), + iconographyType=row["Iconography type"], + iconographyDescription=preprocess_text(row["Iconography description"]), + material=row["Material"], + languages=get_languages(row), + incipit=row["Incipit"], + commentary=get_commentary(row) + ) + + +def export(out_folder, record): + export_path = os.path.join(out_folder, '{}.xml'.format(record['title'])) + with open('XMLtemplate.j2') as file_: + template = Template(file_.read()) + + with open(export_path, 'w+', encoding='utf-8') as xmlFile: + xmlFile.write(template.render(record)) + + +def get_publications(row): + results = [] + publication_nos = str(row["No. in publication"]).split(';') + publications = row["Publication"] + if not publications: + return results + publications = publications.split(';') + + for index, pub in enumerate(publications): + publication = pub.replace('\n', '') + try: + publication_no = publication_nos[index].replace('\n', '').strip() + publication = "{} ({})".format(publication, publication_no) + except IndexError: + pass # ignore adding pub_no if it doesn't exist + results.append(publication) + return results + + +def get_transcription(row): + transcription = preprocess_text(row["Transcription"]) + return transcription.replace('\n', '\n\n') + + +def get_languages(row): + value = row["Language"] + if not value: + return "" + langs = value.split(',') + if len(langs) > 1: + cleaned = [] + for lang in langs: + cleaned.append(lang.strip()) + return cleaned + else: + return langs + + +def get_commentary(row): + commentary = row["Open questions / Remarks"] + # add number of lines surviving (if it exists) + # Note that at the time of writing, there is only 1 (!) record + # that has data in this field + additional = row['Number of lines (s=surviving, o=original)'] + if additional: + period = commentary.endswith('.') + commentary = '{}{} There are {} surviving lines.'.format( + commentary, '.' if not period else '', additional + ) + if commentary: + return commentary + else: + return "" + + +def preprocess_text(text): + ''' + Preprocess a text field. + For now replaces < and > with html entities. + ''' + if not text: + return "" + return text.replace('<', '<').replace('>', '>') + + +def get_persons(row): + persons = [] + inscription_id = row['Inscription no.'] + names = get_names_from_field(row, "Names mentioned") + namesHebrew = get_names_from_field(row, "Names mentioned (original language)") + sexes = get_sexes(row) + + if len(names) == 1 and len(namesHebrew) > 1 and len(sexes) == 1: + # if we have multiple Hebrew names, simply join them together + # TODO: check with Ortal-Paz if this is ok + persons.append(create_person( + names[0], " ".join(namesHebrew), sexes[0])) + elif len(names) == 1 and len(namesHebrew) == 1 and len(sexes) > 1 or inscription_id == '368': + # if we have multiple sexes, store name(s) once and create a person entry to record each sex + # also handles one special case (ID 368) + for index, sex in enumerate(sexes): + if index == 0: + persons.append(create_person( + names[0], namesHebrew[0], sexes[0])) + else: + persons.append(create_person('', '', sexes[index])) + elif len(names) > 1 or len(namesHebrew) > 1 or len(sexes) > 1: + # TODO: discuss the three remaining cases with Ortal-Paz + # custom cases for some rows + # if row['Inscription no.'] == 33: + # persons.append(create_person(" ".join(names), + # " ".join(namesHebrew), sexes[0])) + # else: + # pass + # print(row['Inscription no.']) + # print(names, namesHebrew, sexes) + pass + elif len(names) > 1 and len(namesHebrew) > 1 and len(sexes) > 1: + # if we get here there are multiple people and we assume they are complete + for index, name in enumerate(names): + persons.append(create_person( + name, namesHebrew[index], sexes[index])) + else: + # simple case of a single person + name = first_or_empty(names) + nameHebrew = first_or_empty(namesHebrew) + sex = sexes[0] + persons.append(create_person(name, nameHebrew, sex)) + + return persons + + +def first_or_empty(_list): + if len(_list) > 0: + return _list[0] + else: + return '' + + +def get_names_from_field(row, field): + results = [] + names_raw = extract_multifield(row, field, '\n') + for name in names_raw: + if name == 'X' or name == 'Χ': + # Note that the second character is not a 'X', but one copy-pasted from the commandline (and which looks a lot like one) + results.append('') + else: + results.append(name) + return results + + +def get_sexes(row): + results = [] + sexes_raw = extract_multifield(row, "Sex", '\n') + for sex in sexes_raw: + if '?' in sex: + results.append('Unknown') + elif 'M' in sex and 'F' in sex: + results.append('M') + results.append('F') + else: + results.append(sex) + return results + + +def create_person(name, nameHebrew, sex): + if not name: + return { + 'name': '', 'sex': sex + } + else: + return { + 'name': "{} ({})".format(name, preprocess_text(nameHebrew)), 'sex': sex + } + + +def extract_multifield(row, fieldname, splitter): + ''' + Extract the values from a single field that (might) contains multiple values. + Returns an array that will not contain empty strings or None. + ''' + results = [] + content = row[fieldname] + if not content: + return results + values = content.split(splitter) + for value in values: + if value: + results.append(value) + return results + + +def parse_arguments(sys_args): + ''' + Parse the supplied arguments. + ''' + parser = argparse.ArgumentParser( + description='Preprocess FIJI csv (from excelsheet)') + + parser.add_argument( + '--input', '-in', dest='input', required=False, default='FIJI_full.csv', + help='Path to the CSV file that contains the data. Defaults to \'FIJI_full.csv\' (i.e. in the script\'s folder') + + parser.add_argument( + '--delimiter', '-d', dest='delimiter', required=False, default=';', + help='Character that delimits fields in the CSV. Defaults to \';\'') + + parser.add_argument( + '--out_folder', '-out', dest='out_folder', required=False, default="FIJI", + help='''Path to the folder where the output should end up. + Will be created if it doesn\'t exist. Defaults to \'FIJI\' (i.e. in the script\'s folder)''') + + parsedArgs = parser.parse_args() + return parsedArgs + +if __name__ == "__main__": + main(sys.argv) diff --git a/backend/corpora/peaceportal/__init__.py b/backend/corpora/peaceportal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/corpora/peaceportal/conftest.py b/backend/corpora/peaceportal/conftest.py new file mode 100644 index 000000000..15c2e1626 --- /dev/null +++ b/backend/corpora/peaceportal/conftest.py @@ -0,0 +1,21 @@ +import pytest +import os + +here = os.path.abspath(os.path.dirname(__file__)) + +@pytest.fixture() +def peace_test_settings(settings): + settings.CORPORA = { + 'peaceportal': os.path.join(here, 'peaceportal.py'), + 'peaceportal-epidat': os.path.join(here, 'epidat.py'), + 'peaceportal-fiji': os.path.join(here, 'FIJI', 'fiji.py'), + 'peaceportal-iis': os.path.join(here, 'iis.py'), + 'peaceportal-tol': os.path.join(here, 'tol.py'), + } + + settings.PEACEPORTAL_EPIDAT_DATA= os.path.join(here, 'tests', 'data', 'epidat') + settings.PEACEPORTAL_FIJI_DATA= os.path.join(here, 'tests', 'data', 'fiji') + settings.PEACEPORTAL_IIS_DATA = os.path.join(here, 'tests', 'data', 'iis', 'xml') + settings.PEACEPORTAL_IIS_TXT_DATA = os.path.join(here, 'tests', 'data', 'iis', 'transcription_txts') + settings.PEACEPORTAL_TOL_DATA = os.path.join(here, 'tests', 'data', 'tol') + settings.PEACEPORTAL_ALIAS = 'peaceportal' diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py new file mode 100644 index 000000000..a0f1e8b53 --- /dev/null +++ b/backend/corpora/peaceportal/epidat.py @@ -0,0 +1,405 @@ +import re +from copy import copy + +from django.conf import settings + +from addcorpus.corpus import XMLCorpusDefinition +from addcorpus.extract import XML, Constant, Combined, FilterAttribute +from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language +from corpora.utils.exclude_fields import exclude_fields_without_extractor + + +class PeaceportalEpidat(PeacePortal, XMLCorpusDefinition): + + data_directory = settings.PEACEPORTAL_EPIDAT_DATA + es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat') + + languages = ['de', 'he', 'en', 'nl'] + + def __init__(self): + super().__init__() + self.source_database.extractor = Constant( + value='Epidat (Steinheim Institute)' + ) + + self._id.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'msIdentifier', 'idno'], + multiple=False, + toplevel=False, + flatten=True + ) + + self.url.extractor = FilterAttribute( + tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'], + multiple=False, + toplevel=False, + flatten=True, + attribute_filter={ + 'attribute': 'type', + 'value': 'url' + } + ) + + self.year.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origDate', 'date'], + toplevel=False, + transform=lambda x: get_year(x), + ) + + self.not_before.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origDate', 'date'], + toplevel=False, + attribute='notBefore', + transform=lambda x: get_year(x), + ) + + self.not_after.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origDate', 'date'], + toplevel=False, + attribute='notAfter', + transform=lambda x: get_year(x), + ) + + self.transcription.extractor = XML( + tag=['text', 'body', 'div'], + toplevel=False, + multiple=False, + flatten=True, + transform=lambda x: clean_newline_characters(x), + transform_soup_func=extract_transcript + ) + + self.transcription_german.extractor = XML( + tag=['text', 'body', ], + toplevel=False, + multiple=False, + flatten=True, + transform=lambda x: clean_newline_characters(x), + transform_soup_func=extract_translation + ) + + self.names.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson', 'person'], + flatten=True, + multiple=True, + toplevel=False, + ) + + self.sex.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson', 'person'], + attribute='sex', + multiple=True, + toplevel=False, + transform=lambda x: convert_sex(x) + ) + + self.dates_of_death.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson'], + transform_soup_func=extract_death, + attribute='when', + multiple=False, + toplevel=False, + ) + + self.country.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'country'], + toplevel=False, + transform_soup_func=extract_country, + transform=lambda x: clean_country(x), + flatten=True, + ) + + self.region.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'country', 'region'], + toplevel=False, + flatten=True + ) + + self.settlement.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'settlement'], + toplevel=False, + flatten=True, + transform_soup_func=extract_settlement, + ) + + self.location_details.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'settlement', 'geogName'], + toplevel=False, + flatten=True, + transform_soup_func=extract_location_details, + ) + + self.material.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p', 'material'], + toplevel=False, + flatten=True, + transform=lambda x: categorize_material(x) + ) + + self.material_details.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p', 'material'], + toplevel=False, + flatten=True + ) + + self.language.extractor = XML( + tag=['teiHeader', 'profileDesc', 'langUsage', 'language'], + toplevel=False, + multiple=True, + transform=lambda x: get_language(x) + ) + + self.comments.extractor = Combined( + XML( + tag=['text', 'body'], + toplevel=False, + transform_soup_func=extract_commentary, + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'condition'], + toplevel=False, + flatten=True, + transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p'], + toplevel=False, + transform_soup_func=extract_support_comments, + ), + transform=lambda x: join_commentaries(x) + ) + + self.images.extractor = XML( + tag=['facsimile', 'graphic'], + multiple=True, + attribute='url', + toplevel=False + ) + + self.coordinates.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'], + toplevel=False, + multiple=False, + flatten=True + ) + + self.iconography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'physDesc', 'decoDesc', 'decoNote'], + toplevel=False, + multiple=False + ) + + self.bibliography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'msIdentifier', 'publications', 'publication'], + toplevel=False, + multiple=True + ) + + self.transcription_hebrew.extractor = Combined( + self.transcription.extractor, + Constant('he'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_english.extractor = Combined( + self.transcription.extractor, + Constant('en'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_dutch.extractor = Combined( + self.transcription.extractor, + Constant('nl'), + transform=lambda x: get_text_in_language(x) + ) + + self.fields = exclude_fields_without_extractor(self.fields) + + +def convert_sex(values): + if not values: + return ['Unknown'] + result = [] + for value in values: + if value == '1': + result.append('M') + elif value == '2': + result.append('F') + else: + result.append('Unknown') + return result + + +def clean_country(text): + if not text: + return 'Unknown' + if text.lower().strip() == 'tobedone': + return 'Unknown' + return text + + +def get_year(text): + if not text or text == '--': + return + matches = re.search('[1-2]{0,1}[0-9]{3}', text) + if matches: + return matches[0] + + +def get_language(values): + if not values: + return ['Unknown'] + if 'German in Hebrew letters' in values: + return ['German (transliterated)', 'Hebrew'] + return values + + +def extract_transcript(soup): + ''' + Helper function to ensure correct extraction of the transcripts. + Note that there are multiple formats in which these are stored, + but the text that we need is always in the `` children of + `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one). + ''' + if not soup: + return + return soup.find_all('ab') + + +def extract_translation(soup): + ''' + Helper function to extract translation from the tag + ''' + if not soup: + return + translation = soup.find('div', {'type': 'translation'}) + if translation: + return translation.find_all('ab') + else: + return + + +def extract_commentary(soup): + ''' + Helper function to extract all commentaries from the tag. + A single element will be returned with the commentaries found as text content. + ''' + if not soup: return + found = [] + commentaries = soup.find_all('div', {'type': 'commentary'}) + + for commentary in commentaries: + if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']: + p = commentary.find('p') + if p: + text = p.get_text() + if text: + text = clean_commentary(text) + found.append('{}:\n{}\n'.format(commentary['subtype'].strip().upper(), text)) + + if len(found) > 1: + cloned_soup = copy(soup) + cloned_soup.clear() + cloned_soup.string = "\n".join(found) + return cloned_soup + else: + return None + +def extract_support_comments(soup): + if not soup: return + cloned_soup = copy(soup) + cloned_soup.clear() + + commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS') + commentaries = add_support_comment(soup, commentaries, 'objectType', 'OBJECTTYPE') + + # add any additional text from the

element, + # i.e. if there is text it is the very last node + contents = soup.contents + text = contents[len(contents) - 1].strip() + if text: + text = clean_commentary(text) + commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text) + + cloned_soup.string = commentaries + return cloned_soup + + +def add_support_comment(soup, existing_commentaries, elem_name, commentary_name): + elem = soup.find(elem_name) + if elem: + text = elem.get_text() + if text: + text = clean_commentary(text) + return '{}{}:\n{}\n\n'.format(existing_commentaries, commentary_name, text) + return existing_commentaries + + +def extract_death(soup): + ''' + Helper function to extract date of death from multiple person tags. + ''' + if not soup: + return + return soup.find_all('death') + + +def extract_country(soup): + ''' + Helper function to extract country. + This is needed because the output of `flatten` would otherwise include the text contents + of the ``. + ''' + return clone_soup_extract_child(soup, 'region') + + +def extract_settlement(soup): + return clone_soup_extract_child(soup, 'geogName') + + +def extract_location_details(soup): + return clone_soup_extract_child(soup, 'geo') + + +def clone_soup_extract_child(soup, to_extract): + ''' + Helper function to clone the soup and extract a child element. + This is useful when the output of `flatten` would otherwise include the text contents + of the child. + ''' + if not soup: + return + cloned_soup = copy(soup) + child = cloned_soup.find(to_extract) + if child: + child.extract() + return cloned_soup + + # TODO: add field + + # TODO: move to a comments field: + + # excluded (for now): + # title + # organization (incl details, e.g. address) + # licence + # taxonomy (i.e. things like foto1, foto2 -> no working links to actual images) + diff --git a/backend/corpora/peaceportal/fiji_separate.py b/backend/corpora/peaceportal/fiji_separate.py new file mode 100644 index 000000000..e2b3f564f --- /dev/null +++ b/backend/corpora/peaceportal/fiji_separate.py @@ -0,0 +1,17 @@ +from django.conf import settings + +from corpora.peaceportal.peaceportal import PeacePortal + +class FIJISEPARATE(PeacePortal): + + es_index = settings.FIJI_ALIAS + + # all fields listed here will be ignored if they are + # in the PeacePortal base class definition. Ideal for excluding + # filters that are irrelevant + redundant_fields = ['source_database', 'region'] + + def __init__(self): + for field in self.fields: + if field.name in self.redundant_fields: + self.fields.remove(field) diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py new file mode 100644 index 000000000..e9cd78a84 --- /dev/null +++ b/backend/corpora/peaceportal/iis.py @@ -0,0 +1,376 @@ +from copy import copy +from os.path import join + +from django.conf import settings + +from addcorpus.corpus import XMLCorpusDefinition +from addcorpus.extract import Combined, Constant, ExternalFile, FilterAttribute, XML +from addcorpus.serializers import LanguageField +from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language +from corpora.utils.exclude_fields import exclude_fields_without_extractor + +class PeaceportalIIS(PeacePortal, XMLCorpusDefinition): + data_directory = settings.PEACEPORTAL_IIS_DATA + es_index = getattr(settings, 'PEACEPORTAL_IIS_ES_INDEX', 'peaceportal-iis') + + def add_metadata(self, filename): + external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA + return { + 'associated_file': join(external_file_folder, filename) + } + + def __init__(self): + super().__init__() + self.external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA + self.source_database.extractor = Constant( + value='Inscriptions of Israel/Palestine (Brown University)' + ) + + self._id.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'msIdentifier', 'idno'], + multiple=False, + toplevel=False, + flatten=True, + transform=lambda x: ''.join(x.lower().split()) + ) + + self.url.extractor = FilterAttribute( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'msIdentifier', 'idno'], + multiple=False, + toplevel=False, + flatten=True, + transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format( + ''.join(x.lower().split())) + ) + + # quick and dirty for now: extract value for 'notBefore' + self.year.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'date'], + toplevel=False, + attribute='notBefore' + ) + + self.not_before.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'date'], + toplevel=False, + attribute='notBefore' + ) + + self.not_after.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'date'], + toplevel=False, + attribute='notAfter', + ) + + self.transcription.extractor = ExternalFile( + stream_handler=extract_transcript + ) + + self.transcription_english.extractor = FilterAttribute( + tag=['div'], + toplevel=True, + multiple=False, + flatten=True, + attribute_filter={ + 'attribute': 'type', + 'value': 'translation' + }, + transform_soup_func=extract_paragraph, + transform=lambda x: ' '.join(x.split()) if x else None + ) + + # is not present in IIS data + # self.names.extractor = XML( + # tag=['teiHeader', 'profileDesc', + # 'particDesc', 'listPerson', 'person'], + # flatten=True, + # multiple=True, + # toplevel=False, + # ) + + self.iconography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'physDesc', 'decoDesc', 'decoNote'], + toplevel=False, + multiple=True, + flatten=True + ) + + # is not present in IIS data + self.sex.extractor = Constant( + value='Unknown' + ) + + self.country.extractor = Constant( + value='Israel/Palestine' + ) + + self.region.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'placeName', 'region'], + toplevel=False, + flatten=True + ) + + self.settlement.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'placeName', 'settlement'], + toplevel=False, + flatten=True + ) + + self.location_details.extractor = Combined( + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'placeName'], + toplevel=False, + flatten=True + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'p'], + toplevel=False, + flatten=True + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'provenance'], + toplevel=False, + flatten=True + ) + ) + + self.material.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc'], + attribute='ana', + toplevel=False, + flatten=True, + transform=lambda x: categorize_material(x) + ) + + self.material_details.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc'], + attribute='ana', + toplevel=False, + flatten=True + ) + + self.language.extractor = Combined( + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', + 'textLang'], + attribute='mainLang', + toplevel=False, + transform=lambda x: normalize_language(x) + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', + 'textLang'], + attribute='otherLangs', + toplevel=False, + transform=lambda x: normalize_language(x) + ) + ) + self.language_code.extractor = Combined( + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', + 'textLang'], + attribute='mainLang', + toplevel=False + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents', + 'textLang'], + attribute='otherLangs', + toplevel=False + ) + ) + + self.comments.extractor = Combined( + XML( + tag=['text'], + toplevel=False, + multiple=False, + flatten=True, + transform_soup_func=extract_comments, + transform=lambda x: clean_commentary(x) if x else None + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'condition'], + toplevel=False, + transform_soup_func=extract_condition + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'layoutDesc', 'layout', 'p'], + toplevel=False, + transform=lambda x: 'LAYOUT:\n{}\n\n'.format(clean_commentary(x)) if x else None + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc'], + toplevel=False, + attribute='ana', + transform=lambda x: 'OBJECTTYPE:\n{}\n\n'.format(x[1:]) if x else None + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'dimensions'], + toplevel=False, + transform_soup_func=extract_dimensions, + transform=lambda x: 'DIMENSIONS:\n{}\n\n'.format( + x) if x else None + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p'], + toplevel=False, + flatten=True, + transform=lambda x: 'SUPPORT:\n{}\n\n'.format( + clean_commentary(x)) if x else None + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', 'handDesc', 'handNote'], + toplevel=False, + transform_soup_func=extract_handnotes + ), + transform=lambda x: join_commentaries(x) + ) + + self.bibliography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'msIdentifier', 'publications', 'publication'], + toplevel=False, + multiple=True + ) + + self.transcription_hebrew.extractor = Combined( + self.transcription.extractor, + Constant('he'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_latin.extractor = Combined( + self.transcription.extractor, + Constant('la'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_greek.extractor = Combined( + self.transcription.extractor, + Constant('el'), + transform=lambda x: get_text_in_language(x) + ) + + self.fields = exclude_fields_without_extractor(self.fields) + + +def extract_transcript(filestream): + text = filestream.read().strip() + filestream.close() + # remove the tabs and spaces inherited from xml + text = clean_newline_characters(text) + if text: + text = text.replace('\t', '') + return text + + +def extract_paragraph(soup): + ''' + Extract first

element from `soup`, ignore the rest. + Ideal for ignoring

headers in the HTML versions of the body. + ''' + if not soup: + return + return soup.find('p') + + +def extract_comments(soup): + ''' + Helper function to extract the commentary from either or (siblings under ) + ''' + if not soup: + return + commentary_div = soup.find('div', {'type': 'commentary'}) + return extract_paragraph(commentary_div) + + +def extract_attribute_and_child_p(soup, field_header): + ''' + Extract value for 'ana' attribute from soup, + as well as the text from a

child. Will be returned + in a new soup, i.e. a single element with text content + in the following format `textcontent (attrivubtevalue)` + ''' + result = '' + text = '' + ana = None + if 'ana' in soup.attrs: + ana = soup['ana'] + p = extract_paragraph(soup) + if p: + text = p.get_text() + if text: + result = clean_commentary(text) + if ana: + result = '{} ({})'.format(result, ana) + + if result: + cloned_soup = copy(soup) + cloned_soup.clear() + cloned_soup.string = '{}:\n{}\n\n'.format(field_header, result) + return cloned_soup + + +def extract_condition(soup): + return extract_attribute_and_child_p(soup, 'CONDITION') + + +def extract_handnotes(soup): + if not soup: return + return extract_attribute_and_child_p(soup, 'HANDNOTES') + + +def extract_dimensions(soup): + result = '' + height_elem = soup.find('height') + if height_elem: + height = height_elem.get_text() + if height: + result = "H: {} ".format(height) + + width_elem = soup.find('width') + if width_elem: + width = width_elem.get_text() + if width: + result = "{}W: {}".format(result, width) + + depth_elem = soup.find('depth') + if depth_elem: + depth = depth_elem.get_text() + if depth: + result = "{} D: {}".format(result, depth) + + cloned_soup = copy(soup) + cloned_soup.clear() + cloned_soup.string = result + return cloned_soup + + +def normalize_language(text): + serializer = LanguageField() + return serializer.to_representation(text) + + # excluded (for now): + # revision history + + # MISSING (i.e. present in Epidat and Fiji) + # person(s) - names (profileDesc is completely missing) diff --git a/backend/corpora/peaceportal/iis_corpus_preprocessor.py b/backend/corpora/peaceportal/iis_corpus_preprocessor.py new file mode 100644 index 000000000..9be08fa47 --- /dev/null +++ b/backend/corpora/peaceportal/iis_corpus_preprocessor.py @@ -0,0 +1,100 @@ +import os +import sys +import glob +import argparse +from bs4 import BeautifulSoup + + +def main(sys_args): + args = parse_arguments(sys_args) + prepare_out_folder(args.out_folder) + preprocess(args.xml_folder, args.out_folder) + +def prepare_out_folder(out_folder): + if not os.path.exists(out_folder): + os.makedirs(out_folder) + else: + files = glob.glob('{}/*'.format(out_folder)) + for f in files: + os.remove(f) + +def preprocess(in_folder, out_folder): + + for filepath in glob.iglob('{}/*.xml'.format(in_folder)): + with open(filepath, 'r') as xml: + soup = BeautifulSoup(xml.read(), 'xml') + + filename = os.path.basename(filepath) + keep_only_transcription(filename, soup, out_folder) + # TODO: add extraction of foreigns + + +def keep_only_transcription(filename, soup, out_folder): + out_file = os.path.join(get_subfolder(out_folder, 'tei_with_transcription_only'), filename) + + text_tag = soup.find('text') + transcription = get_transcription(filename, text_tag) + text_tag.clear() + if transcription: + text_tag.append(transcription) + + with open(out_file, 'w') as f_out: + f_out.write(str(soup)) + + +## TODO: extract foreign and export them to separate file. +# def do_something_with_foreign(filename, soup): +# text_tag = soup.find('text') + # transcription = get_transcription(filename, text_tag) + # if transcription: + # foreigns = text_tag.find_all('foreign') + # # print(foreigns) + + # for f in foreigns: + # if f.findChild(): + # print(f) + + +def get_transcription(filename, text_tag): + transcription = text_tag.find('div', { 'subtype': 'transcription'}) + + # if there is no transcription, fallback to diplomatic + if not transcription: + transcription = text_tag.find('div', { 'subtype': 'diplomatic'}) + + if not transcription: + print('No transcription found in {}'.format(filename)) + return transcription + + +def get_subfolder(folder, subfoldername): + ''' + Get a subfolder with `subfoldername` in `folder`. + Will be created if it doesn't exist. + ''' + path = os.path.join(folder, subfoldername) + if not os.path.exists(path): + os.makedirs(path) + return path + + +def parse_arguments(sys_args): + ''' + Parse the supplied arguments. + ''' + parser = argparse.ArgumentParser( + description='Preprocess EpiDoc scrapes, i.e. extract Leiden') + + parser.add_argument( + '--xml_folder', '-xml', dest='xml_folder', required=True, + help='Path to the folder where the .xml files reside.') + + parser.add_argument( + '--out_folder', '-out', dest='out_folder', required=True, + help='Path to the folder where the output should end up. Will be created if it doesn\'t exist or emptied out if it does.') + + parsedArgs = parser.parse_args() + return parsedArgs + +if __name__ == "__main__": + main(sys.argv) diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py new file mode 100644 index 000000000..da8653927 --- /dev/null +++ b/backend/corpora/peaceportal/peaceportal.py @@ -0,0 +1,522 @@ +import os +import os.path as op +import logging +from datetime import datetime +from langdetect import detect +from langdetect.lang_detect_exception import LangDetectException + +from django.conf import settings + +from addcorpus.corpus import ParentCorpusDefinition, FieldDefinition +from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping +from addcorpus.es_settings import es_settings +from addcorpus.extract import Constant +from addcorpus.filters import MultipleChoiceFilter, RangeFilter + +class PeacePortal(ParentCorpusDefinition): + ''' + Base class for corpora in the PEACE portal. + + This supplies the frontend with the information it needs. + Child corpora should only provide extractors for each field. + Consequently, create indices (with alias 'peaceportal') from + the corpora specific definitions, and point the application + to this base corpus. + ''' + + title = "PEACE Portal" + description = "A collection of inscriptions on Jewish burial sites" + # store min_year as int, since datetime does not support BCE dates + min_year = -530 + max_date = datetime(year=1950, month=12, day=31) + visualize = [] + es_index = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal') + es_alias = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal') + scan_image_type = 'image/png' + # fields below are required by code but not actually used + min_date = datetime(year=746, month=1, day=1) + image = 'bogus.jpg' + category = 'inscription' + data_directory = 'bogus' + + # Data overrides from .common.XMLCorpus + tag_entry = 'TEI' + + # New data members + non_xml_msg = 'Skipping non-XML file {}' + non_match_msg = 'Skipping XML file with nonmatching name {}' + # overwrite below in child class if you need to extract the (converted) transcription + # from external files. See README. + languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-) + + @property + def es_settings(self): + return es_settings(self.languages, stopword_analysis=True, stemming_analysis=True) + + def sources(self, start, end): + for directory, _, filenames in os.walk(self.data_directory): + for filename in sorted(filenames): + name, extension = op.splitext(filename) + full_path = op.join(directory, filename) + if not self.validate_extension(extension, full_path): + continue + metadata = self.add_metadata(filename) + yield full_path, metadata + + def add_metadata(self, filename): + return {} + + def validate_extension(self, extension, full_path): + ''' + Check that the file is valid for this corpus. + So far, all PeacePortal corpora are XML, but may include CSV corpora in the future + ''' + logger = logging.getLogger(__name__) + if extension == '.xml': + return True + logger.debug(self.non_xml_msg.format(full_path)) + + def request_media(self, document): + images = document['fieldValues']['images'] + if not images: + images = [] + return { 'media': images } + + source_database = FieldDefinition( + name='source_database', + display_name='Source database', + description='The database a record originates from.', + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Search only within these databases.', + option_count=4, + ), + csv_core=True + ) + + _id = FieldDefinition( + name='id', + display_name='ID', + description='ID of the inscription entry.', + csv_core=True, + es_mapping=keyword_mapping(), + search_field_core=True + ) + + url = FieldDefinition( + name='url', + display_name='URL', + description='URL of the inscription entry.', + es_mapping=keyword_mapping(), + search_field_core=True + ) + + year = FieldDefinition( + name='year', + display_name='Year', + description='Year of origin of the inscription.', + es_mapping=int_mapping(), + search_filter=RangeFilter( + description='Restrict the years from which search results will be returned.', + lower=min_year, + upper=max_date.year, + ), + csv_core=True, + sortable=True, + visualization_type='term_frequency', + visualization_sort='key', + results_overview=True + ) + + not_before = FieldDefinition( + name='not_before', + display_name='Not before', + description='Inscription is dated not earlier than this year.', + es_mapping=int_mapping(), + hidden=True + ) + + not_after = FieldDefinition( + name='not_after', + display_name='Not after', + description='Inscription is dated not later than this year.', + es_mapping=int_mapping(), + hidden=True + ) + + transcription = FieldDefinition( + name='transcription', + es_mapping=main_content_mapping(), + display_name='Transcription', + description='Text content of the inscription.', + search_field_core=True, + results_overview=True, + display_type='text_content' + ) + + transcription_german = FieldDefinition( + name='transcription_de', + es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='de'), + language='de', + hidden=True + ) + + transcription_english = FieldDefinition( + name='transcription_en', + es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='en'), + language='en', + hidden=True + ) + + transcription_hebrew = FieldDefinition( + name='transcription_he', # no stemmers available + es_mapping=main_content_mapping(stopword_analysis=True, language='he'), + language='he', + hidden=True + ) + + transcription_latin = FieldDefinition( + name='transcription_la', + es_mapping={'type': 'text'}, # no stopwords / stemmers available + language='la', + hidden=True + ) + + transcription_greek = FieldDefinition( + name='transcription_el', + es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='el'), + language='el', + hidden=True + ) + + transcription_dutch = FieldDefinition( + name='transcription_nl', + es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='nl'), + language='nl', + hidden=True + ) + + age = FieldDefinition( + name='age', + display_name='Age', + description='Age of the buried person(s)', + es_mapping=int_mapping(), + search_filter=RangeFilter( + description='Filter by age of the buried persons.', + lower=0, + upper=100, + ), + extractor=Constant( + value=None + ) + ) + + # A string with all the names occuring in the source + names = FieldDefinition( + name='names', + es_mapping=text_mapping(), + display_name='Names', + description='Names of the buried persons.', + search_field_core=True + ) + + # Should be an array with potentially multiple values from these: 'M', 'F', or None. + sex = FieldDefinition( + name='sex', + display_name='Sex', + description='Gender(s) of the buried person(s). None if the sex is unknown.', + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Search only within these genders.', + option_count=3, + ), + csv_core=True + ) + + country = FieldDefinition( + name='country', + display_name='Country', + description='Country where the inscription was found.', + es_mapping=keyword_mapping(True), + search_filter=MultipleChoiceFilter( + description='Search only within these countries.', + option_count=5 + ), + visualization_type='term_frequency', + results_overview=True + ) + + settlement = FieldDefinition( + name='settlement', + display_name='Settlement', + description='The settlement where the inscription was found.', + es_mapping=keyword_mapping(True), + search_filter=MultipleChoiceFilter( + description='Search only within these settlements.', + option_count=29 + ), + visualization_type='term_frequency' + ) + + region = FieldDefinition( + name='region', + display_name='Region', + description='The region where the inscription was found.', + es_mapping=keyword_mapping(True), + search_filter=MultipleChoiceFilter( + description='Search only within these regions.', + option_count=29 + ), + visualization_type='term_frequency' + ) + + location_details = FieldDefinition( + name='location_details', + display_name='Location details', + description='Details about the location of the inscription', + es_mapping=text_mapping() + ) + + material = FieldDefinition( + name='material', + display_name='Material', + description='Type of material the inscription is written on.', + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Search only within these material types.', + option_count=39 + ), + visualization_type='term_frequency' + ) + + material_details = FieldDefinition( + name='material_details', + display_name='Material details', + description='Details about the material the inscription is written on.', + es_mapping=text_mapping(), + search_field_core=True + ) + + language = FieldDefinition( + name='language', + display_name='Language', + description='Language of the inscription.', + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Search only within these languages.', + option_count=10 + ), + csv_core=True, + visualization_type='term_frequency' + ) + + language_code = FieldDefinition( + name='language_code', + display_name='Language code', + description='ISO 639 code for the language of the inscription.', + es_mapping=keyword_mapping() + ) + + bibliography = FieldDefinition( + name='bibliography', + es_mapping=keyword_mapping(), + display_name='Bibliography', + description='Reference(s) to who edited and published this funerary inscription.' + ) + + comments = FieldDefinition( + name='comments', + es_mapping=text_mapping(), + display_name='Commentary', + description='Extra comments, questions or remarks on this inscription.', + search_field_core=True, + ) + + images = FieldDefinition( + name='images', + es_mapping=keyword_mapping(), + display_name='Images', + description='Links to image(s) of the inscription.', + hidden=True + ) + + coordinates = FieldDefinition( + name='coordinates', + es_mapping=keyword_mapping(), + display_name='Coordinates', + description='GIS coordinates for the inscription.' + ) + + iconography = FieldDefinition( + name='iconography', + es_mapping=text_mapping(), + display_name='Iconography', + description='Description of the icons used in the inscription.', + search_field_core=True + ) + + dates_of_death = FieldDefinition( + name='dates_of_death', + es_mapping=keyword_mapping(), + display_name='Date of death', + ) + + def __init__(self): + self.fields = [ + self._id, + self.url, + self.year, + self.not_before, + self.not_after, + self.source_database, + self.transcription, + self.names, + self.sex, + self.dates_of_death, + self.age, + self.country, + self.region, + self.settlement, + self.location_details, + self.language, + self.language_code, + self.iconography, + self.images, + self.coordinates, + self.material, + self.material_details, + self.bibliography, + self.comments, + self.transcription_german, + self.transcription_hebrew, + self.transcription_latin, + self.transcription_greek, + self.transcription_english, + self.transcription_dutch + ] + + +def clean_newline_characters(text): + ''' + Remove all spaces surrounding newlines in `text`. + Also removes multiple newline characters in a row. + ''' + if not text: return + parts = text.split('\n') + cleaned = [] + for part in parts: + if not '\n' in part: + stripped = part.strip() + if stripped: + cleaned.append(part.strip()) + return '\n'.join(cleaned) + + +def clean_commentary(commentary): + ''' + Clean a commentary by removing all whitespaces characters between words, + except for one space. + ''' + return ' '.join(commentary.split()) + +def join_commentaries(commentaries): + ''' + Helper function to join the result of a Combined extractor + into one string, separating items by a newline + ''' + results = [] + for comm in commentaries: + if comm: + results.append(comm) + return "\n".join(results) + +def categorize_material(text): + ''' + Helper function to (significantly) reduce the material field to a set of categories. + The Epidat corpus in particular has mainly descriptions of the material. + Returns a list of categories, i.e. those that appear in `text`. + ''' + if not text: return ['Unknown'] + + categories = ['Sandstein', 'Kalkstein', 'Stein', 'Granit', 'Kunststein', + 'Lavatuff', 'Marmor', 'Kalk', 'Syenit', 'Labrador', 'Basalt', 'Beton', + 'Glas', 'Rosenquarz', 'Gabbro', 'Diorit', 'Bronze', + # below from FIJI and IIS + 'Limestone', 'Stone', 'Clay', 'Plaster', 'Glass', 'Kurkar', 'Granite', + 'Marble', 'Metal', 'Bone', 'Lead' ] + result = [] + ltext = text.lower() + + for c in categories: + if c.lower() in ltext: + result.append(translate_category(c)) + + if len(result) == 0: + # reduce unknown, other and ? to Unknown + # 'schrifttafel' removes some clutter from Epidat + if 'unknown' in ltext or 'other' in ltext or '?' in ltext or 'schrifttafel': + result.append('Unknown') + else: + result.append(text) + + return result + +def translate_category(category): + ''' + Helper function to translate non-English categories of material into English + ''' + pairs = { + 'Sandstein': 'Sandstone', + 'Kalkstein': 'Limestone', + 'Stein': 'Stone', + 'Granit': 'Granite', + 'Kunststein': 'Artificial stone', + 'Lavatuff': 'Tufa', + 'Marmor': 'Marble', + 'Kalk': 'Limestone', + 'Syenit': 'Syenite', + 'Labrador': 'Labradorite', + 'Beton': 'Concrete', + 'Glas': 'Glass', + 'Rosenquarz': 'Rose quartz', + 'Diorit': 'Diorite' + } + + for original, translation in pairs.items(): + if category == original: + return translation + return category + + +def get_text_in_language(_input): + ''' + Get all the lines from a transcription that are in a certain language + (according to the `langdetect` package). Note that `transcription` will + be split on newlines to create lines that will be fed to langdetect one by one. + All lines that are in `language_code` will be collected and returned as one string, + i.e. they will be joined with a space (no newlines!). + + Parameters: + _input -- A tuple or list with (transcription, language_code). Will typically be the output + of a Combined extractor, i.e. one for the transcript and a Constant extractor with the language code. + For a list of language codes detected by langdetect, see https://pypi.org/project/langdetect/ + ''' + results = [] + if len(_input) != 2 or not _input[0]: + return results + lines = _input[0].split('\n') + language_code = _input[1] + + for line in lines: + if not line: continue + detected_code = None + try: + # note that Aramaic is detected as Hebrew + detected_code = detect(line) + except LangDetectException: + # sometimes langdetect isn't happy with some stuff like + # very short strings with mainly numbers in it + pass + if detected_code and detected_code == language_code: + results.append(line) + return ' '.join(results) diff --git a/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml b/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml new file mode 100644 index 000000000..90136bb1c --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml @@ -0,0 +1,216 @@ + + + + + +epidat, blr-4 + + + + + + + + + + Salomon Ludwig Steinheim-Institut +

+Edmund-Körner-Platz 2 +D-45127 Essen +
+ + +blr-4 +http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4 + + +Distributed under a Creative Commons licence Attribution-BY 4.0 +

+ All reuse or distribution of this work must contain somewhere a link back to the URL + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4 +

+
+
+ + + + + +born digital + + +epidat +blr-4 + +http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4 + + + +http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4-t + + + + + + + +

+stone +sepulchral monument +

+
+
+ + + +
+
+ + + +1865-02-28 + + + + Germany + Thuringa + + + Bleicherode + + Jewish Cemetery + 51.434387 10.571183 + + + + + +
+
+ + + +EpiDoc: TEI XML for epigraphic Documents Schema + + + + +Julia Buchmann, Nicola Wiemann, Maike Schlotterhose; Bleicherode + + + +World Geodetic System + + + + + +Natan Schönfeld (Nathan Schönfeld) + + + + + + +Hebrew +German + + + + + + + + + + + + recto + + + + + + + Detail + + + + + + + verso + + + + + +
+Edition +
+ + + Hier ruhet + + der Kaufmann + + Nathan Schönfeld + + geb. d. 4. April 1812 + + gest. d. [28.] Februar 1865 + +
+
+ + + ‎‏פ״נ‏‎ + + ‎‏איש חמדות יקר רוח אוהב‏‎ + + ‎‏צדק ופועל טוב כ״ה נתן‏‎ + + ‎‏שאנפעלד נולד ח׳ של פסח‏‎ + + ‎‏תקע״ב ונפטר בשם טוב יום ג׳‏‎ + + ‎‏ב׳ אדר תרכ״ה לפ״ק‏‎ + + ‎‏תנצב״ה‏‎ + +
+
+
+Übersetzung +
+ +
+ + Hier ist begraben + + #.:ein werter Mann#.;, #.:edelmütig#.;, Wohltat + + liebend und Gutes wirkend, der geehrte Herr Natan + + Schönfeld, geboren 8. (Tag) von Pessach 572 + + und verschieden #.:mit gutem Namen#.; Tag 3, + + 2. Adar 625 der kleinen Zählung. + + Seine Seele sei eingebunden in das Bündel des Lebens +
+
+Zitate +

Zl 7: Dan 10,11 | Zl 7: Spr 17,27

+

Zl 10: bBer 17a

+
+
+Prosopographie +
+
+Bibliographie +
+ +
+ \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml b/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml new file mode 100644 index 000000000..63a21a51d --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml @@ -0,0 +1,302 @@ + + + + + +epidat, hlh-12 + + + + + + + + + + Salomon Ludwig Steinheim-Institut +
+Edmund-Körner-Platz 2 +D-45127 Essen +
+
+
+hlh-12 +http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12 + + +Distributed under a Creative Commons licence Attribution-BY 4.0 +

+ All reuse or distribution of this work must contain somewhere a link back to the URL + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12 +

+
+
+
+ + + + +born digital + + +epidat +hlh-12 + +http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12 + + + +http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12-t + + +Stadt Mülheim an der Ruhr, Sterberegister Broich 1891 (1196/5/14), Nr. 247.Kaufhold, Barbara, Jüdischen Leben in Mülheim an der Ruhr, Essen 2004. + + + + +

+stone +sepulchral monument +

+
+ +

+2013 + Der Zustand des Steins hat sich seit 1986 kaum verändert +

+
+
+ + + +
+ + +sechzackiger Stern + +
+ + + +1891-12-06 + + + + Germany + North Rhine-Westphalia + + + Kettwig (Neuer Friedhof in Heiligenhaus) + + Jewish Cemetery + 51.346014 6.924709 + + + + + +
+
+
+ + +EpiDoc: TEI XML for epigraphic Documents Schema + + + + + + Epigraphisches Bildarchiv, + Steinheim-Institut + + + +Nathanja Hüttenmeister, Carmen Wedemeyer + + + +World Geodetic System + + + + + +Gitle bat Mosche (Clara Leffmann) + + + + + + + + + +Hebrew +German + + + + + +
+ + + + + + recto + + + + + + + recto + + + + + + + recto + + + + + + + Detail + + + + + + + Detail + + + + + + + Detail + + + + + + + + + + + + + + + + + + + + + + + + + +
+Edition +
+ + + ‎‏פ״ט‏‎ + + ‎‏הבתולה צנועה וחמודה‏‎ + + ‎‏מ׳ גיטלא בת משה‏‎ + + ‎‏ה״ה ראשנה שנקברה בבית‏‎ + + ‎‏החיים החדשה בק״ק‏‎ + + ‎‏קעטטוויג ומתה בשם ט׳‏‎ + + ‎‏ביום א׳ ה׳ כסלו תרנ״ב ל׳‏‎ + + ‎‏תנצב״ה‏‎ + + Hier ruht die Jungfrau + + Clara Leffmann + + Sie starb erst 19 + + Jahre alt, gottergeben und + + tief betrauert von den ihrigen, + + den 8. Dezbr. 1891 + +
+
+ + + Friede ihrer Asche. + +
+
+
+Übersetzung +
+ + + Hier ist geborgen + + die züchtige und liebliche Jungfrau, + + Frau Gitle, Tochter des Mosche, + + sie ist die Erste, die begraben wurde auf dem neuen + + Friedhof der heiligen Gemeinde + + Kettwig, und sie starb #.:mit gutem Namen#.; + + am Tag 1, 5. Kislev 652 der Zählung. + + Ihre Seele sei eingebunden in das Bündel des Lebens + +
+
+ +
+
+
+Zitate +

Zl 6: bBer 17a

+
+
+Zeilenkommentar +

Zl 5: Friedhof, wörtl. "Haus des Lebens".

+
+
+Endkommentar +

Vermutlich handelt es sich bei der Angabe des Sterbedatums in der deutschen Inschrift um das Begräbnisdatum. Dieser Stein ist der erste des Friedhofes am Görscheider Weg.

+

Zwischen den jüdischen Familien aus Kettwig vor der Brücke und Saarn gab es verwandtschaftliche Verhältnisse, so stammte die Familie Leffmann, deren Angehörige z. T. hier bestattet sind, aus Saarn (Kaufhold, Jüdisches Leben in Mülheim a. d. R., S. ).

+
+
+Prosopographie +

Clara Leffmann war die Tochter des Saarner Metzgers Moritz Leffmann und seiner Frau Sara geb. Herz. Der Bruder Artur fiel 1917 im Krieg (Engelhardt, Chronik, S. 81).

+
+
+Bibliographie + +Systematische bauhistorische Beschreibung + durch + Bau- und Stadtbaugeschichte, Fakultät 6, Institut für Architektur, TU Berlin + +
+ +
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/fiji/299.xml b/backend/corpora/peaceportal/tests/data/fiji/299.xml new file mode 100644 index 000000000..622c94738 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/fiji/299.xml @@ -0,0 +1,64 @@ + + + + + 299 + + + + + + + + + Museo Vaticano, lapidario ebraico ex-Lateranense; inv.no.30762 + + Noy 1995, p. 69-70 (83) + + + + + Rome, Monteverde + 3rd-4th century + Uncertain + + + + + + + + + + Felicissima ( the commemorator) Emarantus ( the decaesed) (Φη<λ>ικίσσιμα Ἠμαράντῳ) + + + + + + + + Greek + + + + + CIJ i 1936, 266 no.339 + None + None + + + + Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν. + Epitaph + none + + Stone (white marble plaque) + Φη<λ>ικίσσιμα + not mentioned + not mentioned + Found on the 3rd of December 1904 in Cub.XL. The lower third of the plaque was left unused. There are poits between the syllables. Ferrua thought it might be pagan. + + + \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/fiji/687.xml b/backend/corpora/peaceportal/tests/data/fiji/687.xml new file mode 100644 index 000000000..d860857cb --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/fiji/687.xml @@ -0,0 +1,61 @@ + + + + + 687 + + + + + + + + + In the catacomb + + Noy 1995, p. 351 (417) + + + + + Rome, Villa Torlonia (lower cat.) + 3rd- 4th century + + + + + + + + + + + + + + + + Greek + + + + + not available + None + None + + + + ἐνθάδε [κεῖται--] + Εpitaph + ? + + Stone (marble fragment) + ἐνθάδε [κεῖται--] + not mentioned or lost + not mentioned or lost + + + + \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/fiji/759.xml b/backend/corpora/peaceportal/tests/data/fiji/759.xml new file mode 100644 index 000000000..74441bf40 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/fiji/759.xml @@ -0,0 +1,65 @@ + + + + + 759 + + + + + + + + + Formerly in Villa Torlonia stables + + Noy 1995, p. 390-1 (489) + + + + + Rome, Villa Torlonia (lower cat.) + 3rd- 4th century + + + + + + + + + + + Irene (Εἰρήνη) + + + + + Greek + + + + + CIJ i 1936, p. 19-20 no.21 + None + None + + + + Εἰρήνη τρεζπτὴ προσήλυτος πατρὸς καὶ μητρὸς Εἰουδε͂α + +Ἰσδραηλίτης ἔζησεν ἤτ(η) γ΄ μ(ῆνας) ζ΄ vac.ἡμ(έ)ρ(αν) α΄. + + Εpitaph + none + + Stone (grey-blue marble plaque) + Εἰρήνη + 3 + The precise age was 3 years, 7 months and 1 day. + + + + \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml new file mode 100644 index 000000000..de749a662 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml @@ -0,0 +1,5 @@ + + + + + Χάρητος Χάρητος Χάρητος Χάρητος \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml new file mode 100644 index 000000000..235b943e8 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml @@ -0,0 +1,5 @@ + + + + + Ἀβρᾶ καὶ Σαμῆ \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml new file mode 100644 index 000000000..b4ac3b202 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml @@ -0,0 +1,5 @@ + + + + + אמא \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml b/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml new file mode 100644 index 000000000..5f7921f49 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml @@ -0,0 +1,196 @@ + + + + + + +Inscriptions of Israel/Palestine + +Prinicipal Investigator +Michael Satlow + + + + + + +

ERROR-could not find publication information which should appear in this space.

+
+
+ +
+ + + + + +Akld 0002 +Shadmi, T. (1996). The Ossuaries and the Sarcophagus. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 41–55). Jerusalem: Israel Antiquities Authority. (page 52)Ilan, T. (1996). The Ossuary and Sarcophagus Inscriptions. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 57–72). Jerusalem: Israel Antiquities Authority. (page 58) + + + + + + +

Jerusalem Akeldama Caves confluence of Kidron and Hinnom Valleys, + First century CE. Ossuary. Funerary.

+
+
+ + + + + + +64 +29 +35 + + + +

+ + + + +

once on each side

+ + +
+ + + +

+ + + + + +Painted Red + + + + + +

+ + + +First century CE + +Judaea +Jerusalem +Akeldama +Cave 2 chamber B + + +

+ + + + + + + + + + + + +

Taxonomies for IIP controlled values

+ + + + + + + +Initial Entry +Normalized objectDesc/@ana +Adding Pleiades IDs to origin/placenames + + adding period attribute to date element, with Periodo value. + + +
+ + + + + + + + + + +
+

ΧΑΡΗΤΟϹΧΑ ΡΗ ΤΟ Ϲ ΧΑΡΗΤΟϹΧΑΡΗΤΟϹ

+
+
+

ΧάρητοςΧάρητοςΧάρητοςΧάρητος

+
+
+

of Chares

+
+
+

+

+ + + +
+ + + +52 + + + +58 + + +
+
+
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml b/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml new file mode 100644 index 000000000..f61d5a5d2 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml @@ -0,0 +1,143 @@ + + + + + +Inscriptions of Israel/Palestine + +Prinicipal Investigator +Michael Satlow + + + + + +

ERROR-could not find publication information which should appear in this space.

+
+
+beth0042 + +
+ + + + +Beth 0042 +Frey, J. B. (1952). Corpus Inscriptionum Iudaicarum (Vol. II (Asie-Afrique)). Roma: Pontificio Istituto di Archeologia Cristiana. (insc)Schwabe, M., & Lifshitz, B. (1974). Beth She’arim. Vol. 2, The Greek Inscriptions. Massada Press on behalf of the Israel Exploration Society. (page 25-26) + + + + +

Galilee. Beth Shearim. 250 CE to 350 CE. Red painted wall of arcosolium. Funerary.

+
+
+ + + + + + +60 + + + +

+ + + + + +

+ + + + + + + + + + + + + + + + + + + + +250 CE to 350 CE + +Galilee +Beth Shearim + + +

+ + + + + + + + + + + + + +

ERROR: could not find taxonomies file, which should appear in this space.

+ + + + + + +Creation +Revision +Changed graphic element to facsimile and kept existing url +Adding Pleiades IDs to origin/placenames + + adding period attribute to date element, with Periodo value. + + +
+ + + + + + + + + +
+

Catacomb 1, Hall G, room IV, arcosolium 1

+
+
+

Ἀβρᾶ καὶ Σαμῆ

+
+
+

+Abra and Same +

+
+ + +
+ + + + + + + +25-26 + + +
+
+
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml b/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml new file mode 100644 index 000000000..d188209a8 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml @@ -0,0 +1,140 @@ + + + + + +Inscriptions of Israel/Palestine + +Prinicipal Investigator +Michael Satlow + + + + + +

ERROR-could not find publication information which should appear in this space.

+
+
+jeru0014 + +
+ + + + +jeru0014 +Rahmani, L. Y. (1994). A Catalogue of Jewish Ossuaries in the Collections of the State of Israel. (A. Sussmann, Ed.). Israel Antiquities Authority: Israel Academy of Sciences and Humanities. (page 80, plate 4, fig. 21) + + + + +

Judaea. Jerusalem. 20 BCE to 70 CE. Soft limestone ossuary. Funerary.

+
+
+ + + + + + +29.5 +52 +23 + + + +

+ + + + + +

+ + + + + + + + + + + + + + + + + + +20 BCE to 70 CE + +Judaea +Jerusalem +Kidron Valley +southeast of 'En Rogel + +

Judaea. Jerusalem. Kidron Valley, southeast of Ἑn Rogel.

+ + + + + + +
+
+
+ + + + +

ERROR: could not find taxonomies file, which should appear in this space.

+
+
+
+ + + +Creation +Normalized objectDesc/@ana +Adding Pleiades IDs to origin/placenames + + adding period attribute to date element, with Periodo value. + + +
+ + + + + + + + +
+

אמא

+
+
+

mother (or Imma)

+
+ + +
+

The ossuary has an inner ledge on three sides and a flat, sliding lid with a small fingergrip on its outer edge. The word אמא could be a name or the word meaning "mother." Several examples of a name occuring along with this word support the second interpretation.

+
+
+ + + +80 + + + +plate 4, fig. 21 + + +
+
+
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/safed/safed.csv b/backend/corpora/peaceportal/tests/data/safed/safed.csv new file mode 100644 index 000000000..769adb10d --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/safed/safed.csv @@ -0,0 +1,10 @@ +MISPAR;ADDITION;Map;Px;Who helped;First Name;First Name (hebrew);Middle Name;Middle Name (hebrew);Title;Parent / First;Parent / First (hebrew);Parent Middle Name;Parent Middle Name (hebrew);Family Name;Family Name (hebrew);City;City (hebrew);CHELKA;AREA;NOTES;YOM;CHODESH;SHANA;DAY;MONTH;YEAR; +1;;;;;Avraham;אברהם;;;;;;;;Harbon;חרבון;;;א;A;החכם הרופא;ה;;רכו;;;1466; +1;A;;;;Lechichl;לחיחל;;;;;;;;;;;;א;A;;י;שבט;תשי;28;1;1950; +2;;;;;Pinchas;פנחס;;;;Zvi;צבי;;;;;;;א;A;;כט;טבת;תשכב;05;01;1962; +3;;;;;Melech;מלך;;;;Meir;מאיר; Yisrael; ישראל;;;;;א;A;;ט;טבת;תשכב;16;12;1961; +4;;;;;Rachel;רחל;;;;;;;;Negrenik Bahagen;נגריניק בהגן;;;א;A;;טו;טבת;תשכא;03;01;1961; +5;;m;px;;Eliyahu;אליהו;Manig;מאניג;;Zev;זאב;;;Katz;כץ;;;א;A;age 68;א;ניסן;תשכ;29;03;1960; +5;A;m;p-x;;Yitzhak;יצחק;;;;Moshe;משה ;David;דוד;Rozenthal HaCohen;רוזנטל הכהן;;;א;A;age 73;כח;חשון;תשכא;;;1960; +6;;m;px;;Dvasi;דוואסי;;;;Zvi;צבי;;;Masiroka ?;מסירוקא ?;Siruka;סירוקא;א;A;above Mik-Ari Path;א;אייר;תשכ;28;04;1960; +7;;m;px;;Sima;סימה;;;;Avraham;אברהם;;;Reuven;רובין;Batshan;באטשן;א;A;above Mik-Ari Path;כג;שבט;תשכ;;;1960; \ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-11.xml b/backend/corpora/peaceportal/tests/data/tol/tol-11.xml new file mode 100644 index 000000000..9259da682 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/tol/tol-11.xml @@ -0,0 +1,214 @@ + + + + + + + + epidat, tol-11 + + edited by + Elíshabá Mata + + + + + + + + + + Salomon Ludwig Steinheim-Institut +
+ Edmund-Körner-Platz 2 + D-45127 Essen +
+
+
+ tol-11 + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11 + + + Distributed under a Creative Commons licence Attribution-BY 4.0 +

+ All reuse or distribution of this work must contain somewhere a link back to the URL + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11 +

+
+
+
+ + + + + born digital + + + epidat + tol-11 + + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11 + + + + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11-t + + + + + + + +

+ stone (material not specified) + sepulchral monument +

+
+
+ + + +
+
+ + + + + Spain + + Toledo + + Jewish Cemetery + 39.871036 -4.022968 + + + + + +
+
+
+ + + + + + Israel + Moshe + Israel + + Hypothetical date + Other transcription: YIŚRA#[2019]EL BEN MOŠEH BEN YIŚRA#[2019]EL #[000D]#[000A]Young murdered person + + + + + Hebrew + + + + + +
+ + +
+ Edition +
+ + + ‎‏מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר‏‎ + + ‎‏לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר‏‎ + + ‎‏עַל מוֹת לַבֵּן בָּחוּר וָטוֹב‏‎ + + ‎‏כְּגַן רָטוֹב‏‎ + + ‎‏קָם עָלָיו כַּזְּדוֹנִים‏‎ + + ‎‏גּוֹי עַז פָּנִים‏‎ + + ‎‏הִשְׁקֵהוּ מֵי רוֹשׁ‏‎ + + ‎‏בָּא עַד הָרֹאשׁ‏‎ + + ‎‏וַיַּכֵּהוּ בִצְדִיָּה‏‎ + + ‎‏מַכָּה טְרִיָּה‏‎ + + ‎‏לָאָרֶץ חַיְתוֹ דִכָּה‏‎ + + ‎‏וַיִּצֶק דַּם הַמַּכָּה‏‎ + + ‎‏נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל‏‎ + + ‎‏נַעַר יִשְׂרָאֵל‏‎ + + ‎‏הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה‏‎ + + ‎‏בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה‏‎ + + ‎‏הַצְּבִי יִשְׂרָאֵל חָלָל‏‎ + + ‎‏בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל‏‎ + + ‎‏אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]‏‎ + + ‎‏וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ‏‎ + + ‎‏עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ‏‎ + + ‎‏בְּפֶסַח וַיָּמֶת אוֹתוֹ‏‎ + + ‎‏תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ‏‎ + + ‎‏וַיֵּאָסֵף אֶל עַמּוֹ‏‎ + + ‎‏תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים‏‎ + + ‎‏צְרוּרָה בִּצְרוֹר הַחַיִּים‏‎ + + ‎‏יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל‏‎ + + ‎‏אֱלֹהֵי יִשְׂרָאֵל‏‎ + + +
+
+
+ Übersetzung +
+ + + + +
+
+
+ Prosopographie +
+
+ Bibliographie + + + 61-62 + 62 + + + + + 174-175 + 17 + + + + + 83-84 + 41 + + +
+ +
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-27.xml b/backend/corpora/peaceportal/tests/data/tol/tol-27.xml new file mode 100644 index 000000000..0c710ec92 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/tol/tol-27.xml @@ -0,0 +1,189 @@ + + + + + + + + epidat, tol-27 + + edited by + Elíshabá Mata + + + + + + + + + + Salomon Ludwig Steinheim-Institut +
+ Edmund-Körner-Platz 2 + D-45127 Essen +
+
+
+ tol-27 + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27 + + + Distributed under a Creative Commons licence Attribution-BY 4.0 +

+ All reuse or distribution of this work must contain somewhere a link back to the URL + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27 +

+
+
+
+ + + + + born digital + + + epidat + tol-27 + + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27 + + + + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27-t + + + + + + + +

+ stone (material not specified) + sepulchral monument +

+
+
+ + + +
+
+ + + + + Spain + + Toledo + + Jewish Cemetery + 39.871036 -4.022968 + + + + + +
+
+
+ + + + + + Moshe + Yizhaq ben Elfats + + + Other transcription of the name: MOŠEH BEN YIṢḤAQ BEN #[2019]ELFAṬS#[000D]#[000A]Young man + + + + + Hebrew + + + + + +
+ + +
+ Edition +
+ + + ‎‏בְּקֶבֶר זֶה נִטְמָן‏‎ + + ‎‏בָּחוּר נֶטַע נַעֲמָן‏‎ + + ‎‏לְדֵרֶךְ מוּסָר סָר‏‎ + + ‎‏וּמִדֵּרֶךְ יָשָׁר לֹא סָר‏‎ + + ‎‏ז״ךְ שָׁנִים חָיָה‏‎ + + ‎‏וְזַךְ לֵבָב הָיָה‏‎ + + ‎‏וּבז״ךְ בְּמַרְחֶשׁוָן פָּנָה‏‎ + + ‎‏וְעָזַב אֶת אָבִיו בֶּן שִׁבְעִים שָׁנָה‏‎ + + ‎‏נֶאֱנַח מַשְׁ#[05בּצּ]מִים‏‎ + + ‎‏כִּי אָרְכוּ לוֹ אַחֲרָיו הַיָּמִים‏‎ + + ‎‏וּבִשְׁנַת חֲמֵשֶׁת אֲלָפִים וְתִשִׁעִים וְשָׁלֹש‏‎ + + ‎‏נִלְכַּד בְּפַח וּפַחַת‏‎ + + ‎‏וּמִבֵּין רֵעָיו נֶאֱסַף וְנִכְתַּשׁ בְּתֹךְ מַכְתֵּשׁ‏‎ + + ‎‏הוּא מֹשֶה נ״ע בר׳ יִצְחָק נ״ע בֶּן אֵלְפַטְשׂ‏‎ + + ‎‏אֱלֹהָיו יְרַחֵם עָלָיו‏‎ + + ‎‏וְיָנוּחַ וְיַעֲמוֹד לְקֵץ הַיָּמִין לְגוֹרָלוֹ‏‎ + + +
+
+
+ Übersetzung +
+ + + + +
+
+
+ Prosopographie +
+
+ Bibliographie + + + 41-42 + 39 + + + + + 182-183 + 25 + + + + + 94-95 + 49 + + +
+ +
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-36.xml b/backend/corpora/peaceportal/tests/data/tol/tol-36.xml new file mode 100644 index 000000000..b8d7a8be5 --- /dev/null +++ b/backend/corpora/peaceportal/tests/data/tol/tol-36.xml @@ -0,0 +1,197 @@ + + + + + + + + epidat, tol-36 + + edited by + Elíshabá Mata + + + + + + + + + + Salomon Ludwig Steinheim-Institut +
+ Edmund-Körner-Platz 2 + D-45127 Essen +
+
+
+ tol-36 + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36 + + + Distributed under a Creative Commons licence Attribution-BY 4.0 +

+ All reuse or distribution of this work must contain somewhere a link back to the URL + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36 +

+
+
+
+ + + + + born digital + + + epidat + tol-36 + + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36 + + + + http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36-t + + + + + + + +

+ stone (material not specified) + sepulchral monument +

+
+
+ + + +
+
+ + + + + Spain + + Toledo + + Jewish Cemetery + 39.871036 -4.022968 + + + + + +
+
+
+ + + + + + Yaakov + Yizhaq + + + Other transcription of the name: YA#[2018]ĂQŌḆ BEN YIṢḤAQ BEN AL-SARAQOSTAN#[000D]#[000A]Occupation: Physician and counselor#[000D]#[000A]Death in the Black Death + + + + + Hebrew + + + + + +
+ + +
+ Edition +
+ + + ‎‏בְּקֶבֶר זֶה נִקְבַּר‏‎ + + ‎‏אִישׁ שֵׂכֶל וּנְבוֹן דָּבָר‏‎ + + ‎‏נְקִי כַפָיִם וּבַר‏‎ + + ‎‏מָלֵא הוֹד וְחָכְמָה‏‎ + + ‎‏וְדַעַת וּמְזִמָּה‏‎ + + ‎‏יוֹעֵץ וַחֲכָם חֲרָשִׁים‏‎ + + ‎‏טוֹב עִם ה׳ וְעִם אֲנָשִׁים‏‎ + + ‎‏רוֹפֵא מַחֲלִים הַנְפָשִׁים‏‎ + + ‎‏וּמִזְּרַע קְדוֹשִׁים‏‎ + + ‎‏שְׁמוֹ ר׳ יַעֲקֹב בר׳ יִצְחָק נ׳ע ן׳ אַלְסָארַקֹסְטַן‏‎ + + ‎‏נָתַן כָּל־יָמָיו אֶל לִבּוֹ‏‎ + + ‎‏לֶאֱהוֹב אֶת ה׳ וּלְדָבְקָה בוֹ‏‎ + + ‎‏וְכַאֲשֶׁר בָּאָרֶץ פָּרַץ פֶּרֶץ‏‎ + + ‎‏בִּקְדוֹשִׂים אֲשֶׁר בָּאָרֶץ‏‎ + + ‎‏וַתִּפְרֹץ בָּם הַמַּגֵּפָה‏‎ + + ‎‏נֶאֱסַף אֶל עַמּוֹ‏‎ + + ‎‏וְעָזַב אֶת הָאָרֶץ וְעָלָה לִשְׁכוֹן מְרוֹמוֹ‏‎ + + ‎‏ובי׳׳ב בְּתַמּוּז שְׁנַת מְנוּחָה הָיְתָה יַד אֱלֹהָיו עָלָיו‏‎ + + ‎‏לְשׁוֹבֵב יַעֲקֹב אֵלָיו‏‎ + + ‎‏לָתֵּת לוֹ יָד בֵּין חֲסִידָיו וּלַעֲבוֹד בְּרֹאשָׁם‏‎ + + ‎‏וַיֹּאמֶר ה׳ אֶל יַעֲקֹב קוּם עֲלֵה בֵית אֵל וְשֶׁב שָׁם‏‎ + + ‎‏וְיַעֲקֹב הָלַךְ לְדַרְכּוֹ לִרְאוֹת פְּנֵי דָר נְגָהִים‏‎ + + ‎‏וַיִּפְגְּעוּ בוֹ מַלְאֲכֵי אֱלֹהִים‏‎ + + +
+
+
+ Übersetzung +
+ + + + +
+
+
+ Prosopographie +
+
+ Bibliographie + + + 65-66 + 70 + + + + + 209-210;C/M (82),135-138 + 58 + + +
+ +
+
\ No newline at end of file diff --git a/backend/corpora/peaceportal/tests/test_peace.py b/backend/corpora/peaceportal/tests/test_peace.py new file mode 100644 index 000000000..54db50ef3 --- /dev/null +++ b/backend/corpora/peaceportal/tests/test_peace.py @@ -0,0 +1,280 @@ +import os +import pytest + +from addcorpus.load_corpus import load_corpus_definition +from addcorpus.save_corpus import load_and_save_all_corpora +from addcorpus.models import Corpus + +CORPUS_TEST_DATA = [ + { + 'name': 'peaceportal-epidat', + 'docs': [{ + "id": "blr-4", + "url": "http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4", + "year": "1865", + "not_before": "1865", + "not_after": None, + "source_database": "Epidat (Steinheim Institute)", + "transcription": """Hier ruhet +der Kaufmann +Nathan Schönfeld +geb. d. 4. April 1812 +gest. d. [28.] Februar 1865 +‎‏פ״נ‏‎ +‎‏איש חמדות יקר רוח אוהב‏‎ +‎‏צדק ופועל טוב כ״ה נתן‏‎ +‎‏שאנפעלד נולד ח׳ של פסח‏‎ +‎‏תקע״ב ונפטר בשם טוב יום ג׳‏‎ +‎‏ב׳ אדר תרכ״ה לפ״ק‏‎ +‎‏תנצב״ה‏‎""", + "names": "Natan Schönfeld (Nathan Schönfeld)", + "sex": [ + "M" + ], + "dates_of_death": [ + "1865-02-28" + ], + "country": "Germany", + "region": "Thuringa", + "settlement": "Bleicherode", + "location_details": "Jewish Cemetery", + "language": [ + "Hebrew", + "German" + ], + "iconography": None, + "images": [ + "http://steinheim-institut.de/daten/picsblr/xl/0004_blr_2012.jpg", + "http://steinheim-institut.de/daten/picsblr/xl/0004rblr_2012.jpg", + "http://steinheim-institut.de/daten/picsblr/xl/0004dblr_2012.jpg" + ], + "coordinates": "51.434387 10.571183", + "material": [ + "Stone" + ], + "material_details": "stone", + "bibliography": None, + "comments": """OBJECTTYPE: +sepulchral monument + +""", + "transcription_de": None, + "transcription_he": "‎‏פ״נ‏‎ ‎‏איש חמדות יקר רוח אוהב‏‎ ‎‏צדק ופועל טוב כ״ה נתן‏‎ ‎‏שאנפעלד נולד ח׳ של פסח‏‎ ‎‏תקע״ב ונפטר בשם טוב יום ג׳‏‎ ‎‏ב׳ אדר תרכ״ה לפ״ק‏‎ ‎‏תנצב״ה‏‎", + "transcription_en": "", + "transcription_nl": "Hier ruhet" + }], + 'n_documents': 2 + }, + { + 'name': 'peaceportal-iis', + 'docs': [{ + "id": "akld0002", + "url": "https://library.brown.edu/iip/viewinscr/akld0002", + "year": "0001", + "not_before": "0001", + "not_after": "0100", + "source_database": "Inscriptions of Israel/Palestine (Brown University)", + "transcription": """Χάρητος +Χάρητος +Χάρητος +Χάρητος""", + "sex": "Unknown", + "country": "Israel/Palestine", + "region": "Judaea", + "settlement": "Jerusalem", + "location_details": ( + "Judaea Jerusalem Akeldama Cave 2 chamber B", + "", + "" + ), + "language": ( + "Ancient Greek", + "Unknown" + ), + "iconography": "Painted Red", + "material": [ + "Limestone", + "Stone" + ], + "material_details": "#limestone", + "bibliography": [ + "Shadmi, T. (1996). The Ossuaries and the Sarcophagus. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 41–55). Jerusalem: Israel Antiquities Authority. (page 52)", + "Ilan, T. (1996). The Ossuary and Sarcophagus Inscriptions. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 57–72). Jerusalem: Israel Antiquities Authority. (page 58)" + ], + "comments": """CONDITION: + (#complete.intact) + + +LAYOUT: +once on each side + + +OBJECTTYPE: +ossuary + + +DIMENSIONS: +H: 64 W: 29 D: 35 + + +HANDNOTES: + (#impressed.inscribed) + +""", + "transcription_he": "", + "transcription_la": "", + "transcription_el": "Χάρητος Χάρητος Χάρητος Χάρητος", + "transcription_en": "of Chares" + }], + 'n_documents': 3 + }, + { + 'name': 'peaceportal-fiji', + 'docs': [{ + "id": "299", + "source_database": "Funerary Inscriptions of Jews from Italy (Utrecht University)", + "transcription": "Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν.", + "names": "Felicissima ( the commemorator) Emarantus ( the decaesed) (Φη<λ>ικίσσιμα Ἠμαράντῳ)", + "sex": [ + "M", + "F" + ], + "age": None, + "country": "Italy", + "settlement": "Rome, Monteverde", + "location_details": "Museo Vaticano, lapidario ebraico ex-Lateranense; inv.no.30762", + "language": [ + "Greek" + ], + "iconography": "none", + "material": [ + "Stone", + "Marble" + ], + "bibliography": [ + "Noy 1995, p. 69-70 (83)" + ], + "comments": """DATE: +Uncertain +""", + "transcription_he": "", + "transcription_la": "", + "transcription_el": "Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν." + }], + 'n_documents': 3 + }, + { + 'name': 'peaceportal-tol', + 'docs': [{ + "id": "tol-11", + "url": "http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11", + "year": None, + "not_before": None, + "not_after": None, + "source_database": "Medieval funerary inscriptions from Toledo", + "transcription": """‎‏מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר‏‎ +‎‏לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר‏‎ +‎‏עַל מוֹת לַבֵּן בָּחוּר וָטוֹב‏‎ +‎‏כְּגַן רָטוֹב‏‎ +‎‏קָם עָלָיו כַּזְּדוֹנִים‏‎ +‎‏גּוֹי עַז פָּנִים‏‎ +‎‏הִשְׁקֵהוּ מֵי רוֹשׁ‏‎ +‎‏בָּא עַד הָרֹאשׁ‏‎ +‎‏וַיַּכֵּהוּ בִצְדִיָּה‏‎ +‎‏מַכָּה טְרִיָּה‏‎ +‎‏לָאָרֶץ חַיְתוֹ דִכָּה‏‎ +‎‏וַיִּצֶק דַּם הַמַּכָּה‏‎ +‎‏נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל‏‎ +‎‏נַעַר יִשְׂרָאֵל‏‎ +‎‏הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה‏‎ +‎‏בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה‏‎ +‎‏הַצְּבִי יִשְׂרָאֵל חָלָל‏‎ +‎‏בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל‏‎ +‎‏אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]‏‎ +‎‏וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ‏‎ +‎‏עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ‏‎ +‎‏בְּפֶסַח וַיָּמֶת אוֹתוֹ‏‎ +‎‏תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ‏‎ +‎‏וַיֵּאָסֵף אֶל עַמּוֹ‏‎ +‎‏תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים‏‎ +‎‏צְרוּרָה בִּצְרוֹר הַחַיִּים‏‎ +‎‏יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל‏‎ +‎‏אֱלֹהֵי יִשְׂרָאֵל‏‎""", + "names": None, + "sex": [ + "Unknown" + ], + "dates_of_death": None, + "country": "Spain", + "region": None, + "settlement": "Toledo", + "location_details": "Jewish Cemetery", + "language": [ + "Hebrew" + ], + "iconography": None, + "images": None, + "coordinates": "39.871036 -4.022968", + "material": [ + "Stone" + ], + "material_details": "stone (material not specified)", + "bibliography": None, + "comments": """OBJECTTYPE: +sepulchral monument + +""", + "transcription_he": "‎‏מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר‏‎ ‎‏לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר‏‎ ‎‏עַל מוֹת לַבֵּן בָּחוּר וָטוֹב‏‎ ‎‏כְּגַן רָטוֹב‏‎ ‎‏קָם עָלָיו כַּזְּדוֹנִים‏‎ ‎‏גּוֹי עַז פָּנִים‏‎ ‎‏הִשְׁקֵהוּ מֵי רוֹשׁ‏‎ ‎‏בָּא עַד הָרֹאשׁ‏‎ ‎‏וַיַּכֵּהוּ בִצְדִיָּה‏‎ ‎‏מַכָּה טְרִיָּה‏‎ ‎‏לָאָרֶץ חַיְתוֹ דִכָּה‏‎ ‎‏וַיִּצֶק דַּם הַמַּכָּה‏‎ ‎‏נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל‏‎ ‎‏נַעַר יִשְׂרָאֵל‏‎ ‎‏הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה‏‎ ‎‏בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה‏‎ ‎‏הַצְּבִי יִשְׂרָאֵל חָלָל‏‎ ‎‏בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל‏‎ ‎‏אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]‏‎ ‎‏וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ‏‎ ‎‏עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ‏‎ ‎‏בְּפֶסַח וַיָּמֶת אוֹתוֹ‏‎ ‎‏תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ‏‎ ‎‏וַיֵּאָסֵף אֶל עַמּוֹ‏‎ ‎‏תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים‏‎ ‎‏צְרוּרָה בִּצְרוֹר הַחַיִּים‏‎ ‎‏יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל‏‎ ‎‏אֱלֹהֵי יִשְׂרָאֵל‏‎", + "transcription_en": "", + "transcription_nl": "" + }], + 'n_documents': 3 + } +] + +def corpus_test_name(corpus_spec): + return corpus_spec['name'] + +@pytest.mark.parametrize("corpus_object", CORPUS_TEST_DATA, ids=corpus_test_name) +def test_imports(peace_test_settings, corpus_object): + parent_corpus = load_corpus_definition('peaceportal') + corpus = load_corpus_definition(corpus_object.get('name')) + assert len(os.listdir(os.path.abspath(corpus.data_directory))) != 0 + fully_specified = ['peaceportal-iis', 'peaceportal-tol'] + if corpus_object.get('name') not in fully_specified: + # only IIS / TOL have all fields + assert len(corpus.fields) != len(parent_corpus.fields) + + start = corpus_object['start'] if 'start' in corpus_object else corpus.min_date + end = corpus_object['end'] if 'end' in corpus_object else corpus.max_date + + tested_fields = set() + resulted_fields = set() + + docs = get_documents(corpus, start, end) + for target in corpus_object.get('docs'): + doc = next(docs) + for key in target: + tested_fields.add(key) + assert key in doc + assert doc[key] == target[key] + + for key in doc: + resulted_fields.add(key) + + docs = get_documents(corpus, start, end) + assert len(list(docs)) == corpus_object.get('n_documents') + +def get_documents(corpus, start, end): + sources = corpus.sources( + start=start, + end=end + ) + return corpus.documents(sources) + +def test_peaceportal_validation(db, peace_test_settings): + load_and_save_all_corpora() + corpus_names = [case['name'] for case in CORPUS_TEST_DATA] + for corpus_name in corpus_names: + corpus = Corpus.objects.get(name=corpus_name) + assert corpus.active \ No newline at end of file diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py new file mode 100644 index 000000000..4d75f4cd4 --- /dev/null +++ b/backend/corpora/peaceportal/tol.py @@ -0,0 +1,394 @@ +import re +from copy import copy + +from django.conf import settings + +from addcorpus.corpus import XMLCorpusDefinition +from addcorpus.extract import XML, Constant, Combined, FilterAttribute +from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language +from corpora.utils.exclude_fields import exclude_fields_without_extractor + +class PeaceportalTOL(PeacePortal, XMLCorpusDefinition): + data_directory = settings.PEACEPORTAL_TOL_DATA + es_index = getattr(settings, 'PEACEPORTAL_TOL_ES_INDEX', 'peaceportal-tol') + + languages = ['en', 'nl', 'he'] + + def __init__(self): + super().__init__() + self.source_database.extractor = Constant( + value='Medieval funerary inscriptions from Toledo' + ) + + self._id.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'msIdentifier', 'idno'], + multiple=False, + toplevel=False, + flatten=True + ) + + self.url.extractor = FilterAttribute( + tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'], + multiple=False, + toplevel=False, + flatten=True, + attribute_filter={ + 'attribute': 'type', + 'value': 'url' + } + ) + + self.year.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origDate', 'date'], + toplevel=False, + transform=lambda x: get_year(x), + ) + + self.not_before.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origDate', 'date'], + toplevel=False, + attribute='notBefore', + transform=lambda x: get_year(x), + ) + + self.not_after.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origDate', 'date'], + toplevel=False, + attribute='notAfter', + transform=lambda x: get_year(x), + ) + + self.transcription.extractor = XML( + tag=['text', 'body', 'div'], + toplevel=False, + multiple=False, + flatten=True, + transform=lambda x: clean_newline_characters(x), + transform_soup_func=extract_transcript + ) + + self.names.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson', 'person'], + flatten=True, + multiple=True, + toplevel=False, + ) + + self.sex.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson', 'person'], + attribute='sex', + multiple=True, + toplevel=False, + transform=lambda x: convert_sex(x) + ) + + self.dates_of_death.extractor = XML( + tag=['teiHeader', 'profileDesc', + 'particDesc', 'listPerson'], + transform_soup_func=extract_death, + attribute='when', + multiple=False, + toplevel=False, + ) + + self.country.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'country'], + toplevel=False, + transform_soup_func=extract_country, + transform=lambda x: clean_country(x), + flatten=True, + ) + + self.region.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'country', 'region'], + toplevel=False, + flatten=True + ) + + self.settlement.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'settlement'], + toplevel=False, + flatten=True, + transform_soup_func=extract_settlement, + ) + + self.location_details.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'settlement', 'geogName'], + toplevel=False, + flatten=True, + transform_soup_func=extract_location_details, + ) + + self.material.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p', 'material'], + toplevel=False, + flatten=True, + transform=lambda x: categorize_material(x) + ) + + self.material_details.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p', 'material'], + toplevel=False, + flatten=True + ) + + self.language.extractor = XML( + tag=['teiHeader', 'profileDesc', 'langUsage', 'language'], + toplevel=False, + multiple=True, + transform=lambda x: get_language(x) + ) + + self.comments.extractor = Combined( + XML( + tag=['text', 'body'], + toplevel=False, + transform_soup_func=extract_commentary, + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'condition'], + toplevel=False, + flatten=True, + transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x + ), + XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', + 'objectDesc', 'supportDesc', 'support', 'p'], + toplevel=False, + transform_soup_func=extract_support_comments, + ), + transform=lambda x: join_commentaries(x) + ) + + self.images.extractor = XML( + tag=['facsimile', 'graphic'], + multiple=True, + attribute='url', + toplevel=False + ) + + self.coordinates.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'], + toplevel=False, + multiple=False, + flatten=True + ) + + self.iconography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', + 'msDesc', 'physDesc', 'decoDesc', 'decoNote'], + toplevel=False, + multiple=False + ) + + self.bibliography.extractor = XML( + tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', + 'msIdentifier', 'publications', 'publication'], + toplevel=False, + multiple=True + ) + + self.transcription_hebrew.extractor = Combined( + self.transcription.extractor, + Constant('he'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_english.extractor = Combined( + self.transcription.extractor, + Constant('en'), + transform=lambda x: get_text_in_language(x) + ) + + self.transcription_dutch.extractor = Combined( + self.transcription.extractor, + Constant('nl'), + transform=lambda x: get_text_in_language(x) + ) + + self.fields = exclude_fields_without_extractor(self.fields) + + +def convert_sex(values): + if not values: + return ['Unknown'] + result = [] + for value in values: + if value == '1': + result.append('M') + elif value == '2': + result.append('F') + else: + result.append('Unknown') + return result + + +def clean_country(text): + if not text: + return 'Unknown' + if text.lower().strip() == 'tobedone': + return 'Unknown' + return text + + +def get_year(text): + if not text or text == '--': + return + matches = re.search('[1-2]{0,1}[0-9]{3}', text) + if matches: + return matches[0] + + +def get_language(values): + if not values: + return ['Unknown'] + if 'German in Hebrew letters' in values: + return ['German (transliterated)', 'Hebrew'] + return values + + +def extract_transcript(soup): + ''' + Helper function to ensure correct extraction of the transcripts. + Note that there are multiple formats in which these are stored, + but the text that we need is always in the `` children of + `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one). + ''' + if not soup: + return + return soup.find_all('ab') + + +def extract_translation(soup): + ''' + Helper function to extract translation from the tag + ''' + if not soup: + return + translation = soup.find('div', {'type': 'translation'}) + if translation: + return translation.find_all('ab') + else: + return + + +def extract_commentary(soup): + ''' + Helper function to extract all commentaries from the tag. + A single element will be returned with the commentaries found as text content. + ''' + if not soup: return + found = [] + commentaries = soup.find_all('div', {'type': 'commentary'}) + + for commentary in commentaries: + if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']: + p = commentary.find('p') + if p: + text = p.get_text() + if text: + text = clean_commentary(text) + found.append('{}:\n{}\n'.format(commentary['subtype'].strip().upper(), text)) + + if len(found) > 1: + cloned_soup = copy(soup) + cloned_soup.clear() + cloned_soup.string = "\n".join(found) + return cloned_soup + else: + return None + +def extract_support_comments(soup): + if not soup: return + cloned_soup = copy(soup) + cloned_soup.clear() + + commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS') + commentaries = add_support_comment(soup, commentaries, 'objectType', 'OBJECTTYPE') + + # add any additional text from the

element, + # i.e. if there is text it is the very last node + contents = soup.contents + text = contents[len(contents) - 1].strip() + if text: + text = clean_commentary(text) + commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text) + + cloned_soup.string = commentaries + return cloned_soup + + +def add_support_comment(soup, existing_commentaries, elem_name, commentary_name): + elem = soup.find(elem_name) + if elem: + text = elem.get_text() + if text: + text = clean_commentary(text) + return '{}{}:\n{}\n\n'.format(existing_commentaries, commentary_name, text) + return existing_commentaries + + +def extract_death(soup): + ''' + Helper function to extract date of death from multiple person tags. + ''' + if not soup: + return + return soup.find_all('death') + + +def extract_country(soup): + ''' + Helper function to extract country. + This is needed because the output of `flatten` would otherwise include the text contents + of the ``. + ''' + return clone_soup_extract_child(soup, 'region') + + +def extract_settlement(soup): + return clone_soup_extract_child(soup, 'geogName') + + +def extract_location_details(soup): + return clone_soup_extract_child(soup, 'geo') + + +def clone_soup_extract_child(soup, to_extract): + ''' + Helper function to clone the soup and extract a child element. + This is useful when the output of `flatten` would otherwise include the text contents + of the child. + ''' + if not soup: + return + cloned_soup = copy(soup) + child = cloned_soup.find(to_extract) + if child: + child.extract() + return cloned_soup + + # TODO: add field + + # TODO: move to a comments field: + + # excluded (for now): + # title + # organization (incl details, e.g. address) + # licence + # taxonomy (i.e. things like foto1, foto2 -> no working links to actual images) + diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py index 3b905c4d7..72882bc41 100644 --- a/backend/corpora/periodicals/periodicals.py +++ b/backend/corpora/periodicals/periodicals.py @@ -38,7 +38,7 @@ class Periodicals(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) tag_toplevel = 'articles' tag_entry = 'artInfo' @@ -145,7 +145,7 @@ def sources(self, start=min_date, end=max_date): display_name='Content', display_type='text_content', description='Text content.', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'en'), results_overview=True, extractor=extract.XML(tag='ocrText', flatten=True), search_field_core=True, diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py index b8b6d0892..2404ee06b 100644 --- a/backend/corpora/rechtspraak/rechtspraak.py +++ b/backend/corpora/rechtspraak/rechtspraak.py @@ -45,7 +45,7 @@ class Rechtspraak(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) tag_toplevel = 'open-rechtspraak' @@ -287,7 +287,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None name='content', display_name='Content', display_type='text_content', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'nl'), extractor=extract.Backup( extract.XML('uitspraak', flatten=True), extract.XML('conclusie', flatten=True), diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py index 1e0ff0d87..35e56ff0f 100644 --- a/backend/corpora/times/times.py +++ b/backend/corpora/times/times.py @@ -39,7 +39,7 @@ class Times(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) tag_toplevel = 'issue' tag_entry = 'article' @@ -424,7 +424,7 @@ def sources(self, start=datetime.min, end=datetime.max): name='content', display_name='Content', display_type='text_content', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'en'), visualizations=['wordcloud'], description='Raw OCR\'ed text (content).', results_overview=True, diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py index b8d416530..0bc8cbc2c 100644 --- a/backend/corpora/troonredes/troonredes.py +++ b/backend/corpora/troonredes/troonredes.py @@ -44,7 +44,7 @@ class Troonredes(XMLCorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) tag_toplevel = 'doc' tag_entry = 'entry' @@ -136,7 +136,7 @@ def sources(self, start=min_date, end=max_date): display_name='Content', display_type='text_content', description='Text content.', - es_mapping=main_content_mapping(True, True, True), + es_mapping=main_content_mapping(True, True, True, 'nl'), results_overview=True, search_field_core=True, visualizations=['wordcloud', 'ngram'], diff --git a/backend/corpora/utils/exclude_fields.py b/backend/corpora/utils/exclude_fields.py new file mode 100644 index 000000000..bccc58792 --- /dev/null +++ b/backend/corpora/utils/exclude_fields.py @@ -0,0 +1,9 @@ +from addcorpus import extract + +def has_extractor(field): + if type(field.extractor) != extract.Constant: + return True + return field.extractor.apply() != None + +def exclude_fields_without_extractor(fields): + return list(filter(has_extractor, fields)) \ No newline at end of file diff --git a/backend/corpora/utils/test_corpora_utils.py b/backend/corpora/utils/test_corpora_utils.py new file mode 100644 index 000000000..5b8274bf5 --- /dev/null +++ b/backend/corpora/utils/test_corpora_utils.py @@ -0,0 +1,17 @@ +from addcorpus.corpus import FieldDefinition +from addcorpus.extract import Constant + +from corpora.utils import exclude_fields + +def test_exclude_fields(): + fields = [ + FieldDefinition( + name='test1', + extractor=Constant('some value') + ), + FieldDefinition( + name='test2' + ) + ] + new_fields = exclude_fields.exclude_fields_without_extractor(fields) + assert len(new_fields) == 1 diff --git a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py index 39eb62ce0..ffb8e046a 100644 --- a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py +++ b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py @@ -1,7 +1,9 @@ from datetime import datetime +import os + from addcorpus.corpus import FieldDefinition, CSVCorpusDefinition +from addcorpus.es_mappings import keyword_mapping, text_mapping from addcorpus.extract import CSV -import os # Fake corpus class for unit tests @@ -26,17 +28,13 @@ def sources(self, start=min_date, end=max_date): content = FieldDefinition( name = 'content', - es_mapping= { - 'type': 'text', - }, + es_mapping = text_mapping(), extractor = CSV('content') ) language = FieldDefinition( name = 'language', - es_mapping= { - 'type': 'keyword' - }, + es_mapping = keyword_mapping(), extractor = CSV('language') ) diff --git a/backend/es/conftest.py b/backend/es/conftest.py index 8c817a8f7..406d285a6 100644 --- a/backend/es/conftest.py +++ b/backend/es/conftest.py @@ -3,7 +3,6 @@ from django.contrib.auth.models import Group from addcorpus.load_corpus import load_corpus_definition -from ianalyzer.elasticsearch import elasticsearch from es import es_index from addcorpus.models import Corpus diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py index 96eb57ab1..6f69f3611 100644 --- a/backend/es/tests/test_es_index.py +++ b/backend/es/tests/test_es_index.py @@ -2,7 +2,6 @@ from datetime import datetime from time import sleep -from addcorpus.load_corpus import load_corpus_definition from es.es_index import perform_indexing start = datetime.strptime('1970-01-01','%Y-%m-%d') diff --git a/backend/ianalyzer/common_settings.py b/backend/ianalyzer/common_settings.py index 06b8fcaf0..f78775222 100644 --- a/backend/ianalyzer/common_settings.py +++ b/backend/ianalyzer/common_settings.py @@ -131,3 +131,5 @@ } LOGO_LINK = 'https://dhstatic.hum.uu.nl/logo-cdh/png/UU_CDH_logo_EN_whiteFC.png' + +NLTK_DATA_PATH = os.path.join(BASE_DIR, 'addcorpus', 'nltk_data') \ No newline at end of file diff --git a/backend/requirements.in b/backend/requirements.in index ab5812765..884fc7c85 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -5,6 +5,7 @@ django-livereload-server # django-revproxy, see https://github.com/UUDigitalHumanitieslab/cookiecutter-webapp-deluxe/issues/35 git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a djangosaml2 +langdetect psycopg2 pytest pytest-django diff --git a/backend/requirements.txt b/backend/requirements.txt index 943b2d568..db9101cd3 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,6 +12,8 @@ async-timeout==4.0.2 # via redis attrs==22.2.0 # via pytest +backports-zoneinfo==0.2.1 + # via django beautifulsoup4==4.11.1 # via # -r requirements.in @@ -45,22 +47,27 @@ click-repl==0.2.0 # via celery cryptography==39.0.1 # via + # pyjwt # pyopenssl # pysaml2 defusedxml==0.7.1 # via # djangosaml2 # pysaml2 -dj-rest-auth[with_social]==2.2.7 + # python3-openid +dj-rest-auth[with-social,with_social]==2.2.7 # via -r requirements.in -django==4.1.10 +django==4.1.13 # via # -r requirements.in # dj-rest-auth + # django-allauth # django-livereload-server # django-revproxy # djangorestframework # djangosaml2 +django-allauth==0.52.0 + # via dj-rest-auth django-livereload-server==0.4 # via -r requirements.in django-revproxy @ git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a @@ -79,6 +86,8 @@ elementpath==4.1.1 # via xmlschema et-xmlfile==1.1.0 # via openpyxl +exceptiongroup==1.1.3 + # via pytest execnet==1.9.0 # via pytest-xdist fst-pso==1.8.1 @@ -89,6 +98,8 @@ gensim==4.3.0 # via -r requirements.in idna==3.4 # via requests +importlib-resources==6.1.0 + # via pysaml2 iniconfig==2.0.0 # via pytest joblib==1.2.0 @@ -99,6 +110,8 @@ kombu==5.2.4 # via celery langcodes==3.3.0 # via -r requirements.in +langdetect==1.0.9 + # via -r requirements.in language-data==1.1 # via -r requirements.in lxml==4.9.1 @@ -121,6 +134,8 @@ numpy==1.24.1 # scikit-learn # scipy # simpful +oauthlib==3.2.2 + # via requests-oauthlib openpyxl==3.1.2 # via -r requirements.in packaging==23.0 @@ -139,6 +154,10 @@ pycparser==2.21 # via cffi pyfume==0.2.25 # via fuzzytm +pyjwt[crypto]==2.8.0 + # via + # django-allauth + # pyjwt pyopenssl==23.1.1 # via pysaml2 pypdf2==3.0.1 @@ -160,6 +179,8 @@ python-dateutil==2.8.2 # via # pandas # pysaml2 +python3-openid==3.2.0 + # via django-allauth pytz==2022.7 # via # celery @@ -172,8 +193,12 @@ regex==2022.10.31 # via nltk requests==2.31.0 # via + # django-allauth # pysaml2 + # requests-oauthlib # simpful +requests-oauthlib==1.3.1 + # via django-allauth scikit-learn==1.2.1 # via -r requirements.in scipy==1.10.0 @@ -190,6 +215,7 @@ six==1.16.0 # via # click-repl # django-livereload-server + # langdetect # python-dateutil smart-open==6.3.0 # via gensim @@ -201,13 +227,17 @@ textdistance==4.5.0 # via -r requirements.in threadpoolctl==3.1.0 # via scikit-learn +tomli==2.0.1 + # via pytest tornado==6.3.3 # via django-livereload-server tqdm==4.64.1 # via # -r requirements.in # nltk -urllib3==1.26.17 +typing-extensions==4.8.0 + # via pypdf2 +urllib3==1.26.18 # via # django-revproxy # elastic-transport @@ -221,6 +251,8 @@ wcwidth==0.2.6 # via prompt-toolkit xmlschema==2.2.3 # via pysaml2 +zipp==3.17.0 + # via importlib-resources # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py b/backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py new file mode 100644 index 000000000..1af0d1928 --- /dev/null +++ b/backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py @@ -0,0 +1,17 @@ +# Generated by Django 4.1.9 on 2023-08-10 10:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('tag', '0002_taggeddocument_delete_taginstance'), + ] + + operations = [ + migrations.AddConstraint( + model_name='taggeddocument', + constraint=models.UniqueConstraint(fields=('corpus', 'doc_id'), name='unique_document_ID_for_corpus'), + ), + ] diff --git a/backend/tag/models.py b/backend/tag/models.py index 1182a86a5..54e44cd93 100644 --- a/backend/tag/models.py +++ b/backend/tag/models.py @@ -42,3 +42,11 @@ class TaggedDocument(models.Model): to=Tag, related_name='tagged_docs' ) + + class Meta: + constraints = [ + UniqueConstraint( + fields=['corpus', 'doc_id'], + name='unique_document_ID_for_corpus' + ) + ] diff --git a/backend/tag/tests/test_views.py b/backend/tag/tests/test_views.py index 996799c6a..911691000 100644 --- a/backend/tag/tests/test_views.py +++ b/backend/tag/tests/test_views.py @@ -105,6 +105,39 @@ def test_patch_document_tags(auth_client, auth_user_tag, mock_corpus, auth_user_ assert status.is_success(response.status_code) assert auth_user_tag.count == 0 +def test_assign_multiple_tags_at_once(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces): + doc = 'test' + patch_request = lambda data: auth_client.patch( + f'/api/tag/document_tags/{mock_corpus}/{doc}', + data, + content_type='application/json' + ) + + response = patch_request({ + 'tags': [tag.id for tag in multiple_tags] + }) + assert status.is_success(response.status_code) + doc = TaggedDocument.objects.get(doc_id=doc) + assert doc.tags.count() == len(multiple_tags) + +def test_assign_multiple_tags_one_by_one(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces): + doc = 'test' + patch_request = lambda data: auth_client.patch( + f'/api/tag/document_tags/{mock_corpus}/{doc}', + data, + content_type='application/json' + ) + + for i in range(len(multiple_tags)): + response = patch_request({ + 'tags': [tag.id for tag in multiple_tags][:i+1] + }) + + assert status.is_success(response.status_code) + doc = TaggedDocument.objects.get(doc_id=doc) + n_tags = doc.tags.count() + assert doc.tags.count() == i + 1 + def test_patch_tags_contamination(auth_client, auth_user_tag, admin_user_tag, mock_corpus, mock_corpus_obj, auth_user_corpus_acces): ''' Verify that patching tags does not affect the tags of other users diff --git a/backend/visualization/tests/mock_corpora/large_mock_corpus.py b/backend/visualization/tests/mock_corpora/large_mock_corpus.py index e15652945..466ceb8a6 100644 --- a/backend/visualization/tests/mock_corpora/large_mock_corpus.py +++ b/backend/visualization/tests/mock_corpora/large_mock_corpus.py @@ -1,7 +1,9 @@ from datetime import datetime -from addcorpus.corpus import CorpusDefinition, FieldDefinition import random +from addcorpus.corpus import CorpusDefinition, FieldDefinition +from addcorpus.es_mappings import date_mapping, text_mapping + TOTAL_DOCUMENTS = 11000 # some constants for generating data @@ -48,16 +50,12 @@ def source2dicts(self, source): date = FieldDefinition( name = 'date', - es_mapping = { - 'type': 'date', - } + es_mapping = date_mapping() ) content = FieldDefinition( name = 'content', - es_mapping = { - 'type': 'text' - } + es_mapping = text_mapping() ) fields = [date, content] diff --git a/backend/visualization/tests/mock_corpora/small_mock_corpus.py b/backend/visualization/tests/mock_corpora/small_mock_corpus.py index a3ad7fd2a..f97c42121 100644 --- a/backend/visualization/tests/mock_corpora/small_mock_corpus.py +++ b/backend/visualization/tests/mock_corpora/small_mock_corpus.py @@ -1,9 +1,12 @@ from datetime import datetime +import os + from addcorpus.corpus import FieldDefinition, CSVCorpusDefinition from addcorpus.extract import CSV -import os +from addcorpus.es_mappings import date_mapping, keyword_mapping, main_content_mapping, text_mapping from addcorpus.es_settings import es_settings + # Fake corpus class for unit tests here = os.path.abspath(os.path.dirname(__file__)) @@ -20,7 +23,7 @@ class SmallMockCorpus(CSVCorpusDefinition): languages = ['en'] category = 'book' - es_settings = es_settings('en', stopword_analyzer=True) + es_settings = es_settings(['en'], stopword_analysis=True) def sources(self, start=min_date, end=max_date): for csv_file in os.listdir(os.path.join(here, 'source_files')): @@ -28,45 +31,25 @@ def sources(self, start=min_date, end=max_date): date = FieldDefinition( name = 'date', - es_mapping = { - 'type': 'date', - }, + es_mapping = date_mapping(), extractor = CSV('date') ) title_field = FieldDefinition( name = 'title', - es_mapping = { - 'type': 'text', - }, + es_mapping = text_mapping(), extractor = CSV('title') ) content = FieldDefinition( name = 'content', - es_mapping= { - 'type': 'text', - "fields": { - "clean": { - "type": "text", - }, - "stemmed": { - "type": "text", - }, - "length": { - "type": "token_count", - 'analyzer': 'standard', - } - } - }, + es_mapping = main_content_mapping(True, True, False, 'en'), extractor = CSV('content') ) genre = FieldDefinition( name = 'genre', - es_mapping= { - 'type': 'keyword' - }, + es_mapping = keyword_mapping(), extractor = CSV('genre') ) diff --git a/backend/visualization/tests/test_termvectors.py b/backend/visualization/tests/test_termvectors.py index 967102b53..ea4f6fe4c 100644 --- a/backend/visualization/tests/test_termvectors.py +++ b/backend/visualization/tests/test_termvectors.py @@ -67,7 +67,7 @@ def test_find_matches(es_client, termvectors_result, small_mock_corpus): }, { 'query_text': 'regarded with such "evil forebodings"', 'components': ['regarded', 'with', 'such', 'evil forebodings'], - 'analyzed': [['regarded'], ['with'], ['such'], ['evil', 'forebodings']] + 'analyzed': [['regarded'], ['evil', 'forebodings']] }, { 'query_text': 'evil + forebodings', 'components': ['evil', '+', 'forebodings'], @@ -83,7 +83,7 @@ def test_find_matches(es_client, termvectors_result, small_mock_corpus): }, { 'query_text': 'rejoice~1 to hear', 'components': ['rejoice~1', 'to', 'hear'], - 'analyzed': [['rejoice~1'], ['to'], ['hear']] + 'analyzed': [['rejoice~1'], ['hear']] } ] diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py index 32dc21190..5bb5e6c54 100644 --- a/backend/visualization/tests/test_wordcloud.py +++ b/backend/visualization/tests/test_wordcloud.py @@ -127,7 +127,6 @@ def test_wordcloud_counts(small_mock_corpus): def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud): stopwords = ['the', 'and', 'of'] - for stopword in stopwords: match = any( item['key'] == stopword for item in small_mock_corpus_complete_wordcloud) diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py index 786929240..68ad5b543 100644 --- a/backend/visualization/wordcloud.py +++ b/backend/visualization/wordcloud.py @@ -1,12 +1,19 @@ from collections import Counter from sklearn.feature_extraction.text import CountVectorizer + from addcorpus.load_corpus import load_corpus_definition from addcorpus.es_settings import get_stopwords_from_settings from es import download as download -def corpus_stopwords(corpus_name): +def field_stopwords(corpus_name, field): corpus = load_corpus_definition(corpus_name) - return get_stopwords_from_settings(corpus.es_settings) + field_definition = next((f for f in corpus.fields if f.name == field), None) + mapping = field_definition.es_mapping + analyzer = mapping.get( + 'fields', {}).get('clean', {}).get('analyzer') + if not analyzer: + return [] + return get_stopwords_from_settings(corpus.es_settings, analyzer) def make_wordcloud_data(documents, field, corpus): texts = [] @@ -14,8 +21,8 @@ def make_wordcloud_data(documents, field, corpus): content = document['_source'][field] if content and content != '': texts.append(content) - - stopwords = corpus_stopwords(corpus) or [] + + stopwords = field_stopwords(corpus, field) cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords) cvtexts = cv.fit_transform(texts) counts = cvtexts.sum(axis=0).A1 diff --git a/docker-compose.yaml b/docker-compose.yaml index 0904061d2..19f75ef4d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,6 +1,6 @@ services: db: - image: postgres + image: docker.io/library/postgres environment: - POSTGRES_DB=${SQL_DATABASE} - POSTGRES_USER=${SQL_USER} @@ -36,7 +36,7 @@ services: - type: bind source: $DATA_DIR target: /corpora - command: bash -c "python manage.py migrate && python manage.py runserver 0.0.0.0:8000" + command: bash -c "python manage.py migrate && python manage.py loadcorpora && python manage.py runserver 0.0.0.0:8000" frontend: build: context: ./frontend @@ -55,6 +55,10 @@ services: - cluster.name=ianalizer-es-data-cluster - bootstrap.memory_lock=true - xpack.security.enabled=false + - logger.org.elasticsearch.discovery=ERROR + - logger.org.elasticsearch.transport=ERROR + - logger.org.elasticsearch.http=ERROR + - logger.org.elasticsearch.cluster=ERROR - "ES_JAVA_OPTS=-Xms2g -Xmx2g" - ELASTIC_PASSWORD=$ELASTIC_ROOT_PASSWORD ulimits: @@ -65,6 +69,14 @@ services: - ianalyzer-es:/usr/share/elasticsearch/data ports: - 127.0.0.1:9200:9200 + kibana: + image: docker.elastic.co/kibana/kibana:8.5.0 + depends_on: + - elasticsearch + environment: + - "ELASTICSEARCH_URL=http://elasticsearch:9200" + ports: + - 127.0.0.1:5601:5601 redis: image: redis:latest restart: unless-stopped diff --git a/frontend/Dockerfile b/frontend/Dockerfile index b80b97408..514c7b21d 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,5 +1,5 @@ # base image -FROM node:14-alpine +FROM docker.io/library/node:14-alpine RUN apk update && apk add --no-cache --virtual .gyp python3 make g++ # Install Chrome diff --git a/frontend/src/app/common-test-bed.ts b/frontend/src/app/common-test-bed.ts index 301beb1eb..904532e01 100644 --- a/frontend/src/app/common-test-bed.ts +++ b/frontend/src/app/common-test-bed.ts @@ -20,6 +20,8 @@ import { WordmodelsService } from './services/wordmodels.service'; import { WordmodelsServiceMock } from '../mock-data/wordmodels'; import { VisualizationService } from './services/visualization.service'; import { visualizationServiceMock } from '../mock-data/visualization'; +import { TagService } from './services/tag.service'; +import { TagServiceMock } from '../mock-data/tag'; export const commonTestBed = () => { const filteredImports = imports.filter(value => !(value in [HttpClientModule])); @@ -59,6 +61,10 @@ export const commonTestBed = () => { { provide: VisualizationService, useValue: new visualizationServiceMock(), + }, + { + provide: TagService, + useValue: new TagServiceMock(), } ); diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html index 46c2122cb..e6fac5f82 100644 --- a/frontend/src/app/document-view/document-view.component.html +++ b/frontend/src/app/document-view/document-view.component.html @@ -9,6 +9,12 @@ + + Your tags + + + + {{field.displayName}} diff --git a/frontend/src/app/document/document.module.ts b/frontend/src/app/document/document.module.ts index 4c6c0e8d5..d341caadb 100644 --- a/frontend/src/app/document/document.module.ts +++ b/frontend/src/app/document/document.module.ts @@ -7,6 +7,7 @@ import { SearchRelevanceComponent } from '../search'; import { CorpusModule } from '../corpus-header/corpus.module'; import { DocumentPopupComponent } from './document-popup/document-popup.component'; import { DialogModule } from 'primeng/dialog'; +import { TagModule } from '../tag/tag.module'; @@ -22,6 +23,7 @@ import { DialogModule } from 'primeng/dialog'; CorpusModule, SharedModule, ImageViewModule, + TagModule, ], exports: [ DocumentViewComponent, DocumentPageComponent, diff --git a/frontend/src/app/dropdown/dropdown.component.html b/frontend/src/app/dropdown/dropdown.component.html index 3d17df9e6..27b3eb11a 100644 --- a/frontend/src/app/dropdown/dropdown.component.html +++ b/frontend/src/app/dropdown/dropdown.component.html @@ -1,6 +1,7 @@

diff --git a/frontend/src/app/dropdown/dropdown.component.scss b/frontend/src/app/dropdown/dropdown.component.scss index e69de29bb..c5d28eccf 100644 --- a/frontend/src/app/dropdown/dropdown.component.scss +++ b/frontend/src/app/dropdown/dropdown.component.scss @@ -0,0 +1,11 @@ +@import "_utilities"; + +.dropdown-item { + // this prevents the .dropdown item rule from being overwritten + // when the dropdown is used within a + color: $text !important; + + &.is-active { + color: $text-invert !important; + } +} diff --git a/frontend/src/app/dropdown/dropdown.component.ts b/frontend/src/app/dropdown/dropdown.component.ts index 0ce576a3d..7e08c16c6 100644 --- a/frontend/src/app/dropdown/dropdown.component.ts +++ b/frontend/src/app/dropdown/dropdown.component.ts @@ -17,6 +17,12 @@ export class DropdownComponent implements OnDestroy { @Input() public value: T | undefined = undefined; + @Input() + public disabled = false; + + @Input() + public styleClass: string; + @Input() public options: T[] = []; diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts index baaa9c6a0..161ee64f2 100644 --- a/frontend/src/app/models/found-document.spec.ts +++ b/frontend/src/app/models/found-document.spec.ts @@ -1,6 +1,11 @@ +import { TestBed } from '@angular/core/testing'; import { makeDocument } from '../../mock-data/constructor-helpers'; import { mockCorpus, mockCorpus3 } from '../../mock-data/corpus'; import { FoundDocument } from './found-document'; +import { TagService } from '../services/tag.service'; +import { TagServiceMock, mockTags } from '../../mock-data/tag'; +import { Tag } from './tag'; +import * as _ from 'lodash'; const maxScore = 2.9113607; const mockResponse = { @@ -26,8 +31,19 @@ const mockResponse = { }; describe('FoundDocument', () => { + let tagService: TagService; + + beforeEach(() => { + TestBed.configureTestingModule({ + providers: [ + { provide: TagService, useValue: new TagServiceMock() } + ] + }); + tagService = TestBed.inject(TagService); + }); + it('should construct from an elasticsearch response', () => { - const document = new FoundDocument(mockCorpus, mockResponse, maxScore); + const document = new FoundDocument(tagService, mockCorpus, mockResponse, maxScore); expect(document.id).toBe('1994_troonrede'); expect(document.fieldValues['monarch']).toBe('Beatrix'); @@ -46,4 +62,14 @@ describe('FoundDocument', () => { }, mockCorpus3); expect(shouldHaveContext.hasContext).toBeTrue(); }); + + it('should set tags', () => { + const doc = makeDocument({ great_field: 'test' }); + expect(doc.tags$.value).toEqual(mockTags); + const tag = _.first(mockTags); + doc.removeTag(tag); + expect(doc.tags$.value.length).toBe(1); + doc.addTag(tag); + expect(doc.tags$.value.length).toBe(2); + }); }); diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts index b319a008f..be705bbd7 100644 --- a/frontend/src/app/models/found-document.ts +++ b/frontend/src/app/models/found-document.ts @@ -2,6 +2,10 @@ import * as _ from 'lodash'; import { makeContextParams } from '../utils/document-context'; import { Corpus, CorpusField } from './corpus'; import { FieldValues, HighlightResult, SearchHit } from './elasticsearch'; +import { Tag } from './tag'; +import { BehaviorSubject, Observable, Subject } from 'rxjs'; +import { TagService } from '../services/tag.service'; +import { tap } from 'rxjs/operators'; export class FoundDocument { id: string; @@ -20,14 +24,22 @@ export class FoundDocument { /** highlighted strings */ highlight: HighlightResult; - constructor(public corpus: Corpus, hit: SearchHit, maxScore: number = 1) { + /** tags created on the document */ + tags$ = new BehaviorSubject(undefined); + + constructor( + private tagService: TagService, + public corpus: Corpus, + hit: SearchHit, + maxScore: number = 1 + ) { this.id = hit._id; this.relevance = hit._score / maxScore; this.fieldValues = Object.assign({ id: hit._id }, hit._source); this.highlight = hit.highlight; + this.fetchTags(); } - /** * whether the document has a "context" that it belongs to * @@ -40,8 +52,11 @@ export class FoundDocument { return false; } - const notBlank = value => value !== undefined && value !== null && value !== ''; - const contextValues = spec.contextFields.map(this.fieldValue.bind(this)); + const notBlank = (value) => + value !== undefined && value !== null && value !== ''; + const contextValues = spec.contextFields.map( + this.fieldValue.bind(this) + ); return _.every(contextValues, notBlank); } @@ -58,4 +73,25 @@ export class FoundDocument { return this.fieldValues[field.name]; } + addTag(tag: Tag): void { + const newTags = this.tags$.value.concat([tag]); + this.setTags(newTags); + } + + removeTag(tag: Tag): void { + const newTags = _.without(this.tags$.value, tag); + this.setTags(newTags); + } + + setTags(tags: Tag[]): void { + this.tagService + .setDocumentTags(this, tags) + .subscribe((value) => this.tags$.next(value)); + } + + private fetchTags(): void { + this.tagService + .getDocumentTags(this) + .subscribe((value) => this.tags$.next(value)); + } } diff --git a/frontend/src/app/models/index.ts b/frontend/src/app/models/index.ts index f72e3bb80..73ff5c665 100644 --- a/frontend/src/app/models/index.ts +++ b/frontend/src/app/models/index.ts @@ -10,3 +10,4 @@ export * from './user'; export * from './user-role'; export * from './visualization'; export * from './elasticsearch'; +export * from './tag'; diff --git a/frontend/src/app/models/tag.ts b/frontend/src/app/models/tag.ts new file mode 100644 index 000000000..b2bcaf9e2 --- /dev/null +++ b/frontend/src/app/models/tag.ts @@ -0,0 +1,12 @@ +export interface Tag { + id: number; + name: string; + description: string; + count: number; +} + +export interface DocumentTagsResponse { + corpus: string; + doc_id: string; + tags: Tag[]; +}; diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts index 08189aed5..ce57982e8 100644 --- a/frontend/src/app/services/api.service.ts +++ b/frontend/src/app/services/api.service.ts @@ -10,6 +10,7 @@ import { AggregateTermFrequencyParameters, Corpus, DateTermFrequencyParameters, + DocumentTagsResponse, Download, DownloadOptions, FieldCoverage, @@ -18,6 +19,7 @@ import { NGramRequestParameters, QueryDb, ResultsDownloadParameters, + Tag, TaskResult, TaskSuccess, TasksOutcome, @@ -36,7 +38,9 @@ interface SolisLoginResponse { queries: QueryDb[]; } -@Injectable() +@Injectable({ + providedIn: 'root', +}) export class ApiService { private apiUrl = environment.apiUrl; @@ -44,6 +48,7 @@ export class ApiService { private visApiURL = 'visualization'; private downloadApiURL = 'download'; private corpusApiUrl = 'corpus'; + private tagApiUrl = 'tag'; private authApiRoute = (route: string): string => `/${this.authApiUrl}/${route}/`; @@ -234,6 +239,36 @@ export class ApiService { return this.http.get('/api/corpus/'); } + // Tagging + + public userTags(): Observable { + const url = this.apiRoute(this.tagApiUrl, 'tags/'); + return this.http.get(url); + } + + public createTag(name: string, description?: string): Observable { + const url = this.apiRoute(this.tagApiUrl, 'tags/'); + return this.http.post(url, { name, description }); + } + + public documentTags(document: FoundDocument): Observable { + const url = this.apiRoute( + this.tagApiUrl, + `document_tags/${document.corpus.name}/${document.id}` + ); + return this.http.get(url); + } + + public setDocumentTags(document: FoundDocument, tagIds: number[]): Observable { + const url = this.apiRoute( + this.tagApiUrl, + `document_tags/${document.corpus.name}/${document.id}`, + ); + return this.http.patch(url, + { tags: tagIds } + ); + } + // Authentication API public login(username: string, password: string) { return this.http.post<{ key: string }>(this.authApiRoute('login'), { diff --git a/frontend/src/app/services/elastic-search.service.spec.ts b/frontend/src/app/services/elastic-search.service.spec.ts index 59a0e4cd6..bc806258c 100644 --- a/frontend/src/app/services/elastic-search.service.spec.ts +++ b/frontend/src/app/services/elastic-search.service.spec.ts @@ -3,6 +3,8 @@ import { HttpClientTestingModule, HttpTestingController } from '@angular/common/ import { ElasticSearchService, SearchResponse } from './elastic-search.service'; import { Aggregator, QueryModel } from '../models'; import { mockCorpus, mockField, mockField2 } from '../../mock-data/corpus'; +import { TagService } from './tag.service'; +import { TagServiceMock } from '../../mock-data/tag'; const mockResponse: SearchResponse = { took: 4, @@ -62,6 +64,7 @@ describe('ElasticSearchService', () => { TestBed.configureTestingModule({ providers: [ ElasticSearchService, + { provide: TagService, useValue: new TagServiceMock() } ], imports: [ HttpClientTestingModule ] }); diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts index a2f8bcb7b..2ba4d6d5e 100644 --- a/frontend/src/app/services/elastic-search.service.ts +++ b/frontend/src/app/services/elastic-search.service.ts @@ -7,6 +7,7 @@ import { AggregateQueryFeedback, SearchHit, EsQuery, Aggregator } from '../models/index'; import * as _ from 'lodash'; +import { TagService } from './tag.service'; import { QueryParameters } from '../models/search-requests'; import { RESULTS_PER_PAGE } from '../models/page-results'; @@ -14,7 +15,7 @@ import { RESULTS_PER_PAGE } from '../models/page-results'; @Injectable() export class ElasticSearchService { - constructor(private http: HttpClient) { + constructor(private http: HttpClient, private tagService: TagService) { } getDocumentById(id: string, corpus: Corpus): Promise { @@ -154,7 +155,7 @@ export class ElasticSearchService { * return the id, relevance and field values of a given document */ private hitToDocument(corpus: Corpus, hit: SearchHit, maxScore: number): FoundDocument { - return new FoundDocument(corpus, hit, maxScore); + return new FoundDocument(this.tagService, corpus, hit, maxScore); } } diff --git a/frontend/src/app/services/tag.service.spec.ts b/frontend/src/app/services/tag.service.spec.ts new file mode 100644 index 000000000..912bd8df4 --- /dev/null +++ b/frontend/src/app/services/tag.service.spec.ts @@ -0,0 +1,26 @@ +import { TestBed } from '@angular/core/testing'; + +import { TagService } from './tag.service'; +import { HttpClientTestingModule } from '@angular/common/http/testing'; +import { ApiService } from './api.service'; +import { ApiServiceMock } from '../../mock-data/api'; + +describe('TagService', () => { + let service: TagService; + + beforeEach(() => { + TestBed.configureTestingModule({ + providers: [ + { provide: ApiService, useValue: new ApiServiceMock() }, + ], + imports: [ + HttpClientTestingModule, + ] + }); + service = TestBed.inject(TagService); + }); + + it('should be created', () => { + expect(service).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts new file mode 100644 index 000000000..167ff57c1 --- /dev/null +++ b/frontend/src/app/services/tag.service.ts @@ -0,0 +1,42 @@ +import { Injectable } from '@angular/core'; +import { FoundDocument } from '../models'; +import { Observable } from 'rxjs'; +import { Tag } from '../models'; +import { map, tap } from 'rxjs/operators'; +import { ApiService } from './api.service'; + + +@Injectable({ + providedIn: 'root', +}) +export class TagService { + /** all tags from the user */ + tags$: Observable; + + constructor(private apiService: ApiService) { + this.fetch(); + } + + makeTag(name: string, description?: string): Observable { + return this.apiService + .createTag(name, description) + .pipe(tap(() => this.fetch())); + } + + getDocumentTags(document: FoundDocument): Observable { + return this.apiService + .documentTags(document) + .pipe(map((response) => response.tags)); + } + + setDocumentTags(document: FoundDocument, tags: Tag[]): Observable { + const tagIds = tags.map((t) => t.id); + return this.apiService + .setDocumentTags(document, tagIds) + .pipe(map((response) => response.tags)); + } + + private fetch() { + this.tags$ = this.apiService.userTags(); + } +} diff --git a/frontend/src/app/tag/document-tags/document-tags.component.html b/frontend/src/app/tag/document-tags/document-tags.component.html new file mode 100644 index 000000000..a38716edf --- /dev/null +++ b/frontend/src/app/tag/document-tags/document-tags.component.html @@ -0,0 +1,27 @@ +
+
+ + {{tag.name}} + + +
+ +
+ + + + + + + +
+ +
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.scss b/frontend/src/app/tag/document-tags/document-tags.component.scss new file mode 100644 index 000000000..e69de29bb diff --git a/frontend/src/app/tag/document-tags/document-tags.component.spec.ts b/frontend/src/app/tag/document-tags/document-tags.component.spec.ts new file mode 100644 index 000000000..a497b691c --- /dev/null +++ b/frontend/src/app/tag/document-tags/document-tags.component.spec.ts @@ -0,0 +1,25 @@ +import { ComponentFixture, TestBed } from '@angular/core/testing'; + +import { DocumentTagsComponent } from './document-tags.component'; +import { commonTestBed } from '../../common-test-bed'; +import { makeDocument } from '../../../mock-data/constructor-helpers'; + +describe('DocumentTagsComponent', () => { + let component: DocumentTagsComponent; + let fixture: ComponentFixture; + + beforeEach(async () => { + commonTestBed().testingModule.compileComponents(); + }); + + beforeEach(() => { + fixture = TestBed.createComponent(DocumentTagsComponent); + component = fixture.componentInstance; + component.document = makeDocument({great_field: 'test'}); + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/tag/document-tags/document-tags.component.ts b/frontend/src/app/tag/document-tags/document-tags.component.ts new file mode 100644 index 000000000..89a0682fc --- /dev/null +++ b/frontend/src/app/tag/document-tags/document-tags.component.ts @@ -0,0 +1,31 @@ +import { Component, Input, OnInit } from '@angular/core'; +import { FoundDocument, Tag } from '../../models'; +import { faPlus, faTimes } from '@fortawesome/free-solid-svg-icons'; +import { first, map, mergeMap } from 'rxjs/operators'; +import * as _ from 'lodash'; + +@Component({ + selector: 'ia-document-tags', + templateUrl: './document-tags.component.html', + styleUrls: ['./document-tags.component.scss'], +}) +export class DocumentTagsComponent implements OnInit { + @Input() document: FoundDocument; + + faTimes = faTimes; + faPlus = faPlus; + + showAddNew = false; + + constructor() {} + + ngOnInit(): void {} + + addTag(tag: Tag) { + this.document.addTag(tag); + } + + removeTag(tag: Tag) { + this.document.removeTag(tag); + } +} diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html new file mode 100644 index 000000000..9b1d6a3cf --- /dev/null +++ b/frontend/src/app/tag/tag-select/tag-select.component.html @@ -0,0 +1,46 @@ +
+
+ + + + + +
+
+ +
+
+ +
+
+ +
+ diff --git a/frontend/src/app/tag/tag-select/tag-select.component.scss b/frontend/src/app/tag/tag-select/tag-select.component.scss new file mode 100644 index 000000000..6ec645475 --- /dev/null +++ b/frontend/src/app/tag/tag-select/tag-select.component.scss @@ -0,0 +1,3 @@ +.tag-input { + height: 2em; +} diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts new file mode 100644 index 000000000..9e49370eb --- /dev/null +++ b/frontend/src/app/tag/tag-select/tag-select.component.ts @@ -0,0 +1,73 @@ +import { + Component, + ElementRef, + EventEmitter, + Input, + OnDestroy, + Output, + ViewChild, +} from '@angular/core'; +import { faCheck, faPlus, faTimes } from '@fortawesome/free-solid-svg-icons'; +import * as _ from 'lodash'; +import { Observable, Subject } from 'rxjs'; +import { Tag } from '../../models'; +import { TagService } from '../../services/tag.service'; +import { takeUntil } from 'rxjs/operators'; + +@Component({ + selector: 'ia-tag-select', + templateUrl: './tag-select.component.html', + styleUrls: ['./tag-select.component.scss'], +}) +export class TagSelectComponent implements OnDestroy { + @Input() exclude: Tag[]; + @Output() selection = new EventEmitter(); + @Output() cancel = new EventEmitter(); + + @ViewChild('tagSelect') tagSelect: ElementRef; + + tags$: Observable; + destroy$ = new Subject(); + + faCheck = faCheck; + faTimes = faTimes; + faPlus = faPlus; + + selectedTag: Tag; + + createMode = false; + newTagName: string; + + constructor(private tagService: TagService) { + this.tags$ = this.tagService.tags$; + } + + filterTags(tags: Tag[], exclude?: Tag[]): Tag[] { + return _.differenceBy(tags, exclude || [], 'name'); + } + + addTag() { + this.selection.emit(this.selectedTag); + this.selectedTag = undefined; + } + + createTag() { + this.tagService + .makeTag(this.newTagName) + .pipe(takeUntil(this.destroy$)) + .subscribe((res) => { + this.selection.emit(res); + this.createMode = false; + }); + } + + toggleCreate(): void { + this.selectedTag = undefined; + this.createMode = !this.createMode; + } + + ngOnDestroy(): void { + this.destroy$.next(); + this.destroy$.complete(); + } +} diff --git a/frontend/src/app/tag/tag-select/tag-select.spec.ts b/frontend/src/app/tag/tag-select/tag-select.spec.ts new file mode 100644 index 000000000..0b2b6e859 --- /dev/null +++ b/frontend/src/app/tag/tag-select/tag-select.spec.ts @@ -0,0 +1,24 @@ + +import { ComponentFixture, TestBed } from '@angular/core/testing'; + +import { TagSelectComponent } from './tag-select.component'; +import { commonTestBed } from '../../common-test-bed'; + +describe('TagSelectComponent', () => { + let component: TagSelectComponent; + let fixture: ComponentFixture; + + beforeEach(async () => { + commonTestBed().testingModule.compileComponents(); + }); + + beforeEach(() => { + fixture = TestBed.createComponent(TagSelectComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/tag/tag.module.ts b/frontend/src/app/tag/tag.module.ts new file mode 100644 index 000000000..fccbf4184 --- /dev/null +++ b/frontend/src/app/tag/tag.module.ts @@ -0,0 +1,13 @@ +import { NgModule } from '@angular/core'; +import { SharedModule } from '../shared/shared.module'; +import { TagSelectComponent } from './tag-select/tag-select.component'; +import { DocumentTagsComponent } from './document-tags/document-tags.component'; + + + +@NgModule({ + declarations: [DocumentTagsComponent, TagSelectComponent], + imports: [SharedModule], + exports: [DocumentTagsComponent], +}) +export class TagModule {} diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts index dac74ec42..8138f30b8 100644 --- a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts +++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts @@ -4,7 +4,6 @@ import { import { AggregateResult, CorpusField, QueryModel, Corpus, FreqTableHeaders } from '../../models/index'; -import { ApiService } from '../../services/index'; import { BehaviorSubject } from 'rxjs'; import { VisualizationService } from '../../services/visualization.service'; import { showLoading } from '../../utils/utils'; diff --git a/frontend/src/mock-data/api.ts b/frontend/src/mock-data/api.ts index a7ddcf108..b3c8485aa 100644 --- a/frontend/src/mock-data/api.ts +++ b/frontend/src/mock-data/api.ts @@ -70,4 +70,8 @@ export class ApiServiceMock { requestMeQdia() { return Promise.resolve({}); } + + userTags() { + return of([]); + } } diff --git a/frontend/src/mock-data/constructor-helpers.ts b/frontend/src/mock-data/constructor-helpers.ts index acf8c903d..21c4a1aa1 100644 --- a/frontend/src/mock-data/constructor-helpers.ts +++ b/frontend/src/mock-data/constructor-helpers.ts @@ -2,6 +2,9 @@ import { Corpus, FieldValues, FoundDocument, HighlightResult, SearchHit } from '../app/models'; import { mockCorpus } from './corpus'; +import { TagServiceMock } from './tag'; + +const tagService = new TagServiceMock() as any; export const makeDocument = ( fieldValues: FieldValues, @@ -13,6 +16,6 @@ export const makeDocument = ( const hit: SearchHit = { _id: id, _score: relevance, _source: fieldValues, highlight }; - return new FoundDocument(corpus, hit); + return new FoundDocument(tagService, corpus, hit); }; diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts new file mode 100644 index 000000000..7d08e7a13 --- /dev/null +++ b/frontend/src/mock-data/tag.ts @@ -0,0 +1,40 @@ +import { Observable, of } from 'rxjs'; +import { FoundDocument, Tag } from '../app/models'; +import { tap } from 'rxjs/operators'; + +export const mockTags: Tag[] = [ + { + id: 1, + name: 'fascinating', + description: 'interesting documents', + count: 2 + }, { + id: 2, + name: 'boring', + description: 'useless documents', + count: 1 + } +]; + +export class TagServiceMock { + tags$ = of(mockTags); + + getDocumentTags(document: FoundDocument): Observable { + return of(mockTags); + } + + makeTag(name: string, description?: string): Observable { + return of({ + id: 3, name, description, count: 0 + }).pipe(tap(this.fetch.bind(this))); + } + + setDocumentTags(document: FoundDocument, tagIds: Tag[]): Observable { + const tags = mockTags.filter(tag => tagIds.includes(tag)); + return of(tags); + }; + + private fetch() { + this.tags$ = of(mockTags); + } +} diff --git a/frontend/src/styles.scss b/frontend/src/styles.scss index 112864b5a..149750a77 100644 --- a/frontend/src/styles.scss +++ b/frontend/src/styles.scss @@ -44,3 +44,16 @@ .is-loading:not(.button) { @extend %content-loader; } + +.tag-button { + align-items: center; + border-radius: 3px; + display: inline-flex; + font-size: 0.75rem; + height: 2em; + justify-content: center; + line-height: 1.5; + padding-left: 0.75em; + padding-right: 0.75em; + white-space: nowrap; +} diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 3b1ee901a..4a54260f0 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -327,7 +327,7 @@ dependencies: "@babel/highlight" "^7.10.3" -"@babel/code-frame@^7.16.7", "@babel/code-frame@^7.22.10", "@babel/code-frame@^7.22.5": +"@babel/code-frame@^7.16.7", "@babel/code-frame@^7.22.5": version "7.22.10" resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.22.10.tgz#1c20e612b768fefa75f6e90d6ecb86329247f0a3" integrity sha512-/KKIMG4UEL35WmI9OlvMhurwtytjvXoFcGNrOvyG9zIzA8YmPjVtIZUf7b05+TPO7G7/GEmLHDaoCgACHl9hhA== @@ -335,6 +335,14 @@ "@babel/highlight" "^7.22.10" chalk "^2.4.2" +"@babel/code-frame@^7.22.13": + version "7.22.13" + resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.22.13.tgz#e3c1c099402598483b7a8c46a721d1038803755e" + integrity sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w== + dependencies: + "@babel/highlight" "^7.22.13" + chalk "^2.4.2" + "@babel/compat-data@^7.13.11", "@babel/compat-data@^7.16.8": version "7.17.0" resolved "https://registry.yarnpkg.com/@babel/compat-data/-/compat-data-7.17.0.tgz#86850b8597ea6962089770952075dcaabb8dba34" @@ -426,7 +434,7 @@ jsesc "^2.5.1" source-map "^0.5.0" -"@babel/generator@^7.17.0", "@babel/generator@^7.22.10", "@babel/generator@^7.22.7": +"@babel/generator@^7.17.0": version "7.22.10" resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.22.10.tgz#c92254361f398e160645ac58831069707382b722" integrity sha512-79KIf7YiWjjdZ81JnLujDRApWtl7BxTqWD88+FFdQEIOG8LJ0etDOM7CXuIgGJa55sGOwZVwuEsaLEm0PJ5/+A== @@ -446,6 +454,16 @@ "@jridgewell/trace-mapping" "^0.3.17" jsesc "^2.5.1" +"@babel/generator@^7.23.0": + version "7.23.0" + resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.23.0.tgz#df5c386e2218be505b34837acbcb874d7a983420" + integrity sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g== + dependencies: + "@babel/types" "^7.23.0" + "@jridgewell/gen-mapping" "^0.3.2" + "@jridgewell/trace-mapping" "^0.3.17" + jsesc "^2.5.1" + "@babel/helper-annotate-as-pure@7.16.7", "@babel/helper-annotate-as-pure@^7.16.7": version "7.16.7" resolved "https://registry.yarnpkg.com/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.16.7.tgz#bb2339a7534a9c128e3102024c60760a3a7f3862" @@ -549,6 +567,11 @@ resolved "https://registry.yarnpkg.com/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.5.tgz#f06dd41b7c1f44e1f8da6c4055b41ab3a09a7e98" integrity sha512-XGmhECfVA/5sAt+H+xpSg0mfrHq6FzNr9Oxh7PSEBBRUb/mL7Kz3NICXb194rCqAEdxkhPT1a88teizAFyvk8Q== +"@babel/helper-environment-visitor@^7.22.20": + version "7.22.20" + resolved "https://registry.yarnpkg.com/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz#96159db61d34a29dba454c959f5ae4a649ba9167" + integrity sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA== + "@babel/helper-explode-assignable-expression@^7.16.7": version "7.16.7" resolved "https://registry.yarnpkg.com/@babel/helper-explode-assignable-expression/-/helper-explode-assignable-expression-7.16.7.tgz#12a6d8522fdd834f194e868af6354e8650242b7a" @@ -556,7 +579,7 @@ dependencies: "@babel/types" "^7.16.7" -"@babel/helper-function-name@^7.16.7", "@babel/helper-function-name@^7.22.5": +"@babel/helper-function-name@^7.16.7": version "7.22.5" resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.22.5.tgz#ede300828905bb15e582c037162f99d5183af1be" integrity sha512-wtHSq6jMRE3uF2otvfuD3DIvVhOsSNshQl0Qrd7qC9oQJzHvOL4qQXlQn2916+CXGywIjpGuIkoyZRRxHPiNQQ== @@ -564,6 +587,14 @@ "@babel/template" "^7.22.5" "@babel/types" "^7.22.5" +"@babel/helper-function-name@^7.23.0": + version "7.23.0" + resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz#1f9a3cdbd5b2698a670c30d2735f9af95ed52759" + integrity sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw== + dependencies: + "@babel/template" "^7.22.15" + "@babel/types" "^7.23.0" + "@babel/helper-hoist-variables@^7.16.7", "@babel/helper-hoist-variables@^7.22.5": version "7.22.5" resolved "https://registry.yarnpkg.com/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz#c01a007dac05c085914e8fb652b339db50d823bb" @@ -693,6 +724,11 @@ resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.5.tgz#9544ef6a33999343c8740fa51350f30eeaaaf193" integrity sha512-aJXu+6lErq8ltp+JhkJUfk1MTGyuA4v7f3pA+BJ5HLfNC6nAQ0Cpi9uOquUj8Hehg0aUiHzWQbOVJGao6ztBAQ== +"@babel/helper-validator-identifier@^7.22.20": + version "7.22.20" + resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz#c4ae002c61d2879e724581d96665583dbc1dc0e0" + integrity sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A== + "@babel/helper-validator-option@^7.16.7", "@babel/helper-validator-option@^7.22.5": version "7.22.5" resolved "https://registry.yarnpkg.com/@babel/helper-validator-option/-/helper-validator-option-7.22.5.tgz#de52000a15a177413c8234fa3a8af4ee8102d0ac" @@ -744,16 +780,30 @@ chalk "^2.4.2" js-tokens "^4.0.0" +"@babel/highlight@^7.22.13": + version "7.22.20" + resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.22.20.tgz#4ca92b71d80554b01427815e06f2df965b9c1f54" + integrity sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg== + dependencies: + "@babel/helper-validator-identifier" "^7.22.20" + chalk "^2.4.2" + js-tokens "^4.0.0" + "@babel/parser@^7.14.7", "@babel/parser@^7.16.12": version "7.17.0" resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.17.0.tgz#f0ac33eddbe214e4105363bb17c3341c5ffcc43c" integrity sha512-VKXSCQx5D8S04ej+Dqsr1CzYvvWgf20jIw2D+YhQCrIlr2UZGaDds23Y0xg75/skOxpLCRpUZvk/1EAVkGoDOw== -"@babel/parser@^7.16.7", "@babel/parser@^7.17.0", "@babel/parser@^7.22.10", "@babel/parser@^7.22.5", "@babel/parser@^7.22.7": +"@babel/parser@^7.16.7", "@babel/parser@^7.17.0", "@babel/parser@^7.22.5", "@babel/parser@^7.22.7": version "7.22.10" resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.22.10.tgz#e37634f9a12a1716136c44624ef54283cabd3f55" integrity sha512-lNbdGsQb9ekfsnjFGhEiF4hfFqGgfOP3H3d27re3n+CGhNuTSUEQdfWk556sTLNTloczcdM5TYF2LhzmDQKyvQ== +"@babel/parser@^7.22.15", "@babel/parser@^7.23.0": + version "7.23.0" + resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.0.tgz#da950e622420bf96ca0d0f2909cdddac3acd8719" + integrity sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw== + "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression@^7.16.7": version "7.16.7" resolved "https://registry.yarnpkg.com/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.16.7.tgz#4eda6d6c2a0aa79c70fa7b6da67763dfe2141050" @@ -1412,51 +1462,28 @@ "@babel/parser" "^7.22.5" "@babel/types" "^7.22.5" -"@babel/traverse@^7.13.0", "@babel/traverse@^7.16.10", "@babel/traverse@^7.16.8": - version "7.17.0" - resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.17.0.tgz#3143e5066796408ccc880a33ecd3184f3e75cd30" - integrity sha512-fpFIXvqD6kC7c7PUNnZ0Z8cQXlarCLtCUpt2S1Dx7PjoRtCFffvOkHHSom+m5HIxMZn5bIBVb71lhabcmjEsqg== +"@babel/template@^7.22.15": + version "7.22.15" + resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.22.15.tgz#09576efc3830f0430f4548ef971dde1350ef2f38" + integrity sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w== dependencies: - "@babel/code-frame" "^7.16.7" - "@babel/generator" "^7.17.0" - "@babel/helper-environment-visitor" "^7.16.7" - "@babel/helper-function-name" "^7.16.7" - "@babel/helper-hoist-variables" "^7.16.7" - "@babel/helper-split-export-declaration" "^7.16.7" - "@babel/parser" "^7.17.0" - "@babel/types" "^7.17.0" - debug "^4.1.0" - globals "^11.1.0" + "@babel/code-frame" "^7.22.13" + "@babel/parser" "^7.22.15" + "@babel/types" "^7.22.15" -"@babel/traverse@^7.16.7", "@babel/traverse@^7.22.8": - version "7.22.8" - resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.22.8.tgz#4d4451d31bc34efeae01eac222b514a77aa4000e" - integrity sha512-y6LPR+wpM2I3qJrsheCTwhIinzkETbplIgPBbwvqPKc+uljeA5gP+3nP8irdYt1mjQaDnlIcG+dw8OjAco4GXw== +"@babel/traverse@^7.13.0", "@babel/traverse@^7.16.10", "@babel/traverse@^7.16.7", "@babel/traverse@^7.16.8", "@babel/traverse@^7.17.0", "@babel/traverse@^7.22.10", "@babel/traverse@^7.22.6", "@babel/traverse@^7.22.8": + version "7.23.2" + resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.23.2.tgz#329c7a06735e144a506bdb2cad0268b7f46f4ad8" + integrity sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw== dependencies: - "@babel/code-frame" "^7.22.5" - "@babel/generator" "^7.22.7" - "@babel/helper-environment-visitor" "^7.22.5" - "@babel/helper-function-name" "^7.22.5" + "@babel/code-frame" "^7.22.13" + "@babel/generator" "^7.23.0" + "@babel/helper-environment-visitor" "^7.22.20" + "@babel/helper-function-name" "^7.23.0" "@babel/helper-hoist-variables" "^7.22.5" "@babel/helper-split-export-declaration" "^7.22.6" - "@babel/parser" "^7.22.7" - "@babel/types" "^7.22.5" - debug "^4.1.0" - globals "^11.1.0" - -"@babel/traverse@^7.17.0", "@babel/traverse@^7.22.10", "@babel/traverse@^7.22.6": - version "7.22.10" - resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.22.10.tgz#20252acb240e746d27c2e82b4484f199cf8141aa" - integrity sha512-Q/urqV4pRByiNNpb/f5OSv28ZlGJiFiiTh+GAHktbIrkPhPbl90+uW6SmpoLyZqutrg9AEaEf3Q/ZBRHBXgxig== - dependencies: - "@babel/code-frame" "^7.22.10" - "@babel/generator" "^7.22.10" - "@babel/helper-environment-visitor" "^7.22.5" - "@babel/helper-function-name" "^7.22.5" - "@babel/helper-hoist-variables" "^7.22.5" - "@babel/helper-split-export-declaration" "^7.22.6" - "@babel/parser" "^7.22.10" - "@babel/types" "^7.22.10" + "@babel/parser" "^7.23.0" + "@babel/types" "^7.23.0" debug "^4.1.0" globals "^11.1.0" @@ -1486,6 +1513,15 @@ "@babel/helper-validator-identifier" "^7.22.5" to-fast-properties "^2.0.0" +"@babel/types@^7.22.15", "@babel/types@^7.23.0": + version "7.23.0" + resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.23.0.tgz#8c1f020c9df0e737e4e247c0619f58c68458aaeb" + integrity sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg== + dependencies: + "@babel/helper-string-parser" "^7.22.5" + "@babel/helper-validator-identifier" "^7.22.20" + to-fast-properties "^2.0.0" + "@colors/colors@1.5.0": version "1.5.0" resolved "https://registry.yarnpkg.com/@colors/colors/-/colors-1.5.0.tgz#bb504579c1cae923e6576a4f5da43d25f97bdbd9" @@ -2624,9 +2660,9 @@ autoprefixer@^10.4.2: postcss-value-parser "^4.2.0" axios@^1.0.0: - version "1.1.3" - resolved "https://registry.yarnpkg.com/axios/-/axios-1.1.3.tgz#8274250dada2edf53814ed7db644b9c2866c1e35" - integrity sha512-00tXVRwKx/FZr/IDVFt4C+f9FYairX517WoGCL6dpOntqLkZofjhu43F/Xl44UOpqa+9sLFDrG/XAnFsUYgkDA== + version "1.6.2" + resolved "https://registry.yarnpkg.com/axios/-/axios-1.6.2.tgz#de67d42c755b571d3e698df1b6504cde9b0ee9f2" + integrity sha512-7i24Ri4pmDRfJTR7LDBhsOTtcm+9kjX5WiY1X3wIisx6G9So3pfMkEiU7emUBe46oceVImccTEM3k6C5dbVW8A== dependencies: follow-redirects "^1.15.0" form-data "^4.0.0"