` children of
+ `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one).
+ '''
+ if not soup:
+ return
+ return soup.find_all('ab')
+
+
+def extract_translation(soup):
+ '''
+ Helper function to extract translation from the tag
+ '''
+ if not soup:
+ return
+ translation = soup.find('div', {'type': 'translation'})
+ if translation:
+ return translation.find_all('ab')
+ else:
+ return
+
+
+def extract_commentary(soup):
+ '''
+ Helper function to extract all commentaries from the tag.
+ A single element will be returned with the commentaries found as text content.
+ '''
+ if not soup: return
+ found = []
+ commentaries = soup.find_all('div', {'type': 'commentary'})
+
+ for commentary in commentaries:
+ if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']:
+ p = commentary.find('p')
+ if p:
+ text = p.get_text()
+ if text:
+ text = clean_commentary(text)
+ found.append('{}:\n{}\n'.format(commentary['subtype'].strip().upper(), text))
+
+ if len(found) > 1:
+ cloned_soup = copy(soup)
+ cloned_soup.clear()
+ cloned_soup.string = "\n".join(found)
+ return cloned_soup
+ else:
+ return None
+
+def extract_support_comments(soup):
+ if not soup: return
+ cloned_soup = copy(soup)
+ cloned_soup.clear()
+
+ commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS')
+ commentaries = add_support_comment(soup, commentaries, 'objectType', 'OBJECTTYPE')
+
+ # add any additional text from the element,
+ # i.e. if there is text it is the very last node
+ contents = soup.contents
+ text = contents[len(contents) - 1].strip()
+ if text:
+ text = clean_commentary(text)
+ commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text)
+
+ cloned_soup.string = commentaries
+ return cloned_soup
+
+
+def add_support_comment(soup, existing_commentaries, elem_name, commentary_name):
+ elem = soup.find(elem_name)
+ if elem:
+ text = elem.get_text()
+ if text:
+ text = clean_commentary(text)
+ return '{}{}:\n{}\n\n'.format(existing_commentaries, commentary_name, text)
+ return existing_commentaries
+
+
+def extract_death(soup):
+ '''
+ Helper function to extract date of death from multiple person tags.
+ '''
+ if not soup:
+ return
+ return soup.find_all('death')
+
+
+def extract_country(soup):
+ '''
+ Helper function to extract country.
+ This is needed because the output of `flatten` would otherwise include the text contents
+ of the ``.
+ '''
+ return clone_soup_extract_child(soup, 'region')
+
+
+def extract_settlement(soup):
+ return clone_soup_extract_child(soup, 'geogName')
+
+
+def extract_location_details(soup):
+ return clone_soup_extract_child(soup, 'geo')
+
+
+def clone_soup_extract_child(soup, to_extract):
+ '''
+ Helper function to clone the soup and extract a child element.
+ This is useful when the output of `flatten` would otherwise include the text contents
+ of the child.
+ '''
+ if not soup:
+ return
+ cloned_soup = copy(soup)
+ child = cloned_soup.find(to_extract)
+ if child:
+ child.extract()
+ return cloned_soup
+
+ # TODO: add field
+
+ # TODO: move to a comments field:
+
+ # excluded (for now):
+ # title
+ # organization (incl details, e.g. address)
+ # licence
+ # taxonomy (i.e. things like foto1, foto2 -> no working links to actual images)
+
diff --git a/backend/corpora/peaceportal/fiji_separate.py b/backend/corpora/peaceportal/fiji_separate.py
new file mode 100644
index 000000000..e2b3f564f
--- /dev/null
+++ b/backend/corpora/peaceportal/fiji_separate.py
@@ -0,0 +1,17 @@
+from django.conf import settings
+
+from corpora.peaceportal.peaceportal import PeacePortal
+
+class FIJISEPARATE(PeacePortal):
+
+ es_index = settings.FIJI_ALIAS
+
+ # all fields listed here will be ignored if they are
+ # in the PeacePortal base class definition. Ideal for excluding
+ # filters that are irrelevant
+ redundant_fields = ['source_database', 'region']
+
+ def __init__(self):
+ for field in self.fields:
+ if field.name in self.redundant_fields:
+ self.fields.remove(field)
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
new file mode 100644
index 000000000..e9cd78a84
--- /dev/null
+++ b/backend/corpora/peaceportal/iis.py
@@ -0,0 +1,376 @@
+from copy import copy
+from os.path import join
+
+from django.conf import settings
+
+from addcorpus.corpus import XMLCorpusDefinition
+from addcorpus.extract import Combined, Constant, ExternalFile, FilterAttribute, XML
+from addcorpus.serializers import LanguageField
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+from corpora.utils.exclude_fields import exclude_fields_without_extractor
+
+class PeaceportalIIS(PeacePortal, XMLCorpusDefinition):
+ data_directory = settings.PEACEPORTAL_IIS_DATA
+ es_index = getattr(settings, 'PEACEPORTAL_IIS_ES_INDEX', 'peaceportal-iis')
+
+ def add_metadata(self, filename):
+ external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
+ return {
+ 'associated_file': join(external_file_folder, filename)
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
+ self.source_database.extractor = Constant(
+ value='Inscriptions of Israel/Palestine (Brown University)'
+ )
+
+ self._id.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc',
+ 'msDesc', 'msIdentifier', 'idno'],
+ multiple=False,
+ toplevel=False,
+ flatten=True,
+ transform=lambda x: ''.join(x.lower().split())
+ )
+
+ self.url.extractor = FilterAttribute(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc',
+ 'msDesc', 'msIdentifier', 'idno'],
+ multiple=False,
+ toplevel=False,
+ flatten=True,
+ transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format(
+ ''.join(x.lower().split()))
+ )
+
+ # quick and dirty for now: extract value for 'notBefore'
+ self.year.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'date'],
+ toplevel=False,
+ attribute='notBefore'
+ )
+
+ self.not_before.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'date'],
+ toplevel=False,
+ attribute='notBefore'
+ )
+
+ self.not_after.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'date'],
+ toplevel=False,
+ attribute='notAfter',
+ )
+
+ self.transcription.extractor = ExternalFile(
+ stream_handler=extract_transcript
+ )
+
+ self.transcription_english.extractor = FilterAttribute(
+ tag=['div'],
+ toplevel=True,
+ multiple=False,
+ flatten=True,
+ attribute_filter={
+ 'attribute': 'type',
+ 'value': 'translation'
+ },
+ transform_soup_func=extract_paragraph,
+ transform=lambda x: ' '.join(x.split()) if x else None
+ )
+
+ # is not present in IIS data
+ # self.names.extractor = XML(
+ # tag=['teiHeader', 'profileDesc',
+ # 'particDesc', 'listPerson', 'person'],
+ # flatten=True,
+ # multiple=True,
+ # toplevel=False,
+ # )
+
+ self.iconography.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc',
+ 'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
+ toplevel=False,
+ multiple=True,
+ flatten=True
+ )
+
+ # is not present in IIS data
+ self.sex.extractor = Constant(
+ value='Unknown'
+ )
+
+ self.country.extractor = Constant(
+ value='Israel/Palestine'
+ )
+
+ self.region.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'placeName', 'region'],
+ toplevel=False,
+ flatten=True
+ )
+
+ self.settlement.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'placeName', 'settlement'],
+ toplevel=False,
+ flatten=True
+ )
+
+ self.location_details.extractor = Combined(
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'placeName'],
+ toplevel=False,
+ flatten=True
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'p'],
+ toplevel=False,
+ flatten=True
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'provenance'],
+ toplevel=False,
+ flatten=True
+ )
+ )
+
+ self.material.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc'],
+ attribute='ana',
+ toplevel=False,
+ flatten=True,
+ transform=lambda x: categorize_material(x)
+ )
+
+ self.material_details.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc'],
+ attribute='ana',
+ toplevel=False,
+ flatten=True
+ )
+
+ self.language.extractor = Combined(
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+ 'textLang'],
+ attribute='mainLang',
+ toplevel=False,
+ transform=lambda x: normalize_language(x)
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+ 'textLang'],
+ attribute='otherLangs',
+ toplevel=False,
+ transform=lambda x: normalize_language(x)
+ )
+ )
+ self.language_code.extractor = Combined(
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+ 'textLang'],
+ attribute='mainLang',
+ toplevel=False
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+ 'textLang'],
+ attribute='otherLangs',
+ toplevel=False
+ )
+ )
+
+ self.comments.extractor = Combined(
+ XML(
+ tag=['text'],
+ toplevel=False,
+ multiple=False,
+ flatten=True,
+ transform_soup_func=extract_comments,
+ transform=lambda x: clean_commentary(x) if x else None
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'condition'],
+ toplevel=False,
+ transform_soup_func=extract_condition
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'layoutDesc', 'layout', 'p'],
+ toplevel=False,
+ transform=lambda x: 'LAYOUT:\n{}\n\n'.format(clean_commentary(x)) if x else None
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc'],
+ toplevel=False,
+ attribute='ana',
+ transform=lambda x: 'OBJECTTYPE:\n{}\n\n'.format(x[1:]) if x else None
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'support', 'dimensions'],
+ toplevel=False,
+ transform_soup_func=extract_dimensions,
+ transform=lambda x: 'DIMENSIONS:\n{}\n\n'.format(
+ x) if x else None
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'support', 'p'],
+ toplevel=False,
+ flatten=True,
+ transform=lambda x: 'SUPPORT:\n{}\n\n'.format(
+ clean_commentary(x)) if x else None
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', 'handDesc', 'handNote'],
+ toplevel=False,
+ transform_soup_func=extract_handnotes
+ ),
+ transform=lambda x: join_commentaries(x)
+ )
+
+ self.bibliography.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'msIdentifier', 'publications', 'publication'],
+ toplevel=False,
+ multiple=True
+ )
+
+ self.transcription_hebrew.extractor = Combined(
+ self.transcription.extractor,
+ Constant('he'),
+ transform=lambda x: get_text_in_language(x)
+ )
+
+ self.transcription_latin.extractor = Combined(
+ self.transcription.extractor,
+ Constant('la'),
+ transform=lambda x: get_text_in_language(x)
+ )
+
+ self.transcription_greek.extractor = Combined(
+ self.transcription.extractor,
+ Constant('el'),
+ transform=lambda x: get_text_in_language(x)
+ )
+
+ self.fields = exclude_fields_without_extractor(self.fields)
+
+
+def extract_transcript(filestream):
+ text = filestream.read().strip()
+ filestream.close()
+ # remove the tabs and spaces inherited from xml
+ text = clean_newline_characters(text)
+ if text:
+ text = text.replace('\t', '')
+ return text
+
+
+def extract_paragraph(soup):
+ '''
+ Extract first element from `soup`, ignore the rest.
+ Ideal for ignoring
headers in the HTML versions of the body.
+ '''
+ if not soup:
+ return
+ return soup.find('p')
+
+
+def extract_comments(soup):
+ '''
+ Helper function to extract the commentary from either or (siblings under )
+ '''
+ if not soup:
+ return
+ commentary_div = soup.find('div', {'type': 'commentary'})
+ return extract_paragraph(commentary_div)
+
+
+def extract_attribute_and_child_p(soup, field_header):
+ '''
+ Extract value for 'ana' attribute from soup,
+ as well as the text from a child. Will be returned
+ in a new soup, i.e. a single element with text content
+ in the following format `textcontent (attrivubtevalue)`
+ '''
+ result = ''
+ text = ''
+ ana = None
+ if 'ana' in soup.attrs:
+ ana = soup['ana']
+ p = extract_paragraph(soup)
+ if p:
+ text = p.get_text()
+ if text:
+ result = clean_commentary(text)
+ if ana:
+ result = '{} ({})'.format(result, ana)
+
+ if result:
+ cloned_soup = copy(soup)
+ cloned_soup.clear()
+ cloned_soup.string = '{}:\n{}\n\n'.format(field_header, result)
+ return cloned_soup
+
+
+def extract_condition(soup):
+ return extract_attribute_and_child_p(soup, 'CONDITION')
+
+
+def extract_handnotes(soup):
+ if not soup: return
+ return extract_attribute_and_child_p(soup, 'HANDNOTES')
+
+
+def extract_dimensions(soup):
+ result = ''
+ height_elem = soup.find('height')
+ if height_elem:
+ height = height_elem.get_text()
+ if height:
+ result = "H: {} ".format(height)
+
+ width_elem = soup.find('width')
+ if width_elem:
+ width = width_elem.get_text()
+ if width:
+ result = "{}W: {}".format(result, width)
+
+ depth_elem = soup.find('depth')
+ if depth_elem:
+ depth = depth_elem.get_text()
+ if depth:
+ result = "{} D: {}".format(result, depth)
+
+ cloned_soup = copy(soup)
+ cloned_soup.clear()
+ cloned_soup.string = result
+ return cloned_soup
+
+
+def normalize_language(text):
+ serializer = LanguageField()
+ return serializer.to_representation(text)
+
+ # excluded (for now):
+ # revision history
+
+ # MISSING (i.e. present in Epidat and Fiji)
+ # person(s) - names (profileDesc is completely missing)
diff --git a/backend/corpora/peaceportal/iis_corpus_preprocessor.py b/backend/corpora/peaceportal/iis_corpus_preprocessor.py
new file mode 100644
index 000000000..9be08fa47
--- /dev/null
+++ b/backend/corpora/peaceportal/iis_corpus_preprocessor.py
@@ -0,0 +1,100 @@
+import os
+import sys
+import glob
+import argparse
+from bs4 import BeautifulSoup
+
+
+def main(sys_args):
+ args = parse_arguments(sys_args)
+ prepare_out_folder(args.out_folder)
+ preprocess(args.xml_folder, args.out_folder)
+
+def prepare_out_folder(out_folder):
+ if not os.path.exists(out_folder):
+ os.makedirs(out_folder)
+ else:
+ files = glob.glob('{}/*'.format(out_folder))
+ for f in files:
+ os.remove(f)
+
+def preprocess(in_folder, out_folder):
+
+ for filepath in glob.iglob('{}/*.xml'.format(in_folder)):
+ with open(filepath, 'r') as xml:
+ soup = BeautifulSoup(xml.read(), 'xml')
+
+ filename = os.path.basename(filepath)
+ keep_only_transcription(filename, soup, out_folder)
+ # TODO: add extraction of foreigns
+
+
+def keep_only_transcription(filename, soup, out_folder):
+ out_file = os.path.join(get_subfolder(out_folder, 'tei_with_transcription_only'), filename)
+
+ text_tag = soup.find('text')
+ transcription = get_transcription(filename, text_tag)
+ text_tag.clear()
+ if transcription:
+ text_tag.append(transcription)
+
+ with open(out_file, 'w') as f_out:
+ f_out.write(str(soup))
+
+
+## TODO: extract foreign and export them to separate file.
+# def do_something_with_foreign(filename, soup):
+# text_tag = soup.find('text')
+ # transcription = get_transcription(filename, text_tag)
+ # if transcription:
+ # foreigns = text_tag.find_all('foreign')
+ # # print(foreigns)
+
+ # for f in foreigns:
+ # if f.findChild():
+ # print(f)
+
+
+def get_transcription(filename, text_tag):
+ transcription = text_tag.find('div', { 'subtype': 'transcription'})
+
+ # if there is no transcription, fallback to diplomatic
+ if not transcription:
+ transcription = text_tag.find('div', { 'subtype': 'diplomatic'})
+
+ if not transcription:
+ print('No transcription found in {}'.format(filename))
+ return transcription
+
+
+def get_subfolder(folder, subfoldername):
+ '''
+ Get a subfolder with `subfoldername` in `folder`.
+ Will be created if it doesn't exist.
+ '''
+ path = os.path.join(folder, subfoldername)
+ if not os.path.exists(path):
+ os.makedirs(path)
+ return path
+
+
+def parse_arguments(sys_args):
+ '''
+ Parse the supplied arguments.
+ '''
+ parser = argparse.ArgumentParser(
+ description='Preprocess EpiDoc scrapes, i.e. extract Leiden')
+
+ parser.add_argument(
+ '--xml_folder', '-xml', dest='xml_folder', required=True,
+ help='Path to the folder where the .xml files reside.')
+
+ parser.add_argument(
+ '--out_folder', '-out', dest='out_folder', required=True,
+ help='Path to the folder where the output should end up. Will be created if it doesn\'t exist or emptied out if it does.')
+
+ parsedArgs = parser.parse_args()
+ return parsedArgs
+
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
new file mode 100644
index 000000000..da8653927
--- /dev/null
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -0,0 +1,522 @@
+import os
+import os.path as op
+import logging
+from datetime import datetime
+from langdetect import detect
+from langdetect.lang_detect_exception import LangDetectException
+
+from django.conf import settings
+
+from addcorpus.corpus import ParentCorpusDefinition, FieldDefinition
+from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping
+from addcorpus.es_settings import es_settings
+from addcorpus.extract import Constant
+from addcorpus.filters import MultipleChoiceFilter, RangeFilter
+
+class PeacePortal(ParentCorpusDefinition):
+ '''
+ Base class for corpora in the PEACE portal.
+
+ This supplies the frontend with the information it needs.
+ Child corpora should only provide extractors for each field.
+ Consequently, create indices (with alias 'peaceportal') from
+ the corpora specific definitions, and point the application
+ to this base corpus.
+ '''
+
+ title = "PEACE Portal"
+ description = "A collection of inscriptions on Jewish burial sites"
+ # store min_year as int, since datetime does not support BCE dates
+ min_year = -530
+ max_date = datetime(year=1950, month=12, day=31)
+ visualize = []
+ es_index = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal')
+ es_alias = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal')
+ scan_image_type = 'image/png'
+ # fields below are required by code but not actually used
+ min_date = datetime(year=746, month=1, day=1)
+ image = 'bogus.jpg'
+ category = 'inscription'
+ data_directory = 'bogus'
+
+ # Data overrides from .common.XMLCorpus
+ tag_entry = 'TEI'
+
+ # New data members
+ non_xml_msg = 'Skipping non-XML file {}'
+ non_match_msg = 'Skipping XML file with nonmatching name {}'
+ # overwrite below in child class if you need to extract the (converted) transcription
+ # from external files. See README.
+ languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-)
+
+ @property
+ def es_settings(self):
+ return es_settings(self.languages, stopword_analysis=True, stemming_analysis=True)
+
+ def sources(self, start, end):
+ for directory, _, filenames in os.walk(self.data_directory):
+ for filename in sorted(filenames):
+ name, extension = op.splitext(filename)
+ full_path = op.join(directory, filename)
+ if not self.validate_extension(extension, full_path):
+ continue
+ metadata = self.add_metadata(filename)
+ yield full_path, metadata
+
+ def add_metadata(self, filename):
+ return {}
+
+ def validate_extension(self, extension, full_path):
+ '''
+ Check that the file is valid for this corpus.
+ So far, all PeacePortal corpora are XML, but may include CSV corpora in the future
+ '''
+ logger = logging.getLogger(__name__)
+ if extension == '.xml':
+ return True
+ logger.debug(self.non_xml_msg.format(full_path))
+
+ def request_media(self, document):
+ images = document['fieldValues']['images']
+ if not images:
+ images = []
+ return { 'media': images }
+
+ source_database = FieldDefinition(
+ name='source_database',
+ display_name='Source database',
+ description='The database a record originates from.',
+ es_mapping=keyword_mapping(),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these databases.',
+ option_count=4,
+ ),
+ csv_core=True
+ )
+
+ _id = FieldDefinition(
+ name='id',
+ display_name='ID',
+ description='ID of the inscription entry.',
+ csv_core=True,
+ es_mapping=keyword_mapping(),
+ search_field_core=True
+ )
+
+ url = FieldDefinition(
+ name='url',
+ display_name='URL',
+ description='URL of the inscription entry.',
+ es_mapping=keyword_mapping(),
+ search_field_core=True
+ )
+
+ year = FieldDefinition(
+ name='year',
+ display_name='Year',
+ description='Year of origin of the inscription.',
+ es_mapping=int_mapping(),
+ search_filter=RangeFilter(
+ description='Restrict the years from which search results will be returned.',
+ lower=min_year,
+ upper=max_date.year,
+ ),
+ csv_core=True,
+ sortable=True,
+ visualization_type='term_frequency',
+ visualization_sort='key',
+ results_overview=True
+ )
+
+ not_before = FieldDefinition(
+ name='not_before',
+ display_name='Not before',
+ description='Inscription is dated not earlier than this year.',
+ es_mapping=int_mapping(),
+ hidden=True
+ )
+
+ not_after = FieldDefinition(
+ name='not_after',
+ display_name='Not after',
+ description='Inscription is dated not later than this year.',
+ es_mapping=int_mapping(),
+ hidden=True
+ )
+
+ transcription = FieldDefinition(
+ name='transcription',
+ es_mapping=main_content_mapping(),
+ display_name='Transcription',
+ description='Text content of the inscription.',
+ search_field_core=True,
+ results_overview=True,
+ display_type='text_content'
+ )
+
+ transcription_german = FieldDefinition(
+ name='transcription_de',
+ es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='de'),
+ language='de',
+ hidden=True
+ )
+
+ transcription_english = FieldDefinition(
+ name='transcription_en',
+ es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='en'),
+ language='en',
+ hidden=True
+ )
+
+ transcription_hebrew = FieldDefinition(
+ name='transcription_he', # no stemmers available
+ es_mapping=main_content_mapping(stopword_analysis=True, language='he'),
+ language='he',
+ hidden=True
+ )
+
+ transcription_latin = FieldDefinition(
+ name='transcription_la',
+ es_mapping={'type': 'text'}, # no stopwords / stemmers available
+ language='la',
+ hidden=True
+ )
+
+ transcription_greek = FieldDefinition(
+ name='transcription_el',
+ es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='el'),
+ language='el',
+ hidden=True
+ )
+
+ transcription_dutch = FieldDefinition(
+ name='transcription_nl',
+ es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='nl'),
+ language='nl',
+ hidden=True
+ )
+
+ age = FieldDefinition(
+ name='age',
+ display_name='Age',
+ description='Age of the buried person(s)',
+ es_mapping=int_mapping(),
+ search_filter=RangeFilter(
+ description='Filter by age of the buried persons.',
+ lower=0,
+ upper=100,
+ ),
+ extractor=Constant(
+ value=None
+ )
+ )
+
+ # A string with all the names occuring in the source
+ names = FieldDefinition(
+ name='names',
+ es_mapping=text_mapping(),
+ display_name='Names',
+ description='Names of the buried persons.',
+ search_field_core=True
+ )
+
+ # Should be an array with potentially multiple values from these: 'M', 'F', or None.
+ sex = FieldDefinition(
+ name='sex',
+ display_name='Sex',
+ description='Gender(s) of the buried person(s). None if the sex is unknown.',
+ es_mapping=keyword_mapping(),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these genders.',
+ option_count=3,
+ ),
+ csv_core=True
+ )
+
+ country = FieldDefinition(
+ name='country',
+ display_name='Country',
+ description='Country where the inscription was found.',
+ es_mapping=keyword_mapping(True),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these countries.',
+ option_count=5
+ ),
+ visualization_type='term_frequency',
+ results_overview=True
+ )
+
+ settlement = FieldDefinition(
+ name='settlement',
+ display_name='Settlement',
+ description='The settlement where the inscription was found.',
+ es_mapping=keyword_mapping(True),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these settlements.',
+ option_count=29
+ ),
+ visualization_type='term_frequency'
+ )
+
+ region = FieldDefinition(
+ name='region',
+ display_name='Region',
+ description='The region where the inscription was found.',
+ es_mapping=keyword_mapping(True),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these regions.',
+ option_count=29
+ ),
+ visualization_type='term_frequency'
+ )
+
+ location_details = FieldDefinition(
+ name='location_details',
+ display_name='Location details',
+ description='Details about the location of the inscription',
+ es_mapping=text_mapping()
+ )
+
+ material = FieldDefinition(
+ name='material',
+ display_name='Material',
+ description='Type of material the inscription is written on.',
+ es_mapping=keyword_mapping(),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these material types.',
+ option_count=39
+ ),
+ visualization_type='term_frequency'
+ )
+
+ material_details = FieldDefinition(
+ name='material_details',
+ display_name='Material details',
+ description='Details about the material the inscription is written on.',
+ es_mapping=text_mapping(),
+ search_field_core=True
+ )
+
+ language = FieldDefinition(
+ name='language',
+ display_name='Language',
+ description='Language of the inscription.',
+ es_mapping=keyword_mapping(),
+ search_filter=MultipleChoiceFilter(
+ description='Search only within these languages.',
+ option_count=10
+ ),
+ csv_core=True,
+ visualization_type='term_frequency'
+ )
+
+ language_code = FieldDefinition(
+ name='language_code',
+ display_name='Language code',
+ description='ISO 639 code for the language of the inscription.',
+ es_mapping=keyword_mapping()
+ )
+
+ bibliography = FieldDefinition(
+ name='bibliography',
+ es_mapping=keyword_mapping(),
+ display_name='Bibliography',
+ description='Reference(s) to who edited and published this funerary inscription.'
+ )
+
+ comments = FieldDefinition(
+ name='comments',
+ es_mapping=text_mapping(),
+ display_name='Commentary',
+ description='Extra comments, questions or remarks on this inscription.',
+ search_field_core=True,
+ )
+
+ images = FieldDefinition(
+ name='images',
+ es_mapping=keyword_mapping(),
+ display_name='Images',
+ description='Links to image(s) of the inscription.',
+ hidden=True
+ )
+
+ coordinates = FieldDefinition(
+ name='coordinates',
+ es_mapping=keyword_mapping(),
+ display_name='Coordinates',
+ description='GIS coordinates for the inscription.'
+ )
+
+ iconography = FieldDefinition(
+ name='iconography',
+ es_mapping=text_mapping(),
+ display_name='Iconography',
+ description='Description of the icons used in the inscription.',
+ search_field_core=True
+ )
+
+ dates_of_death = FieldDefinition(
+ name='dates_of_death',
+ es_mapping=keyword_mapping(),
+ display_name='Date of death',
+ )
+
+ def __init__(self):
+ self.fields = [
+ self._id,
+ self.url,
+ self.year,
+ self.not_before,
+ self.not_after,
+ self.source_database,
+ self.transcription,
+ self.names,
+ self.sex,
+ self.dates_of_death,
+ self.age,
+ self.country,
+ self.region,
+ self.settlement,
+ self.location_details,
+ self.language,
+ self.language_code,
+ self.iconography,
+ self.images,
+ self.coordinates,
+ self.material,
+ self.material_details,
+ self.bibliography,
+ self.comments,
+ self.transcription_german,
+ self.transcription_hebrew,
+ self.transcription_latin,
+ self.transcription_greek,
+ self.transcription_english,
+ self.transcription_dutch
+ ]
+
+
+def clean_newline_characters(text):
+ '''
+ Remove all spaces surrounding newlines in `text`.
+ Also removes multiple newline characters in a row.
+ '''
+ if not text: return
+ parts = text.split('\n')
+ cleaned = []
+ for part in parts:
+ if not '\n' in part:
+ stripped = part.strip()
+ if stripped:
+ cleaned.append(part.strip())
+ return '\n'.join(cleaned)
+
+
+def clean_commentary(commentary):
+ '''
+ Clean a commentary by removing all whitespaces characters between words,
+ except for one space.
+ '''
+ return ' '.join(commentary.split())
+
+def join_commentaries(commentaries):
+ '''
+ Helper function to join the result of a Combined extractor
+ into one string, separating items by a newline
+ '''
+ results = []
+ for comm in commentaries:
+ if comm:
+ results.append(comm)
+ return "\n".join(results)
+
+def categorize_material(text):
+ '''
+ Helper function to (significantly) reduce the material field to a set of categories.
+ The Epidat corpus in particular has mainly descriptions of the material.
+ Returns a list of categories, i.e. those that appear in `text`.
+ '''
+ if not text: return ['Unknown']
+
+ categories = ['Sandstein', 'Kalkstein', 'Stein', 'Granit', 'Kunststein',
+ 'Lavatuff', 'Marmor', 'Kalk', 'Syenit', 'Labrador', 'Basalt', 'Beton',
+ 'Glas', 'Rosenquarz', 'Gabbro', 'Diorit', 'Bronze',
+ # below from FIJI and IIS
+ 'Limestone', 'Stone', 'Clay', 'Plaster', 'Glass', 'Kurkar', 'Granite',
+ 'Marble', 'Metal', 'Bone', 'Lead' ]
+ result = []
+ ltext = text.lower()
+
+ for c in categories:
+ if c.lower() in ltext:
+ result.append(translate_category(c))
+
+ if len(result) == 0:
+ # reduce unknown, other and ? to Unknown
+ # 'schrifttafel' removes some clutter from Epidat
+ if 'unknown' in ltext or 'other' in ltext or '?' in ltext or 'schrifttafel':
+ result.append('Unknown')
+ else:
+ result.append(text)
+
+ return result
+
+def translate_category(category):
+ '''
+ Helper function to translate non-English categories of material into English
+ '''
+ pairs = {
+ 'Sandstein': 'Sandstone',
+ 'Kalkstein': 'Limestone',
+ 'Stein': 'Stone',
+ 'Granit': 'Granite',
+ 'Kunststein': 'Artificial stone',
+ 'Lavatuff': 'Tufa',
+ 'Marmor': 'Marble',
+ 'Kalk': 'Limestone',
+ 'Syenit': 'Syenite',
+ 'Labrador': 'Labradorite',
+ 'Beton': 'Concrete',
+ 'Glas': 'Glass',
+ 'Rosenquarz': 'Rose quartz',
+ 'Diorit': 'Diorite'
+ }
+
+ for original, translation in pairs.items():
+ if category == original:
+ return translation
+ return category
+
+
+def get_text_in_language(_input):
+ '''
+ Get all the lines from a transcription that are in a certain language
+ (according to the `langdetect` package). Note that `transcription` will
+ be split on newlines to create lines that will be fed to langdetect one by one.
+ All lines that are in `language_code` will be collected and returned as one string,
+ i.e. they will be joined with a space (no newlines!).
+
+ Parameters:
+ _input -- A tuple or list with (transcription, language_code). Will typically be the output
+ of a Combined extractor, i.e. one for the transcript and a Constant extractor with the language code.
+ For a list of language codes detected by langdetect, see https://pypi.org/project/langdetect/
+ '''
+ results = []
+ if len(_input) != 2 or not _input[0]:
+ return results
+ lines = _input[0].split('\n')
+ language_code = _input[1]
+
+ for line in lines:
+ if not line: continue
+ detected_code = None
+ try:
+ # note that Aramaic is detected as Hebrew
+ detected_code = detect(line)
+ except LangDetectException:
+ # sometimes langdetect isn't happy with some stuff like
+ # very short strings with mainly numbers in it
+ pass
+ if detected_code and detected_code == language_code:
+ results.append(line)
+ return ' '.join(results)
diff --git a/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml b/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml
new file mode 100644
index 000000000..90136bb1c
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml
@@ -0,0 +1,216 @@
+
+
+
+
+
+epidat, blr-4
+
+
+
+
+
+
+
+
+
+ Salomon Ludwig Steinheim-Institut
+
+Edmund-Körner-Platz 2
+D-45127 Essen
+
+
+
+blr-4
+http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4
+
+
+[Distributed under a Creative Commons licence Attribution-BY 4.0](http://creativecommons.org/licenses/by/4.0/)
+
+ All reuse or distribution of this work must contain somewhere a link back to the URL
+ [http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4]
+
+
+
+
+
+
+
+
+born digital
+
+
+epidat
+blr-4
+
+http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4
+
+
+
+http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4-t
+
+
+
+
+
+
+
+
+stone
+sepulchral monument
+
+
+
+
+
+
+
+
+
+
+
+1865-02-28
+
+
+
+ Germany
+ Thuringa
+
+
+ Bleicherode
+
+ Jewish Cemetery
+ 51.434387 10.571183
+
+
+
+
+
+
+
+
+
+
+EpiDoc: TEI XML for epigraphic Documents Schema
+
+
+
+
+Julia Buchmann, Nicola Wiemann, Maike Schlotterhose; Bleicherode
+
+
+
+World Geodetic System
+
+
+
+
+
+Natan Schönfeld (Nathan Schönfeld)
+
+
+
+
+
+
+Hebrew
+German
+
+
+
+
+
+
+
+
+
+
+
+ recto
+
+
+
+
+
+
+ Detail
+
+
+
+
+
+
+ verso
+
+
+
+
+
+
+Edition
+
+
+
+ Hier ruhet
+
+ der Kaufmann
+
+ Nathan Schönfeld
+
+ geb. d. 4. April 1812
+
+ gest. d. [28.] Februar 1865
+
+
+
+
+
+ פ״נ
+
+ איש חמדות יקר רוח אוהב
+
+ צדק ופועל טוב כ״ה נתן
+
+ שאנפעלד נולד ח׳ של פסח
+
+ תקע״ב ונפטר בשם טוב יום ג׳
+
+ ב׳ אדר תרכ״ה לפ״ק
+
+ תנצב״ה
+
+
+
+
+Übersetzung
+
+
+ Hier ist begraben
+
+ #.:ein werter Mann#.;, #.:edelmütig#.;, Wohltat
+
+ liebend und Gutes wirkend, der geehrte Herr Natan
+
+ Schönfeld, geboren 8. (Tag) von Pessach 572
+
+ und verschieden #.:mit gutem Namen#.; Tag 3,
+
+ 2. Adar 625 der kleinen Zählung.
+
+ Seine Seele sei eingebunden in das Bündel des Lebens
+
+
+Zitate
+
Zl 7: Dan 10,11 | Zl 7: Spr 17,27
+
Zl 10: bBer 17a
+
+
+Prosopographie
+
+
+Bibliographie
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml b/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml
new file mode 100644
index 000000000..63a21a51d
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml
@@ -0,0 +1,302 @@
+
+
+
+
+
+epidat, hlh-12
+
+
+
+
+
+
+
+
+
+ Salomon Ludwig Steinheim-Institut
+
+Edmund-Körner-Platz 2
+D-45127 Essen
+
+
+
+hlh-12
+http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12
+
+
+[Distributed under a Creative Commons licence Attribution-BY 4.0](http://creativecommons.org/licenses/by/4.0/)
+
+ All reuse or distribution of this work must contain somewhere a link back to the URL
+ [http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12]
+
+
+
+
+
+
+
+
+born digital
+
+
+epidat
+hlh-12
+
+http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12
+
+
+
+http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12-t
+
+
+Stadt Mülheim an der Ruhr, Sterberegister Broich 1891 (1196/5/14), Nr. 247.Kaufhold, Barbara, Jüdischen Leben in Mülheim an der Ruhr, Essen 2004.
+
+
+
+
+
+stone
+sepulchral monument
+
+
+
+
+2013
+ Der Zustand des Steins hat sich seit 1986 kaum verändert
+
+
+
+
+
+
+
+
+
+sechzackiger Stern
+
+
+
+
+
+1891-12-06
+
+
+
+ Germany
+ North Rhine-Westphalia
+
+
+ Kettwig (Neuer Friedhof in Heiligenhaus)
+
+ Jewish Cemetery
+ 51.346014 6.924709
+
+
+
+
+
+
+
+
+
+
+EpiDoc: TEI XML for epigraphic Documents Schema
+
+
+
+
+
+ Epigraphisches Bildarchiv,
+ Steinheim-Institut
+
+
+
+Nathanja Hüttenmeister, Carmen Wedemeyer
+
+
+
+World Geodetic System
+
+
+
+
+
+Gitle bat Mosche (Clara Leffmann)
+
+
+
+
+
+
+
+
+
+Hebrew
+German
+
+
+
+
+
+
+
+
+
+
+
+ recto
+
+
+
+
+
+
+ recto
+
+
+
+
+
+
+ recto
+
+
+
+
+
+
+ Detail
+
+
+
+
+
+
+ Detail
+
+
+
+
+
+
+ Detail
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Edition
+
+
+
+ פ״ט
+
+ הבתולה צנועה וחמודה
+
+ מ׳ גיטלא בת משה
+
+ ה״ה ראשנה שנקברה בבית
+
+ החיים החדשה בק״ק
+
+ קעטטוויג ומתה בשם ט׳
+
+ ביום א׳ ה׳ כסלו תרנ״ב ל׳
+
+ תנצב״ה
+
+ Hier ruht die Jungfrau
+
+ Clara Leffmann
+
+ Sie starb erst 19
+
+ Jahre alt, gottergeben und
+
+ tief betrauert von den ihrigen,
+
+ den 8. Dezbr. 1891
+
+
+
+
+
+ Friede ihrer Asche.
+
+
+
+
+Übersetzung
+
+
+
+ Hier ist geborgen
+
+ die züchtige und liebliche Jungfrau,
+
+ Frau Gitle, Tochter des Mosche,
+
+ sie ist die Erste, die begraben wurde auf dem neuen
+
+ Friedhof der heiligen Gemeinde
+
+ Kettwig, und sie starb #.:mit gutem Namen#.;
+
+ am Tag 1, 5. Kislev 652 der Zählung.
+
+ Ihre Seele sei eingebunden in das Bündel des Lebens
+
+
+
+
+
+Zitate
+
Zl 6: bBer 17a
+
+
+Zeilenkommentar
+
Zl 5: Friedhof, wörtl. "Haus des Lebens".
+
+
+Endkommentar
+
Vermutlich handelt es sich bei der Angabe des Sterbedatums in der deutschen Inschrift um das Begräbnisdatum. Dieser Stein ist der erste des Friedhofes am Görscheider Weg.
+
Zwischen den jüdischen Familien aus Kettwig vor der Brücke und Saarn gab es verwandtschaftliche Verhältnisse, so stammte die Familie Leffmann, deren Angehörige z. T. hier bestattet sind, aus Saarn (Kaufhold, Jüdisches Leben in Mülheim a. d. R., S. ).
+
+
+Prosopographie
+
Clara Leffmann war die Tochter des Saarner Metzgers Moritz Leffmann und seiner Frau Sara geb. Herz. Der Bruder Artur fiel 1917 im Krieg (Engelhardt, Chronik, S. 81).
+
+
+Bibliographie
+
+[Systematische bauhistorische Beschreibung](http://steinheim-institut.de/daten/maps/rir/description/hlh.xml)
+ durch
+ Bau- und Stadtbaugeschichte, Fakultät 6, Institut für Architektur, TU Berlin
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/fiji/299.xml b/backend/corpora/peaceportal/tests/data/fiji/299.xml
new file mode 100644
index 000000000..622c94738
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/fiji/299.xml
@@ -0,0 +1,64 @@
+
+
+
+
+ 299
+
+
+
+
+
+
+
+
+ Museo Vaticano, lapidario ebraico ex-Lateranense; inv.no.30762
+
+ Noy 1995, p. 69-70 (83)
+
+
+
+
+ Rome, Monteverde
+ 3rd-4th century
+ Uncertain
+
+
+
+
+
+
+
+
+
+ Felicissima ( the commemorator) Emarantus ( the decaesed) (Φη<λ>ικίσσιμα Ἠμαράντῳ)
+
+
+
+
+
+
+
+ Greek
+
+
+
+
+ CIJ i 1936, 266 no.339
+ None
+ None
+
+
+
+ Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν.
+ Epitaph
+ none
+
+ Stone (white marble plaque)
+ Φη<λ>ικίσσιμα
+ not mentioned
+ not mentioned
+ Found on the 3rd of December 1904 in Cub.XL. The lower third of the plaque was left unused. There are poits between the syllables. Ferrua thought it might be pagan.
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/fiji/687.xml b/backend/corpora/peaceportal/tests/data/fiji/687.xml
new file mode 100644
index 000000000..d860857cb
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/fiji/687.xml
@@ -0,0 +1,61 @@
+
+
+
+
+ 687
+
+
+
+
+
+
+
+
+ In the catacomb
+
+ Noy 1995, p. 351 (417)
+
+
+
+
+ Rome, Villa Torlonia (lower cat.)
+ 3rd- 4th century
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Greek
+
+
+
+
+ not available
+ None
+ None
+
+
+
+ ἐνθάδε [κεῖται--]
+ Εpitaph
+ ?
+
+ Stone (marble fragment)
+ ἐνθάδε [κεῖται--]
+ not mentioned or lost
+ not mentioned or lost
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/fiji/759.xml b/backend/corpora/peaceportal/tests/data/fiji/759.xml
new file mode 100644
index 000000000..74441bf40
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/fiji/759.xml
@@ -0,0 +1,65 @@
+
+
+
+
+ 759
+
+
+
+
+
+
+
+
+ Formerly in Villa Torlonia stables
+
+ Noy 1995, p. 390-1 (489)
+
+
+
+
+ Rome, Villa Torlonia (lower cat.)
+ 3rd- 4th century
+
+
+
+
+
+
+
+
+
+
+ Irene (Εἰρήνη)
+
+
+
+
+ Greek
+
+
+
+
+ CIJ i 1936, p. 19-20 no.21
+ None
+ None
+
+
+
+ Εἰρήνη τρεζπτὴ προσήλυτος πατρὸς καὶ μητρὸς Εἰουδε͂α
+
+Ἰσδραηλίτης ἔζησεν ἤτ(η) γ΄ μ(ῆνας) ζ΄ vac.ἡμ(έ)ρ(αν) α΄.
+
+͂
+ Εpitaph
+ none
+
+ Stone (grey-blue marble plaque)
+ Εἰρήνη
+ 3
+ The precise age was 3 years, 7 months and 1 day.
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml
new file mode 100644
index 000000000..de749a662
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
Χάρητος
Χάρητος
Χάρητος
Χάρητος
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml
new file mode 100644
index 000000000..235b943e8
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
Ἀβρᾶ καὶ Σαμῆ
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml
new file mode 100644
index 000000000..b4ac3b202
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
אמא
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml b/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml
new file mode 100644
index 000000000..5f7921f49
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml
@@ -0,0 +1,196 @@
+
+
+
+
+
+
+Inscriptions of Israel/Palestine
+
+Prinicipal Investigator
+Michael Satlow
+
+
+
+
+
+
+ERROR-could not find publication information which should appear in this space.
+
+
+
+
+
+
+
+
+
+Akld 0002
+Shadmi, T. (1996). The Ossuaries and the Sarcophagus. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 41–55). Jerusalem: Israel Antiquities Authority. (page 52)Ilan, T. (1996). The Ossuary and Sarcophagus Inscriptions. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 57–72). Jerusalem: Israel Antiquities Authority. (page 58)
+
+
+
+
+
+
+Jerusalem Akeldama Caves confluence of Kidron and Hinnom Valleys,
+ First century CE. Ossuary. Funerary.
+
+
+
+
+
+
+
+
+64
+29
+35
+
+
+
+
+
+
+
+
+once on each side
+
+
+
+
+
+
+
+
+
+
+
+
+Painted Red
+
+
+
+
+
+
+
+
+
+First century CE
+
+Judaea
+Jerusalem
+Akeldama
+Cave 2 chamber B
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Taxonomies for IIP controlled values
+
+
+
+
+
+
+
+Initial Entry
+Normalized objectDesc/@ana
+Adding Pleiades IDs to origin/placenames
+
+ adding period attribute to date element, with Periodo value.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
ΧΑΡΗΤΟϹ
ΧΑ ΡΗ ΤΟ Ϲ
ΧΑΡΗΤΟϹ
ΧΑΡΗΤΟϹ
+
+
+
Χάρητος
Χάρητος
Χάρητος
Χάρητος
+
+
+
+
+
+
+
+
+
+
+52
+
+
+
+58
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml b/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml
new file mode 100644
index 000000000..f61d5a5d2
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml
@@ -0,0 +1,143 @@
+
+
+
+
+
+Inscriptions of Israel/Palestine
+
+Prinicipal Investigator
+Michael Satlow
+
+
+
+
+
+ERROR-could not find publication information which should appear in this space.
+
+
+beth0042
+
+
+
+
+
+
+Beth 0042
+Frey, J. B. (1952). Corpus Inscriptionum Iudaicarum (Vol. II (Asie-Afrique)). Roma: Pontificio Istituto di Archeologia Cristiana. (insc)Schwabe, M., & Lifshitz, B. (1974). Beth She’arim. Vol. 2, The Greek Inscriptions. Massada Press on behalf of the Israel Exploration Society. (page 25-26)
+
+
+
+
+Galilee. Beth Shearim. 250 CE to 350 CE. Red painted wall of arcosolium. Funerary.
+
+
+
+
+
+
+
+
+60
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+250 CE to 350 CE
+
+Galilee
+Beth Shearim
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ERROR: could not find taxonomies file, which should appear in this space.
+
+
+
+
+
+
+Creation
+Revision
+Changed graphic element to facsimile and kept existing url
+Adding Pleiades IDs to origin/placenames
+
+ adding period attribute to date element, with Periodo value.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Catacomb 1, Hall G, room IV, arcosolium 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+25-26
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml b/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml
new file mode 100644
index 000000000..d188209a8
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml
@@ -0,0 +1,140 @@
+
+
+
+
+
+Inscriptions of Israel/Palestine
+
+Prinicipal Investigator
+Michael Satlow
+
+
+
+
+
+ERROR-could not find publication information which should appear in this space.
+
+
+jeru0014
+
+
+
+
+
+
+jeru0014
+Rahmani, L. Y. (1994). A Catalogue of Jewish Ossuaries in the Collections of the State of Israel. (A. Sussmann, Ed.). Israel Antiquities Authority: Israel Academy of Sciences and Humanities. (page 80, plate 4, fig. 21)
+
+
+
+
+Judaea. Jerusalem. 20 BCE to 70 CE. Soft limestone ossuary. Funerary.
+
+
+
+
+
+
+
+
+29.5
+52
+23
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+20 BCE to 70 CE
+
+Judaea
+Jerusalem
+Kidron Valley
+southeast of 'En Rogel
+
+Judaea. Jerusalem. Kidron Valley, southeast of Ἑn Rogel.
+
+
+
+
+
+
+
+
+
+
+
+
+
+ERROR: could not find taxonomies file, which should appear in this space.
+
+
+
+
+
+
+Creation
+Normalized objectDesc/@ana
+Adding Pleiades IDs to origin/placenames
+
+ adding period attribute to date element, with Periodo value.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
The ossuary has an inner ledge on three sides and a flat, sliding lid with a small fingergrip on its outer edge. The word אמא could be a name or the word meaning "mother." Several examples of a name occuring along with this word support the second interpretation.
+
+
+
+
+
+80
+
+
+
+plate 4, fig. 21
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/safed/safed.csv b/backend/corpora/peaceportal/tests/data/safed/safed.csv
new file mode 100644
index 000000000..769adb10d
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/safed/safed.csv
@@ -0,0 +1,10 @@
+MISPAR;ADDITION;Map;Px;Who helped;First Name;First Name (hebrew);Middle Name;Middle Name (hebrew);Title;Parent / First;Parent / First (hebrew);Parent Middle Name;Parent Middle Name (hebrew);Family Name;Family Name (hebrew);City;City (hebrew);CHELKA;AREA;NOTES;YOM;CHODESH;SHANA;DAY;MONTH;YEAR;
+1;;;;;Avraham;אברהם;;;;;;;;Harbon;חרבון;;;א;A;החכם הרופא;ה;;רכו;;;1466;
+1;A;;;;Lechichl;לחיחל;;;;;;;;;;;;א;A;;י;שבט;תשי;28;1;1950;
+2;;;;;Pinchas;פנחס;;;;Zvi;צבי;;;;;;;א;A;;כט;טבת;תשכב;05;01;1962;
+3;;;;;Melech;מלך;;;;Meir;מאיר; Yisrael; ישראל;;;;;א;A;;ט;טבת;תשכב;16;12;1961;
+4;;;;;Rachel;רחל;;;;;;;;Negrenik Bahagen;נגריניק בהגן;;;א;A;;טו;טבת;תשכא;03;01;1961;
+5;;m;px;;Eliyahu;אליהו;Manig;מאניג;;Zev;זאב;;;Katz;כץ;;;א;A;age 68;א;ניסן;תשכ;29;03;1960;
+5;A;m;p-x;;Yitzhak;יצחק;;;;Moshe;משה ;David;דוד;Rozenthal HaCohen;רוזנטל הכהן;;;א;A;age 73;כח;חשון;תשכא;;;1960;
+6;;m;px;;Dvasi;דוואסי;;;;Zvi;צבי;;;Masiroka ?;מסירוקא ?;Siruka;סירוקא;א;A;above Mik-Ari Path;א;אייר;תשכ;28;04;1960;
+7;;m;px;;Sima;סימה;;;;Avraham;אברהם;;;Reuven;רובין;Batshan;באטשן;א;A;above Mik-Ari Path;כג;שבט;תשכ;;;1960;
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-11.xml b/backend/corpora/peaceportal/tests/data/tol/tol-11.xml
new file mode 100644
index 000000000..9259da682
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/tol/tol-11.xml
@@ -0,0 +1,214 @@
+
+
+
+
+
+
+
+ epidat, tol-11
+
+ edited by
+ Elíshabá Mata
+
+
+
+
+
+
+
+
+
+ Salomon Ludwig Steinheim-Institut
+
+ Edmund-Körner-Platz 2
+ D-45127 Essen
+
+
+
+ tol-11
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11
+
+
+ [Distributed under a Creative Commons licence Attribution-BY 4.0](http://creativecommons.org/licenses/by/4.0/)
+
+ All reuse or distribution of this work must contain somewhere a link back to the URL
+ [http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11]
+
+
+
+
+
+
+
+
+ born digital
+
+
+ epidat
+ tol-11
+
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11
+
+
+
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11-t
+
+
+
+
+
+
+
+
+ stone (material not specified)
+ sepulchral monument
+
+
+
+
+
+
+
+
+
+
+
+
+ Spain
+
+ Toledo
+
+ Jewish Cemetery
+ 39.871036 -4.022968
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Israel
+ Moshe
+ Israel
+
+ Hypothetical date
+ Other transcription: YIŚRA#[2019]EL BEN MOŠEH BEN YIŚRA#[2019]EL #[000D]#[000A]Young murdered person
+
+
+
+
+ Hebrew
+
+
+
+
+
+
+
+
+
+ Edition
+
+
+
+ מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר
+
+ לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר
+
+ עַל מוֹת לַבֵּן בָּחוּר וָטוֹב
+
+ כְּגַן רָטוֹב
+
+ קָם עָלָיו כַּזְּדוֹנִים
+
+ גּוֹי עַז פָּנִים
+
+ הִשְׁקֵהוּ מֵי רוֹשׁ
+
+ בָּא עַד הָרֹאשׁ
+
+ וַיַּכֵּהוּ בִצְדִיָּה
+
+ מַכָּה טְרִיָּה
+
+ לָאָרֶץ חַיְתוֹ דִכָּה
+
+ וַיִּצֶק דַּם הַמַּכָּה
+
+ נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל
+
+ נַעַר יִשְׂרָאֵל
+
+ הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה
+
+ בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה
+
+ הַצְּבִי יִשְׂרָאֵל חָלָל
+
+ בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל
+
+ אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]
+
+ וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ
+
+ עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ
+
+ בְּפֶסַח וַיָּמֶת אוֹתוֹ
+
+ תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ
+
+ וַיֵּאָסֵף אֶל עַמּוֹ
+
+ תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים
+
+ צְרוּרָה בִּצְרוֹר הַחַיִּים
+
+ יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל
+
+ אֱלֹהֵי יִשְׂרָאֵל
+
+
+
+
+
+
+ Prosopographie
+
+
+ Bibliographie
+
+
+ 61-62
+ 62
+
+
+
+
+ 174-175
+ 17
+
+
+
+
+ 83-84
+ 41
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-27.xml b/backend/corpora/peaceportal/tests/data/tol/tol-27.xml
new file mode 100644
index 000000000..0c710ec92
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/tol/tol-27.xml
@@ -0,0 +1,189 @@
+
+
+
+
+
+
+
+ epidat, tol-27
+
+ edited by
+ Elíshabá Mata
+
+
+
+
+
+
+
+
+
+ Salomon Ludwig Steinheim-Institut
+
+ Edmund-Körner-Platz 2
+ D-45127 Essen
+
+
+
+ tol-27
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27
+
+
+ [Distributed under a Creative Commons licence Attribution-BY 4.0](http://creativecommons.org/licenses/by/4.0/)
+
+ All reuse or distribution of this work must contain somewhere a link back to the URL
+ [http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27]
+
+
+
+
+
+
+
+
+ born digital
+
+
+ epidat
+ tol-27
+
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27
+
+
+
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27-t
+
+
+
+
+
+
+
+
+ stone (material not specified)
+ sepulchral monument
+
+
+
+
+
+
+
+
+
+
+
+
+ Spain
+
+ Toledo
+
+ Jewish Cemetery
+ 39.871036 -4.022968
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Moshe
+ Yizhaq ben Elfats
+
+
+ Other transcription of the name: MOŠEH BEN YIṢḤAQ BEN #[2019]ELFAṬS#[000D]#[000A]Young man
+
+
+
+
+ Hebrew
+
+
+
+
+
+
+
+
+
+ Edition
+
+
+
+ בְּקֶבֶר זֶה נִטְמָן
+
+ בָּחוּר נֶטַע נַעֲמָן
+
+ לְדֵרֶךְ מוּסָר סָר
+
+ וּמִדֵּרֶךְ יָשָׁר לֹא סָר
+
+ ז״ךְ שָׁנִים חָיָה
+
+ וְזַךְ לֵבָב הָיָה
+
+ וּבז״ךְ בְּמַרְחֶשׁוָן פָּנָה
+
+ וְעָזַב אֶת אָבִיו בֶּן שִׁבְעִים שָׁנָה
+
+ נֶאֱנַח מַשְׁ#[05בּצּ]מִים
+
+ כִּי אָרְכוּ לוֹ אַחֲרָיו הַיָּמִים
+
+ וּבִשְׁנַת חֲמֵשֶׁת אֲלָפִים וְתִשִׁעִים וְשָׁלֹש
+
+ נִלְכַּד בְּפַח וּפַחַת
+
+ וּמִבֵּין רֵעָיו נֶאֱסַף וְנִכְתַּשׁ בְּתֹךְ מַכְתֵּשׁ
+
+ הוּא מֹשֶה נ״ע בר׳ יִצְחָק נ״ע בֶּן אֵלְפַטְשׂ
+
+ אֱלֹהָיו יְרַחֵם עָלָיו
+
+ וְיָנוּחַ וְיַעֲמוֹד לְקֵץ הַיָּמִין לְגוֹרָלוֹ
+
+
+
+
+
+
+ Prosopographie
+
+
+ Bibliographie
+
+
+ 41-42
+ 39
+
+
+
+
+ 182-183
+ 25
+
+
+
+
+ 94-95
+ 49
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-36.xml b/backend/corpora/peaceportal/tests/data/tol/tol-36.xml
new file mode 100644
index 000000000..b8d7a8be5
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/tol/tol-36.xml
@@ -0,0 +1,197 @@
+
+
+
+
+
+
+
+ epidat, tol-36
+
+ edited by
+ Elíshabá Mata
+
+
+
+
+
+
+
+
+
+ Salomon Ludwig Steinheim-Institut
+
+ Edmund-Körner-Platz 2
+ D-45127 Essen
+
+
+
+ tol-36
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36
+
+
+ [Distributed under a Creative Commons licence Attribution-BY 4.0](http://creativecommons.org/licenses/by/4.0/)
+
+ All reuse or distribution of this work must contain somewhere a link back to the URL
+ [http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36]
+
+
+
+
+
+
+
+
+ born digital
+
+
+ epidat
+ tol-36
+
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36
+
+
+
+ http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36-t
+
+
+
+
+
+
+
+
+ stone (material not specified)
+ sepulchral monument
+
+
+
+
+
+
+
+
+
+
+
+
+ Spain
+
+ Toledo
+
+ Jewish Cemetery
+ 39.871036 -4.022968
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Yaakov
+ Yizhaq
+
+
+ Other transcription of the name: YA#[2018]ĂQŌḆ BEN YIṢḤAQ BEN AL-SARAQOSTAN#[000D]#[000A]Occupation: Physician and counselor#[000D]#[000A]Death in the Black Death
+
+
+
+
+ Hebrew
+
+
+
+
+
+
+
+
+
+ Edition
+
+
+
+ בְּקֶבֶר זֶה נִקְבַּר
+
+ אִישׁ שֵׂכֶל וּנְבוֹן דָּבָר
+
+ נְקִי כַפָיִם וּבַר
+
+ מָלֵא הוֹד וְחָכְמָה
+
+ וְדַעַת וּמְזִמָּה
+
+ יוֹעֵץ וַחֲכָם חֲרָשִׁים
+
+ טוֹב עִם ה׳ וְעִם אֲנָשִׁים
+
+ רוֹפֵא מַחֲלִים הַנְפָשִׁים
+
+ וּמִזְּרַע קְדוֹשִׁים
+
+ שְׁמוֹ ר׳ יַעֲקֹב בר׳ יִצְחָק נ׳ע ן׳ אַלְסָארַקֹסְטַן
+
+ נָתַן כָּל־יָמָיו אֶל לִבּוֹ
+
+ לֶאֱהוֹב אֶת ה׳ וּלְדָבְקָה בוֹ
+
+ וְכַאֲשֶׁר בָּאָרֶץ פָּרַץ פֶּרֶץ
+
+ בִּקְדוֹשִׂים אֲשֶׁר בָּאָרֶץ
+
+ וַתִּפְרֹץ בָּם הַמַּגֵּפָה
+
+ נֶאֱסַף אֶל עַמּוֹ
+
+ וְעָזַב אֶת הָאָרֶץ וְעָלָה לִשְׁכוֹן מְרוֹמוֹ
+
+ ובי׳׳ב בְּתַמּוּז שְׁנַת מְנוּחָה הָיְתָה יַד אֱלֹהָיו עָלָיו
+
+ לְשׁוֹבֵב יַעֲקֹב אֵלָיו
+
+ לָתֵּת לוֹ יָד בֵּין חֲסִידָיו וּלַעֲבוֹד בְּרֹאשָׁם
+
+ וַיֹּאמֶר ה׳ אֶל יַעֲקֹב קוּם עֲלֵה בֵית אֵל וְשֶׁב שָׁם
+
+ וְיַעֲקֹב הָלַךְ לְדַרְכּוֹ לִרְאוֹת פְּנֵי דָר נְגָהִים
+
+ וַיִּפְגְּעוּ בוֹ מַלְאֲכֵי אֱלֹהִים
+
+
+
+
+
+
+ Prosopographie
+
+
+ Bibliographie
+
+
+ 65-66
+ 70
+
+
+
+
+ 209-210;C/M (82),135-138
+ 58
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/test_peace.py b/backend/corpora/peaceportal/tests/test_peace.py
new file mode 100644
index 000000000..54db50ef3
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/test_peace.py
@@ -0,0 +1,280 @@
+import os
+import pytest
+
+from addcorpus.load_corpus import load_corpus_definition
+from addcorpus.save_corpus import load_and_save_all_corpora
+from addcorpus.models import Corpus
+
+CORPUS_TEST_DATA = [
+ {
+ 'name': 'peaceportal-epidat',
+ 'docs': [{
+ "id": "blr-4",
+ "url": "http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4",
+ "year": "1865",
+ "not_before": "1865",
+ "not_after": None,
+ "source_database": "Epidat (Steinheim Institute)",
+ "transcription": """Hier ruhet
+der Kaufmann
+Nathan Schönfeld
+geb. d. 4. April 1812
+gest. d. [28.] Februar 1865
+פ״נ
+איש חמדות יקר רוח אוהב
+צדק ופועל טוב כ״ה נתן
+שאנפעלד נולד ח׳ של פסח
+תקע״ב ונפטר בשם טוב יום ג׳
+ב׳ אדר תרכ״ה לפ״ק
+תנצב״ה""",
+ "names": "Natan Schönfeld (Nathan Schönfeld)",
+ "sex": [
+ "M"
+ ],
+ "dates_of_death": [
+ "1865-02-28"
+ ],
+ "country": "Germany",
+ "region": "Thuringa",
+ "settlement": "Bleicherode",
+ "location_details": "Jewish Cemetery",
+ "language": [
+ "Hebrew",
+ "German"
+ ],
+ "iconography": None,
+ "images": [
+ "http://steinheim-institut.de/daten/picsblr/xl/0004_blr_2012.jpg",
+ "http://steinheim-institut.de/daten/picsblr/xl/0004rblr_2012.jpg",
+ "http://steinheim-institut.de/daten/picsblr/xl/0004dblr_2012.jpg"
+ ],
+ "coordinates": "51.434387 10.571183",
+ "material": [
+ "Stone"
+ ],
+ "material_details": "stone",
+ "bibliography": None,
+ "comments": """OBJECTTYPE:
+sepulchral monument
+
+""",
+ "transcription_de": None,
+ "transcription_he": "פ״נ איש חמדות יקר רוח אוהב צדק ופועל טוב כ״ה נתן שאנפעלד נולד ח׳ של פסח תקע״ב ונפטר בשם טוב יום ג׳ ב׳ אדר תרכ״ה לפ״ק תנצב״ה",
+ "transcription_en": "",
+ "transcription_nl": "Hier ruhet"
+ }],
+ 'n_documents': 2
+ },
+ {
+ 'name': 'peaceportal-iis',
+ 'docs': [{
+ "id": "akld0002",
+ "url": "https://library.brown.edu/iip/viewinscr/akld0002",
+ "year": "0001",
+ "not_before": "0001",
+ "not_after": "0100",
+ "source_database": "Inscriptions of Israel/Palestine (Brown University)",
+ "transcription": """Χάρητος
+Χάρητος
+Χάρητος
+Χάρητος""",
+ "sex": "Unknown",
+ "country": "Israel/Palestine",
+ "region": "Judaea",
+ "settlement": "Jerusalem",
+ "location_details": (
+ "Judaea Jerusalem Akeldama Cave 2 chamber B",
+ "",
+ ""
+ ),
+ "language": (
+ "Ancient Greek",
+ "Unknown"
+ ),
+ "iconography": "Painted Red",
+ "material": [
+ "Limestone",
+ "Stone"
+ ],
+ "material_details": "#limestone",
+ "bibliography": [
+ "Shadmi, T. (1996). The Ossuaries and the Sarcophagus. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 41–55). Jerusalem: Israel Antiquities Authority. (page 52)",
+ "Ilan, T. (1996). The Ossuary and Sarcophagus Inscriptions. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 57–72). Jerusalem: Israel Antiquities Authority. (page 58)"
+ ],
+ "comments": """CONDITION:
+ (#complete.intact)
+
+
+LAYOUT:
+once on each side
+
+
+OBJECTTYPE:
+ossuary
+
+
+DIMENSIONS:
+H: 64 W: 29 D: 35
+
+
+HANDNOTES:
+ (#impressed.inscribed)
+
+""",
+ "transcription_he": "",
+ "transcription_la": "",
+ "transcription_el": "Χάρητος Χάρητος Χάρητος Χάρητος",
+ "transcription_en": "of Chares"
+ }],
+ 'n_documents': 3
+ },
+ {
+ 'name': 'peaceportal-fiji',
+ 'docs': [{
+ "id": "299",
+ "source_database": "Funerary Inscriptions of Jews from Italy (Utrecht University)",
+ "transcription": "Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν.",
+ "names": "Felicissima ( the commemorator) Emarantus ( the decaesed) (Φη<λ>ικίσσιμα Ἠμαράντῳ)",
+ "sex": [
+ "M",
+ "F"
+ ],
+ "age": None,
+ "country": "Italy",
+ "settlement": "Rome, Monteverde",
+ "location_details": "Museo Vaticano, lapidario ebraico ex-Lateranense; inv.no.30762",
+ "language": [
+ "Greek"
+ ],
+ "iconography": "none",
+ "material": [
+ "Stone",
+ "Marble"
+ ],
+ "bibliography": [
+ "Noy 1995, p. 69-70 (83)"
+ ],
+ "comments": """DATE:
+Uncertain
+""",
+ "transcription_he": "",
+ "transcription_la": "",
+ "transcription_el": "Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν."
+ }],
+ 'n_documents': 3
+ },
+ {
+ 'name': 'peaceportal-tol',
+ 'docs': [{
+ "id": "tol-11",
+ "url": "http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11",
+ "year": None,
+ "not_before": None,
+ "not_after": None,
+ "source_database": "Medieval funerary inscriptions from Toledo",
+ "transcription": """מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר
+לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר
+עַל מוֹת לַבֵּן בָּחוּר וָטוֹב
+כְּגַן רָטוֹב
+קָם עָלָיו כַּזְּדוֹנִים
+גּוֹי עַז פָּנִים
+הִשְׁקֵהוּ מֵי רוֹשׁ
+בָּא עַד הָרֹאשׁ
+וַיַּכֵּהוּ בִצְדִיָּה
+מַכָּה טְרִיָּה
+לָאָרֶץ חַיְתוֹ דִכָּה
+וַיִּצֶק דַּם הַמַּכָּה
+נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל
+נַעַר יִשְׂרָאֵל
+הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה
+בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה
+הַצְּבִי יִשְׂרָאֵל חָלָל
+בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל
+אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]
+וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ
+עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ
+בְּפֶסַח וַיָּמֶת אוֹתוֹ
+תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ
+וַיֵּאָסֵף אֶל עַמּוֹ
+תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים
+צְרוּרָה בִּצְרוֹר הַחַיִּים
+יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל
+אֱלֹהֵי יִשְׂרָאֵל""",
+ "names": None,
+ "sex": [
+ "Unknown"
+ ],
+ "dates_of_death": None,
+ "country": "Spain",
+ "region": None,
+ "settlement": "Toledo",
+ "location_details": "Jewish Cemetery",
+ "language": [
+ "Hebrew"
+ ],
+ "iconography": None,
+ "images": None,
+ "coordinates": "39.871036 -4.022968",
+ "material": [
+ "Stone"
+ ],
+ "material_details": "stone (material not specified)",
+ "bibliography": None,
+ "comments": """OBJECTTYPE:
+sepulchral monument
+
+""",
+ "transcription_he": "מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר עַל מוֹת לַבֵּן בָּחוּר וָטוֹב כְּגַן רָטוֹב קָם עָלָיו כַּזְּדוֹנִים גּוֹי עַז פָּנִים הִשְׁקֵהוּ מֵי רוֹשׁ בָּא עַד הָרֹאשׁ וַיַּכֵּהוּ בִצְדִיָּה מַכָּה טְרִיָּה לָאָרֶץ חַיְתוֹ דִכָּה וַיִּצֶק דַּם הַמַּכָּה נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל נַעַר יִשְׂרָאֵל הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה הַצְּבִי יִשְׂרָאֵל חָלָל בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?] וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ בְּפֶסַח וַיָּמֶת אוֹתוֹ תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ וַיֵּאָסֵף אֶל עַמּוֹ תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים צְרוּרָה בִּצְרוֹר הַחַיִּים יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל אֱלֹהֵי יִשְׂרָאֵל",
+ "transcription_en": "",
+ "transcription_nl": ""
+ }],
+ 'n_documents': 3
+ }
+]
+
+def corpus_test_name(corpus_spec):
+ return corpus_spec['name']
+
+@pytest.mark.parametrize("corpus_object", CORPUS_TEST_DATA, ids=corpus_test_name)
+def test_imports(peace_test_settings, corpus_object):
+ parent_corpus = load_corpus_definition('peaceportal')
+ corpus = load_corpus_definition(corpus_object.get('name'))
+ assert len(os.listdir(os.path.abspath(corpus.data_directory))) != 0
+ fully_specified = ['peaceportal-iis', 'peaceportal-tol']
+ if corpus_object.get('name') not in fully_specified:
+ # only IIS / TOL have all fields
+ assert len(corpus.fields) != len(parent_corpus.fields)
+
+ start = corpus_object['start'] if 'start' in corpus_object else corpus.min_date
+ end = corpus_object['end'] if 'end' in corpus_object else corpus.max_date
+
+ tested_fields = set()
+ resulted_fields = set()
+
+ docs = get_documents(corpus, start, end)
+ for target in corpus_object.get('docs'):
+ doc = next(docs)
+ for key in target:
+ tested_fields.add(key)
+ assert key in doc
+ assert doc[key] == target[key]
+
+ for key in doc:
+ resulted_fields.add(key)
+
+ docs = get_documents(corpus, start, end)
+ assert len(list(docs)) == corpus_object.get('n_documents')
+
+def get_documents(corpus, start, end):
+ sources = corpus.sources(
+ start=start,
+ end=end
+ )
+ return corpus.documents(sources)
+
+def test_peaceportal_validation(db, peace_test_settings):
+ load_and_save_all_corpora()
+ corpus_names = [case['name'] for case in CORPUS_TEST_DATA]
+ for corpus_name in corpus_names:
+ corpus = Corpus.objects.get(name=corpus_name)
+ assert corpus.active
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
new file mode 100644
index 000000000..4d75f4cd4
--- /dev/null
+++ b/backend/corpora/peaceportal/tol.py
@@ -0,0 +1,394 @@
+import re
+from copy import copy
+
+from django.conf import settings
+
+from addcorpus.corpus import XMLCorpusDefinition
+from addcorpus.extract import XML, Constant, Combined, FilterAttribute
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+from corpora.utils.exclude_fields import exclude_fields_without_extractor
+
+class PeaceportalTOL(PeacePortal, XMLCorpusDefinition):
+ data_directory = settings.PEACEPORTAL_TOL_DATA
+ es_index = getattr(settings, 'PEACEPORTAL_TOL_ES_INDEX', 'peaceportal-tol')
+
+ languages = ['en', 'nl', 'he']
+
+ def __init__(self):
+ super().__init__()
+ self.source_database.extractor = Constant(
+ value='Medieval funerary inscriptions from Toledo'
+ )
+
+ self._id.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc',
+ 'msDesc', 'msIdentifier', 'idno'],
+ multiple=False,
+ toplevel=False,
+ flatten=True
+ )
+
+ self.url.extractor = FilterAttribute(
+ tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'],
+ multiple=False,
+ toplevel=False,
+ flatten=True,
+ attribute_filter={
+ 'attribute': 'type',
+ 'value': 'url'
+ }
+ )
+
+ self.year.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origDate', 'date'],
+ toplevel=False,
+ transform=lambda x: get_year(x),
+ )
+
+ self.not_before.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origDate', 'date'],
+ toplevel=False,
+ attribute='notBefore',
+ transform=lambda x: get_year(x),
+ )
+
+ self.not_after.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origDate', 'date'],
+ toplevel=False,
+ attribute='notAfter',
+ transform=lambda x: get_year(x),
+ )
+
+ self.transcription.extractor = XML(
+ tag=['text', 'body', 'div'],
+ toplevel=False,
+ multiple=False,
+ flatten=True,
+ transform=lambda x: clean_newline_characters(x),
+ transform_soup_func=extract_transcript
+ )
+
+ self.names.extractor = XML(
+ tag=['teiHeader', 'profileDesc',
+ 'particDesc', 'listPerson', 'person'],
+ flatten=True,
+ multiple=True,
+ toplevel=False,
+ )
+
+ self.sex.extractor = XML(
+ tag=['teiHeader', 'profileDesc',
+ 'particDesc', 'listPerson', 'person'],
+ attribute='sex',
+ multiple=True,
+ toplevel=False,
+ transform=lambda x: convert_sex(x)
+ )
+
+ self.dates_of_death.extractor = XML(
+ tag=['teiHeader', 'profileDesc',
+ 'particDesc', 'listPerson'],
+ transform_soup_func=extract_death,
+ attribute='when',
+ multiple=False,
+ toplevel=False,
+ )
+
+ self.country.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origPlace', 'country'],
+ toplevel=False,
+ transform_soup_func=extract_country,
+ transform=lambda x: clean_country(x),
+ flatten=True,
+ )
+
+ self.region.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origPlace', 'country', 'region'],
+ toplevel=False,
+ flatten=True
+ )
+
+ self.settlement.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origPlace', 'settlement'],
+ toplevel=False,
+ flatten=True,
+ transform_soup_func=extract_settlement,
+ )
+
+ self.location_details.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origPlace', 'settlement', 'geogName'],
+ toplevel=False,
+ flatten=True,
+ transform_soup_func=extract_location_details,
+ )
+
+ self.material.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'support', 'p', 'material'],
+ toplevel=False,
+ flatten=True,
+ transform=lambda x: categorize_material(x)
+ )
+
+ self.material_details.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'support', 'p', 'material'],
+ toplevel=False,
+ flatten=True
+ )
+
+ self.language.extractor = XML(
+ tag=['teiHeader', 'profileDesc', 'langUsage', 'language'],
+ toplevel=False,
+ multiple=True,
+ transform=lambda x: get_language(x)
+ )
+
+ self.comments.extractor = Combined(
+ XML(
+ tag=['text', 'body'],
+ toplevel=False,
+ transform_soup_func=extract_commentary,
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'condition'],
+ toplevel=False,
+ flatten=True,
+ transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x
+ ),
+ XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+ 'objectDesc', 'supportDesc', 'support', 'p'],
+ toplevel=False,
+ transform_soup_func=extract_support_comments,
+ ),
+ transform=lambda x: join_commentaries(x)
+ )
+
+ self.images.extractor = XML(
+ tag=['facsimile', 'graphic'],
+ multiple=True,
+ attribute='url',
+ toplevel=False
+ )
+
+ self.coordinates.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'],
+ toplevel=False,
+ multiple=False,
+ flatten=True
+ )
+
+ self.iconography.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc',
+ 'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
+ toplevel=False,
+ multiple=False
+ )
+
+ self.bibliography.extractor = XML(
+ tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+ 'msIdentifier', 'publications', 'publication'],
+ toplevel=False,
+ multiple=True
+ )
+
+ self.transcription_hebrew.extractor = Combined(
+ self.transcription.extractor,
+ Constant('he'),
+ transform=lambda x: get_text_in_language(x)
+ )
+
+ self.transcription_english.extractor = Combined(
+ self.transcription.extractor,
+ Constant('en'),
+ transform=lambda x: get_text_in_language(x)
+ )
+
+ self.transcription_dutch.extractor = Combined(
+ self.transcription.extractor,
+ Constant('nl'),
+ transform=lambda x: get_text_in_language(x)
+ )
+
+ self.fields = exclude_fields_without_extractor(self.fields)
+
+
+def convert_sex(values):
+ if not values:
+ return ['Unknown']
+ result = []
+ for value in values:
+ if value == '1':
+ result.append('M')
+ elif value == '2':
+ result.append('F')
+ else:
+ result.append('Unknown')
+ return result
+
+
+def clean_country(text):
+ if not text:
+ return 'Unknown'
+ if text.lower().strip() == 'tobedone':
+ return 'Unknown'
+ return text
+
+
+def get_year(text):
+ if not text or text == '--':
+ return
+ matches = re.search('[1-2]{0,1}[0-9]{3}', text)
+ if matches:
+ return matches[0]
+
+
+def get_language(values):
+ if not values:
+ return ['Unknown']
+ if 'German in Hebrew letters' in values:
+ return ['German (transliterated)', 'Hebrew']
+ return values
+
+
+def extract_transcript(soup):
+ '''
+ Helper function to ensure correct extraction of the transcripts.
+ Note that there are multiple formats in which these are stored,
+ but the text that we need is always in the `` children of
+ `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one).
+ '''
+ if not soup:
+ return
+ return soup.find_all('ab')
+
+
+def extract_translation(soup):
+ '''
+ Helper function to extract translation from the tag
+ '''
+ if not soup:
+ return
+ translation = soup.find('div', {'type': 'translation'})
+ if translation:
+ return translation.find_all('ab')
+ else:
+ return
+
+
+def extract_commentary(soup):
+ '''
+ Helper function to extract all commentaries from the tag.
+ A single element will be returned with the commentaries found as text content.
+ '''
+ if not soup: return
+ found = []
+ commentaries = soup.find_all('div', {'type': 'commentary'})
+
+ for commentary in commentaries:
+ if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']:
+ p = commentary.find('p')
+ if p:
+ text = p.get_text()
+ if text:
+ text = clean_commentary(text)
+ found.append('{}:\n{}\n'.format(commentary['subtype'].strip().upper(), text))
+
+ if len(found) > 1:
+ cloned_soup = copy(soup)
+ cloned_soup.clear()
+ cloned_soup.string = "\n".join(found)
+ return cloned_soup
+ else:
+ return None
+
+def extract_support_comments(soup):
+ if not soup: return
+ cloned_soup = copy(soup)
+ cloned_soup.clear()
+
+ commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS')
+ commentaries = add_support_comment(soup, commentaries, 'objectType', 'OBJECTTYPE')
+
+ # add any additional text from the element,
+ # i.e. if there is text it is the very last node
+ contents = soup.contents
+ text = contents[len(contents) - 1].strip()
+ if text:
+ text = clean_commentary(text)
+ commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text)
+
+ cloned_soup.string = commentaries
+ return cloned_soup
+
+
+def add_support_comment(soup, existing_commentaries, elem_name, commentary_name):
+ elem = soup.find(elem_name)
+ if elem:
+ text = elem.get_text()
+ if text:
+ text = clean_commentary(text)
+ return '{}{}:\n{}\n\n'.format(existing_commentaries, commentary_name, text)
+ return existing_commentaries
+
+
+def extract_death(soup):
+ '''
+ Helper function to extract date of death from multiple person tags.
+ '''
+ if not soup:
+ return
+ return soup.find_all('death')
+
+
+def extract_country(soup):
+ '''
+ Helper function to extract country.
+ This is needed because the output of `flatten` would otherwise include the text contents
+ of the ``.
+ '''
+ return clone_soup_extract_child(soup, 'region')
+
+
+def extract_settlement(soup):
+ return clone_soup_extract_child(soup, 'geogName')
+
+
+def extract_location_details(soup):
+ return clone_soup_extract_child(soup, 'geo')
+
+
+def clone_soup_extract_child(soup, to_extract):
+ '''
+ Helper function to clone the soup and extract a child element.
+ This is useful when the output of `flatten` would otherwise include the text contents
+ of the child.
+ '''
+ if not soup:
+ return
+ cloned_soup = copy(soup)
+ child = cloned_soup.find(to_extract)
+ if child:
+ child.extract()
+ return cloned_soup
+
+ # TODO: add field
+
+ # TODO: move to a comments field:
+
+ # excluded (for now):
+ # title
+ # organization (incl details, e.g. address)
+ # licence
+ # taxonomy (i.e. things like foto1, foto2 -> no working links to actual images)
+
diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py
index 3b905c4d7..72882bc41 100644
--- a/backend/corpora/periodicals/periodicals.py
+++ b/backend/corpora/periodicals/periodicals.py
@@ -38,7 +38,7 @@ class Periodicals(XMLCorpusDefinition):
@property
def es_settings(self):
- return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+ return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
tag_toplevel = 'articles'
tag_entry = 'artInfo'
@@ -145,7 +145,7 @@ def sources(self, start=min_date, end=max_date):
display_name='Content',
display_type='text_content',
description='Text content.',
- es_mapping=main_content_mapping(True, True, True),
+ es_mapping=main_content_mapping(True, True, True, 'en'),
results_overview=True,
extractor=extract.XML(tag='ocrText', flatten=True),
search_field_core=True,
diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py
index b8b6d0892..2404ee06b 100644
--- a/backend/corpora/rechtspraak/rechtspraak.py
+++ b/backend/corpora/rechtspraak/rechtspraak.py
@@ -45,7 +45,7 @@ class Rechtspraak(XMLCorpusDefinition):
@property
def es_settings(self):
- return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+ return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
tag_toplevel = 'open-rechtspraak'
@@ -287,7 +287,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None
name='content',
display_name='Content',
display_type='text_content',
- es_mapping=main_content_mapping(True, True, True),
+ es_mapping=main_content_mapping(True, True, True, 'nl'),
extractor=extract.Backup(
extract.XML('uitspraak', flatten=True),
extract.XML('conclusie', flatten=True),
diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py
index 1e0ff0d87..35e56ff0f 100644
--- a/backend/corpora/times/times.py
+++ b/backend/corpora/times/times.py
@@ -39,7 +39,7 @@ class Times(XMLCorpusDefinition):
@property
def es_settings(self):
- return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+ return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
tag_toplevel = 'issue'
tag_entry = 'article'
@@ -424,7 +424,7 @@ def sources(self, start=datetime.min, end=datetime.max):
name='content',
display_name='Content',
display_type='text_content',
- es_mapping=main_content_mapping(True, True, True),
+ es_mapping=main_content_mapping(True, True, True, 'en'),
visualizations=['wordcloud'],
description='Raw OCR\'ed text (content).',
results_overview=True,
diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py
index b8d416530..0bc8cbc2c 100644
--- a/backend/corpora/troonredes/troonredes.py
+++ b/backend/corpora/troonredes/troonredes.py
@@ -44,7 +44,7 @@ class Troonredes(XMLCorpusDefinition):
@property
def es_settings(self):
- return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+ return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
tag_toplevel = 'doc'
tag_entry = 'entry'
@@ -136,7 +136,7 @@ def sources(self, start=min_date, end=max_date):
display_name='Content',
display_type='text_content',
description='Text content.',
- es_mapping=main_content_mapping(True, True, True),
+ es_mapping=main_content_mapping(True, True, True, 'nl'),
results_overview=True,
search_field_core=True,
visualizations=['wordcloud', 'ngram'],
diff --git a/backend/corpora/utils/exclude_fields.py b/backend/corpora/utils/exclude_fields.py
new file mode 100644
index 000000000..bccc58792
--- /dev/null
+++ b/backend/corpora/utils/exclude_fields.py
@@ -0,0 +1,9 @@
+from addcorpus import extract
+
+def has_extractor(field):
+ if type(field.extractor) != extract.Constant:
+ return True
+ return field.extractor.apply() != None
+
+def exclude_fields_without_extractor(fields):
+ return list(filter(has_extractor, fields))
\ No newline at end of file
diff --git a/backend/corpora/utils/test_corpora_utils.py b/backend/corpora/utils/test_corpora_utils.py
new file mode 100644
index 000000000..5b8274bf5
--- /dev/null
+++ b/backend/corpora/utils/test_corpora_utils.py
@@ -0,0 +1,17 @@
+from addcorpus.corpus import FieldDefinition
+from addcorpus.extract import Constant
+
+from corpora.utils import exclude_fields
+
+def test_exclude_fields():
+ fields = [
+ FieldDefinition(
+ name='test1',
+ extractor=Constant('some value')
+ ),
+ FieldDefinition(
+ name='test2'
+ )
+ ]
+ new_fields = exclude_fields.exclude_fields_without_extractor(fields)
+ assert len(new_fields) == 1
diff --git a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py
index 39eb62ce0..ffb8e046a 100644
--- a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py
+++ b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py
@@ -1,7 +1,9 @@
from datetime import datetime
+import os
+
from addcorpus.corpus import FieldDefinition, CSVCorpusDefinition
+from addcorpus.es_mappings import keyword_mapping, text_mapping
from addcorpus.extract import CSV
-import os
# Fake corpus class for unit tests
@@ -26,17 +28,13 @@ def sources(self, start=min_date, end=max_date):
content = FieldDefinition(
name = 'content',
- es_mapping= {
- 'type': 'text',
- },
+ es_mapping = text_mapping(),
extractor = CSV('content')
)
language = FieldDefinition(
name = 'language',
- es_mapping= {
- 'type': 'keyword'
- },
+ es_mapping = keyword_mapping(),
extractor = CSV('language')
)
diff --git a/backend/es/conftest.py b/backend/es/conftest.py
index 8c817a8f7..406d285a6 100644
--- a/backend/es/conftest.py
+++ b/backend/es/conftest.py
@@ -3,7 +3,6 @@
from django.contrib.auth.models import Group
from addcorpus.load_corpus import load_corpus_definition
-from ianalyzer.elasticsearch import elasticsearch
from es import es_index
from addcorpus.models import Corpus
diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py
index 96eb57ab1..6f69f3611 100644
--- a/backend/es/tests/test_es_index.py
+++ b/backend/es/tests/test_es_index.py
@@ -2,7 +2,6 @@
from datetime import datetime
from time import sleep
-from addcorpus.load_corpus import load_corpus_definition
from es.es_index import perform_indexing
start = datetime.strptime('1970-01-01','%Y-%m-%d')
diff --git a/backend/ianalyzer/common_settings.py b/backend/ianalyzer/common_settings.py
index 06b8fcaf0..f78775222 100644
--- a/backend/ianalyzer/common_settings.py
+++ b/backend/ianalyzer/common_settings.py
@@ -131,3 +131,5 @@
}
LOGO_LINK = 'https://dhstatic.hum.uu.nl/logo-cdh/png/UU_CDH_logo_EN_whiteFC.png'
+
+NLTK_DATA_PATH = os.path.join(BASE_DIR, 'addcorpus', 'nltk_data')
\ No newline at end of file
diff --git a/backend/requirements.in b/backend/requirements.in
index ab5812765..884fc7c85 100644
--- a/backend/requirements.in
+++ b/backend/requirements.in
@@ -5,6 +5,7 @@ django-livereload-server
# django-revproxy, see https://github.com/UUDigitalHumanitieslab/cookiecutter-webapp-deluxe/issues/35
git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a
djangosaml2
+langdetect
psycopg2
pytest
pytest-django
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 943b2d568..80293e02e 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -12,6 +12,8 @@ async-timeout==4.0.2
# via redis
attrs==22.2.0
# via pytest
+backports-zoneinfo==0.2.1
+ # via django
beautifulsoup4==4.11.1
# via
# -r requirements.in
@@ -45,22 +47,27 @@ click-repl==0.2.0
# via celery
cryptography==39.0.1
# via
+ # pyjwt
# pyopenssl
# pysaml2
defusedxml==0.7.1
# via
# djangosaml2
# pysaml2
+ # python3-openid
dj-rest-auth[with_social]==2.2.7
# via -r requirements.in
django==4.1.10
# via
# -r requirements.in
# dj-rest-auth
+ # django-allauth
# django-livereload-server
# django-revproxy
# djangorestframework
# djangosaml2
+django-allauth==0.52.0
+ # via dj-rest-auth
django-livereload-server==0.4
# via -r requirements.in
django-revproxy @ git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a
@@ -79,6 +86,8 @@ elementpath==4.1.1
# via xmlschema
et-xmlfile==1.1.0
# via openpyxl
+exceptiongroup==1.1.3
+ # via pytest
execnet==1.9.0
# via pytest-xdist
fst-pso==1.8.1
@@ -89,6 +98,8 @@ gensim==4.3.0
# via -r requirements.in
idna==3.4
# via requests
+importlib-resources==6.1.0
+ # via pysaml2
iniconfig==2.0.0
# via pytest
joblib==1.2.0
@@ -99,6 +110,8 @@ kombu==5.2.4
# via celery
langcodes==3.3.0
# via -r requirements.in
+langdetect==1.0.9
+ # via -r requirements.in
language-data==1.1
# via -r requirements.in
lxml==4.9.1
@@ -121,6 +134,8 @@ numpy==1.24.1
# scikit-learn
# scipy
# simpful
+oauthlib==3.2.2
+ # via requests-oauthlib
openpyxl==3.1.2
# via -r requirements.in
packaging==23.0
@@ -139,6 +154,8 @@ pycparser==2.21
# via cffi
pyfume==0.2.25
# via fuzzytm
+pyjwt[crypto]==2.8.0
+ # via django-allauth
pyopenssl==23.1.1
# via pysaml2
pypdf2==3.0.1
@@ -160,6 +177,8 @@ python-dateutil==2.8.2
# via
# pandas
# pysaml2
+python3-openid==3.2.0
+ # via django-allauth
pytz==2022.7
# via
# celery
@@ -172,8 +191,12 @@ regex==2022.10.31
# via nltk
requests==2.31.0
# via
+ # django-allauth
# pysaml2
+ # requests-oauthlib
# simpful
+requests-oauthlib==1.3.1
+ # via django-allauth
scikit-learn==1.2.1
# via -r requirements.in
scipy==1.10.0
@@ -190,6 +213,7 @@ six==1.16.0
# via
# click-repl
# django-livereload-server
+ # langdetect
# python-dateutil
smart-open==6.3.0
# via gensim
@@ -201,12 +225,16 @@ textdistance==4.5.0
# via -r requirements.in
threadpoolctl==3.1.0
# via scikit-learn
+tomli==2.0.1
+ # via pytest
tornado==6.3.3
# via django-livereload-server
tqdm==4.64.1
# via
# -r requirements.in
# nltk
+typing-extensions==4.8.0
+ # via pypdf2
urllib3==1.26.17
# via
# django-revproxy
@@ -221,6 +249,8 @@ wcwidth==0.2.6
# via prompt-toolkit
xmlschema==2.2.3
# via pysaml2
+zipp==3.17.0
+ # via importlib-resources
# The following packages are considered to be unsafe in a requirements file:
# setuptools
diff --git a/backend/visualization/tests/mock_corpora/large_mock_corpus.py b/backend/visualization/tests/mock_corpora/large_mock_corpus.py
index e15652945..466ceb8a6 100644
--- a/backend/visualization/tests/mock_corpora/large_mock_corpus.py
+++ b/backend/visualization/tests/mock_corpora/large_mock_corpus.py
@@ -1,7 +1,9 @@
from datetime import datetime
-from addcorpus.corpus import CorpusDefinition, FieldDefinition
import random
+from addcorpus.corpus import CorpusDefinition, FieldDefinition
+from addcorpus.es_mappings import date_mapping, text_mapping
+
TOTAL_DOCUMENTS = 11000
# some constants for generating data
@@ -48,16 +50,12 @@ def source2dicts(self, source):
date = FieldDefinition(
name = 'date',
- es_mapping = {
- 'type': 'date',
- }
+ es_mapping = date_mapping()
)
content = FieldDefinition(
name = 'content',
- es_mapping = {
- 'type': 'text'
- }
+ es_mapping = text_mapping()
)
fields = [date, content]
diff --git a/backend/visualization/tests/mock_corpora/small_mock_corpus.py b/backend/visualization/tests/mock_corpora/small_mock_corpus.py
index a3ad7fd2a..f97c42121 100644
--- a/backend/visualization/tests/mock_corpora/small_mock_corpus.py
+++ b/backend/visualization/tests/mock_corpora/small_mock_corpus.py
@@ -1,9 +1,12 @@
from datetime import datetime
+import os
+
from addcorpus.corpus import FieldDefinition, CSVCorpusDefinition
from addcorpus.extract import CSV
-import os
+from addcorpus.es_mappings import date_mapping, keyword_mapping, main_content_mapping, text_mapping
from addcorpus.es_settings import es_settings
+
# Fake corpus class for unit tests
here = os.path.abspath(os.path.dirname(__file__))
@@ -20,7 +23,7 @@ class SmallMockCorpus(CSVCorpusDefinition):
languages = ['en']
category = 'book'
- es_settings = es_settings('en', stopword_analyzer=True)
+ es_settings = es_settings(['en'], stopword_analysis=True)
def sources(self, start=min_date, end=max_date):
for csv_file in os.listdir(os.path.join(here, 'source_files')):
@@ -28,45 +31,25 @@ def sources(self, start=min_date, end=max_date):
date = FieldDefinition(
name = 'date',
- es_mapping = {
- 'type': 'date',
- },
+ es_mapping = date_mapping(),
extractor = CSV('date')
)
title_field = FieldDefinition(
name = 'title',
- es_mapping = {
- 'type': 'text',
- },
+ es_mapping = text_mapping(),
extractor = CSV('title')
)
content = FieldDefinition(
name = 'content',
- es_mapping= {
- 'type': 'text',
- "fields": {
- "clean": {
- "type": "text",
- },
- "stemmed": {
- "type": "text",
- },
- "length": {
- "type": "token_count",
- 'analyzer': 'standard',
- }
- }
- },
+ es_mapping = main_content_mapping(True, True, False, 'en'),
extractor = CSV('content')
)
genre = FieldDefinition(
name = 'genre',
- es_mapping= {
- 'type': 'keyword'
- },
+ es_mapping = keyword_mapping(),
extractor = CSV('genre')
)
diff --git a/backend/visualization/tests/test_termvectors.py b/backend/visualization/tests/test_termvectors.py
index 967102b53..ea4f6fe4c 100644
--- a/backend/visualization/tests/test_termvectors.py
+++ b/backend/visualization/tests/test_termvectors.py
@@ -67,7 +67,7 @@ def test_find_matches(es_client, termvectors_result, small_mock_corpus):
}, {
'query_text': 'regarded with such "evil forebodings"',
'components': ['regarded', 'with', 'such', 'evil forebodings'],
- 'analyzed': [['regarded'], ['with'], ['such'], ['evil', 'forebodings']]
+ 'analyzed': [['regarded'], ['evil', 'forebodings']]
}, {
'query_text': 'evil + forebodings',
'components': ['evil', '+', 'forebodings'],
@@ -83,7 +83,7 @@ def test_find_matches(es_client, termvectors_result, small_mock_corpus):
}, {
'query_text': 'rejoice~1 to hear',
'components': ['rejoice~1', 'to', 'hear'],
- 'analyzed': [['rejoice~1'], ['to'], ['hear']]
+ 'analyzed': [['rejoice~1'], ['hear']]
}
]
diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py
index 32dc21190..5bb5e6c54 100644
--- a/backend/visualization/tests/test_wordcloud.py
+++ b/backend/visualization/tests/test_wordcloud.py
@@ -127,7 +127,6 @@ def test_wordcloud_counts(small_mock_corpus):
def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud):
stopwords = ['the', 'and', 'of']
-
for stopword in stopwords:
match = any(
item['key'] == stopword for item in small_mock_corpus_complete_wordcloud)
diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py
index 786929240..68ad5b543 100644
--- a/backend/visualization/wordcloud.py
+++ b/backend/visualization/wordcloud.py
@@ -1,12 +1,19 @@
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
+
from addcorpus.load_corpus import load_corpus_definition
from addcorpus.es_settings import get_stopwords_from_settings
from es import download as download
-def corpus_stopwords(corpus_name):
+def field_stopwords(corpus_name, field):
corpus = load_corpus_definition(corpus_name)
- return get_stopwords_from_settings(corpus.es_settings)
+ field_definition = next((f for f in corpus.fields if f.name == field), None)
+ mapping = field_definition.es_mapping
+ analyzer = mapping.get(
+ 'fields', {}).get('clean', {}).get('analyzer')
+ if not analyzer:
+ return []
+ return get_stopwords_from_settings(corpus.es_settings, analyzer)
def make_wordcloud_data(documents, field, corpus):
texts = []
@@ -14,8 +21,8 @@ def make_wordcloud_data(documents, field, corpus):
content = document['_source'][field]
if content and content != '':
texts.append(content)
-
- stopwords = corpus_stopwords(corpus) or []
+
+ stopwords = field_stopwords(corpus, field)
cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords)
cvtexts = cv.fit_transform(texts)
counts = cvtexts.sum(axis=0).A1
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 0904061d2..19f75ef4d 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,6 +1,6 @@
services:
db:
- image: postgres
+ image: docker.io/library/postgres
environment:
- POSTGRES_DB=${SQL_DATABASE}
- POSTGRES_USER=${SQL_USER}
@@ -36,7 +36,7 @@ services:
- type: bind
source: $DATA_DIR
target: /corpora
- command: bash -c "python manage.py migrate && python manage.py runserver 0.0.0.0:8000"
+ command: bash -c "python manage.py migrate && python manage.py loadcorpora && python manage.py runserver 0.0.0.0:8000"
frontend:
build:
context: ./frontend
@@ -55,6 +55,10 @@ services:
- cluster.name=ianalizer-es-data-cluster
- bootstrap.memory_lock=true
- xpack.security.enabled=false
+ - logger.org.elasticsearch.discovery=ERROR
+ - logger.org.elasticsearch.transport=ERROR
+ - logger.org.elasticsearch.http=ERROR
+ - logger.org.elasticsearch.cluster=ERROR
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- ELASTIC_PASSWORD=$ELASTIC_ROOT_PASSWORD
ulimits:
@@ -65,6 +69,14 @@ services:
- ianalyzer-es:/usr/share/elasticsearch/data
ports:
- 127.0.0.1:9200:9200
+ kibana:
+ image: docker.elastic.co/kibana/kibana:8.5.0
+ depends_on:
+ - elasticsearch
+ environment:
+ - "ELASTICSEARCH_URL=http://elasticsearch:9200"
+ ports:
+ - 127.0.0.1:5601:5601
redis:
image: redis:latest
restart: unless-stopped
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index b80b97408..514c7b21d 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -1,5 +1,5 @@
# base image
-FROM node:14-alpine
+FROM docker.io/library/node:14-alpine
RUN apk update && apk add --no-cache --virtual .gyp python3 make g++
# Install Chrome
diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
index dac74ec42..8138f30b8 100644
--- a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
+++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
@@ -4,7 +4,6 @@ import {
import { AggregateResult, CorpusField, QueryModel, Corpus, FreqTableHeaders } from '../../models/index';
-import { ApiService } from '../../services/index';
import { BehaviorSubject } from 'rxjs';
import { VisualizationService } from '../../services/visualization.service';
import { showLoading } from '../../utils/utils';