diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py index 010ddc757..4bc194496 100644 --- a/backend/addcorpus/es_settings.py +++ b/backend/addcorpus/es_settings.py @@ -5,6 +5,15 @@ HERE = os.path.abspath(os.path.dirname(__file__)) NLTK_DATA_PATH = os.path.join(HERE, 'nltk_data') +# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html] +AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian', + 'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch', + 'english', 'estonian', 'finnish', 'french', 'galician', + 'german', 'greek', 'hindi', 'hungarian', 'indonesian', + 'irish', 'italian', 'latvian', 'lithuanian', 'norwegian', + 'persian', 'portuguese', 'romanian', 'russian', 'sorani', + 'spanish', 'swedish', 'turkish', 'thai'] + def get_language_key(language_code): ''' Get the nltk stopwords file / elasticsearch stemmer name for a language code @@ -44,6 +53,8 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False): stemmer_filter_name = 'stemmer' stemmed_analyzer_name = 'stemmed' + set_char_filter(settings) + for language in languages: # do not attach language isocodes if there is just one language language_string = language if len(languages) > 1 else None @@ -57,9 +68,8 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False): settings, add_language_string(stopword_filter_name, language_string), add_language_string(clean_analyzer_name, language_string), - language ) - if stemming_analyzer: + if stemming_analyzer and get_language_key(language) in AVAILABLE_ES_STEMMERS: set_stemmed_analyzer( settings, add_language_string(stopword_filter_name, language_string), @@ -118,23 +128,31 @@ def get_stopwords_from_settings(es_settings): return stopwords def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language): - settings['analysis']['filter'][stemmer_filter_name] = make_stemmer_filter(language) - settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name) + filters = settings['analysis'].get('filter', {}) + filters.update({stemmer_filter_name: make_stemmer_filter(language)}) + settings['analysis']['filter'] = filters + analyzers = settings['analysis'].get('analyzer') + analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)}) + settings['analysis']['analyzer'] = analyzers + +def set_char_filter(settings): + settings["analysis"] = { + "char_filter": { "number_filter": number_filter() } + } def set_stopword_filter(settings, stopword_filter_name, language): stopword_filter = make_stopword_filter(stopword_filter_name, language) if not stopword_filter: return False - settings["analysis"] = { - "analyzer": {}, - "char_filter":{ "number_filter": number_filter() }, - 'filter': { - stopword_filter_name: stopword_filter - } - } + filters = settings['analysis'].get('filter', {}) + filters.update({ + stopword_filter_name: stopword_filter + }) + settings['analysis']['filter'] = filters return True -def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language): - settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer( - stopword_filter_name - ) \ No newline at end of file +def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name): + clean_analyzer = make_clean_analyzer(stopword_filter_name) + analyzers = settings['analysis'].get('analyzer', {}) + analyzers.update({clean_analyzer_name: clean_analyzer}) + settings["analysis"]['analyzer'] = analyzers \ No newline at end of file diff --git a/backend/addcorpus/save_corpus.py b/backend/addcorpus/save_corpus.py index 1c9010754..82a0db368 100644 --- a/backend/addcorpus/save_corpus.py +++ b/backend/addcorpus/save_corpus.py @@ -48,7 +48,10 @@ def _copy_corpus_attributes(corpus_definition: CorpusDefinition, configuration: 'word_models_present', ] - defined = get_defined_attributes(corpus_definition, attributes_to_copy) + try: + defined = get_defined_attributes(corpus_definition, attributes_to_copy) + except Exception as e: + raise e for attr, value in defined.items(): configuration.__setattr__(attr, value) diff --git a/backend/addcorpus/tests/test_es_settings.py b/backend/addcorpus/tests/test_es_settings.py index e69de29bb..9f94b1b69 100644 --- a/backend/addcorpus/tests/test_es_settings.py +++ b/backend/addcorpus/tests/test_es_settings.py @@ -0,0 +1,70 @@ +import pytest + +from addcorpus.es_settings import es_settings + +char_filter_tokenizer = {'char_filter': ['number_filter'], 'tokenizer': 'standard'} + +test_cases = { + 'single_language': { + 'languages': ['en'], + 'stopword': True, + 'stemming': True, + 'expected': { + 'filter': { + 'stemmer': {'type': 'stemmer', 'language': 'english'}, + 'stopwords': {'type': 'stop', 'stopwords': list()} + }, + 'analyzer': { + 'clean': { + 'filter': ['lowercase', 'stopwords'], + **char_filter_tokenizer + }, + 'stemmed': { + 'filter': ['lowercase', 'stopwords', 'stemmer'], + **char_filter_tokenizer + } + } + } + }, + 'multiple_languages': { + 'languages': ['en', 'de'], + 'stopword': True, + 'stemming': True, + 'expected': { + 'filter': { + 'stemmer_de': {'type': 'stemmer', 'language': 'german'}, + 'stopwords_de': {'type': 'stop', 'stopwords': list()}, + 'stemmer_en': {'type': 'stemmer', 'language': 'english'}, + 'stopwords_en': {'type': 'stop', 'stopwords': list()}, + }, + 'analyzer': { + 'clean_de': { + 'filter': ['lowercase', 'stopwords_de'], + **char_filter_tokenizer + }, + 'stemmed_de': { + 'filter': ['lowercase', 'stopwords_de', 'stemmer_de'], + **char_filter_tokenizer + }, + 'clean_en': { + 'filter': ['lowercase', 'stopwords_en'], + **char_filter_tokenizer + }, + 'stemmed_en': { + 'filter': ['lowercase', 'stopwords_en', 'stemmer_en'], + **char_filter_tokenizer + } + } + } + } +} + +@pytest.mark.parametrize('test_config', list(test_cases.values())) +def test_es_settings(test_config): + settings = es_settings(test_config['languages'], test_config['stopword'], test_config['stemming']) + assert settings['analysis']['filter'].keys() == test_config['expected']['filter'].keys() + assert settings['analysis']['analyzer'].keys() == test_config['expected']['analyzer'].keys() + for analyzer in settings['analysis']['analyzer'].keys(): + assert settings['analysis']['analyzer'][analyzer]['filter'][1] in settings['analysis']['filter'] + if analyzer.startswith('stemmed'): + assert settings['analysis']['analyzer'][analyzer]['filter'][2] in settings['analysis']['filter'] \ No newline at end of file diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py index 341970e5e..0a97b25d6 100644 --- a/backend/corpora/ecco/ecco.py +++ b/backend/corpora/ecco/ecco.py @@ -30,10 +30,6 @@ class Ecco(XMLCorpusDefinition): min_date = datetime(year=1700, month=1, day=1) max_date = datetime(year=1800, month=12, day=31) - @property - def es_settings(self): - return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True) - data_directory = settings.ECCO_DATA es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco') image = 'ecco.jpg' @@ -49,7 +45,7 @@ def es_settings(self): @property def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True) def sources(self, start=min_date, end=max_date): logging.basicConfig(filename='ecco.log', level=logging.INFO) diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py index 917f7c59c..c0c4b42cb 100644 --- a/backend/corpora/peaceportal/epidat.py +++ b/backend/corpora/peaceportal/epidat.py @@ -14,10 +14,7 @@ class PeaceportalEpidat(PeacePortal): es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat') es_alias = settings.PEACEPORTAL_ALIAS - languages = ['german', 'hebrew', 'english', 'dutch'] - - def es_settings(self): - return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True) + languages = ['de', 'he', 'en', 'nl'] def __init__(self): super().__init__() diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py index 1f721d50b..c81f9fa06 100644 --- a/backend/corpora/peaceportal/peaceportal.py +++ b/backend/corpora/peaceportal/peaceportal.py @@ -36,7 +36,8 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition): scan_image_type = 'image/png' # fields below are required by code but not actually used min_date = datetime(year=746, month=1, day=1) - image = 'bogus' + image = 'bogus.jpg' + category = 'inscription' data_directory = 'bogus' # Data overrides from .common.XMLCorpus @@ -50,8 +51,9 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition): external_file_folder = None languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-) + @property def es_settings(self): - return es_settings(self.languages, True, True) + return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True) def sources(self, start, end): logger = logging.getLogger(__name__) @@ -166,8 +168,8 @@ def request_media(self, document): ) transcription_hebrew = FieldDefinition( - name='transcription_he', # no stopwords / stemmers available - es_mapping={'type': 'text'}, + name='transcription_he', # no stemmers available + es_mapping=main_content_mapping(stopword_analysis=True, language='he'), language='he', hidden=True ) diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py index f59c54b43..5790a2b9e 100644 --- a/backend/corpora/peaceportal/tests/test_import.py +++ b/backend/corpora/peaceportal/tests/test_import.py @@ -43,7 +43,6 @@ def test_imports(peace_corpus_settings, corpus_object): resulted_fields = set() docs = get_documents(corpus, start, end) - print(list(docs)) for target in corpus_object.get('docs'): doc = next(docs) for key in target: