Skip to content

Commit

Permalink
fix problems with es_settings
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Nov 9, 2023
1 parent 5492847 commit 9c0e40c
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 30 deletions.
48 changes: 33 additions & 15 deletions backend/addcorpus/es_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@
HERE = os.path.abspath(os.path.dirname(__file__))
NLTK_DATA_PATH = os.path.join(HERE, 'nltk_data')

# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian',
'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch',
'english', 'estonian', 'finnish', 'french', 'galician',
'german', 'greek', 'hindi', 'hungarian', 'indonesian',
'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
'persian', 'portuguese', 'romanian', 'russian', 'sorani',
'spanish', 'swedish', 'turkish', 'thai']

def get_language_key(language_code):
'''
Get the nltk stopwords file / elasticsearch stemmer name for a language code
Expand Down Expand Up @@ -44,6 +53,8 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
stemmer_filter_name = 'stemmer'
stemmed_analyzer_name = 'stemmed'

set_char_filter(settings)

for language in languages:
# do not attach language isocodes if there is just one language
language_string = language if len(languages) > 1 else None
Expand All @@ -57,9 +68,8 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
settings,
add_language_string(stopword_filter_name, language_string),
add_language_string(clean_analyzer_name, language_string),
language
)
if stemming_analyzer:
if stemming_analyzer and get_language_key(language) in AVAILABLE_ES_STEMMERS:
set_stemmed_analyzer(
settings,
add_language_string(stopword_filter_name, language_string),
Expand Down Expand Up @@ -118,23 +128,31 @@ def get_stopwords_from_settings(es_settings):
return stopwords

def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language):
settings['analysis']['filter'][stemmer_filter_name] = make_stemmer_filter(language)
settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)
filters = settings['analysis'].get('filter', {})
filters.update({stemmer_filter_name: make_stemmer_filter(language)})
settings['analysis']['filter'] = filters
analyzers = settings['analysis'].get('analyzer')
analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)})
settings['analysis']['analyzer'] = analyzers

def set_char_filter(settings):
settings["analysis"] = {
"char_filter": { "number_filter": number_filter() }
}

def set_stopword_filter(settings, stopword_filter_name, language):
stopword_filter = make_stopword_filter(stopword_filter_name, language)
if not stopword_filter:
return False
settings["analysis"] = {
"analyzer": {},
"char_filter":{ "number_filter": number_filter() },
'filter': {
stopword_filter_name: stopword_filter
}
}
filters = settings['analysis'].get('filter', {})
filters.update({
stopword_filter_name: stopword_filter
})
settings['analysis']['filter'] = filters
return True

def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language):
settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer(
stopword_filter_name
)
def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name):
clean_analyzer = make_clean_analyzer(stopword_filter_name)
analyzers = settings['analysis'].get('analyzer', {})
analyzers.update({clean_analyzer_name: clean_analyzer})
settings["analysis"]['analyzer'] = analyzers
5 changes: 4 additions & 1 deletion backend/addcorpus/save_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def _copy_corpus_attributes(corpus_definition: CorpusDefinition, configuration:
'word_models_present',
]

defined = get_defined_attributes(corpus_definition, attributes_to_copy)
try:
defined = get_defined_attributes(corpus_definition, attributes_to_copy)
except Exception as e:
raise e

for attr, value in defined.items():
configuration.__setattr__(attr, value)
Expand Down
70 changes: 70 additions & 0 deletions backend/addcorpus/tests/test_es_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pytest

from addcorpus.es_settings import es_settings

char_filter_tokenizer = {'char_filter': ['number_filter'], 'tokenizer': 'standard'}

test_cases = {
'single_language': {
'languages': ['en'],
'stopword': True,
'stemming': True,
'expected': {
'filter': {
'stemmer': {'type': 'stemmer', 'language': 'english'},
'stopwords': {'type': 'stop', 'stopwords': list()}
},
'analyzer': {
'clean': {
'filter': ['lowercase', 'stopwords'],
**char_filter_tokenizer
},
'stemmed': {
'filter': ['lowercase', 'stopwords', 'stemmer'],
**char_filter_tokenizer
}
}
}
},
'multiple_languages': {
'languages': ['en', 'de'],
'stopword': True,
'stemming': True,
'expected': {
'filter': {
'stemmer_de': {'type': 'stemmer', 'language': 'german'},
'stopwords_de': {'type': 'stop', 'stopwords': list()},
'stemmer_en': {'type': 'stemmer', 'language': 'english'},
'stopwords_en': {'type': 'stop', 'stopwords': list()},
},
'analyzer': {
'clean_de': {
'filter': ['lowercase', 'stopwords_de'],
**char_filter_tokenizer
},
'stemmed_de': {
'filter': ['lowercase', 'stopwords_de', 'stemmer_de'],
**char_filter_tokenizer
},
'clean_en': {
'filter': ['lowercase', 'stopwords_en'],
**char_filter_tokenizer
},
'stemmed_en': {
'filter': ['lowercase', 'stopwords_en', 'stemmer_en'],
**char_filter_tokenizer
}
}
}
}
}

@pytest.mark.parametrize('test_config', list(test_cases.values()))
def test_es_settings(test_config):
settings = es_settings(test_config['languages'], test_config['stopword'], test_config['stemming'])
assert settings['analysis']['filter'].keys() == test_config['expected']['filter'].keys()
assert settings['analysis']['analyzer'].keys() == test_config['expected']['analyzer'].keys()
for analyzer in settings['analysis']['analyzer'].keys():
assert settings['analysis']['analyzer'][analyzer]['filter'][1] in settings['analysis']['filter']
if analyzer.startswith('stemmed'):
assert settings['analysis']['analyzer'][analyzer]['filter'][2] in settings['analysis']['filter']
6 changes: 1 addition & 5 deletions backend/corpora/ecco/ecco.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ class Ecco(XMLCorpusDefinition):
min_date = datetime(year=1700, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)

@property
def es_settings(self):
return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)

data_directory = settings.ECCO_DATA
es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
image = 'ecco.jpg'
Expand All @@ -49,7 +45,7 @@ def es_settings(self):

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)

def sources(self, start=min_date, end=max_date):
logging.basicConfig(filename='ecco.log', level=logging.INFO)
Expand Down
5 changes: 1 addition & 4 deletions backend/corpora/peaceportal/epidat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ class PeaceportalEpidat(PeacePortal):
es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat')
es_alias = settings.PEACEPORTAL_ALIAS

languages = ['german', 'hebrew', 'english', 'dutch']

def es_settings(self):
return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
languages = ['de', 'he', 'en', 'nl']

def __init__(self):
super().__init__()
Expand Down
10 changes: 6 additions & 4 deletions backend/corpora/peaceportal/peaceportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
scan_image_type = 'image/png'
# fields below are required by code but not actually used
min_date = datetime(year=746, month=1, day=1)
image = 'bogus'
image = 'bogus.jpg'
category = 'inscription'
data_directory = 'bogus'

# Data overrides from .common.XMLCorpus
Expand All @@ -50,8 +51,9 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
external_file_folder = None
languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-)

@property
def es_settings(self):
return es_settings(self.languages, True, True)
return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)

def sources(self, start, end):
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -166,8 +168,8 @@ def request_media(self, document):
)

transcription_hebrew = FieldDefinition(
name='transcription_he', # no stopwords / stemmers available
es_mapping={'type': 'text'},
name='transcription_he', # no stemmers available
es_mapping=main_content_mapping(stopword_analysis=True, language='he'),
language='he',
hidden=True
)
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/peaceportal/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def test_imports(peace_corpus_settings, corpus_object):
resulted_fields = set()

docs = get_documents(corpus, start, end)
print(list(docs))
for target in corpus_object.get('docs'):
doc = next(docs)
for key in target:
Expand Down

0 comments on commit 9c0e40c

Please sign in to comment.