diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py index 0b4996a35..a98884358 100644 --- a/backend/addcorpus/constants.py +++ b/backend/addcorpus/constants.py @@ -1,3 +1,5 @@ +from enum import Enum + CATEGORIES = [ ('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), @@ -11,3 +13,44 @@ ''' Types of data ''' + +class MappingType(Enum): + 'Elasticsearch mapping types that are implemented in I-analyzer' + + TEXT = 'text' + KEYWORD = 'keyword' + DATE = 'date' + INTEGER = 'integer' + FLOAT = 'float' + BOOLEAN = 'boolean' + + +class VisualizationType(Enum): + '''Types of visualisations available''' + + RESULTS_COUNT = 'resultscount' + TERM_FREQUENCY = 'termfrequency' + NGRAM = 'ngram' + WORDCLOUD = 'wordcloud' + +FORBIDDEN_FIELD_NAMES = [ + 'query', + 'fields', + 'sort', + 'highlight', + 'visualize', + 'visualizedField', + 'normalize', + 'size', + 'positions', + 'freqCompensation', + 'analysis', + 'maxDocuments', + 'numberOfNgrams', + 'dateField', +] +''' +Field names that cannot be used because they are also query parameters in frontend routes. + +Using them would make routing ambiguous. +''' diff --git a/backend/addcorpus/filters.py b/backend/addcorpus/filters.py index d9ff1111a..a8353e978 100644 --- a/backend/addcorpus/filters.py +++ b/backend/addcorpus/filters.py @@ -4,6 +4,7 @@ ''' from datetime import datetime +from addcorpus.constants import MappingType class Filter(object): ''' @@ -11,6 +12,9 @@ class Filter(object): the ElasticSearch filter that is sent to the client. ''' + mapping_types = tuple() + '''accepted mapping types for this filter''' + def __init__(self, description=None): self.field = None # Must be filled after initialising self.description = description @@ -31,6 +35,8 @@ class DateFilter(Filter): Filter for datetime values: produces two datepickers for min and max date. ''' + mapping_types = (MappingType.DATE,) + def __init__(self, lower, upper, *nargs, **kwargs): self.lower = lower self.upper = upper @@ -42,6 +48,8 @@ class RangeFilter(Filter): Filter for numerical values: produces a slider between two values. ''' + mapping_types = (MappingType.INTEGER, MappingType.FLOAT) + def __init__(self, lower, upper, *nargs, **kwargs): self.lower = lower self.upper = upper @@ -53,6 +61,11 @@ class MultipleChoiceFilter(Filter): Filter for keyword values: produces a set of buttons. ''' + mapping_types = (MappingType.KEYWORD,) + # note: the multiple choice filter is imlemented as a terms query + # which is also valid for integer/float/boolean/date, + # but those should be rejected so the appropriate filter is used instead + def __init__(self, option_count=10, *nargs, **kwargs): self.option_count = option_count # option_count defines how many buckets are retrieved @@ -63,9 +76,17 @@ def __init__(self, option_count=10, *nargs, **kwargs): class BooleanFilter(Filter): ''' Filter for boolean values: produces a drop-down menu. - ''' #TODO checkbox? + ''' + + mapping_types = (MappingType.BOOLEAN,) def __init__(self, true, false, *nargs, **kwargs): self.true = true self.false = false super().__init__(*nargs, **kwargs) + +VALID_MAPPINGS = { + f.__name__: tuple(mt.value for mt in f.mapping_types) + for f in + [DateFilter, RangeFilter, MultipleChoiceFilter, BooleanFilter] +} diff --git a/backend/addcorpus/migrations/0005_add_validators.py b/backend/addcorpus/migrations/0005_add_validators.py new file mode 100644 index 000000000..abb8fdb7c --- /dev/null +++ b/backend/addcorpus/migrations/0005_add_validators.py @@ -0,0 +1,44 @@ +# Generated by Django 4.1.9 on 2023-09-13 16:15 + +import addcorpus.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('addcorpus', '0004_alter_corpusconfiguration_category'), + ] + + operations = [ + migrations.AlterField( + model_name='corpusconfiguration', + name='description_page', + field=models.CharField(blank=True, help_text='filename of the markdown documentation file for this corpus', max_length=128, validators=[addcorpus.validators.validate_markdown_filename_extension]), + ), + migrations.AlterField( + model_name='corpusconfiguration', + name='image', + field=models.CharField(help_text='filename of the corpus image', max_length=126, validators=[addcorpus.validators.validate_image_filename_extension]), + ), + migrations.AlterField( + model_name='corpusconfiguration', + name='scan_image_type', + field=models.CharField(blank=True, help_text='MIME type of scan images', max_length=64, validators=[addcorpus.validators.validate_mimetype]), + ), + migrations.AlterField( + model_name='field', + name='es_mapping', + field=models.JSONField(help_text='specification of the elasticsearch mapping of this field', validators=[addcorpus.validators.validate_es_mapping]), + ), + migrations.AlterField( + model_name='field', + name='name', + field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validators.validate_name_is_not_a_route_parameter]), + ), + migrations.AlterField( + model_name='field', + name='search_filter', + field=models.JSONField(blank=True, help_text='specification of the search filter for this field (if any)', validators=[addcorpus.validators.validate_search_filter]), + ), + ] diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py index 485757ea5..0c91431ee 100644 --- a/backend/addcorpus/models.py +++ b/backend/addcorpus/models.py @@ -1,9 +1,14 @@ from django.db import models from django.contrib.postgres.fields import ArrayField from django.contrib.auth.models import Group +from django.core.exceptions import ValidationError +import warnings -from addcorpus.constants import CATEGORIES -from addcorpus.validators import validate_language_code +from addcorpus.constants import CATEGORIES, MappingType, VisualizationType +from addcorpus.validators import validate_language_code, validate_image_filename_extension, \ + validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \ + validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \ + validate_visualizations_with_mapping, validate_implication MAX_LENGTH_NAME = 126 MAX_LENGTH_DESCRIPTION = 254 @@ -63,6 +68,7 @@ class CorpusConfiguration(models.Model): description_page = models.CharField( max_length=128, blank=True, + validators=[validate_markdown_filename_extension], help_text='filename of the markdown documentation file for this corpus', ) description = models.CharField( @@ -85,6 +91,7 @@ class CorpusConfiguration(models.Model): ) image = models.CharField( max_length=126, + validators=[validate_image_filename_extension], help_text='filename of the corpus image', ) languages = ArrayField( @@ -104,6 +111,7 @@ class CorpusConfiguration(models.Model): scan_image_type = models.CharField( max_length=64, blank=True, + validators=[validate_mimetype], help_text='MIME type of scan images', ) title = models.CharField( @@ -120,29 +128,33 @@ def __str__(self): FIELD_DISPLAY_TYPES = [ ('text_content', 'text content'), - ('text', 'text'), - ('keyword', 'keyword'), - ('date', 'date'), - ('integer', 'integer'), - ('float', 'float'), - ('boolean', 'boolean'), + (MappingType.TEXT.value, 'text'), + (MappingType.KEYWORD.value, 'keyword'), + (MappingType.DATE.value, 'date'), + (MappingType.INTEGER.value, 'integer'), + (MappingType.FLOAT.value, 'float'), + (MappingType.BOOLEAN.value, 'boolean'), ] FIELD_VISUALIZATIONS = [ - ('resultscount', 'Number of results'), - ('termfrequency', 'Frequency of the search term'), - ('ngram', 'Neighbouring words'), - ('wordcloud', 'Most frequent words'), + (VisualizationType.RESULTS_COUNT.value, 'Number of results'), + (VisualizationType.TERM_FREQUENCY.value, 'Frequency of the search term'), + (VisualizationType.NGRAM.value, 'Neighbouring words'), + (VisualizationType.WORDCLOUD.value, 'Most frequent words'), ] +'''Options for `visualizations` field''' VISUALIZATION_SORT_OPTIONS = [ ('key', 'By the value of the field'), ('value', 'By frequency') ] +'''Options for `visualization_sort` field''' + class Field(models.Model): name = models.SlugField( max_length=MAX_LENGTH_NAME, + validators=[validate_name_is_not_a_route_parameter], help_text='internal name for the field', ) corpus_configuration = models.ForeignKey( @@ -167,6 +179,7 @@ class Field(models.Model): ) search_filter = models.JSONField( blank=True, + validators=[validate_search_filter], help_text='specification of the search filter for this field (if any)', ) results_overview = models.BooleanField( @@ -197,6 +210,7 @@ class Field(models.Model): help_text='if the field has results/term frequency charts: how is the x-axis sorted?', ) es_mapping = models.JSONField( + validators=[validate_es_mapping], help_text='specification of the elasticsearch mapping of this field', ) indexed = models.BooleanField( @@ -236,3 +250,22 @@ class Meta: def __str__(self) -> str: return f'{self.name} ({self.corpus_configuration.corpus.name})' + + def clean(self): + validate_searchable_field_has_full_text_search(self.es_mapping, self.searchable) + + if self.search_filter: + validate_search_filter_with_mapping(self.es_mapping, self.search_filter) + + if self.visualizations: + validate_visualizations_with_mapping(self.es_mapping, self.visualizations) + + validate_implication(self.primary_sort, self.sortable, "The primary sorting field must be sortable") + validate_implication(self.csv_core, self.downloadable, "Core download fields must be downloadable") + + # core search fields must searchable + # not a hard requirement because it is not currently satisfied in all corpora + try: + validate_implication(self.search_field_core, self.searchable, "Core search fields must be searchable") + except ValidationError as e: + warnings.warn(e.message) diff --git a/backend/addcorpus/tests/test_validators.py b/backend/addcorpus/tests/test_validators.py new file mode 100644 index 000000000..f5d95e6a3 --- /dev/null +++ b/backend/addcorpus/tests/test_validators.py @@ -0,0 +1,73 @@ +import pytest +from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping +from addcorpus.validators import * + +def test_validate_mimetype(): + validate_mimetype('image/jpeg') + + with pytest.raises(ValidationError): + validate_mimetype('nonsense') + +def test_validate_es_mapping(): + validate_es_mapping({'type': 'text'}) + + with pytest.raises(ValidationError): + validate_es_mapping({}) + + with pytest.raises(ValidationError): + validate_es_mapping({'type': 'perlocator'}) + +def test_validate_search_filter(): + validate_search_filter({ + 'name': 'RangeFilter', + 'lower': 0, + 'upper': 100, + 'description': '...' + }) + + with pytest.raises(ValidationError): + validate_search_filter({'name': 'UnkownFilter'}) + +def test_validate_search_filter_with_mapping(): + filter = { + 'name': 'RangeFilter', + 'lower': 0, + 'upper': 100, + 'description': '...' + } + + validate_search_filter_with_mapping(int_mapping(), filter) + + with pytest.raises(ValidationError): + validate_search_filter_with_mapping(keyword_mapping(), filter) + +def test_validate_visualizations_with_mapping(): + validate_visualizations_with_mapping(text_mapping(), ['ngram']) + validate_visualizations_with_mapping(keyword_mapping(), ['resultscount']) + validate_visualizations_with_mapping(keyword_mapping(enable_full_text_search=True), ['ngram']) + + with pytest.raises(ValidationError): + validate_visualizations_with_mapping(keyword_mapping(), ['ngram']) + + with pytest.raises(ValidationError): + validate_visualizations_with_mapping(text_mapping(), ['resultscount']) + +def test_validate_searchable_fields_has_fts(): + validate_searchable_field_has_full_text_search(text_mapping(), True) + validate_searchable_field_has_full_text_search( + keyword_mapping(enable_full_text_search=True), True + ) + validate_searchable_field_has_full_text_search(int_mapping(), False) + + with pytest.raises(ValidationError): + validate_searchable_field_has_full_text_search(int_mapping(), True) + + with pytest.warns(Warning): + validate_searchable_field_has_full_text_search(keyword_mapping(), True) + +def test_filename_validation(): + validate_image_filename_extension('image.jpg') + + with pytest.raises(ValidationError): + validate_image_filename_extension('image.txt') + diff --git a/backend/addcorpus/validators.py b/backend/addcorpus/validators.py index c4a26b408..fe11fb33d 100644 --- a/backend/addcorpus/validators.py +++ b/backend/addcorpus/validators.py @@ -1,5 +1,22 @@ from django.core.exceptions import ValidationError from langcodes import tag_is_valid +import mimetypes +import warnings +import os + +from addcorpus.constants import MappingType, VisualizationType, FORBIDDEN_FIELD_NAMES +from addcorpus.filters import VALID_MAPPINGS as VALID_SEARCH_FILTER_MAPPINGS + +def primary_mapping_type(es_mapping): + return es_mapping.get('type', None) + +def supports_full_text_search(es_mapping): + is_text = primary_mapping_type(es_mapping) == MappingType.TEXT.value + has_text_multifield = 'text' in es_mapping.get('fields', {}) + return is_text or has_text_multifield + +def supports_aggregation(es_mapping): + return primary_mapping_type(es_mapping) != MappingType.TEXT.value def validate_language_code(value): ''' @@ -8,3 +25,130 @@ def validate_language_code(value): if not tag_is_valid(value) or value == '': raise ValidationError(f'{value} is not a valid ISO-639 language tag') + +def validate_mimetype(value): + ''' + verify that a value is a valid MIME type + ''' + + if not value in mimetypes.types_map.values(): + raise ValidationError(f'{value} is not a valid MIME type') + +def validate_search_filter(value): + '''validate the search filter JSON''' + + if value: + name = value.get('name', None) + if not name in VALID_SEARCH_FILTER_MAPPINGS: + raise ValidationError(f'Unknown search filter type: {name}') + +def validate_es_mapping(value): + '''validate that the field mapping specifies a mapping type''' + + mapping_type = primary_mapping_type(value) + + if not mapping_type: + raise ValidationError('No mapping type specified') + + valid_types = [t.value for t in list(MappingType)] + if mapping_type not in valid_types: + raise ValidationError(f'Invalid mapping type: {mapping_type}') + +def validate_search_filter_with_mapping(es_mapping, search_filter_dict): + ''' + validate that the search filter is appropriate for the mapping type + ''' + + filter_type = search_filter_dict.get('name') + mapping_type = primary_mapping_type(es_mapping) + + valid_mappings = VALID_SEARCH_FILTER_MAPPINGS[filter_type] + if not mapping_type in valid_mappings: + raise ValidationError(f'{filter_type} cannot be used with {mapping_type} mapping') + + +def validate_visualizations_with_mapping(es_mapping, visualizations): + ''' + validate that the specified visualisations are compatible with the field mapping + ''' + + if not supports_full_text_search(es_mapping): + if VisualizationType.NGRAM.value in visualizations: + raise ValidationError(f'ngram visualisation requires a text mapping') + + if VisualizationType.WORDCLOUD.value in visualizations: + warnings.warn( + 'A field uses a wordcloud visualisation but does not tokenise data. ' + 'This is technically possible, but suggests the mapping type is inappropriate.', + ) + + use_aggregations = [vt.value for vt in (VisualizationType.RESULTS_COUNT, VisualizationType.TERM_FREQUENCY)] + uses_aggregations = lambda vis: vis in use_aggregations + + if any(map(uses_aggregations, visualizations)) and not supports_aggregation(es_mapping): + vis = next(filter(uses_aggregations, visualizations)) + raise ValidationError(f'{vis} visualisation cannot be used on text mapping') + + +def validate_name_is_not_a_route_parameter(value): + ''' + reject names that are also used as query parameters in frontend routes. + + This would create serious bugs in the frontend as those parameters will also + be interpreted as filter settings for the field. + ''' + + if value in FORBIDDEN_FIELD_NAMES: + raise ValidationError( + f'{value} cannot be used as a field name, because it is also a route parameter' + ) + +def mapping_can_be_searched(es_mapping): + ''' + Verify if a mapping is appropriate for searching + ''' + + if supports_full_text_search(es_mapping): + return True + + if primary_mapping_type(es_mapping) == MappingType.KEYWORD.value: + warnings.warn( + 'It is strongly discouraged to use text search for keyword fields without' + 'text analysis. Consider adding a text multifield or using a filter instead.' + ) + return True + + return False + +def validate_searchable_field_has_full_text_search(es_mapping, searchable): + mapping_type = primary_mapping_type(es_mapping) + validate_implication( + searchable, es_mapping, + message=f'Text search is not supported for mapping type {mapping_type}', + conclusion_predicate=mapping_can_be_searched, + ) + +def identity(obj): + return obj + +def validate_implication(premise_value, conclusion_value, message, premise_predicate = identity, conclusion_predicate = identity): + ''' + shorthand for a lot of "if A then B" validations. + ''' + + if premise_predicate(premise_value) and not conclusion_predicate(conclusion_value): + raise ValidationError(message) + + +def validate_filename_extension(filename, allowed_extensions): + _, extension = os.path.splitext(filename) + if not extension in allowed_extensions: + raise ValidationError(f'Extension {extension} is not allowed') + +def validate_markdown_filename_extension(filename): + allowed = ['.md', '.markdown'] + validate_filename_extension(filename, allowed) + +def validate_image_filename_extension(filename): + allowed = ['.jpeg', '.jpg', '.png', '.JPG'] + validate_filename_extension(filename, allowed) diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py index d28629807..b48a28963 100644 --- a/backend/corpora/dbnl/dbnl.py +++ b/backend/corpora/dbnl/dbnl.py @@ -383,6 +383,7 @@ def _xml_files(self): display_name='Primary', description='Whether this is the primary document for this book - each book has only one primary document', extractor=Order(transform = lambda index : index == 0), + es_mapping=bool_mapping(), search_filter=BooleanFilter( true='Primary', false='Other',