Skip to content

Commit

Permalink
Merge pull request #1266 from UUDigitalHumanitieslab/feature/corpus-m…
Browse files Browse the repository at this point in the history
…odel-validation

Feature/corpus model validation
  • Loading branch information
lukavdplas authored Sep 26, 2023
2 parents 4aa606a + 3542e46 commit fcba57a
Show file tree
Hide file tree
Showing 7 changed files with 372 additions and 13 deletions.
43 changes: 43 additions & 0 deletions backend/addcorpus/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from enum import Enum

CATEGORIES = [
('parliament', 'Parliamentary debates'),
('periodical', 'Newspapers and other periodicals'),
Expand All @@ -11,3 +13,44 @@
'''
Types of data
'''

class MappingType(Enum):
'Elasticsearch mapping types that are implemented in I-analyzer'

TEXT = 'text'
KEYWORD = 'keyword'
DATE = 'date'
INTEGER = 'integer'
FLOAT = 'float'
BOOLEAN = 'boolean'


class VisualizationType(Enum):
'''Types of visualisations available'''

RESULTS_COUNT = 'resultscount'
TERM_FREQUENCY = 'termfrequency'
NGRAM = 'ngram'
WORDCLOUD = 'wordcloud'

FORBIDDEN_FIELD_NAMES = [
'query',
'fields',
'sort',
'highlight',
'visualize',
'visualizedField',
'normalize',
'size',
'positions',
'freqCompensation',
'analysis',
'maxDocuments',
'numberOfNgrams',
'dateField',
]
'''
Field names that cannot be used because they are also query parameters in frontend routes.
Using them would make routing ambiguous.
'''
23 changes: 22 additions & 1 deletion backend/addcorpus/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@
'''

from datetime import datetime
from addcorpus.constants import MappingType

class Filter(object):
'''
A filter is the interface between the form that is presented to users and
the ElasticSearch filter that is sent to the client.
'''

mapping_types = tuple()
'''accepted mapping types for this filter'''

def __init__(self, description=None):
self.field = None # Must be filled after initialising
self.description = description
Expand All @@ -31,6 +35,8 @@ class DateFilter(Filter):
Filter for datetime values: produces two datepickers for min and max date.
'''

mapping_types = (MappingType.DATE,)

def __init__(self, lower, upper, *nargs, **kwargs):
self.lower = lower
self.upper = upper
Expand All @@ -42,6 +48,8 @@ class RangeFilter(Filter):
Filter for numerical values: produces a slider between two values.
'''

mapping_types = (MappingType.INTEGER, MappingType.FLOAT)

def __init__(self, lower, upper, *nargs, **kwargs):
self.lower = lower
self.upper = upper
Expand All @@ -53,6 +61,11 @@ class MultipleChoiceFilter(Filter):
Filter for keyword values: produces a set of buttons.
'''

mapping_types = (MappingType.KEYWORD,)
# note: the multiple choice filter is imlemented as a terms query
# which is also valid for integer/float/boolean/date,
# but those should be rejected so the appropriate filter is used instead

def __init__(self, option_count=10, *nargs, **kwargs):
self.option_count = option_count
# option_count defines how many buckets are retrieved
Expand All @@ -63,9 +76,17 @@ def __init__(self, option_count=10, *nargs, **kwargs):
class BooleanFilter(Filter):
'''
Filter for boolean values: produces a drop-down menu.
''' #TODO checkbox?
'''

mapping_types = (MappingType.BOOLEAN,)

def __init__(self, true, false, *nargs, **kwargs):
self.true = true
self.false = false
super().__init__(*nargs, **kwargs)

VALID_MAPPINGS = {
f.__name__: tuple(mt.value for mt in f.mapping_types)
for f in
[DateFilter, RangeFilter, MultipleChoiceFilter, BooleanFilter]
}
44 changes: 44 additions & 0 deletions backend/addcorpus/migrations/0005_add_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Generated by Django 4.1.9 on 2023-09-13 16:15

import addcorpus.validators
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0004_alter_corpusconfiguration_category'),
]

operations = [
migrations.AlterField(
model_name='corpusconfiguration',
name='description_page',
field=models.CharField(blank=True, help_text='filename of the markdown documentation file for this corpus', max_length=128, validators=[addcorpus.validators.validate_markdown_filename_extension]),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='image',
field=models.CharField(help_text='filename of the corpus image', max_length=126, validators=[addcorpus.validators.validate_image_filename_extension]),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='scan_image_type',
field=models.CharField(blank=True, help_text='MIME type of scan images', max_length=64, validators=[addcorpus.validators.validate_mimetype]),
),
migrations.AlterField(
model_name='field',
name='es_mapping',
field=models.JSONField(help_text='specification of the elasticsearch mapping of this field', validators=[addcorpus.validators.validate_es_mapping]),
),
migrations.AlterField(
model_name='field',
name='name',
field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validators.validate_name_is_not_a_route_parameter]),
),
migrations.AlterField(
model_name='field',
name='search_filter',
field=models.JSONField(blank=True, help_text='specification of the search filter for this field (if any)', validators=[addcorpus.validators.validate_search_filter]),
),
]
57 changes: 45 additions & 12 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField
from django.contrib.auth.models import Group
from django.core.exceptions import ValidationError
import warnings

from addcorpus.constants import CATEGORIES
from addcorpus.validators import validate_language_code
from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validators import validate_language_code, validate_image_filename_extension, \
validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \
validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \
validate_visualizations_with_mapping, validate_implication

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
Expand Down Expand Up @@ -63,6 +68,7 @@ class CorpusConfiguration(models.Model):
description_page = models.CharField(
max_length=128,
blank=True,
validators=[validate_markdown_filename_extension],
help_text='filename of the markdown documentation file for this corpus',
)
description = models.CharField(
Expand All @@ -85,6 +91,7 @@ class CorpusConfiguration(models.Model):
)
image = models.CharField(
max_length=126,
validators=[validate_image_filename_extension],
help_text='filename of the corpus image',
)
languages = ArrayField(
Expand All @@ -104,6 +111,7 @@ class CorpusConfiguration(models.Model):
scan_image_type = models.CharField(
max_length=64,
blank=True,
validators=[validate_mimetype],
help_text='MIME type of scan images',
)
title = models.CharField(
Expand All @@ -120,29 +128,33 @@ def __str__(self):

FIELD_DISPLAY_TYPES = [
('text_content', 'text content'),
('text', 'text'),
('keyword', 'keyword'),
('date', 'date'),
('integer', 'integer'),
('float', 'float'),
('boolean', 'boolean'),
(MappingType.TEXT.value, 'text'),
(MappingType.KEYWORD.value, 'keyword'),
(MappingType.DATE.value, 'date'),
(MappingType.INTEGER.value, 'integer'),
(MappingType.FLOAT.value, 'float'),
(MappingType.BOOLEAN.value, 'boolean'),
]

FIELD_VISUALIZATIONS = [
('resultscount', 'Number of results'),
('termfrequency', 'Frequency of the search term'),
('ngram', 'Neighbouring words'),
('wordcloud', 'Most frequent words'),
(VisualizationType.RESULTS_COUNT.value, 'Number of results'),
(VisualizationType.TERM_FREQUENCY.value, 'Frequency of the search term'),
(VisualizationType.NGRAM.value, 'Neighbouring words'),
(VisualizationType.WORDCLOUD.value, 'Most frequent words'),
]
'''Options for `visualizations` field'''

VISUALIZATION_SORT_OPTIONS = [
('key', 'By the value of the field'),
('value', 'By frequency')
]
'''Options for `visualization_sort` field'''


class Field(models.Model):
name = models.SlugField(
max_length=MAX_LENGTH_NAME,
validators=[validate_name_is_not_a_route_parameter],
help_text='internal name for the field',
)
corpus_configuration = models.ForeignKey(
Expand All @@ -167,6 +179,7 @@ class Field(models.Model):
)
search_filter = models.JSONField(
blank=True,
validators=[validate_search_filter],
help_text='specification of the search filter for this field (if any)',
)
results_overview = models.BooleanField(
Expand Down Expand Up @@ -197,6 +210,7 @@ class Field(models.Model):
help_text='if the field has results/term frequency charts: how is the x-axis sorted?',
)
es_mapping = models.JSONField(
validators=[validate_es_mapping],
help_text='specification of the elasticsearch mapping of this field',
)
indexed = models.BooleanField(
Expand Down Expand Up @@ -236,3 +250,22 @@ class Meta:

def __str__(self) -> str:
return f'{self.name} ({self.corpus_configuration.corpus.name})'

def clean(self):
validate_searchable_field_has_full_text_search(self.es_mapping, self.searchable)

if self.search_filter:
validate_search_filter_with_mapping(self.es_mapping, self.search_filter)

if self.visualizations:
validate_visualizations_with_mapping(self.es_mapping, self.visualizations)

validate_implication(self.primary_sort, self.sortable, "The primary sorting field must be sortable")
validate_implication(self.csv_core, self.downloadable, "Core download fields must be downloadable")

# core search fields must searchable
# not a hard requirement because it is not currently satisfied in all corpora
try:
validate_implication(self.search_field_core, self.searchable, "Core search fields must be searchable")
except ValidationError as e:
warnings.warn(e.message)
73 changes: 73 additions & 0 deletions backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pytest
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping
from addcorpus.validators import *

def test_validate_mimetype():
validate_mimetype('image/jpeg')

with pytest.raises(ValidationError):
validate_mimetype('nonsense')

def test_validate_es_mapping():
validate_es_mapping({'type': 'text'})

with pytest.raises(ValidationError):
validate_es_mapping({})

with pytest.raises(ValidationError):
validate_es_mapping({'type': 'perlocator'})

def test_validate_search_filter():
validate_search_filter({
'name': 'RangeFilter',
'lower': 0,
'upper': 100,
'description': '...'
})

with pytest.raises(ValidationError):
validate_search_filter({'name': 'UnkownFilter'})

def test_validate_search_filter_with_mapping():
filter = {
'name': 'RangeFilter',
'lower': 0,
'upper': 100,
'description': '...'
}

validate_search_filter_with_mapping(int_mapping(), filter)

with pytest.raises(ValidationError):
validate_search_filter_with_mapping(keyword_mapping(), filter)

def test_validate_visualizations_with_mapping():
validate_visualizations_with_mapping(text_mapping(), ['ngram'])
validate_visualizations_with_mapping(keyword_mapping(), ['resultscount'])
validate_visualizations_with_mapping(keyword_mapping(enable_full_text_search=True), ['ngram'])

with pytest.raises(ValidationError):
validate_visualizations_with_mapping(keyword_mapping(), ['ngram'])

with pytest.raises(ValidationError):
validate_visualizations_with_mapping(text_mapping(), ['resultscount'])

def test_validate_searchable_fields_has_fts():
validate_searchable_field_has_full_text_search(text_mapping(), True)
validate_searchable_field_has_full_text_search(
keyword_mapping(enable_full_text_search=True), True
)
validate_searchable_field_has_full_text_search(int_mapping(), False)

with pytest.raises(ValidationError):
validate_searchable_field_has_full_text_search(int_mapping(), True)

with pytest.warns(Warning):
validate_searchable_field_has_full_text_search(keyword_mapping(), True)

def test_filename_validation():
validate_image_filename_extension('image.jpg')

with pytest.raises(ValidationError):
validate_image_filename_extension('image.txt')

Loading

0 comments on commit fcba57a

Please sign in to comment.