Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/corpus model validation #1266

Merged
merged 7 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions backend/addcorpus/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from enum import Enum

CATEGORIES = [
('parliament', 'Parliamentary debates'),
('periodical', 'Newspapers and other periodicals'),
Expand All @@ -11,3 +13,44 @@
'''
Types of data
'''

class MappingType(Enum):
lukavdplas marked this conversation as resolved.
Show resolved Hide resolved
'Elasticsearch mapping types that are implemented in I-analyzer'

TEXT = 'text'
KEYWORD = 'keyword'
DATE = 'date'
INTEGER = 'integer'
FLOAT = 'float'
BOOLEAN = 'boolean'


class VisualizationType(Enum):
'''Types of visualisations available'''

RESULTS_COUNT = 'resultscount'
TERM_FREQUENCY = 'termfrequency'
NGRAM = 'ngram'
WORDCLOUD = 'wordcloud'

FORBIDDEN_FIELD_NAMES = [
'query',
'fields',
'sort',
'highlight',
'visualize',
'visualizedField',
'normalize',
'size',
'positions',
'freqCompensation',
'analysis',
'maxDocuments',
'numberOfNgrams',
'dateField',
]
'''
Field names that cannot be used because they are also query parameters in frontend routes.

Using them would make routing ambiguous.
'''
23 changes: 22 additions & 1 deletion backend/addcorpus/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@
'''

from datetime import datetime
from addcorpus.constants import MappingType

class Filter(object):
'''
A filter is the interface between the form that is presented to users and
the ElasticSearch filter that is sent to the client.
'''

mapping_types = tuple()
'''accepted mapping types for this filter'''

def __init__(self, description=None):
self.field = None # Must be filled after initialising
self.description = description
Expand All @@ -31,6 +35,8 @@ class DateFilter(Filter):
Filter for datetime values: produces two datepickers for min and max date.
'''

mapping_types = (MappingType.DATE,)

def __init__(self, lower, upper, *nargs, **kwargs):
self.lower = lower
self.upper = upper
Expand All @@ -42,6 +48,8 @@ class RangeFilter(Filter):
Filter for numerical values: produces a slider between two values.
'''

mapping_types = (MappingType.INTEGER, MappingType.FLOAT)

def __init__(self, lower, upper, *nargs, **kwargs):
self.lower = lower
self.upper = upper
Expand All @@ -53,6 +61,11 @@ class MultipleChoiceFilter(Filter):
Filter for keyword values: produces a set of buttons.
'''

mapping_types = (MappingType.KEYWORD,)
# note: the multiple choice filter is imlemented as a terms query
# which is also valid for integer/float/boolean/date,
# but those should be rejected so the appropriate filter is used instead

def __init__(self, option_count=10, *nargs, **kwargs):
self.option_count = option_count
# option_count defines how many buckets are retrieved
Expand All @@ -63,9 +76,17 @@ def __init__(self, option_count=10, *nargs, **kwargs):
class BooleanFilter(Filter):
'''
Filter for boolean values: produces a drop-down menu.
''' #TODO checkbox?
'''

mapping_types = (MappingType.BOOLEAN,)

def __init__(self, true, false, *nargs, **kwargs):
self.true = true
self.false = false
super().__init__(*nargs, **kwargs)

VALID_MAPPINGS = {
f.__name__: tuple(mt.value for mt in f.mapping_types)
for f in
[DateFilter, RangeFilter, MultipleChoiceFilter, BooleanFilter]
}
44 changes: 44 additions & 0 deletions backend/addcorpus/migrations/0005_add_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Generated by Django 4.1.9 on 2023-09-13 16:15

import addcorpus.validators
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0004_alter_corpusconfiguration_category'),
]

operations = [
migrations.AlterField(
model_name='corpusconfiguration',
name='description_page',
field=models.CharField(blank=True, help_text='filename of the markdown documentation file for this corpus', max_length=128, validators=[addcorpus.validators.validate_markdown_filename_extension]),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='image',
field=models.CharField(help_text='filename of the corpus image', max_length=126, validators=[addcorpus.validators.validate_image_filename_extension]),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='scan_image_type',
field=models.CharField(blank=True, help_text='MIME type of scan images', max_length=64, validators=[addcorpus.validators.validate_mimetype]),
),
migrations.AlterField(
model_name='field',
name='es_mapping',
field=models.JSONField(help_text='specification of the elasticsearch mapping of this field', validators=[addcorpus.validators.validate_es_mapping]),
),
migrations.AlterField(
model_name='field',
name='name',
field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validators.validate_name_is_not_a_route_parameter]),
),
migrations.AlterField(
model_name='field',
name='search_filter',
field=models.JSONField(blank=True, help_text='specification of the search filter for this field (if any)', validators=[addcorpus.validators.validate_search_filter]),
),
]
57 changes: 45 additions & 12 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField
from django.contrib.auth.models import Group
from django.core.exceptions import ValidationError
import warnings

from addcorpus.constants import CATEGORIES
from addcorpus.validators import validate_language_code
from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validators import validate_language_code, validate_image_filename_extension, \
validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \
validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \
validate_visualizations_with_mapping, validate_implication

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
Expand Down Expand Up @@ -63,6 +68,7 @@ class CorpusConfiguration(models.Model):
description_page = models.CharField(
max_length=128,
blank=True,
validators=[validate_markdown_filename_extension],
help_text='filename of the markdown documentation file for this corpus',
)
description = models.CharField(
Expand All @@ -85,6 +91,7 @@ class CorpusConfiguration(models.Model):
)
image = models.CharField(
max_length=126,
validators=[validate_image_filename_extension],
help_text='filename of the corpus image',
)
languages = ArrayField(
Expand All @@ -104,6 +111,7 @@ class CorpusConfiguration(models.Model):
scan_image_type = models.CharField(
max_length=64,
blank=True,
validators=[validate_mimetype],
help_text='MIME type of scan images',
)
title = models.CharField(
Expand All @@ -120,29 +128,33 @@ def __str__(self):

FIELD_DISPLAY_TYPES = [
('text_content', 'text content'),
('text', 'text'),
('keyword', 'keyword'),
('date', 'date'),
('integer', 'integer'),
('float', 'float'),
('boolean', 'boolean'),
(MappingType.TEXT.value, 'text'),
(MappingType.KEYWORD.value, 'keyword'),
(MappingType.DATE.value, 'date'),
(MappingType.INTEGER.value, 'integer'),
(MappingType.FLOAT.value, 'float'),
(MappingType.BOOLEAN.value, 'boolean'),
]

FIELD_VISUALIZATIONS = [
('resultscount', 'Number of results'),
('termfrequency', 'Frequency of the search term'),
('ngram', 'Neighbouring words'),
('wordcloud', 'Most frequent words'),
(VisualizationType.RESULTS_COUNT.value, 'Number of results'),
(VisualizationType.TERM_FREQUENCY.value, 'Frequency of the search term'),
(VisualizationType.NGRAM.value, 'Neighbouring words'),
(VisualizationType.WORDCLOUD.value, 'Most frequent words'),
]
'''Options for `visualizations` field'''

VISUALIZATION_SORT_OPTIONS = [
('key', 'By the value of the field'),
('value', 'By frequency')
]
'''Options for `visualization_sort` field'''


class Field(models.Model):
name = models.SlugField(
max_length=MAX_LENGTH_NAME,
validators=[validate_name_is_not_a_route_parameter],
help_text='internal name for the field',
)
corpus_configuration = models.ForeignKey(
Expand All @@ -167,6 +179,7 @@ class Field(models.Model):
)
search_filter = models.JSONField(
blank=True,
validators=[validate_search_filter],
help_text='specification of the search filter for this field (if any)',
)
results_overview = models.BooleanField(
Expand Down Expand Up @@ -197,6 +210,7 @@ class Field(models.Model):
help_text='if the field has results/term frequency charts: how is the x-axis sorted?',
)
es_mapping = models.JSONField(
validators=[validate_es_mapping],
help_text='specification of the elasticsearch mapping of this field',
)
indexed = models.BooleanField(
Expand Down Expand Up @@ -236,3 +250,22 @@ class Meta:

def __str__(self) -> str:
return f'{self.name} ({self.corpus_configuration.corpus.name})'

def clean(self):
validate_searchable_field_has_full_text_search(self.es_mapping, self.searchable)

if self.search_filter:
validate_search_filter_with_mapping(self.es_mapping, self.search_filter)

if self.visualizations:
validate_visualizations_with_mapping(self.es_mapping, self.visualizations)

validate_implication(self.primary_sort, self.sortable, "The primary sorting field must be sortable")
validate_implication(self.csv_core, self.downloadable, "Core download fields must be downloadable")

# core search fields must searchable
# not a hard requirement because it is not currently satisfied in all corpora
try:
validate_implication(self.search_field_core, self.searchable, "Core search fields must be searchable")
except ValidationError as e:
warnings.warn(e.message)
73 changes: 73 additions & 0 deletions backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pytest
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping
from addcorpus.validators import *

def test_validate_mimetype():
validate_mimetype('image/jpeg')

with pytest.raises(ValidationError):
validate_mimetype('nonsense')

def test_validate_es_mapping():
validate_es_mapping({'type': 'text'})

with pytest.raises(ValidationError):
validate_es_mapping({})

with pytest.raises(ValidationError):
validate_es_mapping({'type': 'perlocator'})

def test_validate_search_filter():
validate_search_filter({
'name': 'RangeFilter',
'lower': 0,
'upper': 100,
'description': '...'
})

with pytest.raises(ValidationError):
validate_search_filter({'name': 'UnkownFilter'})

def test_validate_search_filter_with_mapping():
filter = {
'name': 'RangeFilter',
'lower': 0,
'upper': 100,
'description': '...'
}

validate_search_filter_with_mapping(int_mapping(), filter)

with pytest.raises(ValidationError):
validate_search_filter_with_mapping(keyword_mapping(), filter)

def test_validate_visualizations_with_mapping():
validate_visualizations_with_mapping(text_mapping(), ['ngram'])
validate_visualizations_with_mapping(keyword_mapping(), ['resultscount'])
validate_visualizations_with_mapping(keyword_mapping(enable_full_text_search=True), ['ngram'])

with pytest.raises(ValidationError):
validate_visualizations_with_mapping(keyword_mapping(), ['ngram'])

with pytest.raises(ValidationError):
validate_visualizations_with_mapping(text_mapping(), ['resultscount'])

def test_validate_searchable_fields_has_fts():
validate_searchable_field_has_full_text_search(text_mapping(), True)
validate_searchable_field_has_full_text_search(
keyword_mapping(enable_full_text_search=True), True
)
validate_searchable_field_has_full_text_search(int_mapping(), False)

with pytest.raises(ValidationError):
validate_searchable_field_has_full_text_search(int_mapping(), True)

with pytest.warns(Warning):
validate_searchable_field_has_full_text_search(keyword_mapping(), True)

def test_filename_validation():
validate_image_filename_extension('image.jpg')

with pytest.raises(ValidationError):
validate_image_filename_extension('image.txt')

Loading