Skip to content

Commit

Permalink
Merge pull request #1284 from UUDigitalHumanitieslab/bugfix/remove-ng…
Browse files Browse the repository at this point in the history
…rams-from-dbnl

Bugfix/remove ngrams from dbnl
  • Loading branch information
lukavdplas authored Oct 11, 2023
2 parents 0aea109 + c277cb1 commit c674f9a
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 4 deletions.
8 changes: 7 additions & 1 deletion backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from addcorpus.validators import validate_language_code, validate_image_filename_extension, \
validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \
validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \
validate_visualizations_with_mapping, validate_implication
validate_visualizations_with_mapping, validate_implication, any_date_fields, visualisations_require_date_field

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
Expand Down Expand Up @@ -269,3 +269,9 @@ def clean(self):
validate_implication(self.search_field_core, self.searchable, "Core search fields must be searchable")
except ValidationError as e:
warnings.warn(e.message)

validate_implication(
self.visualizations, self.corpus_configuration.fields.all(),
'The ngram visualisation requires a date field on the corpus',
visualisations_require_date_field, any_date_fields,
)
32 changes: 31 additions & 1 deletion backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping
from addcorpus.models import Field
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping, main_content_mapping, date_mapping
from addcorpus.validators import *

def test_validate_mimetype():
Expand Down Expand Up @@ -71,3 +72,32 @@ def test_filename_validation():
with pytest.raises(ValidationError):
validate_image_filename_extension('image.txt')

def test_validate_ngram_has_date_field():
text_field = Field(
name='content',
es_mapping=main_content_mapping(),
visualizations=['wordcloud', 'ngram']
)

date_field = Field(
name='date',
es_mapping=date_mapping()
)

with_date_field = [text_field, date_field]
without_date_field = [text_field]

validate_implication(
text_field.visualizations, with_date_field,
'',
visualisations_require_date_field,
any_date_fields
)

with pytest.raises(ValidationError):
validate_implication(
text_field.visualizations, without_date_field,
'',
visualisations_require_date_field,
any_date_fields
)
7 changes: 7 additions & 0 deletions backend/addcorpus/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,10 @@ def validate_markdown_filename_extension(filename):
def validate_image_filename_extension(filename):
allowed = ['.jpeg', '.jpg', '.png', '.JPG']
validate_filename_extension(filename, allowed)

def any_date_fields(fields):
is_date = lambda field: primary_mapping_type(field.es_mapping) == 'date'
return any(map(is_date, fields))

def visualisations_require_date_field(visualisations):
return visualisations and 'ngram' in visualisations
2 changes: 1 addition & 1 deletion backend/corpora/dbnl/dbnl.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def _xml_files(self):
transform_soup_func=utils.pad_content,
),
es_mapping=main_content_mapping(token_counts=True),
visualizations=['wordcloud', 'ngram'],
visualizations=['wordcloud'],
)

has_content = FieldDefinition(
Expand Down
2 changes: 1 addition & 1 deletion documentation/Defining-corpus-fields.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ The following properties determine how a field appears in the interface.

`search_filter` can be set if the interface should include a search filter widget for the field. I-analyzer includes date filters, multiplechoice filters (used for keyword data), range filters, and boolean filters. See [filters.py](../backend/addcorpus/filters.py).

`visualizations` optionally specifies a list of visualisations that apply for the field. Generally speaking, this is based on the type of data. For date fields and categorical/ordinal fields (usually keyword type), you can use `['resultscount', 'termfrequency']`. For text fields, you can use `['wordcloud', 'ngram']`.
`visualizations` optionally specifies a list of visualisations that apply for the field. Generally speaking, this is based on the type of data. For date fields and categorical/ordinal fields (usually keyword type), you can use `['resultscount', 'termfrequency']`. For text fields, you can use `['wordcloud', 'ngram']`. However, the ngram visualisation also requires that your corpus has a date field.

If a field includes the `'resultscount'` and/or `'termfrequency'` visualisations and it is not a date field, you can also specify `visualisation_sort`, which determines how to sort the x-axis of the graph. Default is `'value'`, where categories are sorted based on the y-axis value (i.e., frequency). You may specify that they should be sorted on `'key'`, so that categories are sorted alphabetically (for keywords) or small-to-large (for numbers).

Expand Down

0 comments on commit c674f9a

Please sign in to comment.