Merge branch 'develop' into feature/results-model

CentreForDigitalHumanities · Nov 21, 2023 · 728f4f8 · 728f4f8
2 parents a363af7 + f99068f
commit 728f4f8
Show file tree

Hide file tree

Showing 99 changed files with 5,675 additions and 265 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,66 @@
+---
+name: Bug report
+description: Let us know that something isn't working right
+labels:
+  - bug
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for making a bug report! Please fill in this information so we can get to the
+        bottom of your issue.
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What went wrong?
+      description: Please describe what happened.
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: What did you expect to happen?
+    validations:
+      required: true
+  - type: textarea
+    id: screenshot
+    attributes:
+      label: Screenshot
+      description: If you can make a screenshot of the issue, please include it!
+    validations:
+      required: false
+  - type: checkboxes
+    id: instance
+    attributes:
+      label: Where did you find the bug?
+      description: Please add where you found the bug.
+      options:
+        - label: https://ianalyzer.hum.uu.nl
+        - label: https://peopleandparliament.hum.uu.nl
+        - label: https://peace.sites.uu.nl
+        - label: a server hosted elsewhere (i.e. not by the research software lab)
+        - label: a local server
+    validations:
+      required: true
+  - type: input
+    id: version
+    attributes:
+      label: Version
+      description: |
+        For third-party and local servers, please add information about the version of the
+        software, if you know it. A version number (e.g "1.2.3") is great. For a pre-release
+        build, you can provide the branch or commit hash.
+    validations:
+      required: false
+  - type: textarea
+    id: to-reproduce
+    attributes:
+      label: Steps to reproduce
+      description: |
+        How can a developer replicate the issue? Please provide any information you can. For
+        example: "I went to
+        https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
+        clicked on Download CSV. I pressed cancel and then I clicked Download CSV again."
+    validations:
+      required: true
+---
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,5 +1,5 @@
 # Base image
-FROM python:3.8-buster
+FROM docker.io/library/python:3.8-buster
 # Setting this means stdout and stderr streams are sent to terminal in real time
 ENV PYTHONUNBUFFERED 1
 # Get required libraries for xmlsec

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
@@ -11,9 +11,6 @@
 from os.path import isdir
 
 from django.conf import settings
-from langcodes import Language, standardize_tag
-
-from addcorpus.constants import CATEGORIES
 
 import logging
 
@@ -33,37 +30,37 @@ class CorpusDefinition(object):
     @property
     def title(self):
         '''
-        Path to source data directory.
+        Title of the corpus
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing title')
 
     @property
     def description(self):
         '''
         Short description of the corpus
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing description')
 
     @property
     def data_directory(self):
         '''
         Path to source data directory.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing data_directory')
 
     @property
     def min_date(self):
         '''
         Minimum timestamp for data files.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing min_date')
 
     @property
     def max_date(self):
         '''
         Maximum timestamp for data files.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing max_date')
 
 
     '''
@@ -81,14 +78,14 @@ def category(self):
 
         See addcorpus.constants.CATEGORIES for options
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing category')
 
     @property
     def es_index(self):
         '''
         ElasticSearch index name.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing category')
 
     '''
     Elasticsearch alias. Defaults to None.
@@ -111,7 +108,7 @@ def fields(self):
         the `Field` class, containing information about each attribute.
         MUST include a field with `name='id'`.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing fields')
 
 
     '''
@@ -139,7 +136,7 @@ def image(self):
         Name of the corpus image. Should be relative path from a directory 'images'
         in the same directory as the corpus definition file.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing image')
 
     '''
     MIME type of scanned documents (images)
@@ -241,15 +238,15 @@ def sources(self, start=datetime.min, end=datetime.max):
         empty or contains only a timestamp; but any data that is to be
         extracted without reading the file itself can be specified there.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing sources')
 
     def source2dicts(self, sources):
         '''
         Generate an iterator of document dictionaries from a given source file.
 
         The dictionaries are created from this corpus' `Field`s.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing source2dicts')
 
     def documents(self, sources=None):
         '''
@@ -274,6 +271,31 @@ def _reject_extractors(self, *inapplicable_extractors):
             if isinstance(field.extractor, inapplicable_extractors):
                 raise RuntimeError(
                     "Specified extractor method cannot be used with this type of data")
+
+class ParentCorpusDefinition(CorpusDefinition):
+    ''' A class from which other corpus definitions can inherit.
+    This class is in charge of setting fields, usually without defining an extractor.
+    The subclassed CorpusDefinitions will set extractors on the fields -
+    this way, CorpusDefinitions can share the same mappings and filters,
+    while the logic to collect sources and populate the fields can be different.
+    The ParentCorpusDefinition can also be used to allow cross-corpus search and filtering.
+    '''
+    #define fields property so it can be set in __init__
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, value):
+        self._fields = value
+
+    def __init__(self):
+        ''' Specify a list of fields which all subclasses share
+            A subclass of ParentCorpusDefinition will provide extractors for the fields,
+            and potentially prune done the list of fields to those which have an extractor
+        '''
+        self.fields = []
+
 
 class XMLCorpusDefinition(CorpusDefinition):
     '''
@@ -309,7 +331,7 @@ def source2dicts(self, source):
         default implementation for XML layouts; may be subclassed if more
         '''
         # Make sure that extractors are sensible
-        self._reject_extractors(extract.HTML, extract.CSV)
+        self._reject_extractors(extract.CSV)
 
         # extract information from external xml files first, if applicable
         metadata = {}
@@ -519,7 +541,7 @@ def source2dicts(self, source):
         '''
         (filename, metadata) = source
 
-        self._reject_extractors(extract.XML, extract.CSV)
+        self._reject_extractors(extract.CSV)
 
         # Loading HTML
         logger.info('Reading HTML file {} ...'.format(filename))
@@ -594,7 +616,7 @@ class CSVCorpusDefinition(CorpusDefinition):
     def source2dicts(self, source):
         # make sure the field size is as big as the system permits
         csv.field_size_limit(sys.maxsize)
-        self._reject_extractors(extract.XML, extract.HTML)
+        self._reject_extractors(extract.XML, extract.FilterAttribute)
 
         if isinstance(source, str):
             filename = source
@@ -693,6 +715,7 @@ def __init__(self,
                  visualizations=[],
                  visualization_sort=None,
                  es_mapping={'type': 'text'},
+                 language=None,
                  search_filter=None,
                  extractor=extract.Constant(None),
                  sortable=None,
@@ -716,6 +739,7 @@ def __init__(self,
         self.visualizations = visualizations
         self.visualization_sort = visualization_sort
         self.es_mapping = es_mapping
+        self.language = language
         self.indexed = indexed
         self.hidden = not indexed or hidden
         self.extractor = extractor

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -1,11 +1,13 @@
-def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
+from addcorpus.es_settings import add_language_string
+
+def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
     '''
     Mapping for the main content field. Options:
 
     - `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
-    - `stopword_analysis`: enables analysis using stopword removal. Requires setting a `clean` analyser in the `es_settings` of the corpus.
-    - `stemming_analysis`: enables analysis using stemming. Requires a `stemmed` analyser in the `es_settings` for the corpus.
-    - 'updated_highlighting': enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
+    - `stopword_analysis`: enables analysis using stopword removal.
+    - `stemming_analysis`: enables analysis using stemming.
+    - `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
     '''
 
     mapping = {
@@ -27,13 +29,13 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
         if stopword_analysis:
             multifields['clean'] = {
                 "type": "text",
-                "analyzer": "clean",
+                "analyzer": add_language_string('clean', language),
                 "term_vector": "with_positions_offsets" # include character positions for highlighting
             }
         if stemming_analysis:
             multifields['stemmed'] = {
                 "type": "text",
-                "analyzer": "stemmed",
+                "analyzer": add_language_string('stemmed', language),
                 "term_vector": "with_positions_offsets",
             }
         mapping['fields'] = multifields