Skip to content

Commit

Permalink
Merge branch 'develop' into feature/results-model
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas committed Nov 21, 2023
2 parents a363af7 + f99068f commit 728f4f8
Show file tree
Hide file tree
Showing 99 changed files with 5,675 additions and 265 deletions.
36 changes: 0 additions & 36 deletions .github/ISSUE_TEMPLATE/bug_report.md

This file was deleted.

66 changes: 66 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
---
name: Bug report
description: Let us know that something isn't working right
labels:
- bug
body:
- type: markdown
attributes:
value: |
Thank you for making a bug report! Please fill in this information so we can get to the
bottom of your issue.
- type: textarea
id: what-happened
attributes:
label: What went wrong?
description: Please describe what happened.
validations:
required: true
- type: textarea
id: expected
attributes:
label: What did you expect to happen?
validations:
required: true
- type: textarea
id: screenshot
attributes:
label: Screenshot
description: If you can make a screenshot of the issue, please include it!
validations:
required: false
- type: checkboxes
id: instance
attributes:
label: Where did you find the bug?
description: Please add where you found the bug.
options:
- label: https://ianalyzer.hum.uu.nl
- label: https://peopleandparliament.hum.uu.nl
- label: https://peace.sites.uu.nl
- label: a server hosted elsewhere (i.e. not by the research software lab)
- label: a local server
validations:
required: true
- type: input
id: version
attributes:
label: Version
description: |
For third-party and local servers, please add information about the version of the
software, if you know it. A version number (e.g "1.2.3") is great. For a pre-release
build, you can provide the branch or commit hash.
validations:
required: false
- type: textarea
id: to-reproduce
attributes:
label: Steps to reproduce
description: |
How can a developer replicate the issue? Please provide any information you can. For
example: "I went to
https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
clicked on Download CSV. I pressed cancel and then I clicked Download CSV again."
validations:
required: true
---
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Base image
FROM python:3.8-buster
FROM docker.io/library/python:3.8-buster
# Setting this means stdout and stderr streams are sent to terminal in real time
ENV PYTHONUNBUFFERED 1
# Get required libraries for xmlsec
Expand Down
60 changes: 42 additions & 18 deletions backend/addcorpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
from os.path import isdir

from django.conf import settings
from langcodes import Language, standardize_tag

from addcorpus.constants import CATEGORIES

import logging

Expand All @@ -33,37 +30,37 @@ class CorpusDefinition(object):
@property
def title(self):
'''
Path to source data directory.
Title of the corpus
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing title')

@property
def description(self):
'''
Short description of the corpus
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing description')

@property
def data_directory(self):
'''
Path to source data directory.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing data_directory')

@property
def min_date(self):
'''
Minimum timestamp for data files.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing min_date')

@property
def max_date(self):
'''
Maximum timestamp for data files.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing max_date')


'''
Expand All @@ -81,14 +78,14 @@ def category(self):
See addcorpus.constants.CATEGORIES for options
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing category')

@property
def es_index(self):
'''
ElasticSearch index name.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing category')

'''
Elasticsearch alias. Defaults to None.
Expand All @@ -111,7 +108,7 @@ def fields(self):
the `Field` class, containing information about each attribute.
MUST include a field with `name='id'`.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing fields')


'''
Expand Down Expand Up @@ -139,7 +136,7 @@ def image(self):
Name of the corpus image. Should be relative path from a directory 'images'
in the same directory as the corpus definition file.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing image')

'''
MIME type of scanned documents (images)
Expand Down Expand Up @@ -241,15 +238,15 @@ def sources(self, start=datetime.min, end=datetime.max):
empty or contains only a timestamp; but any data that is to be
extracted without reading the file itself can be specified there.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing sources')

def source2dicts(self, sources):
'''
Generate an iterator of document dictionaries from a given source file.
The dictionaries are created from this corpus' `Field`s.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing source2dicts')

def documents(self, sources=None):
'''
Expand All @@ -274,6 +271,31 @@ def _reject_extractors(self, *inapplicable_extractors):
if isinstance(field.extractor, inapplicable_extractors):
raise RuntimeError(
"Specified extractor method cannot be used with this type of data")

class ParentCorpusDefinition(CorpusDefinition):
''' A class from which other corpus definitions can inherit.
This class is in charge of setting fields, usually without defining an extractor.
The subclassed CorpusDefinitions will set extractors on the fields -
this way, CorpusDefinitions can share the same mappings and filters,
while the logic to collect sources and populate the fields can be different.
The ParentCorpusDefinition can also be used to allow cross-corpus search and filtering.
'''
#define fields property so it can be set in __init__
@property
def fields(self):
return self._fields

@fields.setter
def fields(self, value):
self._fields = value

def __init__(self):
''' Specify a list of fields which all subclasses share
A subclass of ParentCorpusDefinition will provide extractors for the fields,
and potentially prune done the list of fields to those which have an extractor
'''
self.fields = []


class XMLCorpusDefinition(CorpusDefinition):
'''
Expand Down Expand Up @@ -309,7 +331,7 @@ def source2dicts(self, source):
default implementation for XML layouts; may be subclassed if more
'''
# Make sure that extractors are sensible
self._reject_extractors(extract.HTML, extract.CSV)
self._reject_extractors(extract.CSV)

# extract information from external xml files first, if applicable
metadata = {}
Expand Down Expand Up @@ -519,7 +541,7 @@ def source2dicts(self, source):
'''
(filename, metadata) = source

self._reject_extractors(extract.XML, extract.CSV)
self._reject_extractors(extract.CSV)

# Loading HTML
logger.info('Reading HTML file {} ...'.format(filename))
Expand Down Expand Up @@ -594,7 +616,7 @@ class CSVCorpusDefinition(CorpusDefinition):
def source2dicts(self, source):
# make sure the field size is as big as the system permits
csv.field_size_limit(sys.maxsize)
self._reject_extractors(extract.XML, extract.HTML)
self._reject_extractors(extract.XML, extract.FilterAttribute)

if isinstance(source, str):
filename = source
Expand Down Expand Up @@ -693,6 +715,7 @@ def __init__(self,
visualizations=[],
visualization_sort=None,
es_mapping={'type': 'text'},
language=None,
search_filter=None,
extractor=extract.Constant(None),
sortable=None,
Expand All @@ -716,6 +739,7 @@ def __init__(self,
self.visualizations = visualizations
self.visualization_sort = visualization_sort
self.es_mapping = es_mapping
self.language = language
self.indexed = indexed
self.hidden = not indexed or hidden
self.extractor = extractor
Expand Down
14 changes: 8 additions & 6 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
from addcorpus.es_settings import add_language_string

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
'''
Mapping for the main content field. Options:
- `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
- `stopword_analysis`: enables analysis using stopword removal. Requires setting a `clean` analyser in the `es_settings` of the corpus.
- `stemming_analysis`: enables analysis using stemming. Requires a `stemmed` analyser in the `es_settings` for the corpus.
- 'updated_highlighting': enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
- `stopword_analysis`: enables analysis using stopword removal.
- `stemming_analysis`: enables analysis using stemming.
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
'''

mapping = {
Expand All @@ -27,13 +29,13 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
if stopword_analysis:
multifields['clean'] = {
"type": "text",
"analyzer": "clean",
"analyzer": add_language_string('clean', language),
"term_vector": "with_positions_offsets" # include character positions for highlighting
}
if stemming_analysis:
multifields['stemmed'] = {
"type": "text",
"analyzer": "stemmed",
"analyzer": add_language_string('stemmed', language),
"term_vector": "with_positions_offsets",
}
mapping['fields'] = multifields
Expand Down
Loading

0 comments on commit 728f4f8

Please sign in to comment.