Skip to content

Commit

Permalink
Merge pull request #1313 from UUDigitalHumanitieslab/feature/peace-co…
Browse files Browse the repository at this point in the history
…rpora

Feature/peace corpora
  • Loading branch information
BeritJanssen authored Nov 16, 2023
2 parents 82099d0 + dc3fab2 commit de7ab44
Show file tree
Hide file tree
Showing 65 changed files with 4,936 additions and 169 deletions.
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Base image
FROM python:3.8-buster
FROM docker.io/library/python:3.8-buster
# Setting this means stdout and stderr streams are sent to terminal in real time
ENV PYTHONUNBUFFERED 1
# Get required libraries for xmlsec
Expand Down
60 changes: 42 additions & 18 deletions backend/addcorpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
from os.path import isdir

from django.conf import settings
from langcodes import Language, standardize_tag

from addcorpus.constants import CATEGORIES

import logging

Expand All @@ -33,37 +30,37 @@ class CorpusDefinition(object):
@property
def title(self):
'''
Path to source data directory.
Title of the corpus
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing title')

@property
def description(self):
'''
Short description of the corpus
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing description')

@property
def data_directory(self):
'''
Path to source data directory.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing data_directory')

@property
def min_date(self):
'''
Minimum timestamp for data files.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing min_date')

@property
def max_date(self):
'''
Maximum timestamp for data files.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing max_date')


'''
Expand All @@ -81,14 +78,14 @@ def category(self):
See addcorpus.constants.CATEGORIES for options
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing category')

@property
def es_index(self):
'''
ElasticSearch index name.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing category')

'''
Elasticsearch alias. Defaults to None.
Expand All @@ -111,7 +108,7 @@ def fields(self):
the `Field` class, containing information about each attribute.
MUST include a field with `name='id'`.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing fields')


'''
Expand Down Expand Up @@ -139,7 +136,7 @@ def image(self):
Name of the corpus image. Should be relative path from a directory 'images'
in the same directory as the corpus definition file.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing image')

'''
MIME type of scanned documents (images)
Expand Down Expand Up @@ -241,15 +238,15 @@ def sources(self, start=datetime.min, end=datetime.max):
empty or contains only a timestamp; but any data that is to be
extracted without reading the file itself can be specified there.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing sources')

def source2dicts(self, sources):
'''
Generate an iterator of document dictionaries from a given source file.
The dictionaries are created from this corpus' `Field`s.
'''
raise NotImplementedError()
raise NotImplementedError('CorpusDefinition missing source2dicts')

def documents(self, sources=None):
'''
Expand All @@ -274,6 +271,31 @@ def _reject_extractors(self, *inapplicable_extractors):
if isinstance(field.extractor, inapplicable_extractors):
raise RuntimeError(
"Specified extractor method cannot be used with this type of data")

class ParentCorpusDefinition(CorpusDefinition):
''' A class from which other corpus definitions can inherit.
This class is in charge of setting fields, usually without defining an extractor.
The subclassed CorpusDefinitions will set extractors on the fields -
this way, CorpusDefinitions can share the same mappings and filters,
while the logic to collect sources and populate the fields can be different.
The ParentCorpusDefinition can also be used to allow cross-corpus search and filtering.
'''
#define fields property so it can be set in __init__
@property
def fields(self):
return self._fields

@fields.setter
def fields(self, value):
self._fields = value

def __init__(self):
''' Specify a list of fields which all subclasses share
A subclass of ParentCorpusDefinition will provide extractors for the fields,
and potentially prune done the list of fields to those which have an extractor
'''
self.fields = []


class XMLCorpusDefinition(CorpusDefinition):
'''
Expand Down Expand Up @@ -309,7 +331,7 @@ def source2dicts(self, source):
default implementation for XML layouts; may be subclassed if more
'''
# Make sure that extractors are sensible
self._reject_extractors(extract.HTML, extract.CSV)
self._reject_extractors(extract.CSV)

# extract information from external xml files first, if applicable
metadata = {}
Expand Down Expand Up @@ -519,7 +541,7 @@ def source2dicts(self, source):
'''
(filename, metadata) = source

self._reject_extractors(extract.XML, extract.CSV)
self._reject_extractors(extract.CSV)

# Loading HTML
logger.info('Reading HTML file {} ...'.format(filename))
Expand Down Expand Up @@ -594,7 +616,7 @@ class CSVCorpusDefinition(CorpusDefinition):
def source2dicts(self, source):
# make sure the field size is as big as the system permits
csv.field_size_limit(sys.maxsize)
self._reject_extractors(extract.XML, extract.HTML)
self._reject_extractors(extract.XML, extract.FilterAttribute)

if isinstance(source, str):
filename = source
Expand Down Expand Up @@ -693,6 +715,7 @@ def __init__(self,
visualizations=[],
visualization_sort=None,
es_mapping={'type': 'text'},
language=None,
search_filter=None,
extractor=extract.Constant(None),
sortable=None,
Expand All @@ -716,6 +739,7 @@ def __init__(self,
self.visualizations = visualizations
self.visualization_sort = visualization_sort
self.es_mapping = es_mapping
self.language = language
self.indexed = indexed
self.hidden = not indexed or hidden
self.extractor = extractor
Expand Down
14 changes: 8 additions & 6 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
from addcorpus.es_settings import add_language_string

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
'''
Mapping for the main content field. Options:
- `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
- `stopword_analysis`: enables analysis using stopword removal. Requires setting a `clean` analyser in the `es_settings` of the corpus.
- `stemming_analysis`: enables analysis using stemming. Requires a `stemmed` analyser in the `es_settings` for the corpus.
- 'updated_highlighting': enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
- `stopword_analysis`: enables analysis using stopword removal.
- `stemming_analysis`: enables analysis using stemming.
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
'''

mapping = {
Expand All @@ -27,13 +29,13 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
if stopword_analysis:
multifields['clean'] = {
"type": "text",
"analyzer": "clean",
"analyzer": add_language_string('clean', language),
"term_vector": "with_positions_offsets" # include character positions for highlighting
}
if stemming_analysis:
multifields['stemmed'] = {
"type": "text",
"analyzer": "stemmed",
"analyzer": add_language_string('stemmed', language),
"term_vector": "with_positions_offsets",
}
mapping['fields'] = multifields
Expand Down
Loading

0 comments on commit de7ab44

Please sign in to comment.