Merge branch 'develop' into feature/wordcloud-chartjs

CentreForDigitalHumanities · Oct 26, 2023 · 359f3a2 · 359f3a2
2 parents a206cba + 9d28132
commit 359f3a2
Show file tree

Hide file tree

Showing 29 changed files with 225 additions and 87 deletions.
diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -1,4 +1,4 @@
-def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
+def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
     '''
     Mapping for the main content field. Options:
 

diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -12,8 +12,8 @@
 from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
 from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf
 from addcorpus.load_corpus import corpus_dir
-
 from addcorpus.es_mappings import keyword_mapping, main_content_mapping
+from addcorpus.es_settings import es_settings
 
 from media.media_url import media_url
 
@@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition):
 
     dutchannualreports_map = {}
 
+    @property
+    def es_settings(self):
+        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+
     with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
         reader = csv.DictReader(f)
         for line in reader:

diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
@@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition):
     description_page = 'ecco.md'
     min_date = datetime(year=1700, month=1, day=1)
     max_date = datetime(year=1800, month=12, day=31)
-
-    @property
-    def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
-
     data_directory = settings.ECCO_DATA
     es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
     image = 'ecco.jpg'
@@ -47,6 +42,10 @@ def es_settings(self):
 
     meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')
 
+    @property
+    def es_settings(self):
+        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+
     def sources(self, start=min_date, end=max_date):
         logging.basicConfig(filename='ecco.log', level=logging.INFO)
 

diff --git a/backend/corpora/parliament/finland-old.py b/backend/corpora/parliament/finland-old.py
@@ -14,7 +14,7 @@
 class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
     title = 'People and Parliament (Finland, 1863-1905)'
     description = 'Speeches from the early Finnish estates'
-    max_date = datetime(year=1905, month=12, day=31)
+    max_date = datetime(year=1906, month=12, day=31)
     min_date = datetime(year=1863, month=1, day=1)
     data_directory = settings.PP_FINLAND_OLD_DATA
     es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')

diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py
@@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition):
     title = "People & Parliament (Netherlands)"
     description = "Speeches from the Eerste Kamer and Tweede Kamer"
     min_date = datetime(year=1815, month=1, day=1)
-    max_date = datetime(year=2020, month=12, day=31)
+    max_date = datetime(year=2022, month=12, day=31)
     data_directory = settings.PP_NL_DATA
     data_directory_recent = settings.PP_NL_RECENT_DATA
     word_model_path = getattr(settings, 'PP_NL_WM', None)

diff --git a/backend/download/create_csv.py b/backend/download/create_csv.py
@@ -5,6 +5,7 @@
 
 from django.conf import settings
 
+from visualization.query import get_query_text
 from visualization.term_frequency import parse_datestring
 
 def write_file(filename, fieldnames, rows, dialect = 'excel'):
@@ -99,4 +100,22 @@ def format_field_value(value, unit):
             'week': '%Y-%m-%d',
             'day': '%Y-%m-%d'
         }
-        return date.strftime(formats[unit])
+        return date.strftime(formats[unit]) 
+
+def ngram_csv(results, log_id):
+    rows = ngram_table(results)
+    fieldnames = ['date', 'N-gram', 'Frequency']
+    filename = create_filename(log_id)
+    filepath = write_file(filename, fieldnames, rows)
+    return filepath
+
+def ngram_table(results):
+    rows = []
+    for index, time_point in enumerate(results['time_points']):
+        for ngram in results['words']:
+            rows.append({
+                'date': time_point,
+                'N-gram': ngram['label'],
+                'Frequency': ngram['data'][index]
+            })
+    return rows
diff --git a/backend/download/migrations/0002_alter_download_download_type.py b/backend/download/migrations/0002_alter_download_download_type.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.10 on 2023-10-18 12:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('download', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='download',
+            name='download_type',
+            field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
+        ),
+    ]
diff --git a/backend/download/models.py b/backend/download/models.py
@@ -1,8 +1,9 @@
 from django.db import models
+from django.conf import settings
+from django.utils import timezone
+
 from users.models import CustomUser
 from addcorpus.models import Corpus
-from django.conf import settings
-from datetime import datetime
 
 MAX_LENGTH_FILENAME = 254
 
@@ -17,6 +18,7 @@ class Download(models.Model):
             ('search_results', 'Search results'),
             ('date_term_frequency', 'Term frequency (timeline)'),
             ('aggregate_term_frequency', 'Term frequency (histogram)'),
+            ('ngram', 'Neighbouring words')
         ],
         help_text='Type of download (search results or a type of visualisation)')
     corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
@@ -49,7 +51,7 @@ def complete(self, filename = None):
         '''
 
         self.filename = filename
-        self.completed = datetime.now()
+        self.completed = timezone.now()
         self.save()
 
     def descriptive_filename(self):

diff --git a/backend/download/tasks.py b/backend/download/tasks.py
@@ -2,13 +2,12 @@
 import re
 from django.conf import settings
 from celery import shared_task, chain, group
-from django.urls import reverse
 
 from es import download as es_download
 from download import create_csv
 from download.models import Download
 from addcorpus.models import Corpus
-from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks
+from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks
 from visualization import query
 from download.mail import send_csv_email
 
@@ -90,10 +89,12 @@ def download_search_results(request_json, user):
     return try_download(make_chain, download)
 
 @shared_task()
-def make_term_frequency_csv(results_per_series, parameters_per_series, log_id):
+def make_full_data_csv(results_per_series, visualization_type, parameters_per_series, log_id):
     '''
     Export term frequency results to a csv.
     '''
+    if visualization_type == 'ngram':
+        return create_csv.ngram_csv(results_per_series, log_id)
     query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series)
     return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, log_id, unit = unit)
 
@@ -110,6 +111,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type):
         task_function(series_parameters, True) for series_parameters in parameters_unlimited
     )
 
+def ngram_full_data_tasks(ngram_parameters, dummy):
+    ngram_parameters['max_size_per_interval'] = None
+    return ngram_data_tasks(ngram_parameters)
+
 def extract_term_frequency_download_metadata(parameters_per_series):
     '''
     Get some relevant metadata for a term frequency request:
@@ -148,16 +153,16 @@ def download_full_data(request_json, user):
     '''
     Download the full data for a visualisation
     '''
-
     visualization_type = request_json['visualization']
 
     task_per_type = {
         'date_term_frequency': term_frequency_full_data_tasks,
-        'aggregate_term_frequency': term_frequency_full_data_tasks
+        'aggregate_term_frequency': term_frequency_full_data_tasks,
+        'ngram': ngram_full_data_tasks,
     }
 
     parameters = request_json['parameters']
-    corpus_name = request_json['corpus']
+    corpus_name = request_json['corpus_name']
     corpus = Corpus.objects.get(name=corpus_name)
     task = task_per_type[visualization_type](parameters, visualization_type)
 
@@ -166,7 +171,7 @@ def download_full_data(request_json, user):
 
     make_chain = lambda : chain(
         task,
-        make_term_frequency_csv.s(parameters, download.id),
+        make_full_data_csv.s(visualization_type, parameters, download.id),
         complete_download.s(download.id),
         csv_data_email.s(user.email, user.username),
     ).on_error(complete_failed_download.s(download.id))

diff --git a/backend/download/tests/test_csv_results.py b/backend/download/tests/test_csv_results.py
@@ -208,3 +208,26 @@ def test_date_format():
 
     for value, unit, expected in cases:
         assert create_csv.format_field_value(value, unit) == expected
+
+
+mock_ngram_data = {
+    'words': [
+        {'label': 'ex parrot', 'data': [2, 3]},
+        {'label': 'this parrot what', 'data': [4, 8]},
+        {'label': 'dead parrot when', 'data': [4, 6]},
+    ],
+    'time_points': ['1960-1965', '1962-1967']
+}
+
+expected_csv_table = [
+    {'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2},
+    {'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4},
+    {'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4},
+    {'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3},
+    {'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8},
+    {'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6},
+]
+
+def test_ngram_table():
+    table = create_csv.ngram_table(mock_ngram_data)
+    assert table == expected_csv_table
diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py
@@ -6,7 +6,7 @@
 from download import SEARCH_RESULTS_DIALECT
 from addcorpus.models import Corpus
 import io
-from visualization.query import MATCH_ALL
+from visualization import query
 from es.search import hits
 from tag.models import Tag, TaggedDocument
 
@@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
     # TODO: construct query from query module, which is much more convenient
     query_text = mock_corpus_specs['example_query']
     search_field = mock_corpus_specs['content_field']
-    query = {
-        "query": {
-            "bool": {
-                "must": {
-                    "simple_query_string": {
-                        "query": query_text,
-                        "fields": [search_field],
-                        "lenient": True,
-                        "default_operator": "or"
-                    }
-                },
-                "filter": []
-            }
-        }
-    }
+    query = mock_es_query(query_text, search_field)
     return {
         'es_query':  query,
         'corpus_name': mock_corpus,
@@ -78,14 +64,40 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
         'unit': 'year',
     }
 
+def ngram_parameters(mock_corpus, mock_corpus_specs):
+    query_text = mock_corpus_specs['example_query']
+    search_field = mock_corpus_specs['content_field']
+    return {
+        'corpus_name': mock_corpus,
+        'es_query': mock_es_query(query_text, search_field),
+        'field': search_field,
+        'ngram_size': 2,
+        'term_position': 'any',
+        'freq_compensation': True,
+        'subfield': 'clean',
+        'max_size_per_interval': 50,
+        'number_of_ngrams': 10,
+        'date_field': 'date'
+    }
+
+def mock_es_query(query_text, search_field):
+    q = query.MATCH_ALL
+    q = query.set_query_text(q, query_text)
+    q = query.set_search_fields(q, [search_field])
+    return q
+
+@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)])
 def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus,
                                  index_small_mock_corpus, small_mock_corpus_specs, celery_worker,
-                                 csv_directory):
-    parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs)
+                                 csv_directory, visualization_type, request_parameters):
+    parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs)
+    if visualization_type != 'ngram':
+        # timeline and histogram expect a series of parameters
+        parameters = [parameters]
     request_json = {
-        'visualization': 'date_term_frequency',
-        'parameters': [parameters],
-        'corpus': small_mock_corpus
+        'visualization': visualization_type,
+        'parameters': parameters,
+        'corpus_name': small_mock_corpus
     }
     response = admin_client.post(
         '/api/download/full_data',
@@ -160,7 +172,7 @@ def test_csv_download_view(admin_client, finished_download):
 def some_document_id(admin_client, small_mock_corpus, index_small_mock_corpus):
     search_response = admin_client.post(
         f'/api/es/{small_mock_corpus}/_search',
-        {'es_query': MATCH_ALL},
+        {'es_query': query.MATCH_ALL},
          content_type='application/json'
     )
 
@@ -188,7 +200,7 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock
     encoding = 'utf-8'
     download_request_json = {
         'corpus': small_mock_corpus,
-        'es_query': MATCH_ALL,
+        'es_query': query.MATCH_ALL,
         'tags': [tag_on_some_document.id],
         'fields': ['date','content'],
         'size': 3,

diff --git a/backend/download/tests/test_full_data.py b/backend/download/tests/test_full_data.py
@@ -21,11 +21,11 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo
        ],
         'unit': 'year'
     }]
-
-    group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency')
+    visualization_type = 'date_term_frequency'
+    group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type)
     results = group.apply().get()
     log_id = 0 # fake ID
-    filename = tasks.make_term_frequency_csv(results, full_data_parameters, log_id)
+    filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters, log_id)
 
     with open(filename) as f:
         reader = csv.DictReader(f)

diff --git a/backend/download/views.py b/backend/download/views.py
@@ -98,10 +98,10 @@ class FullDataDownloadTaskView(APIView):
     permission_classes = [IsAuthenticated, CorpusAccessPermission]
 
     def post(self, request, *args, **kwargs):
-        check_json_keys(request, ['visualization', 'parameters', 'corpus'])
+        check_json_keys(request, ['visualization', 'parameters', 'corpus_name'])
 
         visualization_type = request.data['visualization']
-        known_visualisations = ['date_term_frequency', 'aggregate_term_frequency']
+        known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram']
         if visualization_type not in known_visualisations:
             raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"')
 

diff --git a/backend/es/download.py b/backend/es/download.py
@@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size):
     result = search(
         corpus = corpus,
         query_model=query_model,
-        size = size,
+        size=size,
     )
     return hits(result)