Merge pull request #1289 from UUDigitalHumanitieslab/feature/full-ngram

Feature/full ngram
CentreForDigitalHumanities · Oct 26, 2023 · 9d28132 · 9d28132
2 parents c1b99ae + d3bd098
commit 9d28132
Show file tree

Hide file tree

Showing 21 changed files with 202 additions and 77 deletions.
diff --git a/backend/download/create_csv.py b/backend/download/create_csv.py
@@ -5,6 +5,7 @@
 
 from django.conf import settings
 
+from visualization.query import get_query_text
 from visualization.term_frequency import parse_datestring
 
 def write_file(filename, fieldnames, rows, dialect = 'excel'):
@@ -99,4 +100,22 @@ def format_field_value(value, unit):
             'week': '%Y-%m-%d',
             'day': '%Y-%m-%d'
         }
-        return date.strftime(formats[unit])
+        return date.strftime(formats[unit]) 
+
+def ngram_csv(results, log_id):
+    rows = ngram_table(results)
+    fieldnames = ['date', 'N-gram', 'Frequency']
+    filename = create_filename(log_id)
+    filepath = write_file(filename, fieldnames, rows)
+    return filepath
+
+def ngram_table(results):
+    rows = []
+    for index, time_point in enumerate(results['time_points']):
+        for ngram in results['words']:
+            rows.append({
+                'date': time_point,
+                'N-gram': ngram['label'],
+                'Frequency': ngram['data'][index]
+            })
+    return rows
diff --git a/backend/download/migrations/0002_alter_download_download_type.py b/backend/download/migrations/0002_alter_download_download_type.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.10 on 2023-10-18 12:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('download', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='download',
+            name='download_type',
+            field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
+        ),
+    ]
diff --git a/backend/download/models.py b/backend/download/models.py
@@ -1,8 +1,9 @@
 from django.db import models
+from django.conf import settings
+from django.utils import timezone
+
 from users.models import CustomUser
 from addcorpus.models import Corpus
-from django.conf import settings
-from datetime import datetime
 
 MAX_LENGTH_FILENAME = 254
 
@@ -17,6 +18,7 @@ class Download(models.Model):
             ('search_results', 'Search results'),
             ('date_term_frequency', 'Term frequency (timeline)'),
             ('aggregate_term_frequency', 'Term frequency (histogram)'),
+            ('ngram', 'Neighbouring words')
         ],
         help_text='Type of download (search results or a type of visualisation)')
     corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
@@ -49,7 +51,7 @@ def complete(self, filename = None):
         '''
 
         self.filename = filename
-        self.completed = datetime.now()
+        self.completed = timezone.now()
         self.save()
 
     def descriptive_filename(self):

diff --git a/backend/download/tasks.py b/backend/download/tasks.py
@@ -2,13 +2,12 @@
 import re
 from django.conf import settings
 from celery import shared_task, chain, group
-from django.urls import reverse
 
 from es import download as es_download
 from download import create_csv
 from download.models import Download
 from addcorpus.models import Corpus
-from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks
+from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks
 from visualization import query
 from download.mail import send_csv_email
 
@@ -90,10 +89,12 @@ def download_search_results(request_json, user):
     return try_download(make_chain, download)
 
 @shared_task()
-def make_term_frequency_csv(results_per_series, parameters_per_series, log_id):
+def make_full_data_csv(results_per_series, visualization_type, parameters_per_series, log_id):
     '''
     Export term frequency results to a csv.
     '''
+    if visualization_type == 'ngram':
+        return create_csv.ngram_csv(results_per_series, log_id)
     query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series)
     return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, log_id, unit = unit)
 
@@ -110,6 +111,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type):
         task_function(series_parameters, True) for series_parameters in parameters_unlimited
     )
 
+def ngram_full_data_tasks(ngram_parameters, dummy):
+    ngram_parameters['max_size_per_interval'] = None
+    return ngram_data_tasks(ngram_parameters)
+
 def extract_term_frequency_download_metadata(parameters_per_series):
     '''
     Get some relevant metadata for a term frequency request:
@@ -148,16 +153,16 @@ def download_full_data(request_json, user):
     '''
     Download the full data for a visualisation
     '''
-
     visualization_type = request_json['visualization']
 
     task_per_type = {
         'date_term_frequency': term_frequency_full_data_tasks,
-        'aggregate_term_frequency': term_frequency_full_data_tasks
+        'aggregate_term_frequency': term_frequency_full_data_tasks,
+        'ngram': ngram_full_data_tasks,
     }
 
     parameters = request_json['parameters']
-    corpus_name = request_json['corpus']
+    corpus_name = request_json['corpus_name']
     corpus = Corpus.objects.get(name=corpus_name)
     task = task_per_type[visualization_type](parameters, visualization_type)
 
@@ -166,7 +171,7 @@ def download_full_data(request_json, user):
 
     make_chain = lambda : chain(
         task,
-        make_term_frequency_csv.s(parameters, download.id),
+        make_full_data_csv.s(visualization_type, parameters, download.id),
         complete_download.s(download.id),
         csv_data_email.s(user.email, user.username),
     ).on_error(complete_failed_download.s(download.id))

diff --git a/backend/download/tests/test_csv_results.py b/backend/download/tests/test_csv_results.py
@@ -208,3 +208,26 @@ def test_date_format():
 
     for value, unit, expected in cases:
         assert create_csv.format_field_value(value, unit) == expected
+
+
+mock_ngram_data = {
+    'words': [
+        {'label': 'ex parrot', 'data': [2, 3]},
+        {'label': 'this parrot what', 'data': [4, 8]},
+        {'label': 'dead parrot when', 'data': [4, 6]},
+    ],
+    'time_points': ['1960-1965', '1962-1967']
+}
+
+expected_csv_table = [
+    {'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2},
+    {'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4},
+    {'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4},
+    {'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3},
+    {'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8},
+    {'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6},
+]
+
+def test_ngram_table():
+    table = create_csv.ngram_table(mock_ngram_data)
+    assert table == expected_csv_table
diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py
@@ -6,7 +6,7 @@
 from download import SEARCH_RESULTS_DIALECT
 from addcorpus.models import Corpus
 import io
-from visualization.query import MATCH_ALL
+from visualization import query
 from es.search import hits
 from tag.models import Tag, TaggedDocument
 
@@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
     # TODO: construct query from query module, which is much more convenient
     query_text = mock_corpus_specs['example_query']
     search_field = mock_corpus_specs['content_field']
-    query = {
-        "query": {
-            "bool": {
-                "must": {
-                    "simple_query_string": {
-                        "query": query_text,
-                        "fields": [search_field],
-                        "lenient": True,
-                        "default_operator": "or"
-                    }
-                },
-                "filter": []
-            }
-        }
-    }
+    query = mock_es_query(query_text, search_field)
     return {
         'es_query':  query,
         'corpus_name': mock_corpus,
@@ -78,14 +64,40 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
         'unit': 'year',
     }
 
+def ngram_parameters(mock_corpus, mock_corpus_specs):
+    query_text = mock_corpus_specs['example_query']
+    search_field = mock_corpus_specs['content_field']
+    return {
+        'corpus_name': mock_corpus,
+        'es_query': mock_es_query(query_text, search_field),
+        'field': search_field,
+        'ngram_size': 2,
+        'term_position': 'any',
+        'freq_compensation': True,
+        'subfield': 'clean',
+        'max_size_per_interval': 50,
+        'number_of_ngrams': 10,
+        'date_field': 'date'
+    }
+
+def mock_es_query(query_text, search_field):
+    q = query.MATCH_ALL
+    q = query.set_query_text(q, query_text)
+    q = query.set_search_fields(q, [search_field])
+    return q
+
+@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)])
 def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus,
                                  index_small_mock_corpus, small_mock_corpus_specs, celery_worker,
-                                 csv_directory):
-    parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs)
+                                 csv_directory, visualization_type, request_parameters):
+    parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs)
+    if visualization_type != 'ngram':
+        # timeline and histogram expect a series of parameters
+        parameters = [parameters]
     request_json = {
-        'visualization': 'date_term_frequency',
-        'parameters': [parameters],
-        'corpus': small_mock_corpus
+        'visualization': visualization_type,
+        'parameters': parameters,
+        'corpus_name': small_mock_corpus
     }
     response = admin_client.post(
         '/api/download/full_data',
@@ -160,7 +172,7 @@ def test_csv_download_view(admin_client, finished_download):
 def some_document_id(admin_client, small_mock_corpus, index_small_mock_corpus):
     search_response = admin_client.post(
         f'/api/es/{small_mock_corpus}/_search',
-        {'es_query': MATCH_ALL},
+        {'es_query': query.MATCH_ALL},
          content_type='application/json'
     )
 
@@ -188,7 +200,7 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock
     encoding = 'utf-8'
     download_request_json = {
         'corpus': small_mock_corpus,
-        'es_query': MATCH_ALL,
+        'es_query': query.MATCH_ALL,
         'tags': [tag_on_some_document.id],
         'fields': ['date','content'],
         'size': 3,

diff --git a/backend/download/tests/test_full_data.py b/backend/download/tests/test_full_data.py
@@ -21,11 +21,11 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo
        ],
         'unit': 'year'
     }]
-
-    group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency')
+    visualization_type = 'date_term_frequency'
+    group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type)
     results = group.apply().get()
     log_id = 0 # fake ID
-    filename = tasks.make_term_frequency_csv(results, full_data_parameters, log_id)
+    filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters, log_id)
 
     with open(filename) as f:
         reader = csv.DictReader(f)

diff --git a/backend/download/views.py b/backend/download/views.py
@@ -98,10 +98,10 @@ class FullDataDownloadTaskView(APIView):
     permission_classes = [IsAuthenticated, CorpusAccessPermission]
 
     def post(self, request, *args, **kwargs):
-        check_json_keys(request, ['visualization', 'parameters', 'corpus'])
+        check_json_keys(request, ['visualization', 'parameters', 'corpus_name'])
 
         visualization_type = request.data['visualization']
-        known_visualisations = ['date_term_frequency', 'aggregate_term_frequency']
+        known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram']
         if visualization_type not in known_visualisations:
             raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"')
 

diff --git a/backend/es/download.py b/backend/es/download.py
@@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size):
     result = search(
         corpus = corpus,
         query_model=query_model,
-        size = size,
+        size=size,
     )
     return hits(result)
diff --git a/backend/visualization/ngram.py b/backend/visualization/ngram.py
@@ -5,6 +5,7 @@
 from addcorpus.models import CorpusConfiguration
 from datetime import datetime
 from es.search import get_index, search
+from es.download import scroll
 from ianalyzer.elasticsearch import elasticsearch
 from visualization import query, termvectors
 
@@ -50,7 +51,7 @@ def get_total_time_interval(es_query, corpus):
 def get_time_bins(es_query, corpus):
     """Wide bins for a query. Depending on the total time range of the query, time intervervals are
     10 years (>100 yrs), 5 years (100-20 yrs) of 1 year (<20 yrs)."""
-
+    
     min_date, max_date = get_total_time_interval(es_query, corpus)
     min_year, max_year = min_date.year, max_date.year
     time_range = max_year - min_year
@@ -77,9 +78,9 @@ def get_time_bins(es_query, corpus):
     return bins
 
 
-def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field):
-    index = get_index(corpus)
-    client = elasticsearch(corpus)
+def tokens_by_time_interval(corpus_name, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field, **kwargs):
+    index = get_index(corpus_name)
+    client = elasticsearch(corpus_name)
     positions_dict = {
         'any': list(range(ngram_size)),
         'first': [0],
@@ -100,21 +101,21 @@ def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_posit
     date_filter = query.make_date_filter(start_date, end_date, date_field)
     narrow_query = query.add_filter(es_query, date_filter)
     #search for the query text
-    search_results = search(
-        corpus=corpus,
-        query_model = narrow_query,
-        client = client,
-        size = max_size_per_interval,
+    search_results, _total = scroll(
+        corpus=corpus_name,
+        query_model=narrow_query,
+        client=client,
+        download_size=max_size_per_interval,
     )
     bin_ngrams = Counter()
-    for hit in search_results['hits']['hits']:
+    for hit in search_results:
         identifier = hit['_id']
         # get the term vectors for the hit
         result = client.termvectors(
             index=index,
             id=identifier,
             term_statistics=freq_compensation,
-            fields = [field]
+            fields=[field]
         )
         terms = termvectors.get_terms(result, field)
         if terms:

diff --git a/backend/visualization/tasks.py b/backend/visualization/tasks.py
@@ -25,7 +25,7 @@ def ngram_data_tasks(request_json):
 
     return chord(group([
         get_ngram_data_bin.s(
-            corpus=corpus,
+            corpus_name=corpus,
             es_query=es_query,
             field=request_json['field'],
             bin=b,
@@ -40,7 +40,7 @@ def ngram_data_tasks(request_json):
     ]), integrate_ngram_results.s(
             number_of_ngrams=request_json['number_of_ngrams']
         )
-    )()
+    )
 
 @shared_task()
 def get_histogram_term_frequency_bin(es_query, corpus_name, field_name, field_value, size, include_query_in_result = False):

diff --git a/backend/visualization/tests/test_ngrams.py b/backend/visualization/tests/test_ngrams.py
@@ -111,10 +111,10 @@ def test_top_10_ngrams():
             for w in target_data }
         assert dataset_relative['data'] == relative_frequencies[word]
 
-def get_binned_results(corpus, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
+def get_binned_results(corpus_name, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
     return [
         ngram.tokens_by_time_interval(
-            corpus, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
+            corpus_name, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
         for bin in time_bins
     ]