Skip to content

Commit

Permalink
Merge pull request #1289 from UUDigitalHumanitieslab/feature/full-ngram
Browse files Browse the repository at this point in the history
Feature/full ngram
  • Loading branch information
BeritJanssen authored Oct 26, 2023
2 parents c1b99ae + d3bd098 commit 9d28132
Show file tree
Hide file tree
Showing 21 changed files with 202 additions and 77 deletions.
21 changes: 20 additions & 1 deletion backend/download/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from django.conf import settings

from visualization.query import get_query_text
from visualization.term_frequency import parse_datestring

def write_file(filename, fieldnames, rows, dialect = 'excel'):
Expand Down Expand Up @@ -99,4 +100,22 @@ def format_field_value(value, unit):
'week': '%Y-%m-%d',
'day': '%Y-%m-%d'
}
return date.strftime(formats[unit])
return date.strftime(formats[unit])

def ngram_csv(results, log_id):
rows = ngram_table(results)
fieldnames = ['date', 'N-gram', 'Frequency']
filename = create_filename(log_id)
filepath = write_file(filename, fieldnames, rows)
return filepath

def ngram_table(results):
rows = []
for index, time_point in enumerate(results['time_points']):
for ngram in results['words']:
rows.append({
'date': time_point,
'N-gram': ngram['label'],
'Frequency': ngram['data'][index]
})
return rows
18 changes: 18 additions & 0 deletions backend/download/migrations/0002_alter_download_download_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.10 on 2023-10-18 12:52

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('download', '0001_initial'),
]

operations = [
migrations.AlterField(
model_name='download',
name='download_type',
field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
),
]
8 changes: 5 additions & 3 deletions backend/download/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from django.db import models
from django.conf import settings
from django.utils import timezone

from users.models import CustomUser
from addcorpus.models import Corpus
from django.conf import settings
from datetime import datetime

MAX_LENGTH_FILENAME = 254

Expand All @@ -17,6 +18,7 @@ class Download(models.Model):
('search_results', 'Search results'),
('date_term_frequency', 'Term frequency (timeline)'),
('aggregate_term_frequency', 'Term frequency (histogram)'),
('ngram', 'Neighbouring words')
],
help_text='Type of download (search results or a type of visualisation)')
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
Expand Down Expand Up @@ -49,7 +51,7 @@ def complete(self, filename = None):
'''

self.filename = filename
self.completed = datetime.now()
self.completed = timezone.now()
self.save()

def descriptive_filename(self):
Expand Down
19 changes: 12 additions & 7 deletions backend/download/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import re
from django.conf import settings
from celery import shared_task, chain, group
from django.urls import reverse

from es import download as es_download
from download import create_csv
from download.models import Download
from addcorpus.models import Corpus
from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks
from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks
from visualization import query
from download.mail import send_csv_email

Expand Down Expand Up @@ -90,10 +89,12 @@ def download_search_results(request_json, user):
return try_download(make_chain, download)

@shared_task()
def make_term_frequency_csv(results_per_series, parameters_per_series, log_id):
def make_full_data_csv(results_per_series, visualization_type, parameters_per_series, log_id):
'''
Export term frequency results to a csv.
'''
if visualization_type == 'ngram':
return create_csv.ngram_csv(results_per_series, log_id)
query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series)
return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, log_id, unit = unit)

Expand All @@ -110,6 +111,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type):
task_function(series_parameters, True) for series_parameters in parameters_unlimited
)

def ngram_full_data_tasks(ngram_parameters, dummy):
ngram_parameters['max_size_per_interval'] = None
return ngram_data_tasks(ngram_parameters)

def extract_term_frequency_download_metadata(parameters_per_series):
'''
Get some relevant metadata for a term frequency request:
Expand Down Expand Up @@ -148,16 +153,16 @@ def download_full_data(request_json, user):
'''
Download the full data for a visualisation
'''

visualization_type = request_json['visualization']

task_per_type = {
'date_term_frequency': term_frequency_full_data_tasks,
'aggregate_term_frequency': term_frequency_full_data_tasks
'aggregate_term_frequency': term_frequency_full_data_tasks,
'ngram': ngram_full_data_tasks,
}

parameters = request_json['parameters']
corpus_name = request_json['corpus']
corpus_name = request_json['corpus_name']
corpus = Corpus.objects.get(name=corpus_name)
task = task_per_type[visualization_type](parameters, visualization_type)

Expand All @@ -166,7 +171,7 @@ def download_full_data(request_json, user):

make_chain = lambda : chain(
task,
make_term_frequency_csv.s(parameters, download.id),
make_full_data_csv.s(visualization_type, parameters, download.id),
complete_download.s(download.id),
csv_data_email.s(user.email, user.username),
).on_error(complete_failed_download.s(download.id))
Expand Down
23 changes: 23 additions & 0 deletions backend/download/tests/test_csv_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,26 @@ def test_date_format():

for value, unit, expected in cases:
assert create_csv.format_field_value(value, unit) == expected


mock_ngram_data = {
'words': [
{'label': 'ex parrot', 'data': [2, 3]},
{'label': 'this parrot what', 'data': [4, 8]},
{'label': 'dead parrot when', 'data': [4, 6]},
],
'time_points': ['1960-1965', '1962-1967']
}

expected_csv_table = [
{'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2},
{'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4},
{'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4},
{'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3},
{'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8},
{'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6},
]

def test_ngram_table():
table = create_csv.ngram_table(mock_ngram_data)
assert table == expected_csv_table
58 changes: 35 additions & 23 deletions backend/download/tests/test_download_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from download import SEARCH_RESULTS_DIALECT
from addcorpus.models import Corpus
import io
from visualization.query import MATCH_ALL
from visualization import query
from es.search import hits
from tag.models import Tag, TaggedDocument

Expand Down Expand Up @@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
# TODO: construct query from query module, which is much more convenient
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
query = {
"query": {
"bool": {
"must": {
"simple_query_string": {
"query": query_text,
"fields": [search_field],
"lenient": True,
"default_operator": "or"
}
},
"filter": []
}
}
}
query = mock_es_query(query_text, search_field)
return {
'es_query': query,
'corpus_name': mock_corpus,
Expand All @@ -78,14 +64,40 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
'unit': 'year',
}

def ngram_parameters(mock_corpus, mock_corpus_specs):
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
return {
'corpus_name': mock_corpus,
'es_query': mock_es_query(query_text, search_field),
'field': search_field,
'ngram_size': 2,
'term_position': 'any',
'freq_compensation': True,
'subfield': 'clean',
'max_size_per_interval': 50,
'number_of_ngrams': 10,
'date_field': 'date'
}

def mock_es_query(query_text, search_field):
q = query.MATCH_ALL
q = query.set_query_text(q, query_text)
q = query.set_search_fields(q, [search_field])
return q

@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)])
def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus,
index_small_mock_corpus, small_mock_corpus_specs, celery_worker,
csv_directory):
parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs)
csv_directory, visualization_type, request_parameters):
parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs)
if visualization_type != 'ngram':
# timeline and histogram expect a series of parameters
parameters = [parameters]
request_json = {
'visualization': 'date_term_frequency',
'parameters': [parameters],
'corpus': small_mock_corpus
'visualization': visualization_type,
'parameters': parameters,
'corpus_name': small_mock_corpus
}
response = admin_client.post(
'/api/download/full_data',
Expand Down Expand Up @@ -160,7 +172,7 @@ def test_csv_download_view(admin_client, finished_download):
def some_document_id(admin_client, small_mock_corpus, index_small_mock_corpus):
search_response = admin_client.post(
f'/api/es/{small_mock_corpus}/_search',
{'es_query': MATCH_ALL},
{'es_query': query.MATCH_ALL},
content_type='application/json'
)

Expand Down Expand Up @@ -188,7 +200,7 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock
encoding = 'utf-8'
download_request_json = {
'corpus': small_mock_corpus,
'es_query': MATCH_ALL,
'es_query': query.MATCH_ALL,
'tags': [tag_on_some_document.id],
'fields': ['date','content'],
'size': 3,
Expand Down
6 changes: 3 additions & 3 deletions backend/download/tests/test_full_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo
],
'unit': 'year'
}]

group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency')
visualization_type = 'date_term_frequency'
group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type)
results = group.apply().get()
log_id = 0 # fake ID
filename = tasks.make_term_frequency_csv(results, full_data_parameters, log_id)
filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters, log_id)

with open(filename) as f:
reader = csv.DictReader(f)
Expand Down
4 changes: 2 additions & 2 deletions backend/download/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ class FullDataDownloadTaskView(APIView):
permission_classes = [IsAuthenticated, CorpusAccessPermission]

def post(self, request, *args, **kwargs):
check_json_keys(request, ['visualization', 'parameters', 'corpus'])
check_json_keys(request, ['visualization', 'parameters', 'corpus_name'])

visualization_type = request.data['visualization']
known_visualisations = ['date_term_frequency', 'aggregate_term_frequency']
known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram']
if visualization_type not in known_visualisations:
raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"')

Expand Down
2 changes: 1 addition & 1 deletion backend/es/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size):
result = search(
corpus = corpus,
query_model=query_model,
size = size,
size=size,
)
return hits(result)
23 changes: 12 additions & 11 deletions backend/visualization/ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from addcorpus.models import CorpusConfiguration
from datetime import datetime
from es.search import get_index, search
from es.download import scroll
from ianalyzer.elasticsearch import elasticsearch
from visualization import query, termvectors

Expand Down Expand Up @@ -50,7 +51,7 @@ def get_total_time_interval(es_query, corpus):
def get_time_bins(es_query, corpus):
"""Wide bins for a query. Depending on the total time range of the query, time intervervals are
10 years (>100 yrs), 5 years (100-20 yrs) of 1 year (<20 yrs)."""

min_date, max_date = get_total_time_interval(es_query, corpus)
min_year, max_year = min_date.year, max_date.year
time_range = max_year - min_year
Expand All @@ -77,9 +78,9 @@ def get_time_bins(es_query, corpus):
return bins


def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field):
index = get_index(corpus)
client = elasticsearch(corpus)
def tokens_by_time_interval(corpus_name, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field, **kwargs):
index = get_index(corpus_name)
client = elasticsearch(corpus_name)
positions_dict = {
'any': list(range(ngram_size)),
'first': [0],
Expand All @@ -100,21 +101,21 @@ def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_posit
date_filter = query.make_date_filter(start_date, end_date, date_field)
narrow_query = query.add_filter(es_query, date_filter)
#search for the query text
search_results = search(
corpus=corpus,
query_model = narrow_query,
client = client,
size = max_size_per_interval,
search_results, _total = scroll(
corpus=corpus_name,
query_model=narrow_query,
client=client,
download_size=max_size_per_interval,
)
bin_ngrams = Counter()
for hit in search_results['hits']['hits']:
for hit in search_results:
identifier = hit['_id']
# get the term vectors for the hit
result = client.termvectors(
index=index,
id=identifier,
term_statistics=freq_compensation,
fields = [field]
fields=[field]
)
terms = termvectors.get_terms(result, field)
if terms:
Expand Down
4 changes: 2 additions & 2 deletions backend/visualization/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def ngram_data_tasks(request_json):

return chord(group([
get_ngram_data_bin.s(
corpus=corpus,
corpus_name=corpus,
es_query=es_query,
field=request_json['field'],
bin=b,
Expand All @@ -40,7 +40,7 @@ def ngram_data_tasks(request_json):
]), integrate_ngram_results.s(
number_of_ngrams=request_json['number_of_ngrams']
)
)()
)

@shared_task()
def get_histogram_term_frequency_bin(es_query, corpus_name, field_name, field_value, size, include_query_in_result = False):
Expand Down
4 changes: 2 additions & 2 deletions backend/visualization/tests/test_ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ def test_top_10_ngrams():
for w in target_data }
assert dataset_relative['data'] == relative_frequencies[word]

def get_binned_results(corpus, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
def get_binned_results(corpus_name, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
return [
ngram.tokens_by_time_interval(
corpus, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
corpus_name, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
for bin in time_bins
]

Expand Down
Loading

0 comments on commit 9d28132

Please sign in to comment.