Skip to content

Commit

Permalink
Merge branch 'develop' into feature/wordcloud-chartjs
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas authored Oct 26, 2023
2 parents a206cba + 9d28132 commit 359f3a2
Show file tree
Hide file tree
Showing 29 changed files with 225 additions and 87 deletions.
2 changes: 1 addition & 1 deletion backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
'''
Mapping for the main content field. Options:
Expand Down
6 changes: 5 additions & 1 deletion backend/corpora/dutchannualreports/dutchannualreports.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf
from addcorpus.load_corpus import corpus_dir

from addcorpus.es_mappings import keyword_mapping, main_content_mapping
from addcorpus.es_settings import es_settings

from media.media_url import media_url

Expand Down Expand Up @@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition):

dutchannualreports_map = {}

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
reader = csv.DictReader(f)
for line in reader:
Expand Down
9 changes: 4 additions & 5 deletions backend/corpora/ecco/ecco.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition):
description_page = 'ecco.md'
min_date = datetime(year=1700, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

data_directory = settings.ECCO_DATA
es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
image = 'ecco.jpg'
Expand All @@ -47,6 +42,10 @@ def es_settings(self):

meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

def sources(self, start=min_date, end=max_date):
logging.basicConfig(filename='ecco.log', level=logging.INFO)

Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/finland-old.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
title = 'People and Parliament (Finland, 1863-1905)'
description = 'Speeches from the early Finnish estates'
max_date = datetime(year=1905, month=12, day=31)
max_date = datetime(year=1906, month=12, day=31)
min_date = datetime(year=1863, month=1, day=1)
data_directory = settings.PP_FINLAND_OLD_DATA
es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')
Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/netherlands.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition):
title = "People & Parliament (Netherlands)"
description = "Speeches from the Eerste Kamer and Tweede Kamer"
min_date = datetime(year=1815, month=1, day=1)
max_date = datetime(year=2020, month=12, day=31)
max_date = datetime(year=2022, month=12, day=31)
data_directory = settings.PP_NL_DATA
data_directory_recent = settings.PP_NL_RECENT_DATA
word_model_path = getattr(settings, 'PP_NL_WM', None)
Expand Down
21 changes: 20 additions & 1 deletion backend/download/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from django.conf import settings

from visualization.query import get_query_text
from visualization.term_frequency import parse_datestring

def write_file(filename, fieldnames, rows, dialect = 'excel'):
Expand Down Expand Up @@ -99,4 +100,22 @@ def format_field_value(value, unit):
'week': '%Y-%m-%d',
'day': '%Y-%m-%d'
}
return date.strftime(formats[unit])
return date.strftime(formats[unit])

def ngram_csv(results, log_id):
rows = ngram_table(results)
fieldnames = ['date', 'N-gram', 'Frequency']
filename = create_filename(log_id)
filepath = write_file(filename, fieldnames, rows)
return filepath

def ngram_table(results):
rows = []
for index, time_point in enumerate(results['time_points']):
for ngram in results['words']:
rows.append({
'date': time_point,
'N-gram': ngram['label'],
'Frequency': ngram['data'][index]
})
return rows
18 changes: 18 additions & 0 deletions backend/download/migrations/0002_alter_download_download_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.10 on 2023-10-18 12:52

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('download', '0001_initial'),
]

operations = [
migrations.AlterField(
model_name='download',
name='download_type',
field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
),
]
8 changes: 5 additions & 3 deletions backend/download/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from django.db import models
from django.conf import settings
from django.utils import timezone

from users.models import CustomUser
from addcorpus.models import Corpus
from django.conf import settings
from datetime import datetime

MAX_LENGTH_FILENAME = 254

Expand All @@ -17,6 +18,7 @@ class Download(models.Model):
('search_results', 'Search results'),
('date_term_frequency', 'Term frequency (timeline)'),
('aggregate_term_frequency', 'Term frequency (histogram)'),
('ngram', 'Neighbouring words')
],
help_text='Type of download (search results or a type of visualisation)')
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
Expand Down Expand Up @@ -49,7 +51,7 @@ def complete(self, filename = None):
'''

self.filename = filename
self.completed = datetime.now()
self.completed = timezone.now()
self.save()

def descriptive_filename(self):
Expand Down
19 changes: 12 additions & 7 deletions backend/download/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import re
from django.conf import settings
from celery import shared_task, chain, group
from django.urls import reverse

from es import download as es_download
from download import create_csv
from download.models import Download
from addcorpus.models import Corpus
from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks
from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks
from visualization import query
from download.mail import send_csv_email

Expand Down Expand Up @@ -90,10 +89,12 @@ def download_search_results(request_json, user):
return try_download(make_chain, download)

@shared_task()
def make_term_frequency_csv(results_per_series, parameters_per_series, log_id):
def make_full_data_csv(results_per_series, visualization_type, parameters_per_series, log_id):
'''
Export term frequency results to a csv.
'''
if visualization_type == 'ngram':
return create_csv.ngram_csv(results_per_series, log_id)
query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series)
return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, log_id, unit = unit)

Expand All @@ -110,6 +111,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type):
task_function(series_parameters, True) for series_parameters in parameters_unlimited
)

def ngram_full_data_tasks(ngram_parameters, dummy):
ngram_parameters['max_size_per_interval'] = None
return ngram_data_tasks(ngram_parameters)

def extract_term_frequency_download_metadata(parameters_per_series):
'''
Get some relevant metadata for a term frequency request:
Expand Down Expand Up @@ -148,16 +153,16 @@ def download_full_data(request_json, user):
'''
Download the full data for a visualisation
'''

visualization_type = request_json['visualization']

task_per_type = {
'date_term_frequency': term_frequency_full_data_tasks,
'aggregate_term_frequency': term_frequency_full_data_tasks
'aggregate_term_frequency': term_frequency_full_data_tasks,
'ngram': ngram_full_data_tasks,
}

parameters = request_json['parameters']
corpus_name = request_json['corpus']
corpus_name = request_json['corpus_name']
corpus = Corpus.objects.get(name=corpus_name)
task = task_per_type[visualization_type](parameters, visualization_type)

Expand All @@ -166,7 +171,7 @@ def download_full_data(request_json, user):

make_chain = lambda : chain(
task,
make_term_frequency_csv.s(parameters, download.id),
make_full_data_csv.s(visualization_type, parameters, download.id),
complete_download.s(download.id),
csv_data_email.s(user.email, user.username),
).on_error(complete_failed_download.s(download.id))
Expand Down
23 changes: 23 additions & 0 deletions backend/download/tests/test_csv_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,26 @@ def test_date_format():

for value, unit, expected in cases:
assert create_csv.format_field_value(value, unit) == expected


mock_ngram_data = {
'words': [
{'label': 'ex parrot', 'data': [2, 3]},
{'label': 'this parrot what', 'data': [4, 8]},
{'label': 'dead parrot when', 'data': [4, 6]},
],
'time_points': ['1960-1965', '1962-1967']
}

expected_csv_table = [
{'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2},
{'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4},
{'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4},
{'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3},
{'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8},
{'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6},
]

def test_ngram_table():
table = create_csv.ngram_table(mock_ngram_data)
assert table == expected_csv_table
58 changes: 35 additions & 23 deletions backend/download/tests/test_download_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from download import SEARCH_RESULTS_DIALECT
from addcorpus.models import Corpus
import io
from visualization.query import MATCH_ALL
from visualization import query
from es.search import hits
from tag.models import Tag, TaggedDocument

Expand Down Expand Up @@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
# TODO: construct query from query module, which is much more convenient
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
query = {
"query": {
"bool": {
"must": {
"simple_query_string": {
"query": query_text,
"fields": [search_field],
"lenient": True,
"default_operator": "or"
}
},
"filter": []
}
}
}
query = mock_es_query(query_text, search_field)
return {
'es_query': query,
'corpus_name': mock_corpus,
Expand All @@ -78,14 +64,40 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
'unit': 'year',
}

def ngram_parameters(mock_corpus, mock_corpus_specs):
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
return {
'corpus_name': mock_corpus,
'es_query': mock_es_query(query_text, search_field),
'field': search_field,
'ngram_size': 2,
'term_position': 'any',
'freq_compensation': True,
'subfield': 'clean',
'max_size_per_interval': 50,
'number_of_ngrams': 10,
'date_field': 'date'
}

def mock_es_query(query_text, search_field):
q = query.MATCH_ALL
q = query.set_query_text(q, query_text)
q = query.set_search_fields(q, [search_field])
return q

@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)])
def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus,
index_small_mock_corpus, small_mock_corpus_specs, celery_worker,
csv_directory):
parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs)
csv_directory, visualization_type, request_parameters):
parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs)
if visualization_type != 'ngram':
# timeline and histogram expect a series of parameters
parameters = [parameters]
request_json = {
'visualization': 'date_term_frequency',
'parameters': [parameters],
'corpus': small_mock_corpus
'visualization': visualization_type,
'parameters': parameters,
'corpus_name': small_mock_corpus
}
response = admin_client.post(
'/api/download/full_data',
Expand Down Expand Up @@ -160,7 +172,7 @@ def test_csv_download_view(admin_client, finished_download):
def some_document_id(admin_client, small_mock_corpus, index_small_mock_corpus):
search_response = admin_client.post(
f'/api/es/{small_mock_corpus}/_search',
{'es_query': MATCH_ALL},
{'es_query': query.MATCH_ALL},
content_type='application/json'
)

Expand Down Expand Up @@ -188,7 +200,7 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock
encoding = 'utf-8'
download_request_json = {
'corpus': small_mock_corpus,
'es_query': MATCH_ALL,
'es_query': query.MATCH_ALL,
'tags': [tag_on_some_document.id],
'fields': ['date','content'],
'size': 3,
Expand Down
6 changes: 3 additions & 3 deletions backend/download/tests/test_full_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo
],
'unit': 'year'
}]

group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency')
visualization_type = 'date_term_frequency'
group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type)
results = group.apply().get()
log_id = 0 # fake ID
filename = tasks.make_term_frequency_csv(results, full_data_parameters, log_id)
filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters, log_id)

with open(filename) as f:
reader = csv.DictReader(f)
Expand Down
4 changes: 2 additions & 2 deletions backend/download/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ class FullDataDownloadTaskView(APIView):
permission_classes = [IsAuthenticated, CorpusAccessPermission]

def post(self, request, *args, **kwargs):
check_json_keys(request, ['visualization', 'parameters', 'corpus'])
check_json_keys(request, ['visualization', 'parameters', 'corpus_name'])

visualization_type = request.data['visualization']
known_visualisations = ['date_term_frequency', 'aggregate_term_frequency']
known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram']
if visualization_type not in known_visualisations:
raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"')

Expand Down
2 changes: 1 addition & 1 deletion backend/es/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size):
result = search(
corpus = corpus,
query_model=query_model,
size = size,
size=size,
)
return hits(result)
Loading

0 comments on commit 359f3a2

Please sign in to comment.