diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py
index 0b3e673ce..a98884358 100644
--- a/backend/addcorpus/constants.py
+++ b/backend/addcorpus/constants.py
@@ -1,9 +1,8 @@
from enum import Enum
CATEGORIES = [
- ('newspaper', 'Newspapers'),
('parliament', 'Parliamentary debates'),
- ('periodical', 'Periodicals'),
+ ('periodical', 'Newspapers and other periodicals'),
('finance', 'Financial reports'),
('ruling', 'Court rulings'),
('review', 'Online reviews'),
diff --git a/backend/addcorpus/migrations/0004_alter_corpusconfiguration_category.py b/backend/addcorpus/migrations/0004_alter_corpusconfiguration_category.py
new file mode 100644
index 000000000..f336ae2d8
--- /dev/null
+++ b/backend/addcorpus/migrations/0004_alter_corpusconfiguration_category.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.9 on 2023-09-21 14:16
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('addcorpus', '0003_add_corpusconfiguration'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='corpusconfiguration',
+ name='category',
+ field=models.CharField(choices=[('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), ('finance', 'Financial reports'), ('ruling', 'Court rulings'), ('review', 'Online reviews'), ('inscription', 'Funerary inscriptions'), ('oration', 'Orations'), ('book', 'Books')], help_text='category/medium of documents in this dataset', max_length=64),
+ ),
+ ]
diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
index 9b14b841b..f326ced2a 100644
--- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
+++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
@@ -36,7 +36,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition):
es_index = getattr(settings, 'DUTCHNEWSPAPERS_ES_INDEX', 'dutchnewspapers-public')
image = 'dutchnewspapers.jpg'
languages = ['nl']
- category = 'newspaper'
+ category = 'periodical'
@property
def es_settings(self):
diff --git a/backend/corpora/ecco/description/ecco.md b/backend/corpora/ecco/description/ecco.md
new file mode 100644
index 000000000..69fa16bc8
--- /dev/null
+++ b/backend/corpora/ecco/description/ecco.md
@@ -0,0 +1,38 @@
+*Eighteenth Century Collections Online (ECCO)* is a fully text-searchable corpus of books, pamphlets and broadsides in all subjects printed between 1701 and 1800. It currently contains over 135,000 titles amounting to over 26 million fully searchable pages. *ECCO* is a digitization of the eighteenth-century section of the works catalogued in the *English Short-title Catalogue (ESTC)*.
+
+Most of these works were printed in England, Scotland, Ireland and the United States, but it also contains works printed in territories under British colonial rule as well as from countries across Europe and Asia.
+
+The corpus includes everything from six-penny broadsheets, pamphlets, books, government documents and more, written by or about people of all professions and classes.
+
+### Subjects
+
+- Multidisciplinary
+- Eighteenth-century knowledge, thought, beliefs, events
+- Age of Enlightenment
+- Histories
+- Poetry
+- Novels
+- Plays
+- Law books
+- Biographies
+- Science
+- Philosophy
+- Dictionaries
+- Theology/ Religion
+- Diaries
+- Almanacs
+- … and many more
+
+### Read more
+
+Additional information can be found in the links below.
+
+- [Access through publisher website (requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/start.do?p=ECCO&u=utrecht)
+- [About this archive (publisher website; requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/helpCenter?userGroupName=utrecht&inPS=true&nspage=true&prodId=ECCO&docId=EFZIPA587871271)
+- [Sample topics and searches (publisher website; requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/helpCenter?userGroupName=utrecht&inPS=true&nspage=true&prodId=ECCO&docId=OAWADC058207024&title=Sample%20Topics%20and%20Searches)
+
+### Availability
+
+*ECCO* is published by [Gale](https://en.wikipedia.org/wiki/Gale_(publisher)) and is only available to members of Utrecht University.
+
+*Note:* Only the *ECCO Part I* is available on I-analyzer.
diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
index aeb02df5a..ed1ac4c02 100644
--- a/backend/corpora/ecco/ecco.py
+++ b/backend/corpora/ecco/ecco.py
@@ -26,6 +26,7 @@
class Ecco(XMLCorpusDefinition):
title = "Eighteenth Century Collections Online"
description = "Digital collection of books published in Great Britain during the 18th century."
+ description_page = 'ecco.md'
min_date = datetime(year=1700, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)
diff --git a/backend/corpora/guardianobserver/description/guardianobserver.md b/backend/corpora/guardianobserver/description/guardianobserver.md
new file mode 100644
index 000000000..745bbc2d4
--- /dev/null
+++ b/backend/corpora/guardianobserver/description/guardianobserver.md
@@ -0,0 +1,30 @@
+
+This corpus contains articles from *The Guardian* and *The Observer.*
+
+### The Guardian
+
+*The Guardian* is a British daily newspaper, originally founded in 1821 as *The Manchester Guardian*. It is a sister newspaper to both *The Observer* and *The Guardian Weekly*. It is considered a “newspaper of record” and is currently one of the most widely read in the UK and a respected newspaper in the world.
+
+Political alignment: Centre-left
+
+### Observer
+
+*The Observer* is a British newspaper published weekly on Sundays. It is the world's oldest Sunday newspaper and is a sister paper to both *The Guardian* and *The Guardian Weekly*.
+
+Political alignment: Centre-left; British republicanism
+
+### Subjects
+
+- Historical local, regional and national news
+- Multidisciplinary
+
+### Read more
+
+- [The Guardian (Wikipedia)](https://en.wikipedia.org/wiki/The_Guardian)
+- [Official website of The Guardian](https://www.theguardian.com/international)
+- [The Observer (Wikipedia)](https://en.wikipedia.org/wiki/The_Observer)
+- [Access through publisher website (requires Utrecht University login)](https://www.proquest.com/hnpguardianobserver/index?parentSessionId=SBW10zSG6gyVTa17wSPUIoNhfaXQZBxx2UvOA9%2FiYto%3D&accountid=14772)
+
+### Availability
+
+The Guardian/Observer corpus is published by [ProQuest](https://en.wikipedia.org/wiki/ProQuest) and is only available to members of Utrecht University.
diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py
index 38d56b5a4..b700e82c1 100644
--- a/backend/corpora/guardianobserver/guardianobserver.py
+++ b/backend/corpora/guardianobserver/guardianobserver.py
@@ -34,6 +34,7 @@
class GuardianObserver(XMLCorpusDefinition):
title = "Guardian-Observer"
description = "Newspaper archive, 1791-2003"
+ description_page = 'guardianobserver.md'
min_date = datetime(year=1791, month=1, day=1)
max_date = datetime(year=2003, month=12, day=31)
data_directory = settings.GO_DATA
@@ -41,7 +42,7 @@ class GuardianObserver(XMLCorpusDefinition):
image = 'guardianobserver.jpg'
scan_image_type = getattr(settings, 'GO_SCAN_IMAGE_TYPE', 'application/pdf')
languages = ['en']
- category = 'newspaper'
+ category = 'periodical'
@property
def es_settings(self):
diff --git a/backend/corpora/periodicals/description/19thCenturyUKPeriodicals.md b/backend/corpora/periodicals/description/19thCenturyUKPeriodicals.md
index 38bb18753..2715e6392 100644
--- a/backend/corpora/periodicals/description/19thCenturyUKPeriodicals.md
+++ b/backend/corpora/periodicals/description/19thCenturyUKPeriodicals.md
@@ -1,10 +1,26 @@
-### 19th Century UK Periodicals: new readerships
+The *Nineteenth Century UK Periodicals* series covers the events, lives, values, and themes that shaped the nineteenth century world.
-The 19th century was a time of revolutionary change and expansion. Britain was one of the
-world’s first industrial, urban superpowers and developed a press to feed the demands
-of its increasingly literate population: 19th Century UK Periodicals covers the events, lives,
-values and themes that shaped the nineteenth-century world.
+The collection is comprised of material published primarily in England, but also includes titles from Australia, Canada, India, South Africa, and many more.
+The collection was predominantly sourced from two major libraries – the British Library and the National Library of Scotland.
+
+### Subjects
+
+- Empire and Colonialism
+- Science and Industry
+- Cities and Society
+- Sport and Leisure
+- Politics
+- Daily Life
+- Feminism
+- Art and Culture
+- Philosophy
+- Literature
+- Parenting
+- Medicine
+- … and many more
+
+### Titles
The corpus includes the following 91 titles:
- Alexandra Magazine and Womans Social and Industrial Advocate
- Atalanta
@@ -96,4 +112,14 @@ The corpus includes the following 91 titles:
- Walters Theatrical and Sporting Directory and Book of Reference
- Womans Advocate
- Women and Work: A Weekly Industrial Educational and Household Register for Women
-- Womens Penny Paper.
\ No newline at end of file
+- Womens Penny Paper.
+
+### Read more
+
+- [Access through publisher website (requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/start.do?p=NCUK&u=utrecht)
+- [About this archive (publisher website; requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/helpCenter?userGroupName=utrecht&inPS=true&nspage=true&prodId=NCUK&docId=DWSDAY911647535)
+- [Sample topics and searches (publisher website; requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/helpCenter?userGroupName=utrecht&inPS=true&nspage=true&prodId=NCUK&docId=KEECCH350737398&title=Sample%20Topics%20and%20Searches)
+
+### Availability
+
+This corpus is published by [Gale](https://en.wikipedia.org/wiki/Gale_(publisher)) and is only available to members of Utrecht University.
diff --git a/backend/corpora/times/description/times.md b/backend/corpora/times/description/times.md
index d91c7956d..0d880b005 100644
--- a/backend/corpora/times/description/times.md
+++ b/backend/corpora/times/description/times.md
@@ -1,6 +1,34 @@
-### The Times Digtial Archive 1785-2012
+*The Times* is a British daily national newspaper, originally founded in 1785 as *The Daily Universal Register*. *The Times* is the oldest daily newspaper in continuous publication and remains one of the most widely read and respected newspapers in the world. It is a sister newspaper to *The Sunday Times*.
+
+Political alignment: Conservative; Centre-right
+
+This corpus contains a full-text version of 200 years of *The Times*, a critical source for studying a range of subjects.
-This corpus contains a full-text version of 200 years of The Times, a critical source for studying a range of subjects.
All issues of this period are present, with the following exceptions:
-- Issues of march 1785: they are missing in the publisher's archive.
-- Issues in date range 01/01/1979 - 31/10/1979: during this period, a major general strike occured and no newspaper editions were published.
\ No newline at end of file
+- Issues of March 1785: they are missing from the publisher's archive.
+- Issues in date range 01/01/1979 - 31/10/1979: during this period, a major general strike occurred, and no newspaper editions were published
+
+### Subjects
+
+- Historical local, regional and national news
+- Multidisciplinary
+- Business
+- Humanities
+- Political Science
+- Philosophy
+- Major international historical events
+
+### Read more
+
+- [The Times (Wikipedia)](https://en.wikipedia.org/wiki/The_Times)
+- [Access through publisher website (requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/start.do?p=TTDA&u=utrecht)
+- [About this archive (publisher website; requires Utrecht University login)](https://go-gale-com.proxy.library.uu.nl/ps/helpCenter?userGroupName=utrecht&inPS=true&nspage=true&prodId=TTDA&docId=QCOGMG579883681)
+- [Sample topics and searches](https://go-gale-com.proxy.library.uu.nl/ps/helpCenter?userGroupName=utrecht&inPS=true&nspage=true&prodId=TTDA&docId=GCANVE436736839&title=Sample%20Topics%20and%20Searches)
+
+### Availability
+
+This corpus is published by [Gale](https://en.wikipedia.org/wiki/Gale_(publisher)) and is only available to members of Utrecht University.
+
+### Image source
+
+Corpus image from [Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Twice_round_the_clock;_or,_The_hours_of_the_day_and_night_in_London_(1859)_(14776691334).jpg)
diff --git a/backend/corpora/times/images/times.jpg b/backend/corpora/times/images/times.jpg
index 0da3342c2..6acbd5524 100644
Binary files a/backend/corpora/times/images/times.jpg and b/backend/corpora/times/images/times.jpg differ
diff --git a/backend/corpora/times/images/times.jpg~ b/backend/corpora/times/images/times.jpg~
new file mode 100644
index 000000000..a00f4da7a
Binary files /dev/null and b/backend/corpora/times/images/times.jpg~ differ
diff --git a/backend/corpora/times/images/times_thumb.jpg b/backend/corpora/times/images/times_thumb.jpg
deleted file mode 100644
index 5f34caad2..000000000
Binary files a/backend/corpora/times/images/times_thumb.jpg and /dev/null differ
diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py
index 38ec07ddc..1e0ff0d87 100644
--- a/backend/corpora/times/times.py
+++ b/backend/corpora/times/times.py
@@ -35,7 +35,7 @@ class Times(XMLCorpusDefinition):
scan_image_type = getattr(settings, 'TIMES_SCAN_IMAGE_TYPE', 'image/png')
description_page = 'times.md'
languages = ['en']
- category = 'newspaper'
+ category = 'periodical'
@property
def es_settings(self):
diff --git a/backend/visualization/ngram.py b/backend/visualization/ngram.py
index bcf5c748f..3e568a612 100644
--- a/backend/visualization/ngram.py
+++ b/backend/visualization/ngram.py
@@ -1,41 +1,23 @@
from collections import Counter
+
+import numpy as np
+
from addcorpus.models import CorpusConfiguration
from datetime import datetime
from es.search import get_index, search
from ianalyzer.elasticsearch import elasticsearch
from visualization import query, termvectors
-from es import download
-def get_ngrams(es_query, corpus, field,
- ngram_size=2, positions='any', freq_compensation=True, subfield='none', max_size_per_interval=50,
- number_of_ngrams=10, date_field = 'date'):
+def get_ngrams(results, number_of_ngrams=10):
"""Given a query and a corpus, get the words that occurred most frequently around the query term"""
+ ngrams = []
+ ngrams = get_top_n_ngrams(results, number_of_ngrams)
- bins = get_time_bins(es_query, corpus)
- time_labels = [format_time_label(start_year, end_year) for start_year, end_year in bins]
-
- positions_dict = {
- 'any': list(range(ngram_size)),
- 'first': [0],
- 'second': [1],
- 'third': [2],
- 'fourth': [3],
+ return {
+ 'words': ngrams,
+ 'time_points': sorted([result['time_interval'] for result in results])
}
- term_positions = positions_dict[positions]
-
- # find ngrams
-
- docs, total_frequencies = tokens_by_time_interval(
- corpus, es_query, field, bins, ngram_size, term_positions, freq_compensation, subfield, max_size_per_interval,
- date_field
- )
- if freq_compensation:
- ngrams = get_top_n_ngrams(docs, total_frequencies, number_of_ngrams)
- else:
- ngrams = get_top_n_ngrams(docs, dict(), number_of_ngrams)
-
- return { 'words': ngrams, 'time_points' : time_labels }
def format_time_label(start_year, end_year):
@@ -95,105 +77,107 @@ def get_time_bins(es_query, corpus):
return bins
-def tokens_by_time_interval(corpus, es_query, field, bins, ngram_size, term_positions, freq_compensation, subfield, max_size_per_interval, date_field):
+def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field):
index = get_index(corpus)
client = elasticsearch(corpus)
- ngrams_per_bin = []
+ positions_dict = {
+ 'any': list(range(ngram_size)),
+ 'first': [0],
+ 'second': [1],
+ 'third': [2],
+ 'fourth': [3],
+ }
+ term_positions = positions_dict[term_position]
ngram_ttfs = dict()
query_text = query.get_query_text(es_query)
field = field if subfield == 'none' else '.'.join([field, subfield])
- for (start_year, end_year) in bins:
- start_date = datetime(start_year, 1, 1)
- end_date = datetime(end_year, 12, 31)
-
- # filter query on this time bin
- date_filter = query.make_date_filter(start_date, end_date, date_field)
- narrow_query = query.add_filter(es_query, date_filter)
-
- #search for the query text
- search_results = search(
- corpus=corpus,
- query_model = narrow_query,
- client = client,
- size = max_size_per_interval,
+ start_date = datetime(bin[0], 1, 1)
+ end_date = datetime(bin[1], 12, 31)
+
+ # filter query on this time bin
+ date_filter = query.make_date_filter(start_date, end_date, date_field)
+ narrow_query = query.add_filter(es_query, date_filter)
+ #search for the query text
+ search_results = search(
+ corpus=corpus,
+ query_model = narrow_query,
+ client = client,
+ size = max_size_per_interval,
+ )
+ bin_ngrams = Counter()
+ for hit in search_results['hits']['hits']:
+ identifier = hit['_id']
+ # get the term vectors for the hit
+ result = client.termvectors(
+ index=index,
+ id=identifier,
+ term_statistics=freq_compensation,
+ fields = [field]
)
-
- bin_ngrams = Counter()
-
- for hit in search_results['hits']['hits']:
- identifier = hit['_id']
-
- # get the term vectors for the hit
- result = client.termvectors(
- index=index,
- id=identifier,
- term_statistics=freq_compensation,
- fields = [field]
- )
-
- terms = termvectors.get_terms(result, field)
-
- if terms:
- sorted_tokens = termvectors.get_tokens(terms, sort=True)
-
- for match_start, match_stop, match_content in termvectors.token_matches(sorted_tokens, query_text, index, field, client):
- for j in term_positions:
- start = match_start - j
- stop = match_stop - 1 - j + ngram_size
- if start >= 0 and stop <= len(sorted_tokens):
- ngram = sorted_tokens[start:stop]
- words = ' '.join([token['term'] for token in ngram])
+ terms = termvectors.get_terms(result, field)
+ if terms:
+ sorted_tokens = termvectors.get_tokens(terms, sort=True)
+ for match_start, match_stop, match_content in termvectors.token_matches(sorted_tokens, query_text, index, field, client):
+ for j in term_positions:
+ start = match_start - j
+ stop = match_stop - 1 - j + ngram_size
+ if start >= 0 and stop <= len(sorted_tokens):
+ ngram = sorted_tokens[start:stop]
+ words = ' '.join([token['term'] for token in ngram])
+ if freq_compensation:
ttf = sum(token['ttf'] for token in ngram) / len(ngram)
ngram_ttfs[words] = ttf
- bin_ngrams.update({ words: 1})
-
- # output per bin: all tokens from this time interval
- ngrams_per_bin.append(bin_ngrams)
-
- return ngrams_per_bin, ngram_ttfs
+ bin_ngrams.update({ words: 1})
+
+ results = {
+ 'time_interval': format_time_label(bin[0], bin[1]),
+ 'ngrams': bin_ngrams
+ }
+ if freq_compensation:
+ results['ngram_ttfs'] = ngram_ttfs
+ return results
-def get_top_n_ngrams(counters, total_frequencies = None, number_of_ngrams=10):
+def get_top_n_ngrams(results, number_of_ngrams=10):
"""
Converts a list of documents with tokens into n dataseries, listing the
frequency of the top n tokens and their frequency in each document.
Input:
- - `docs`: a list of Counter objects with ngram frequencies. The division into counters reflects how the data is grouped,
- i.e. by time interval. Each counter object reflects how often ngram tokens have been observed per interval. Presumably,
- each token is a string containing an ngram.
- but can be any immutable object. The division into documents reflects how the data is grouped (e.g. by time interval).
- - `total_frequencies`: dict or `None`. If a dict, it should give the total frequency for every ngram that features in `docs`. In
- practice, this is the average frequency of each word in the ngram. If the dict is provided, the frequency of the ngram will be divided
- by it.
+ - `results`: a list of dictionaries with the following fields:
+ 'ngram': Counter objects with ngram frequencies
+ 'time_interval': the time intervals for which the ngrams were counted
+ (optional): 'ngram-ttf': averaged total term frequencies - only computed if freq_compensation was requested
+ - `number_of_ngrams`: the number of top ngrams to return
Output:
- A list of 10 data series. Each series is a dict with two keys: `'label'` contains the content of a token (presumably an
+ A list of number_of_ngrams data series. Each series is a dict with two keys: `'label'` contains the content of a token (presumably an
ngram string), `'data'` contains a list of the frequency of that token in each document. Depending on `divide_by_ttf`,
this is absolute or relative to the total term frequencies provided.
"""
-
total_counter = Counter()
- for c in counters:
- total_counter.update(c)
+ for result in results:
+ total_counter.update(result['ngrams'])
+ sorted_results = sorted(results, key=lambda r: r['time_interval'])
number_of_results = min(number_of_ngrams, len(total_counter))
- if total_frequencies:
- def frequency(ngram, counter): return counter[ngram] / total_frequencies[ngram]
+ if 'ngram_ttfs' in results[0]:
+ total_frequencies = {}
+ for result in results:
+ total_frequencies.update(result['ngram_ttfs'])
+ def frequency(ngram, counter): return counter.get(ngram, 0.0) / max(1.0, total_frequencies[ngram])
def overall_frequency(ngram): return frequency(ngram, total_counter)
top_ngrams = sorted(total_counter.keys(), key=overall_frequency, reverse=True)[:number_of_results]
else:
- def frequency(ngram, counter): return counter[ngram]
+ def frequency(ngram, counter): return counter.get(ngram, 0)
top_ngrams = [word for word, freq in total_counter.most_common(number_of_results)]
-
-
output = [{
'label': ngram,
- 'data': [frequency(ngram, c)
- for c in counters]
+ 'data': [frequency(ngram, result['ngrams'])
+ for result in sorted_results]
}
for ngram in top_ngrams]
diff --git a/backend/visualization/tasks.py b/backend/visualization/tasks.py
index 6ec169eec..3b7dcf88a 100644
--- a/backend/visualization/tasks.py
+++ b/backend/visualization/tasks.py
@@ -1,4 +1,4 @@
-from celery import shared_task, group
+from celery import chord, group, shared_task
from django.conf import settings
from visualization import wordcloud, ngram, term_frequency
from es import download as es_download
@@ -9,21 +9,39 @@ def get_wordcloud_data(request_json):
word_counts = wordcloud.make_wordcloud_data(list_of_texts, request_json['field'], request_json['corpus'])
return word_counts
-@shared_task()
-def get_ngram_data(request_json):
- return ngram.get_ngrams(
- request_json['es_query'],
- request_json['corpus_name'],
- request_json['field'],
- ngram_size=request_json['ngram_size'],
- positions=request_json['term_position'],
- freq_compensation=request_json['freq_compensation'],
- subfield=request_json['subfield'],
- max_size_per_interval=request_json['max_size_per_interval'],
- number_of_ngrams=request_json['number_of_ngrams'],
- date_field = request_json['date_field']
- )
+@shared_task
+def get_ngram_data_bin(**kwargs):
+ return ngram.tokens_by_time_interval(**kwargs)
+@shared_task
+def integrate_ngram_results(results, **kwargs):
+ return ngram.get_ngrams(results, **kwargs)
+
+def ngram_data_tasks(request_json):
+ corpus = request_json['corpus_name']
+ es_query = request_json['es_query']
+ freq_compensation = request_json['freq_compensation']
+ bins = ngram.get_time_bins(es_query, corpus)
+
+ return chord(group([
+ get_ngram_data_bin.s(
+ corpus=corpus,
+ es_query=es_query,
+ field=request_json['field'],
+ bin=b,
+ ngram_size=request_json['ngram_size'],
+ term_position=request_json['term_position'],
+ freq_compensation=freq_compensation,
+ subfield=request_json['subfield'],
+ max_size_per_interval=request_json['max_size_per_interval'],
+ date_field=request_json['date_field']
+ )
+ for b in bins
+ ]), integrate_ngram_results.s(
+ number_of_ngrams=request_json['number_of_ngrams']
+ )
+ )()
+
@shared_task()
def get_histogram_term_frequency_bin(es_query, corpus_name, field_name, field_value, size, include_query_in_result = False):
'''
diff --git a/backend/visualization/tests/test_ngrams.py b/backend/visualization/tests/test_ngrams.py
index f907c8047..93de4bbdb 100644
--- a/backend/visualization/tests/test_ngrams.py
+++ b/backend/visualization/tests/test_ngrams.py
@@ -1,7 +1,5 @@
-from random import sample
from typing import Counter
from visualization import query, ngram
-from visualization.tests.mock_corpora.small_mock_corpus import SmallMockCorpus
from datetime import datetime, date
import pytest
@@ -83,7 +81,7 @@ def test_top_10_ngrams():
['a', 'c']
]
- counts = [Counter(doc) for doc in docs]
+ time_intervals = ['1820-1830','1830-1840','1840-1850']
target_data = {
'a': [1, 1, 1],
@@ -96,14 +94,15 @@ def test_top_10_ngrams():
'b': 200,
'c': 150,
}
+ test_results = [{'ngrams': Counter(doc), 'time_interval': time_intervals[i]} for i, doc in enumerate(docs)]
- output_absolute = ngram.get_top_n_ngrams(counts)
+ output_absolute = ngram.get_top_n_ngrams(test_results)
for word in target_data:
dataset_absolute = next(series for series in output_absolute if series['label'] == word)
assert dataset_absolute['data'] == target_data[word]
-
- output_relative = ngram.get_top_n_ngrams(counts, ttf)
+ [r.update({'ngram_ttfs': ttf}) for r in test_results]
+ output_relative = ngram.get_top_n_ngrams(test_results)
for word in target_data:
dataset_relative = next(series for series in output_relative if series['label'] == word)
@@ -112,14 +111,17 @@ def test_top_10_ngrams():
for w in target_data }
assert dataset_relative['data'] == relative_frequencies[word]
-
+def get_binned_results(corpus, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
+ return [
+ ngram.tokens_by_time_interval(
+ corpus, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
+ for bin in time_bins
+ ]
def test_absolute_bigrams(small_mock_corpus, index_small_mock_corpus, basic_query):
# search for a word that occurs a few times
frequent_query = query.set_query_text(basic_query, 'to')
-
-
# expected bigram frequencies
bigrams = [
{
@@ -160,12 +162,13 @@ def test_absolute_bigrams(small_mock_corpus, index_small_mock_corpus, basic_quer
}
]
- result = ngram.get_ngrams(frequent_query, small_mock_corpus, 'content', freq_compensation=False)
+ results = get_binned_results(small_mock_corpus, frequent_query)
- assert result['time_points'] == ['{}-{}'.format(start, end) for start, end in CENTURY_BINS]
+ assert sorted([r['time_interval'] for r in results]) == sorted(['{}-{}'.format(start, end) for start, end in CENTURY_BINS])
+ integrated_results = ngram.get_ngrams(results)
for bigram in bigrams:
- data = next((item for item in result['words'] if item['label'] == bigram['label']), None)
+ data = next((item for item in integrated_results['words'] if item['label'] == bigram['label']), None)
assert data
for bin, freq in enumerate(data['data']):
@@ -204,7 +207,7 @@ def test_bigrams_with_quote(small_mock_corpus, index_small_mock_corpus, basic_qu
# search for a word that occurs a few times
case_query = query.set_query_text(basic_query, case['query'])
- result = ngram.get_ngrams(case_query, small_mock_corpus, 'content', freq_compensation=False)
+ result = ngram.get_ngrams(get_binned_results(small_mock_corpus, case_query))
ngrams = case['ngrams']
@@ -259,8 +262,16 @@ def test_number_of_ngrams(small_mock_corpus, index_small_mock_corpus, basic_quer
max_frequency = 6
- for size in range(1, max_frequency + 2):
- result = ngram.get_ngrams(frequent_query, small_mock_corpus, 'content', number_of_ngrams= size)
+ for number_of_ngrams in range(1, max_frequency + 2):
+ result = ngram.get_ngrams(get_binned_results(small_mock_corpus, frequent_query), number_of_ngrams=number_of_ngrams)
series = result['words']
- assert len(series) == min(max_frequency, size)
+ assert len(series) == min(max_frequency, number_of_ngrams)
+
+def test_freq_compensation(small_mock_corpus, index_small_mock_corpus, basic_query):
+ frequent_query = query.set_query_text(basic_query, 'to')
+ results = get_binned_results(small_mock_corpus, frequent_query, freq_compensation=True)
+ top_grams = ngram.get_top_n_ngrams(results)
+ assert top_grams
+
+
diff --git a/backend/visualization/views.py b/backend/visualization/views.py
index ee997e9bc..034a7d584 100644
--- a/backend/visualization/views.py
+++ b/backend/visualization/views.py
@@ -57,10 +57,9 @@ def post(self, request, *args, **kwargs):
try:
handle_tags_in_request(request)
- ngram_counts_task = tasks.get_ngram_data.delay(request.data)
- return Response({
- 'task_ids': [ngram_counts_task.id]
- })
+ chord = tasks.ngram_data_tasks(request.data)
+ subtasks = [chord, *chord.parent.children]
+ return Response({'task_ids': [task.id for task in subtasks]})
except Exception as e:
logger.error(e)
raise APIException(detail='Could not set up ngram generation.')
diff --git a/frontend/src/app/filter/ad-hoc-filter.component.spec.ts b/frontend/src/app/filter/ad-hoc-filter.component.spec.ts
deleted file mode 100644
index 4c92ce6bb..000000000
--- a/frontend/src/app/filter/ad-hoc-filter.component.spec.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import { ComponentFixture, TestBed } from '@angular/core/testing';
-import { commonTestBed } from '../common-test-bed';
-
-import { AdHocFilterComponent } from './ad-hoc-filter.component';
-
-describe('AdHocFilterComponent', () => {
- let component: AdHocFilterComponent;
- let fixture: ComponentFixture
+
+
+ Filters
- - -
-