Skip to content

Commit

Permalink
Merge branch 'release/5.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Oct 30, 2023
2 parents 8bab032 + 2285937 commit aac4ce4
Show file tree
Hide file tree
Showing 76 changed files with 1,020 additions and 213 deletions.
36 changes: 36 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
name: Bug report
about: Let us know about something that isn't working right
title: ''
labels: bug
assignees: ''

---

### What went wrong?

Describe what happened.

### Expected behavior

What did you expect to happen?

### Screenshots

If applicable, please add a screenshot of the problem!

### Which version?

Please specify where you encountered the issue:

- [ ] https://ianalyzer.hum.uu.nl
- [ ] https://peopleandparliament.hum.uu.nl
- [ ] https://peace.sites.uu.nl/
- [ ] a server hosted elsewhere (i.e. not by the research software lab)
- [ ] a local server

If this happened on local or third-party server, it helps if you can be more specific about the version. Please include the version number (e.g. "3.2.4") or a commit hash if you know it!

### To reproduce

How can a developer replicate the issue? Please provide any information you can. For example: "I went to https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then clicked on *Download CSV*. I pressed *cancel* and then I clicked *Download CSV* again."
20 changes: 20 additions & 0 deletions .github/ISSUE_TEMPLATE/feature_request.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for something new
title: ''
labels: enhancement
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.
2 changes: 1 addition & 1 deletion backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
'''
Mapping for the main content field. Options:
Expand Down
41 changes: 29 additions & 12 deletions backend/api/tests/test_api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,11 @@
from addcorpus.models import Corpus
from rest_framework.status import is_success

def test_search_history_view(admin_user, admin_client):
corpus = Corpus.objects.create(name = 'mock-corpus')

# get search history
response = admin_client.get('/api/search_history/')
assert is_success(response.status_code)
assert len(response.data) == 0

# add a query to search history
data = {
def mock_query_data(user, corpus_name):
return {
'aborted': False,
'corpus': corpus.name,
'user': admin_user.id,
'corpus': corpus_name,
'user': user.id,
'started': datetime.now().isoformat(),
'completed': datetime.now().isoformat(),
'query_json': {
Expand All @@ -25,6 +17,17 @@ def test_search_history_view(admin_user, admin_client):
'total_results': 10,
'transferred': 0,
}

def test_search_history_view(admin_user, admin_client):
corpus = Corpus.objects.create(name = 'mock-corpus')

# get search history
response = admin_client.get('/api/search_history/')
assert is_success(response.status_code)
assert len(response.data) == 0

# add a query to search history
data = mock_query_data(admin_user, 'mock-corpus')
response = admin_client.post('/api/search_history/', data, content_type='application/json')
assert is_success(response.status_code)

Expand All @@ -34,6 +37,20 @@ def test_search_history_view(admin_user, admin_client):
assert len(response.data) == 1


def test_delete_search_history(auth_client, auth_user, db):
mock_corpus = 'mock-corpus'
corpus = Corpus.objects.create(name = mock_corpus)
query = mock_query_data(auth_user, mock_corpus)
auth_client.post('/api/search_history/', query, content_type='application/json')

assert len(auth_user.queries.all()) == 1

response = auth_client.post('/api/search_history/delete_all/')
assert is_success(response.status_code)

assert len(auth_user.queries.all()) == 0


def test_task_status_view(transactional_db, admin_client, celery_worker):
bad_request = {
'bad_key': 'data'
Expand Down
8 changes: 7 additions & 1 deletion backend/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from api.serializers import QuerySerializer
from rest_framework.permissions import IsAuthenticated
from rest_framework.exceptions import APIException
from rest_framework.decorators import action
import logging
from rest_framework.permissions import IsAuthenticated
from api.utils import check_json_keys
from celery import current_app as celery_app

Expand All @@ -23,6 +23,12 @@ class QueryViewset(viewsets.ModelViewSet):
def get_queryset(self):
return self.request.user.queries.all()

@action(detail=False, methods=['post'])
def delete_all(self, request):
queries = self.get_queryset()
queries.delete()
return Response('success')

class TaskStatusView(APIView):
'''
Get the status of an array of backend tasks (working/done/failed),
Expand Down
6 changes: 5 additions & 1 deletion backend/corpora/dutchannualreports/dutchannualreports.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf
from addcorpus.load_corpus import corpus_dir

from addcorpus.es_mappings import keyword_mapping, main_content_mapping
from addcorpus.es_settings import es_settings

from media.media_url import media_url

Expand Down Expand Up @@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition):

dutchannualreports_map = {}

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
reader = csv.DictReader(f)
for line in reader:
Expand Down
9 changes: 4 additions & 5 deletions backend/corpora/ecco/ecco.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition):
description_page = 'ecco.md'
min_date = datetime(year=1700, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

data_directory = settings.ECCO_DATA
es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
image = 'ecco.jpg'
Expand All @@ -47,6 +42,10 @@ def es_settings(self):

meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

def sources(self, start=min_date, end=max_date):
logging.basicConfig(filename='ecco.log', level=logging.INFO)

Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/finland-old.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
title = 'People and Parliament (Finland, 1863-1905)'
description = 'Speeches from the early Finnish estates'
max_date = datetime(year=1905, month=12, day=31)
max_date = datetime(year=1906, month=12, day=31)
min_date = datetime(year=1863, month=1, day=1)
data_directory = settings.PP_FINLAND_OLD_DATA
es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')
Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/netherlands.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition):
title = "People & Parliament (Netherlands)"
description = "Speeches from the Eerste Kamer and Tweede Kamer"
min_date = datetime(year=1815, month=1, day=1)
max_date = datetime(year=2020, month=12, day=31)
max_date = datetime(year=2022, month=12, day=31)
data_directory = settings.PP_NL_DATA
data_directory_recent = settings.PP_NL_RECENT_DATA
word_model_path = getattr(settings, 'PP_NL_WM', None)
Expand Down
10 changes: 7 additions & 3 deletions backend/download/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,22 @@ def index_ml_mock_corpus(es_client, ml_mock_corpus):
def index_mock_corpus(es_client, mock_corpus, index_small_mock_corpus, index_large_mock_corpus, index_ml_mock_corpus):
yield mock_corpus

def save_all_results_csv(mock_corpus, mock_corpus_specs):
def all_results_request_json(mock_corpus, mock_corpus_specs):
fields = mock_corpus_specs['fields']
query = mock_corpus_specs['example_query']

request_json = {
return {
'corpus': mock_corpus,
'es_query': MATCH_ALL,
'fields': fields,
'route': '/search/{};query={}'.format(mock_corpus, query)
}

def save_all_results_csv(mock_corpus, mock_corpus_specs):
request_json = all_results_request_json(mock_corpus, mock_corpus_specs)
results = tasks.download_scroll(request_json)
filename = tasks.make_csv(results, request_json)
fake_id = mock_corpus + '_all_results'
filename = tasks.make_csv(results, request_json, fake_id)

return filename

Expand Down
42 changes: 26 additions & 16 deletions backend/download/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from django.conf import settings

from visualization.query import get_query_text
from visualization.term_frequency import parse_datestring

def write_file(filename, fieldnames, rows, dialect = 'excel'):
Expand All @@ -21,12 +22,10 @@ def write_file(filename, fieldnames, rows, dialect = 'excel'):

return filepath

def create_filename(descriptive_part, essential_suffix = '.csv'):
max_length = 255 - (len(essential_suffix) + len(settings.CSV_FILES_PATH))
truncated = descriptive_part[:min(max_length, len(descriptive_part))]
return truncated + essential_suffix
def create_filename(download_id):
return f'{download_id}.csv'

def search_results_csv(results, fields, query):
def search_results_csv(results, fields, query, download_id):
entries = []
field_set = set(fields)
field_set.update(['query'])
Expand All @@ -50,14 +49,14 @@ def search_results_csv(results, fields, query):
entry.update({highlight_field_name: soup.get_text()})
entries.append(entry)

filename = create_filename(query)
filename = create_filename(download_id)
field_set.discard('context')
fieldnames = sorted(field_set)
filepath = write_file(filename, fieldnames, entries, dialect = 'resultsDialect')
return filepath


def term_frequency_csv(queries, results, field_name, unit = None):
def term_frequency_csv(queries, results, field_name, download_id, unit = None):
has_token_counts = results[0].get('token_count', None) != None
query_column = ['Query'] if len(queries) > 1 else []
freq_columns = ['Term frequency', 'Relative term frequency (by # documents)', 'Total documents']
Expand All @@ -66,17 +65,10 @@ def term_frequency_csv(queries, results, field_name, unit = None):

rows = term_frequency_csv_rows(queries, results, field_name, unit)

filename = term_frequency_filename(queries, field_name)
filename = create_filename(download_id)
filepath = write_file(filename, fieldnames, rows)
return filepath

def term_frequency_filename(queries, field_name):
querystring = '_'.join(queries)
timestamp = datetime.now().isoformat(sep='_', timespec='minutes') # ensure csv filenames are unique with timestamp
suffix = '_' + timestamp + '.csv'
description = 'term_frequency_{}_{}'.format(field_name, querystring)
return create_filename(description, suffix)

def term_frequency_csv_rows(queries, results, field_name, unit):
for result in results:
field_value = format_field_value(result['key'], unit)
Expand Down Expand Up @@ -108,4 +100,22 @@ def format_field_value(value, unit):
'week': '%Y-%m-%d',
'day': '%Y-%m-%d'
}
return date.strftime(formats[unit])
return date.strftime(formats[unit])

def ngram_csv(results, log_id):
rows = ngram_table(results)
fieldnames = ['date', 'N-gram', 'Frequency']
filename = create_filename(log_id)
filepath = write_file(filename, fieldnames, rows)
return filepath

def ngram_table(results):
rows = []
for index, time_point in enumerate(results['time_points']):
for ngram in results['words']:
rows.append({
'date': time_point,
'N-gram': ngram['label'],
'Frequency': ngram['data'][index]
})
return rows
4 changes: 2 additions & 2 deletions backend/download/mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def send_csv_email(user_email, username, download_id):

subject = 'I-Analyzer CSV download'
from_email = settings.DEFAULT_FROM_EMAIL
path = Download.objects.get(id=download_id).filename
_, filename = os.path.split(path)
download = Download.objects.get(id=download_id)
filename = download.descriptive_filename()

context = {
'email_title': 'Download CSV',
Expand Down
18 changes: 18 additions & 0 deletions backend/download/migrations/0002_alter_download_download_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.10 on 2023-10-18 12:52

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('download', '0001_initial'),
]

operations = [
migrations.AlterField(
model_name='download',
name='download_type',
field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
),
]
15 changes: 12 additions & 3 deletions backend/download/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from django.db import models
from django.conf import settings
from django.utils import timezone

from users.models import CustomUser
from addcorpus.models import Corpus
from django.conf import settings
from datetime import datetime

MAX_LENGTH_FILENAME = 254

Expand All @@ -17,6 +18,7 @@ class Download(models.Model):
('search_results', 'Search results'),
('date_term_frequency', 'Term frequency (timeline)'),
('aggregate_term_frequency', 'Term frequency (histogram)'),
('ngram', 'Neighbouring words')
],
help_text='Type of download (search results or a type of visualisation)')
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
Expand Down Expand Up @@ -49,5 +51,12 @@ def complete(self, filename = None):
'''

self.filename = filename
self.completed = datetime.now()
self.completed = timezone.now()
self.save()

def descriptive_filename(self):
corpus_name = self.corpus.name
type_name = self.download_type
timestamp = self.completed.strftime('%Y-%m-%d %H:%M')

return f'{type_name}__{corpus_name}__{timestamp}.csv'
Loading

0 comments on commit aac4ce4

Please sign in to comment.