From 7413ac598c68095d20d3aeb64d76ec424aedbb19 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 15:13:45 +0200
Subject: [PATCH 01/98] create tag interface

---
 frontend/src/app/models/index.ts | 1 +
 frontend/src/app/models/tag.ts   | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 frontend/src/app/models/tag.ts

diff --git a/frontend/src/app/models/index.ts b/frontend/src/app/models/index.ts
index 764f71ba5..6eae337ad 100644
--- a/frontend/src/app/models/index.ts
+++ b/frontend/src/app/models/index.ts
@@ -9,3 +9,4 @@ export * from './user';
 export * from './user-role';
 export * from './visualization';
 export * from './elasticsearch';
+export * from './tag';
diff --git a/frontend/src/app/models/tag.ts b/frontend/src/app/models/tag.ts
new file mode 100644
index 000000000..55a650412
--- /dev/null
+++ b/frontend/src/app/models/tag.ts
@@ -0,0 +1,6 @@
+export interface Tag {
+    id: number;
+    name: string;
+    description: string;
+    count: number;
+}

From 8659990f7d50835329cae0ddabf70beef0eb8de7 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 15:13:53 +0200
Subject: [PATCH 02/98] create tag service

---
 frontend/src/app/services/tag.service.spec.ts | 21 +++++++++++++++++++
 frontend/src/app/services/tag.service.ts      | 18 ++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 frontend/src/app/services/tag.service.spec.ts
 create mode 100644 frontend/src/app/services/tag.service.ts

diff --git a/frontend/src/app/services/tag.service.spec.ts b/frontend/src/app/services/tag.service.spec.ts
new file mode 100644
index 000000000..e04f69429
--- /dev/null
+++ b/frontend/src/app/services/tag.service.spec.ts
@@ -0,0 +1,21 @@
+import { TestBed } from '@angular/core/testing';
+
+import { TagService } from './tag.service';
+import { HttpClientTestingModule } from '@angular/common/http/testing';
+
+describe('TagService', () => {
+    let service: TagService;
+
+    beforeEach(() => {
+        TestBed.configureTestingModule({
+            imports: [
+                HttpClientTestingModule
+            ]
+        });
+        service = TestBed.inject(TagService);
+    });
+
+    it('should be created', () => {
+        expect(service).toBeTruthy();
+    });
+});
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
new file mode 100644
index 000000000..65e73b7e1
--- /dev/null
+++ b/frontend/src/app/services/tag.service.ts
@@ -0,0 +1,18 @@
+import { Injectable } from '@angular/core';
+import { Corpus, FoundDocument } from '../models';
+import { HttpClient } from '@angular/common/http';
+import { Observable } from 'rxjs';
+import { Tag } from '../models';
+
+
+@Injectable({
+    providedIn: 'root'
+})
+export class TagService {
+
+    constructor(private http: HttpClient) { }
+
+    getDocumentTags(corpus: Corpus, document: FoundDocument): Observable<Tag[]> {
+        return this.http.get<Tag[]>(`/api/tag/document_tags/${corpus.name}/${document.id}`);
+    }
+}

From f6d9adbbb6bfbcbcc18d30ac335bbb92247f6527 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 15:33:09 +0200
Subject: [PATCH 03/98] move SearchHit type definition

---
 frontend/src/app/models/elasticsearch.ts            | 10 ++++++++++
 frontend/src/app/services/elastic-search.service.ts |  9 +--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/frontend/src/app/models/elasticsearch.ts b/frontend/src/app/models/elasticsearch.ts
index 697fb6060..b0816dacb 100644
--- a/frontend/src/app/models/elasticsearch.ts
+++ b/frontend/src/app/models/elasticsearch.ts
@@ -58,3 +58,13 @@ export interface SimpleQueryString {
 }
 
 export type EsSearchClause = MatchAll | SimpleQueryString;
+
+export interface FieldValues { [fieldName: string]: any };
+export interface HighlightResult { [fieldName: string]: string[] }
+
+export interface SearchHit {
+    _id: string;
+    _score: number;
+    _source: FieldValues;
+    highlight?: HighlightResult;
+}
diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts
index fb71eb6e6..ff7382797 100644
--- a/frontend/src/app/services/elastic-search.service.ts
+++ b/frontend/src/app/services/elastic-search.service.ts
@@ -5,7 +5,7 @@ import { HttpClient, HttpParams } from '@angular/common/http';
 import {
     FoundDocument, Corpus, QueryModel, SearchResults,
     AggregateQueryFeedback, EsSearchClause, BooleanQuery,
-    EsFilter
+    EsFilter, SearchHit
 } from '../models/index';
 import * as _ from 'lodash';
 
@@ -244,10 +244,3 @@ export interface SearchResponse {
     };
     aggregations?: any;
 }
-
-export interface SearchHit {
-    _id: string;
-    _score: number;
-    _source: {};
-    highlight: {};
-}

From 8fa81a49af264db7dd313988a816009e761e6665 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 16:20:56 +0200
Subject: [PATCH 04/98] add constructor to FoundDocument

---
 .../document-view.component.spec.ts           |  7 ++--
 .../image-view/image-view.component.spec.ts   |  3 +-
 .../src/app/models/found-document.spec.ts     | 33 +++++++++++++++++++
 frontend/src/app/models/found-document.ts     | 32 +++++++++++++-----
 .../search/search-results.component.spec.ts   | 14 +++-----
 .../app/services/elastic-search.service.ts    |  9 ++---
 .../src/app/utils/document-context.spec.ts    | 18 ++++------
 frontend/src/mock-data/constructor-helpers.ts | 16 +++++++++
 frontend/src/mock-data/elastic-search.ts      | 12 ++-----
 9 files changed, 92 insertions(+), 52 deletions(-)
 create mode 100644 frontend/src/app/models/found-document.spec.ts
 create mode 100644 frontend/src/mock-data/constructor-helpers.ts

diff --git a/frontend/src/app/document-view/document-view.component.spec.ts b/frontend/src/app/document-view/document-view.component.spec.ts
index 5e7d9e458..863dd2631 100644
--- a/frontend/src/app/document-view/document-view.component.spec.ts
+++ b/frontend/src/app/document-view/document-view.component.spec.ts
@@ -6,6 +6,7 @@ import { mockCorpus, mockField } from '../../mock-data/corpus';
 import { commonTestBed } from '../common-test-bed';
 
 import { DocumentViewComponent } from './document-view.component';
+import { makeDocument } from '../../mock-data/constructor-helpers';
 
 describe('DocumentViewComponent', () => {
     let component: DocumentViewComponent;
@@ -22,11 +23,7 @@ describe('DocumentViewComponent', () => {
             scan_image_type: 'farout_image_type',
             fields: [mockField]
         }, mockCorpus);
-        component.document = {
-            id: 'test',
-            relevance: 0.5,
-            fieldValues: { great_field: 'Hello world!' }
-        };
+        component.document = makeDocument({ great_field: 'Hello world!' });
         fixture.detectChanges();
     });
 
diff --git a/frontend/src/app/image-view/image-view.component.spec.ts b/frontend/src/app/image-view/image-view.component.spec.ts
index 8c4c020ba..2b967c49c 100644
--- a/frontend/src/app/image-view/image-view.component.spec.ts
+++ b/frontend/src/app/image-view/image-view.component.spec.ts
@@ -3,6 +3,7 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing';
 import { commonTestBed } from '../common-test-bed';
 
 import { ImageViewComponent } from './image-view.component';
+import { makeDocument } from '../../mock-data/constructor-helpers';
 
 describe('ImageViewComponent', () => {
   let component: ImageViewComponent;
@@ -15,7 +16,7 @@ describe('ImageViewComponent', () => {
   beforeEach(() => {
     fixture = TestBed.createComponent(ImageViewComponent);
     component = fixture.componentInstance;
-    component.document = {id: '42', relevance: 42, fieldValues: {image_path: 'great/image/path'}};
+    component.document = makeDocument({image_path: 'great/image/path'});
     fixture.detectChanges();
   });
 
diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts
new file mode 100644
index 000000000..76b240250
--- /dev/null
+++ b/frontend/src/app/models/found-document.spec.ts
@@ -0,0 +1,33 @@
+import { FoundDocument } from './found-document';
+
+const maxScore = 2.9113607;
+const mockResponse = {
+    _index: 'troonredes',
+    _id: '1994_troonrede',
+    _score: 2.9113607,
+    _source: {
+        date: '1994-09-20',
+        id: '1994_troonrede',
+        title: 'Troonrede 20 september 1994',
+        monarch: 'Beatrix',
+        speech_type: 'troonrede',
+        content: 'Om op langere termijn de zekerheid te kunnen blijven bieden ' +
+            'van een gegarandeerd basispensioen, en om solidaire regelingen bij ' +
+            'arbeidsongeschiktheid en werkloosheid in stand te houden, is een ' +
+            'kritische toets van het bestaande stelsel nu geboden.'
+    },
+    highlight: {
+        content: [
+            '<em>toets</em>'
+        ]
+    }
+};
+
+describe('FoundDocument', () => {
+    it('should construct from an elasticsearch response', () => {
+        const document = new FoundDocument(mockResponse, maxScore);
+
+        expect(document.id).toBe('1994_troonrede');
+        expect(document.fieldValues['monarch']).toBe('Beatrix');
+    });
+});
diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index 514da30c3..c7639d067 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -1,13 +1,27 @@
-export interface FoundDocument {
+import { FieldValues, HighlightResult, SearchHit } from './elasticsearch';
+
+export class FoundDocument {
     id: string;
-    /**
-     * Normalized relevance [0,1] with 1 being most relevant
+
+    /** relevance score for the query;
+     * in [0,1] with 1 being most relevant
      */
     relevance: number;
-    fieldValues: { [fieldName: string]: any };
-    /**
-     * Position of the document in the array of results
-     */
-    position?: number;
-    highlight?: {[fieldName: string]: string[]};
+
+    /** values for fields */
+    fieldValues: FieldValues;
+
+    /** position of the document in the array of results */
+    position: number;
+
+    /** highlighted strings */
+    highlight: HighlightResult;
+
+    constructor(hit: SearchHit, maxScore: number = 1) {
+        this.id = hit._id;
+        this.relevance = hit._score / maxScore;
+        this.fieldValues = Object.assign({ id: hit._id }, hit._source);
+        this.highlight = hit.highlight;
+    }
+
 }
diff --git a/frontend/src/app/search/search-results.component.spec.ts b/frontend/src/app/search/search-results.component.spec.ts
index d66457861..cef8f1bd0 100644
--- a/frontend/src/app/search/search-results.component.spec.ts
+++ b/frontend/src/app/search/search-results.component.spec.ts
@@ -3,9 +3,10 @@ import * as _ from 'lodash';
 import { mockCorpus, mockField } from '../../mock-data/corpus';
 import { commonTestBed } from '../common-test-bed';
 
-import { CorpusField, QueryModel } from '../models/index';
+import { CorpusField, FoundDocument, QueryModel } from '../models/index';
 
 import { SearchResultsComponent } from './search-results.component';
+import { makeDocument } from '../../mock-data/constructor-helpers';
 
 
 describe('Search Results Component', () => {
@@ -22,7 +23,7 @@ describe('Search Results Component', () => {
         component = fixture.componentInstance;
         component.results = {
             fields,
-            documents: [createDocument({
+            documents: [makeDocument({
                 a: '1',
                 b: '2',
                 c: 'Hide-and-seek!'
@@ -30,7 +31,7 @@ describe('Search Results Component', () => {
             {
                 c: ['Where is <span>Wally?</span>', 'I cannot find <span>Wally</span> anywhere!']
             }),
-            createDocument({
+            makeDocument({
                 a: '3',
                 b: '4',
                 c: 'Wally is here'
@@ -56,13 +57,6 @@ describe('Search Results Component', () => {
         return field;
     };
 
-    const createDocument = (
-        fieldValues: { [name: string]: string },
-        id: string,
-        relevance: number,
-        highlight?: {[fieldName: string]: string[]}
-        ) => ({ id, relevance, fieldValues, highlight });
-
     it('should be created', () => {
         expect(component).toBeTruthy();
     });
diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts
index ff7382797..c23005caa 100644
--- a/frontend/src/app/services/elastic-search.service.ts
+++ b/frontend/src/app/services/elastic-search.service.ts
@@ -168,13 +168,8 @@ export class ElasticSearchService {
     /**
      * return the id, relevance and field values of a given document
      */
-    private hitToDocument(hit: SearchHit, maxScore: number) {
-        return {
-            id: hit._id,
-            relevance: hit._score / maxScore,
-            fieldValues: Object.assign({ id: hit._id }, hit._source),
-            highlight: hit.highlight,
-        } as FoundDocument;
+    private hitToDocument(hit: SearchHit, maxScore: number): FoundDocument {
+        return new FoundDocument(hit, maxScore);
     }
 }
 
diff --git a/frontend/src/app/utils/document-context.spec.ts b/frontend/src/app/utils/document-context.spec.ts
index 565ffc9bb..f0980f78e 100644
--- a/frontend/src/app/utils/document-context.spec.ts
+++ b/frontend/src/app/utils/document-context.spec.ts
@@ -1,20 +1,16 @@
+import { makeDocument } from '../../mock-data/constructor-helpers';
 import { mockCorpus3 } from '../../mock-data/corpus';
-import { FoundDocument } from '../models';
 import { makeContextParams } from './document-context';
 
 describe('document context utils', () => {
     const corpus = mockCorpus3;
 
-    const document: FoundDocument = {
-        id: '1',
-        relevance: undefined,
-        fieldValues: {
-            great_field: 'true',
-            speech: 'whatever',
-            ordering: '42',
-            date: '1900-01-01'
-        }
-    };
+    const document = makeDocument({
+        great_field: 'true',
+        speech: 'whatever',
+        ordering: '42',
+        date: '1900-01-01'
+    });
 
     it('should create a document context link', () => {
         const params = makeContextParams(document, corpus);
diff --git a/frontend/src/mock-data/constructor-helpers.ts b/frontend/src/mock-data/constructor-helpers.ts
new file mode 100644
index 000000000..6dc1f7db3
--- /dev/null
+++ b/frontend/src/mock-data/constructor-helpers.ts
@@ -0,0 +1,16 @@
+// these functions are shorthands to create objects that would normally come out the API
+
+import { FieldValues, FoundDocument, HighlightResult, SearchHit } from '../app/models';
+
+export const makeDocument = (
+    fieldValues: FieldValues,
+    id: string = '0',
+    relevance: number = 1,
+    highlight: HighlightResult = undefined
+): FoundDocument => {
+    const hit: SearchHit = {
+        _id: id, _score: relevance, _source: fieldValues, highlight
+    };
+    return new FoundDocument(hit);
+};
+
diff --git a/frontend/src/mock-data/elastic-search.ts b/frontend/src/mock-data/elastic-search.ts
index 942f2fa3e..13c61af23 100644
--- a/frontend/src/mock-data/elastic-search.ts
+++ b/frontend/src/mock-data/elastic-search.ts
@@ -1,5 +1,5 @@
-import { Corpus, FoundDocument, QueryModel } from '../app/models';
-import { EsQuery } from '../app/services';
+import { FoundDocument } from '../app/models';
+import { makeDocument } from './constructor-helpers';
 
 export class ElasticSearchServiceMock {
     /**
@@ -9,12 +9,6 @@ export class ElasticSearchServiceMock {
     }
 
     getDocumentById(): Promise<FoundDocument> {
-        return Promise.resolve({
-            id: '0',
-            relevance: null,
-            fieldValues: {
-                content: 'Hello world!'
-            }
-        });
+        return Promise.resolve(makeDocument({content: 'Hello world!'}));
     }
 }

From ea4ae285d9f4cbde3894b09e424437b4b12c4bad Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 16:31:13 +0200
Subject: [PATCH 05/98] include corpus in FoundDocument

---
 .../src/app/models/found-document.spec.ts     |  3 ++-
 frontend/src/app/models/found-document.ts     |  5 ++++-
 .../search/search-results.component.spec.ts   |  4 ++--
 .../app/services/elastic-search.service.ts    | 22 +++++++++----------
 frontend/src/mock-data/constructor-helpers.ts |  6 +++--
 5 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts
index 76b240250..11fc4a865 100644
--- a/frontend/src/app/models/found-document.spec.ts
+++ b/frontend/src/app/models/found-document.spec.ts
@@ -1,3 +1,4 @@
+import { mockCorpus } from '../../mock-data/corpus';
 import { FoundDocument } from './found-document';
 
 const maxScore = 2.9113607;
@@ -25,7 +26,7 @@ const mockResponse = {
 
 describe('FoundDocument', () => {
     it('should construct from an elasticsearch response', () => {
-        const document = new FoundDocument(mockResponse, maxScore);
+        const document = new FoundDocument(mockCorpus, mockResponse, maxScore);
 
         expect(document.id).toBe('1994_troonrede');
         expect(document.fieldValues['monarch']).toBe('Beatrix');
diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index c7639d067..a9a04e316 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -1,3 +1,4 @@
+import { Corpus } from './corpus';
 import { FieldValues, HighlightResult, SearchHit } from './elasticsearch';
 
 export class FoundDocument {
@@ -17,11 +18,13 @@ export class FoundDocument {
     /** highlighted strings */
     highlight: HighlightResult;
 
-    constructor(hit: SearchHit, maxScore: number = 1) {
+    constructor(public corpus: Corpus, hit: SearchHit, maxScore: number = 1) {
         this.id = hit._id;
         this.relevance = hit._score / maxScore;
         this.fieldValues = Object.assign({ id: hit._id }, hit._source);
         this.highlight = hit.highlight;
     }
 
+
+
 }
diff --git a/frontend/src/app/search/search-results.component.spec.ts b/frontend/src/app/search/search-results.component.spec.ts
index cef8f1bd0..60569c692 100644
--- a/frontend/src/app/search/search-results.component.spec.ts
+++ b/frontend/src/app/search/search-results.component.spec.ts
@@ -27,7 +27,7 @@ describe('Search Results Component', () => {
                 a: '1',
                 b: '2',
                 c: 'Hide-and-seek!'
-            }, '1', 1,
+            }, mockCorpus, '1', 1,
             {
                 c: ['Where is <span>Wally?</span>', 'I cannot find <span>Wally</span> anywhere!']
             }),
@@ -35,7 +35,7 @@ describe('Search Results Component', () => {
                 a: '3',
                 b: '4',
                 c: 'Wally is here'
-            }, '2', 0.5)],
+            }, mockCorpus, '2', 0.5)],
             total: {
                 value: 2,
                 relation: 'gte'
diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts
index c23005caa..bbde3172a 100644
--- a/frontend/src/app/services/elastic-search.service.ts
+++ b/frontend/src/app/services/elastic-search.service.ts
@@ -31,11 +31,11 @@ export class ElasticSearchService {
             size: 1,
             index: corpus.index,
         };
-        return this.client.search(query).then(this.firstDocumentFromResponse.bind(this));
+        return this.client.search(query).then(this.firstDocumentFromResponse.bind(this, corpus));
     }
 
-    private firstDocumentFromResponse(response: SearchResponse): FoundDocument {
-        const parsed = this.parseResponse(response);
+    private firstDocumentFromResponse(corpus: Corpus, response: SearchResponse): FoundDocument {
+        const parsed = this.parseResponse(corpus, response);
         if (parsed.documents.length) {
             return _.first(parsed.documents);
         }
@@ -133,7 +133,7 @@ export class ElasticSearchService {
 
         // Perform the search
         const response = await this.execute(queryModel.corpus, esQuery, size || this.resultsPerPage);
-        return this.parseResponse(response);
+        return this.parseResponse(queryModel.corpus, response);
     }
 
 
@@ -146,21 +146,19 @@ export class ElasticSearchService {
         const esQuery = queryModel.toEsQuery();
         // Perform the search
         const response = await this.execute(queryModel.corpus, esQuery, size || this.resultsPerPage, from);
-        return this.parseResponse(response);
+        return this.parseResponse(queryModel.corpus, response);
     }
 
     /**
      * Extract relevant information from dictionary returned by ES
      *
+     * @param corpus
      * @param response
-     * @param queryModel
-     * @param alreadyRetrieved
-     * @param completed
      */
-    private parseResponse(response: SearchResponse): SearchResults {
+    private parseResponse(corpus: Corpus, response: SearchResponse): SearchResults {
         const hits = response.hits.hits;
         return {
-            documents: hits.map(hit => this.hitToDocument(hit, response.hits.max_score)),
+            documents: hits.map(hit => this.hitToDocument(corpus, hit, response.hits.max_score)),
             total: response.hits.total
         };
     }
@@ -168,8 +166,8 @@ export class ElasticSearchService {
     /**
      * return the id, relevance and field values of a given document
      */
-    private hitToDocument(hit: SearchHit, maxScore: number): FoundDocument {
-        return new FoundDocument(hit, maxScore);
+    private hitToDocument(corpus: Corpus, hit: SearchHit, maxScore: number): FoundDocument {
+        return new FoundDocument(corpus, hit, maxScore);
     }
 }
 
diff --git a/frontend/src/mock-data/constructor-helpers.ts b/frontend/src/mock-data/constructor-helpers.ts
index 6dc1f7db3..8ae7e0e64 100644
--- a/frontend/src/mock-data/constructor-helpers.ts
+++ b/frontend/src/mock-data/constructor-helpers.ts
@@ -1,9 +1,11 @@
 // these functions are shorthands to create objects that would normally come out the API
 
-import { FieldValues, FoundDocument, HighlightResult, SearchHit } from '../app/models';
+import { Corpus, FieldValues, FoundDocument, HighlightResult, SearchHit } from '../app/models';
+import { mockCorpus } from './corpus';
 
 export const makeDocument = (
     fieldValues: FieldValues,
+    corpus: Corpus = mockCorpus,
     id: string = '0',
     relevance: number = 1,
     highlight: HighlightResult = undefined
@@ -11,6 +13,6 @@ export const makeDocument = (
     const hit: SearchHit = {
         _id: id, _score: relevance, _source: fieldValues, highlight
     };
-    return new FoundDocument(hit);
+    return new FoundDocument(mockCorpus, hit);
 };
 

From 5582a8319c88a47879450a590e869be21159eed0 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 16:58:09 +0200
Subject: [PATCH 06/98] add basic methods to FoundDocument

---
 .../document-page.component.html              |  2 +-
 .../document-page.component.spec.ts           |  2 ++
 .../document-page/document-page.component.ts  |  6 +---
 .../document-view.component.html              |  8 ++---
 .../src/app/models/found-document.spec.ts     | 17 +++++++++-
 frontend/src/app/models/found-document.ts     | 33 ++++++++++++++++++-
 .../app/search/search-results.component.html  |  6 ++--
 .../app/search/search-results.component.ts    | 12 -------
 frontend/src/app/utils/document-context.ts    |  2 +-
 frontend/src/mock-data/constructor-helpers.ts |  2 +-
 10 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/frontend/src/app/document-page/document-page.component.html b/frontend/src/app/document-page/document-page.component.html
index 496c487f9..3d3e23112 100644
--- a/frontend/src/app/document-page/document-page.component.html
+++ b/frontend/src/app/document-page/document-page.component.html
@@ -5,7 +5,7 @@
             <div class="level">
                 <div class="level-left"></div>
                 <div class="level-right">
-                    <div class="level-item" *ngIf="hasContext">
+                    <div class="level-item" *ngIf="document?.hasContext">
                         <a [routerLink]="contextLink" [queryParams]="contextQueryParams" iaBalloon="view all documents from this {{contextDisplayName}}">
                             <span class="icon"><fa-icon [icon]="contextIcon"></fa-icon></span>
                             <span>View {{contextDisplayName}}</span>
diff --git a/frontend/src/app/document-page/document-page.component.spec.ts b/frontend/src/app/document-page/document-page.component.spec.ts
index a44696969..b00df857b 100644
--- a/frontend/src/app/document-page/document-page.component.spec.ts
+++ b/frontend/src/app/document-page/document-page.component.spec.ts
@@ -2,6 +2,7 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing';
 import { commonTestBed } from '../common-test-bed';
 
 import { DocumentPageComponent } from './document-page.component';
+import { makeDocument } from '../../mock-data/constructor-helpers';
 
 describe('DocumentPageComponent', () => {
     let component: DocumentPageComponent;
@@ -14,6 +15,7 @@ describe('DocumentPageComponent', () => {
     beforeEach(() => {
         fixture = TestBed.createComponent(DocumentPageComponent);
         component = fixture.componentInstance;
+        component.document = makeDocument({great_field: 'Hello world!'});
         fixture.detectChanges();
     });
 
diff --git a/frontend/src/app/document-page/document-page.component.ts b/frontend/src/app/document-page/document-page.component.ts
index 96b43b4c5..70541b48e 100644
--- a/frontend/src/app/document-page/document-page.component.ts
+++ b/frontend/src/app/document-page/document-page.component.ts
@@ -27,12 +27,8 @@ export class DocumentPageComponent implements OnInit {
         private activatedRoute: ActivatedRoute,
     ) { }
 
-    get hasContext(): boolean {
-        return this.corpus && !_.isUndefined(this.corpus.documentContext);
-    }
-
     get contextDisplayName(): string {
-        if (this.hasContext) {
+        if (this.document?.hasContext) {
             return this.corpus.documentContext.displayName;
         }
     }
diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html
index e7b3759bc..d72dd8332 100644
--- a/frontend/src/app/document-view/document-view.component.html
+++ b/frontend/src/app/document-view/document-view.component.html
@@ -10,11 +10,11 @@
                         </td>
                     </tr>
                     <ng-container *ngFor="let field of propertyFields">
-                        <tr *ngIf="document.fieldValues[field.name]">
+                        <tr *ngIf="document.fieldValue(field)">
                             <th>{{field.displayName}}</th>
-                            <td *ngIf="!isUrlField(field)" data-test-field-value [innerHtml]="document.fieldValues[field.name] | highlight:query"></td>
+                            <td *ngIf="!isUrlField(field)" data-test-field-value [innerHtml]="document.fieldValue(field) | highlight:query"></td>
                             <td *ngIf="isUrlField(field)">
-                                <a href={{document.fieldValues[field.name]}} target="_blank">{{document.fieldValues[field.name]}}</a>
+                                <a href={{document.fieldValue(field)}} target="_blank">{{document.fieldValue(field)}}</a>
                             </td>
                         </tr>
                     </ng-container>
@@ -41,7 +41,7 @@
             </div>
             <div *ngIf="tabIndex==0">
                 <ng-container *ngFor="let field of contentFields">
-                    <div class="content" *ngIf="document.fieldValues[field.name]" [innerHtml]="document.fieldValues[field.name] | highlight:query"></div>
+                    <div class="content" *ngIf="document.fieldValue(field)" [innerHtml]="document.fieldValue(field) | highlight:query"></div>
                 </ng-container>
             </div>
             <div *ngIf="tabIndex==1">
diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts
index 11fc4a865..baaa9c6a0 100644
--- a/frontend/src/app/models/found-document.spec.ts
+++ b/frontend/src/app/models/found-document.spec.ts
@@ -1,4 +1,5 @@
-import { mockCorpus } from '../../mock-data/corpus';
+import { makeDocument } from '../../mock-data/constructor-helpers';
+import { mockCorpus, mockCorpus3 } from '../../mock-data/corpus';
 import { FoundDocument } from './found-document';
 
 const maxScore = 2.9113607;
@@ -31,4 +32,18 @@ describe('FoundDocument', () => {
         expect(document.id).toBe('1994_troonrede');
         expect(document.fieldValues['monarch']).toBe('Beatrix');
     });
+
+    it('should reflect context', () => {
+        const notDefinedInCorpus = makeDocument({great_field: 'test'}, mockCorpus);
+        expect(notDefinedInCorpus.hasContext).toBeFalse();
+
+        const missingValues = makeDocument({great_field: 'test'}, mockCorpus3);
+        expect(missingValues.hasContext).toBeFalse();
+
+        const shouldHaveContext = makeDocument({
+            great_field: 'test',
+            date: new Date('1800-01-01')
+        }, mockCorpus3);
+        expect(shouldHaveContext.hasContext).toBeTrue();
+    });
 });
diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index a9a04e316..b319a008f 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -1,4 +1,6 @@
-import { Corpus } from './corpus';
+import * as _ from 'lodash';
+import { makeContextParams } from '../utils/document-context';
+import { Corpus, CorpusField } from './corpus';
 import { FieldValues, HighlightResult, SearchHit } from './elasticsearch';
 
 export class FoundDocument {
@@ -26,5 +28,34 @@ export class FoundDocument {
     }
 
 
+    /**
+     * whether the document has a "context" that it belongs to
+     *
+     * e.g. the publication it was a part of
+     */
+    get hasContext(): boolean {
+        const spec = this.corpus.documentContext;
+
+        if (_.isUndefined(spec)) {
+            return false;
+        }
+
+        const notBlank = value => value !== undefined && value !== null && value !== '';
+        const contextValues = spec.contextFields.map(this.fieldValue.bind(this));
+        return _.every(contextValues, notBlank);
+    }
+
+    /**
+     * query parameters for a search request for the context of the document
+     *
+     * e.g. the publication it was a part of
+     */
+    get contextQueryParams() {
+        return makeContextParams(this, this.corpus);
+    }
+
+    fieldValue(field: CorpusField) {
+        return this.fieldValues[field.name];
+    }
 
 }
diff --git a/frontend/src/app/search/search-results.component.html b/frontend/src/app/search/search-results.component.html
index 2a42ad4c4..fb009b0a8 100644
--- a/frontend/src/app/search/search-results.component.html
+++ b/frontend/src/app/search/search-results.component.html
@@ -40,7 +40,7 @@ <h2 class="subtitle" *ngIf="results.total.value > 5">
                 <div class="column is-flex" tabindex="0" (click)="onViewDocument(document)" (keydown.enter)="onViewDocument(document)" role="button">
                     <table class="table is-fullwidth">
                         <ng-container *ngFor="let field of results.fields">
-                            <tr *ngIf="document.fieldValues[field.name]">
+                            <tr *ngIf="document.fieldValue(field)">
                                 <td>
                                     <b>{{field.displayName}}: </b>
                                 </td>
@@ -52,7 +52,7 @@ <h2 class="subtitle" *ngIf="results.total.value > 5">
                                     </td>
                                 </ng-container>
                                 <ng-template #unhighlightedRow>
-                                    <td style="word-break:break-word" [innerHtml]="document.fieldValues[field.name] | highlight:query:true"></td>
+                                    <td style="word-break:break-word" [innerHtml]="document.fieldValue(field) | highlight:query:true"></td>
                                 </ng-template>
                             </tr>
                         </ng-container>
@@ -101,7 +101,7 @@ <h2 class="subtitle" *ngIf="results.total.value > 5">
                     <span>Link</span>
                 </a>
                 &nbsp;
-                <a *ngIf="hasContext(viewDocument)" [routerLink]="['/search', corpus.name]" [queryParams]="contextParams(viewDocument)"
+                <a *ngIf="viewDocument.hasContext" [routerLink]="['/search', corpus.name]" [queryParams]="viewDocument.contextQueryParams"
                     iaBalloon="view all documents from this {{contextDisplayName}}"
                     tabindex="0">
                     <span class="icon">
diff --git a/frontend/src/app/search/search-results.component.ts b/frontend/src/app/search/search-results.component.ts
index 086e7fbcf..2b1231639 100644
--- a/frontend/src/app/search/search-results.component.ts
+++ b/frontend/src/app/search/search-results.component.ts
@@ -182,16 +182,4 @@ export class SearchResultsComponent implements OnChanges {
         this.onViewDocument(document);
     }
 
-    hasContext(document: FoundDocument) {
-        if (this.corpus.documentContext) {
-            const contextFields = this.corpus.documentContext.contextFields;
-            const notBlank = value => value !== undefined && value !== null && value !== '';
-            return _.every(contextFields, field => notBlank(document.fieldValues[field.name]));
-        }
-        return false;
-    }
-
-    contextParams(document: FoundDocument) {
-        return makeContextParams(document, this.corpus);
-    }
 }
diff --git a/frontend/src/app/utils/document-context.ts b/frontend/src/app/utils/document-context.ts
index 160cbc88f..b4e06817d 100644
--- a/frontend/src/app/utils/document-context.ts
+++ b/frontend/src/app/utils/document-context.ts
@@ -7,7 +7,7 @@ const documentContextQuery = (corpus: Corpus, document: FoundDocument): QueryMod
 
     spec.contextFields.forEach(field => {
         const filter = field.makeSearchFilter();
-        filter.setToValue(document.fieldValues[field.name]);
+        filter.setToValue(document.fieldValue(field));
         queryModel.addFilter(filter);
     });
 
diff --git a/frontend/src/mock-data/constructor-helpers.ts b/frontend/src/mock-data/constructor-helpers.ts
index 8ae7e0e64..acf8c903d 100644
--- a/frontend/src/mock-data/constructor-helpers.ts
+++ b/frontend/src/mock-data/constructor-helpers.ts
@@ -13,6 +13,6 @@ export const makeDocument = (
     const hit: SearchHit = {
         _id: id, _score: relevance, _source: fieldValues, highlight
     };
-    return new FoundDocument(mockCorpus, hit);
+    return new FoundDocument(corpus, hit);
 };
 

From 46cf135454da45de7f4224c7f469e70bcebabe25 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 18:14:40 +0200
Subject: [PATCH 07/98] draft solution for including tag service in
 FoundDocument

---
 frontend/src/app/common-test-bed.ts           |  6 +++++
 .../src/app/models/found-document.spec.ts     | 16 +++++++++++++-
 frontend/src/app/models/found-document.ts     |  7 +++++-
 .../services/elastic-search.service.spec.ts   |  3 +++
 .../app/services/elastic-search.service.ts    |  5 +++--
 frontend/src/mock-data/constructor-helpers.ts |  5 ++++-
 frontend/src/mock-data/tag.ts                 | 22 +++++++++++++++++++
 7 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 frontend/src/mock-data/tag.ts

diff --git a/frontend/src/app/common-test-bed.ts b/frontend/src/app/common-test-bed.ts
index 6f2ce571f..f86c8901f 100644
--- a/frontend/src/app/common-test-bed.ts
+++ b/frontend/src/app/common-test-bed.ts
@@ -20,6 +20,8 @@ import { WordmodelsService } from './services/wordmodels.service';
 import { WordmodelsServiceMock } from '../mock-data/wordmodels';
 import { VisualizationService } from './services/visualization.service';
 import { visualizationServiceMock } from '../mock-data/visualization';
+import { TagService } from './services/tag.service';
+import { TagServiceMock } from '../mock-data/tag';
 
 export const commonTestBed = () => {
     const filteredImports = imports.filter(value => !(value in [HttpClientModule]));
@@ -63,6 +65,10 @@ export const commonTestBed = () => {
         {
             provide: VisualizationService,
             useValue: new visualizationServiceMock(),
+        },
+        {
+            provide: TagService,
+            useValue: new TagServiceMock(),
         }
     );
 
diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts
index baaa9c6a0..5ed24225d 100644
--- a/frontend/src/app/models/found-document.spec.ts
+++ b/frontend/src/app/models/found-document.spec.ts
@@ -1,6 +1,9 @@
+import { TestBed } from '@angular/core/testing';
 import { makeDocument } from '../../mock-data/constructor-helpers';
 import { mockCorpus, mockCorpus3 } from '../../mock-data/corpus';
 import { FoundDocument } from './found-document';
+import { TagService } from '../services/tag.service';
+import { TagServiceMock } from '../../mock-data/tag';
 
 const maxScore = 2.9113607;
 const mockResponse = {
@@ -26,8 +29,19 @@ const mockResponse = {
 };
 
 describe('FoundDocument', () => {
+    let tagService: TagService;
+
+    beforeEach(() => {
+        TestBed.configureTestingModule({
+            providers: [
+                { provide: TagService, useValue: new TagServiceMock() }
+            ]
+        });
+        tagService = TestBed.inject(TagService);
+    });
+
     it('should construct from an elasticsearch response', () => {
-        const document = new FoundDocument(mockCorpus, mockResponse, maxScore);
+        const document = new FoundDocument(tagService, mockCorpus, mockResponse, maxScore);
 
         expect(document.id).toBe('1994_troonrede');
         expect(document.fieldValues['monarch']).toBe('Beatrix');
diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index b319a008f..b36f430fb 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -20,7 +20,12 @@ export class FoundDocument {
     /** highlighted strings */
     highlight: HighlightResult;
 
-    constructor(public corpus: Corpus, hit: SearchHit, maxScore: number = 1) {
+    constructor(
+        private tagService: TagService,
+        public corpus: Corpus,
+        hit: SearchHit,
+        maxScore: number = 1
+    ) {
         this.id = hit._id;
         this.relevance = hit._score / maxScore;
         this.fieldValues = Object.assign({ id: hit._id }, hit._source);
diff --git a/frontend/src/app/services/elastic-search.service.spec.ts b/frontend/src/app/services/elastic-search.service.spec.ts
index 3a54c3114..1a818457e 100644
--- a/frontend/src/app/services/elastic-search.service.spec.ts
+++ b/frontend/src/app/services/elastic-search.service.spec.ts
@@ -1,6 +1,8 @@
 import { TestBed } from '@angular/core/testing';
 import { HttpClientTestingModule } from '@angular/common/http/testing';
 import { ElasticSearchService } from './elastic-search.service';
+import { TagService } from './tag.service';
+import { TagServiceMock } from '../../mock-data/tag';
 describe('ElasticSearchService', () => {
     let service: ElasticSearchService;
 
@@ -8,6 +10,7 @@ describe('ElasticSearchService', () => {
         TestBed.configureTestingModule({
             providers: [
                 ElasticSearchService,
+                { provide: TagService, useValue: new TagServiceMock() }
             ],
             imports: [ HttpClientTestingModule ]
         });
diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts
index bbde3172a..160717f6c 100644
--- a/frontend/src/app/services/elastic-search.service.ts
+++ b/frontend/src/app/services/elastic-search.service.ts
@@ -8,6 +8,7 @@ import {
     EsFilter, SearchHit
 } from '../models/index';
 import * as _ from 'lodash';
+import { TagService } from './tag.service';
 
 
 @Injectable()
@@ -15,7 +16,7 @@ export class ElasticSearchService {
     private client: Client;
     private resultsPerPage = 20;
 
-    constructor(private http: HttpClient) {
+    constructor(private http: HttpClient, private tagService: TagService) {
         this.client = new Client(this.http);
     }
 
@@ -167,7 +168,7 @@ export class ElasticSearchService {
      * return the id, relevance and field values of a given document
      */
     private hitToDocument(corpus: Corpus, hit: SearchHit, maxScore: number): FoundDocument {
-        return new FoundDocument(corpus, hit, maxScore);
+        return new FoundDocument(this.tagService, corpus, hit, maxScore);
     }
 }
 
diff --git a/frontend/src/mock-data/constructor-helpers.ts b/frontend/src/mock-data/constructor-helpers.ts
index acf8c903d..21c4a1aa1 100644
--- a/frontend/src/mock-data/constructor-helpers.ts
+++ b/frontend/src/mock-data/constructor-helpers.ts
@@ -2,6 +2,9 @@
 
 import { Corpus, FieldValues, FoundDocument, HighlightResult, SearchHit } from '../app/models';
 import { mockCorpus } from './corpus';
+import { TagServiceMock } from './tag';
+
+const tagService = new TagServiceMock() as any;
 
 export const makeDocument = (
     fieldValues: FieldValues,
@@ -13,6 +16,6 @@ export const makeDocument = (
     const hit: SearchHit = {
         _id: id, _score: relevance, _source: fieldValues, highlight
     };
-    return new FoundDocument(corpus, hit);
+    return new FoundDocument(tagService, corpus, hit);
 };
 
diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts
new file mode 100644
index 000000000..d9fce9a90
--- /dev/null
+++ b/frontend/src/mock-data/tag.ts
@@ -0,0 +1,22 @@
+import { Observable, of } from 'rxjs';
+import { Tag } from '../app/models';
+
+export const mockTags: Tag[] = [
+    {
+        id: 1,
+        name: 'fascinating',
+        description: 'interesting documents',
+        count: 2
+    }, {
+        id: 2,
+        name: 'boring',
+        description: 'useless documents',
+        count: 1
+    }
+];
+
+export class TagServiceMock {
+    getDocumentTags(): Observable<Tag[]> {
+        return of(mockTags);
+    }
+}

From fbd6ac4ee47e46bba85fee3d26bb2a7872decc9c Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 18:16:49 +0200
Subject: [PATCH 08/98] fetch tags for documents

---
 frontend/src/app/models/found-document.ts | 11 +++++++++++
 frontend/src/app/services/tag.service.ts  |  4 ++--
 frontend/src/mock-data/tag.ts             |  4 ++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index b36f430fb..a82123746 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -2,6 +2,9 @@ import * as _ from 'lodash';
 import { makeContextParams } from '../utils/document-context';
 import { Corpus, CorpusField } from './corpus';
 import { FieldValues, HighlightResult, SearchHit } from './elasticsearch';
+import { Tag } from './tag';
+import { Observable } from 'rxjs';
+import { TagService } from '../services/tag.service';
 
 export class FoundDocument {
     id: string;
@@ -20,6 +23,9 @@ export class FoundDocument {
     /** highlighted strings */
     highlight: HighlightResult;
 
+    /** tags created on the document */
+    tags$: Observable<Tag[]>;
+
     constructor(
         private tagService: TagService,
         public corpus: Corpus,
@@ -30,6 +36,7 @@ export class FoundDocument {
         this.relevance = hit._score / maxScore;
         this.fieldValues = Object.assign({ id: hit._id }, hit._source);
         this.highlight = hit.highlight;
+        this.fetchTags();
     }
 
 
@@ -63,4 +70,8 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
+    private fetchTags(): void {
+        this.tags$ = this.tagService.getDocumentTags(this);
+    }
+
 }
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index 65e73b7e1..5388b07c9 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -12,7 +12,7 @@ export class TagService {
 
     constructor(private http: HttpClient) { }
 
-    getDocumentTags(corpus: Corpus, document: FoundDocument): Observable<Tag[]> {
-        return this.http.get<Tag[]>(`/api/tag/document_tags/${corpus.name}/${document.id}`);
+    getDocumentTags(document: FoundDocument): Observable<Tag[]> {
+        return this.http.get<Tag[]>(`/api/tag/document_tags/${document.corpus.name}/${document.id}`);
     }
 }
diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts
index d9fce9a90..420bfbf3a 100644
--- a/frontend/src/mock-data/tag.ts
+++ b/frontend/src/mock-data/tag.ts
@@ -1,5 +1,5 @@
 import { Observable, of } from 'rxjs';
-import { Tag } from '../app/models';
+import { FoundDocument, Tag } from '../app/models';
 
 export const mockTags: Tag[] = [
     {
@@ -16,7 +16,7 @@ export const mockTags: Tag[] = [
 ];
 
 export class TagServiceMock {
-    getDocumentTags(): Observable<Tag[]> {
+    getDocumentTags(document: FoundDocument): Observable<Tag[]> {
         return of(mockTags);
     }
 }

From 9ac34630f7d791968628b938eb474ed640e59738 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Wed, 12 Jul 2023 18:25:09 +0200
Subject: [PATCH 09/98] basic tag component

---
 frontend/src/app/app.module.ts                |  2 ++
 .../document-view.component.html              |  6 +++++
 .../document-tags.component.html              |  5 ++++
 .../document-tags.component.scss              |  0
 .../document-tags.component.spec.ts           | 23 +++++++++++++++++++
 .../document-tags/document-tags.component.ts  | 17 ++++++++++++++
 6 files changed, 53 insertions(+)
 create mode 100644 frontend/src/app/search/document-tags/document-tags.component.html
 create mode 100644 frontend/src/app/search/document-tags/document-tags.component.scss
 create mode 100644 frontend/src/app/search/document-tags/document-tags.component.spec.ts
 create mode 100644 frontend/src/app/search/document-tags/document-tags.component.ts

diff --git a/frontend/src/app/app.module.ts b/frontend/src/app/app.module.ts
index 0fd77585b..d3f51443f 100644
--- a/frontend/src/app/app.module.ts
+++ b/frontend/src/app/app.module.ts
@@ -93,6 +93,7 @@ import { CorpusFilterComponent } from './corpus-selection/corpus-filter/corpus-f
 import { DatePickerComponent } from './corpus-selection/corpus-filter/date-picker/date-picker.component';
 import { CorpusInfoComponent } from './corpus-info/corpus-info.component';
 import { FieldInfoComponent } from './corpus-info/field-info/field-info.component';
+import { DocumentTagsComponent } from './search/document-tags/document-tags.component';
 
 
 export const appRoutes: Routes = [
@@ -188,6 +189,7 @@ export const declarations: any[] = [
     DateFilterComponent,
     DialogComponent,
     DocumentPageComponent,
+    DocumentTagsComponent,
     DocumentViewComponent,
     DownloadComponent,
     DownloadHistoryComponent,
diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html
index d72dd8332..7d3e87975 100644
--- a/frontend/src/app/document-view/document-view.component.html
+++ b/frontend/src/app/document-view/document-view.component.html
@@ -9,6 +9,12 @@
                             <search-relevance [value]="document.relevance"></search-relevance>
                         </td>
                     </tr>
+                    <tr>
+                        <th>Your tags</th>
+                        <td>
+                            <ia-document-tags [tags]="document.tags$ | async"></ia-document-tags>
+                        </td>
+                    </tr>
                     <ng-container *ngFor="let field of propertyFields">
                         <tr *ngIf="document.fieldValue(field)">
                             <th>{{field.displayName}}</th>
diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/search/document-tags/document-tags.component.html
new file mode 100644
index 000000000..c64e82752
--- /dev/null
+++ b/frontend/src/app/search/document-tags/document-tags.component.html
@@ -0,0 +1,5 @@
+<div class="tags">
+    <span class="tag" *ngFor="let tag of tags">
+        {{tag.name}}
+    </span>
+</div>
diff --git a/frontend/src/app/search/document-tags/document-tags.component.scss b/frontend/src/app/search/document-tags/document-tags.component.scss
new file mode 100644
index 000000000..e69de29bb
diff --git a/frontend/src/app/search/document-tags/document-tags.component.spec.ts b/frontend/src/app/search/document-tags/document-tags.component.spec.ts
new file mode 100644
index 000000000..0619b9804
--- /dev/null
+++ b/frontend/src/app/search/document-tags/document-tags.component.spec.ts
@@ -0,0 +1,23 @@
+import { ComponentFixture, TestBed } from '@angular/core/testing';
+
+import { DocumentTagsComponent } from './document-tags.component';
+import { commonTestBed } from '../../common-test-bed';
+
+describe('DocumentTagsComponent', () => {
+    let component: DocumentTagsComponent;
+    let fixture: ComponentFixture<DocumentTagsComponent>;
+
+    beforeEach(async () => {
+        commonTestBed().testingModule.compileComponents();
+    });
+
+    beforeEach(() => {
+        fixture = TestBed.createComponent(DocumentTagsComponent);
+        component = fixture.componentInstance;
+        fixture.detectChanges();
+    });
+
+    it('should create', () => {
+        expect(component).toBeTruthy();
+    });
+});
diff --git a/frontend/src/app/search/document-tags/document-tags.component.ts b/frontend/src/app/search/document-tags/document-tags.component.ts
new file mode 100644
index 000000000..b5d0aa069
--- /dev/null
+++ b/frontend/src/app/search/document-tags/document-tags.component.ts
@@ -0,0 +1,17 @@
+import { Component, Input, OnInit } from '@angular/core';
+import { Tag } from '../../models';
+
+@Component({
+  selector: 'ia-document-tags',
+  templateUrl: './document-tags.component.html',
+  styleUrls: ['./document-tags.component.scss']
+})
+export class DocumentTagsComponent implements OnInit {
+    @Input() tags: Tag[];
+
+    constructor() { }
+
+    ngOnInit(): void {
+    }
+
+}

From 8bd98ba414097425998e90e449bf5401174e8fc3 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 13:25:09 +0200
Subject: [PATCH 10/98] show controls for tags

---
 .../document-tags.component.html              | 22 +++++++++++++++----
 .../document-tags/document-tags.component.ts  |  4 ++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/search/document-tags/document-tags.component.html
index c64e82752..1f1b9fe36 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.html
+++ b/frontend/src/app/search/document-tags/document-tags.component.html
@@ -1,5 +1,19 @@
-<div class="tags">
-    <span class="tag" *ngFor="let tag of tags">
-        {{tag.name}}
-    </span>
+<div class="field is-grouped is-grouped-multiline">
+    <div class="control tags has-addons" *ngFor="let tag of tags">
+        <span class="tag is-primary" *ngFor="let tag of tags">
+            {{tag.name}}
+        </span>
+        <button class="button tag" aria-label="remove this tag"
+            iaBalloon="remove this tag" iaBalloonPosition="up">
+            <span class="icon"><fa-icon [icon]="faTimes" aria-hidden="true"></fa-icon></span>
+        </button>
+    </div>
+
+    <div class="control">
+        <button class="button tag" aria-label="add a tag"
+            iaBalloon="add a tag" iaBalloonPosition="up">
+            <span class="icon"><fa-icon [icon]="faPlus" aria-hidden="true"></fa-icon></span>
+        </button>
+    </div>
+
 </div>
diff --git a/frontend/src/app/search/document-tags/document-tags.component.ts b/frontend/src/app/search/document-tags/document-tags.component.ts
index b5d0aa069..09280cb74 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.ts
+++ b/frontend/src/app/search/document-tags/document-tags.component.ts
@@ -1,5 +1,6 @@
 import { Component, Input, OnInit } from '@angular/core';
 import { Tag } from '../../models';
+import { faPlus, faTimes } from '@fortawesome/free-solid-svg-icons';
 
 @Component({
   selector: 'ia-document-tags',
@@ -9,6 +10,9 @@ import { Tag } from '../../models';
 export class DocumentTagsComponent implements OnInit {
     @Input() tags: Tag[];
 
+    faTimes = faTimes;
+    faPlus = faPlus;
+
     constructor() { }
 
     ngOnInit(): void {

From 22cb068bef604a22d6c1f8e9dda80f4f5ae5fd58 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 13:49:25 +0200
Subject: [PATCH 11/98] start linking controls to api

---
 .../app/document-view/document-view.component.html   |  2 +-
 frontend/src/app/models/found-document.ts            |  4 ++++
 .../document-tags/document-tags.component.html       |  9 +++++----
 .../document-tags/document-tags.component.spec.ts    |  2 ++
 .../search/document-tags/document-tags.component.ts  | 12 ++++++++++--
 frontend/src/app/services/tag.service.ts             |  4 ++++
 6 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html
index 7d3e87975..556ae880b 100644
--- a/frontend/src/app/document-view/document-view.component.html
+++ b/frontend/src/app/document-view/document-view.component.html
@@ -12,7 +12,7 @@
                     <tr>
                         <th>Your tags</th>
                         <td>
-                            <ia-document-tags [tags]="document.tags$ | async"></ia-document-tags>
+                            <ia-document-tags [document]="document"></ia-document-tags>
                         </td>
                     </tr>
                     <ng-container *ngFor="let field of propertyFields">
diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index a82123746..a72ee8a1d 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -70,6 +70,10 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
+    addTag(tag: Tag): void {}
+
+    removeTag(tag: Tag): void {}
+
     private fetchTags(): void {
         this.tags$ = this.tagService.getDocumentTags(this);
     }
diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/search/document-tags/document-tags.component.html
index 1f1b9fe36..9aca05cad 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.html
+++ b/frontend/src/app/search/document-tags/document-tags.component.html
@@ -1,10 +1,11 @@
 <div class="field is-grouped is-grouped-multiline">
-    <div class="control tags has-addons" *ngFor="let tag of tags">
-        <span class="tag is-primary" *ngFor="let tag of tags">
+    <div class="control tags has-addons" *ngFor="let tag of (document?.tags$ | async)">
+        <span class="tag is-primary">
             {{tag.name}}
         </span>
-        <button class="button tag" aria-label="remove this tag"
-            iaBalloon="remove this tag" iaBalloonPosition="up">
+        <button class="button tag" [attr.aria-label]="'remove tag ' + tag.name"
+            iaBalloon="remove this tag" iaBalloonPosition="up"
+            (click)="removeTag(tag)">
             <span class="icon"><fa-icon [icon]="faTimes" aria-hidden="true"></fa-icon></span>
         </button>
     </div>
diff --git a/frontend/src/app/search/document-tags/document-tags.component.spec.ts b/frontend/src/app/search/document-tags/document-tags.component.spec.ts
index 0619b9804..a497b691c 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.spec.ts
+++ b/frontend/src/app/search/document-tags/document-tags.component.spec.ts
@@ -2,6 +2,7 @@ import { ComponentFixture, TestBed } from '@angular/core/testing';
 
 import { DocumentTagsComponent } from './document-tags.component';
 import { commonTestBed } from '../../common-test-bed';
+import { makeDocument } from '../../../mock-data/constructor-helpers';
 
 describe('DocumentTagsComponent', () => {
     let component: DocumentTagsComponent;
@@ -14,6 +15,7 @@ describe('DocumentTagsComponent', () => {
     beforeEach(() => {
         fixture = TestBed.createComponent(DocumentTagsComponent);
         component = fixture.componentInstance;
+        component.document = makeDocument({great_field: 'test'});
         fixture.detectChanges();
     });
 
diff --git a/frontend/src/app/search/document-tags/document-tags.component.ts b/frontend/src/app/search/document-tags/document-tags.component.ts
index 09280cb74..0e8d20030 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.ts
+++ b/frontend/src/app/search/document-tags/document-tags.component.ts
@@ -1,5 +1,5 @@
 import { Component, Input, OnInit } from '@angular/core';
-import { Tag } from '../../models';
+import { FoundDocument, Tag } from '../../models';
 import { faPlus, faTimes } from '@fortawesome/free-solid-svg-icons';
 
 @Component({
@@ -8,7 +8,7 @@ import { faPlus, faTimes } from '@fortawesome/free-solid-svg-icons';
   styleUrls: ['./document-tags.component.scss']
 })
 export class DocumentTagsComponent implements OnInit {
-    @Input() tags: Tag[];
+    @Input() document: FoundDocument;
 
     faTimes = faTimes;
     faPlus = faPlus;
@@ -18,4 +18,12 @@ export class DocumentTagsComponent implements OnInit {
     ngOnInit(): void {
     }
 
+    addTag(tag: Tag) {
+        this.document.addTag(tag);
+    }
+
+    removeTag(tag: Tag) {
+        this.document.removeTag(tag);
+    }
+
 }
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index 5388b07c9..a6ef46ddf 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -15,4 +15,8 @@ export class TagService {
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
         return this.http.get<Tag[]>(`/api/tag/document_tags/${document.corpus.name}/${document.id}`);
     }
+
+    removeDocumentTag(document: FoundDocument, tag: Tag) {
+        return this.http.delete('/api/....');
+    }
 }

From 7142d5e8ba37972eb06b5200e9854c06e3f1d05e Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 11:51:21 +0200
Subject: [PATCH 12/98] update filtering logic in tag view

---
 backend/tag/views.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/backend/tag/views.py b/backend/tag/views.py
index 2d897af93..9fa8cc71e 100644
--- a/backend/tag/views.py
+++ b/backend/tag/views.py
@@ -42,13 +42,17 @@ def list(self, *args, **kwargs):
         Supports filtering on a corpus by specifying the name as a query parameter.
         '''
 
-        corpus_name = check_corpus_name(self.request)
 
         filters = {
             'user': self.request.user,
-            'tagged_docs__corpus__name': corpus_name
         }
 
+        corpus_name = check_corpus_name(self.request)
+        if corpus_name:
+            filters.update({
+                'tagged_docs__corpus__name': corpus_name
+            })
+
         queryset = self.queryset.filter(**filters).distinct()
         serializer = self.get_serializer(queryset, many=True)
         return Response(serializer.data)

From b882c12e68c94bfa9f722dd4b5474904d8cbfad8 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 14:57:52 +0200
Subject: [PATCH 13/98] add view for document tagging

---
 backend/tag/tests/test_views.py | 25 +++++++++++
 backend/tag/views.py            | 76 +++++++++++++++++++++++++++++----
 2 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/backend/tag/tests/test_views.py b/backend/tag/tests/test_views.py
index d68c9c281..af0126507 100644
--- a/backend/tag/tests/test_views.py
+++ b/backend/tag/tests/test_views.py
@@ -78,6 +78,31 @@ def test_get_document_tags(auth_user, auth_client, auth_user_tag, tagged_documen
     response = auth_client.get(f'/api/tag/document_tags/{mock_corpus}/{doc_id}')
     assert status.is_success(response.status_code)
 
+def test_patch_document_tags(auth_client, auth_user_tag, mock_corpus, auth_user_corpus_acces):
+    assert auth_user_tag.count == 0
+
+    new_doc = 'a-new-document'
+    patch_request = lambda data: auth_client.patch(
+        f'/api/tag/document_tags/{mock_corpus}/{new_doc}',
+        data,
+        content_type='application/json'
+    )
+
+    response = patch_request([
+        { 'op': 'add', 'value': auth_user_tag.id }
+    ])
+
+    assert status.is_success(response.status_code)
+    assert auth_user_tag.count == 1
+
+    response = patch_request([
+        { 'op': 'remove', 'value': auth_user_tag.id }
+    ])
+
+    assert status.is_success(response.status_code)
+    assert auth_user_tag.count == 0
+
+
 def search_with_tag(client, corpus_name, tag_id):
     route = f'/api/es/{corpus_name}/_search'
     query = MATCH_ALL
diff --git a/backend/tag/views.py b/backend/tag/views.py
index 9fa8cc71e..ffa18f9e6 100644
--- a/backend/tag/views.py
+++ b/backend/tag/views.py
@@ -3,13 +3,13 @@
 from rest_framework.viewsets import ModelViewSet
 from rest_framework.views import APIView
 from django.http import HttpRequest
-from rest_framework.exceptions import NotFound
+from rest_framework.exceptions import NotFound, PermissionDenied, ParseError
 
 from .models import Tag, TaggedDocument
 from .permissions import IsTagOwner
 from .serializers import TagSerializer
 from addcorpus.models import Corpus
-from addcorpus.permissions import CorpusAccessPermission, corpus_name_from_request
+from addcorpus.permissions import CorpusAccessPermission
 
 def check_corpus_name(request: HttpRequest):
     '''
@@ -65,15 +65,75 @@ def get(self, request, *args, **kwargs):
         Get the tags for a document
         '''
 
-        tagged = TaggedDocument.objects.filter(
-            corpus__name=kwargs.get('corpus'),
-            doc_id=kwargs.get('doc_id'),
-        )
+        doc = self._get_document(**kwargs)
 
-        if tagged:
-            tags =tagged.first().tags.filter(user=request.user)
+        if doc:
+            tags = doc.tags.filter(user=request.user)
         else:
             tags = []
 
         serializer = TagSerializer(tags, many=True)
         return Response(serializer.data)
+
+    def patch(self, request, *args, **kwargs):
+        '''
+        Add or remove tags for a document
+
+        The payload should specify a list of operations, like so:
+
+        ```
+        [
+            {"op": "add", "value": 47},
+            {"op": "remove": "value": 123},
+            {"op": "add", "value": 12},
+        ]
+        ```
+        '''
+
+        doc = self._get_document(**kwargs) or self._create_document(**kwargs)
+
+        for op in request.data:
+            tag_id = op.get('value')
+            tag = self._get_tag(request, tag_id)
+            action = self._get_patch_action(op, doc)
+            action(tag)
+
+        return Response('done')
+
+    def _get_document(self, **kwargs):
+        match = TaggedDocument.objects.filter(
+            corpus__name=kwargs.get('corpus'),
+            doc_id=kwargs.get('doc_id'),
+        )
+
+        if match.exists():
+            return match.first()
+
+    def _create_document(self, **kwargs):
+        corpus_name = kwargs.get('corpus') # note: corpus name is verified in permissions
+        doc_id = kwargs.get('doc_id')
+        corpus = Corpus.objects.get(name=corpus_name)
+        return TaggedDocument.objects.create(corpus=corpus, doc_id=doc_id)
+
+    def _get_tag(self, request, tag_id):
+        if not Tag.objects.filter(id=tag_id).exists():
+            raise NotFound(f'Tag {tag_id} does not exist')
+
+        tag = Tag.objects.get(id=tag_id)
+
+        if not tag.user == request.user:
+            raise PermissionDenied(f'You do not have permission to modify tag {tag_id}')
+
+        return tag
+
+    def _get_patch_action(self, op, doc: TaggedDocument):
+        actions = {
+            'add': doc.tags.add,
+            'remove': doc.tags.remove
+        }
+
+        action = actions.get(op.get('op', None), None)
+        if not action:
+            raise ParseError('could not parse action')
+
+        return action

From f23deede50fa55fafe2b6891108b7001a9f88ab8 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 15:22:36 +0200
Subject: [PATCH 14/98] finish removing tags from frontend

---
 frontend/src/app/models/found-document.ts | 13 +++++++++++--
 frontend/src/app/services/tag.service.ts  | 20 +++++++++++++++++---
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index a72ee8a1d..3e5d73598 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -5,6 +5,7 @@ import { FieldValues, HighlightResult, SearchHit } from './elasticsearch';
 import { Tag } from './tag';
 import { Observable } from 'rxjs';
 import { TagService } from '../services/tag.service';
+import { tap } from 'rxjs/operators';
 
 export class FoundDocument {
     id: string;
@@ -70,9 +71,17 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
-    addTag(tag: Tag): void {}
+    addTag(tag: Tag): void {
+        this.tagService.addDocumentTag(this, tag).pipe(
+            tap(this.fetchTags.bind(this))
+        ).subscribe();
+    }
 
-    removeTag(tag: Tag): void {}
+    removeTag(tag: Tag): void {
+        this.tagService.removeDocumentTag(this, tag).pipe(
+            tap(this.fetchTags.bind(this))
+        ).subscribe();
+    }
 
     private fetchTags(): void {
         this.tags$ = this.tagService.getDocumentTags(this);
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index a6ef46ddf..561a44812 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -4,6 +4,10 @@ import { HttpClient } from '@angular/common/http';
 import { Observable } from 'rxjs';
 import { Tag } from '../models';
 
+type TaggingActions = {
+    op: 'add'|'remove';
+    value: number;
+}[];
 
 @Injectable({
     providedIn: 'root'
@@ -13,10 +17,20 @@ export class TagService {
     constructor(private http: HttpClient) { }
 
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
-        return this.http.get<Tag[]>(`/api/tag/document_tags/${document.corpus.name}/${document.id}`);
+        return this.http.get<Tag[]>(this.documentTagUrl(document));
     }
 
-    removeDocumentTag(document: FoundDocument, tag: Tag) {
-        return this.http.delete('/api/....');
+    addDocumentTag(document: FoundDocument, tag: Tag): Observable<any> {
+        const data: TaggingActions = [{op: 'add', value: tag.id}];
+        return this.http.patch(this.documentTagUrl(document), data);
+    }
+
+    removeDocumentTag(document: FoundDocument, tag: Tag): Observable<any> {
+        const data: TaggingActions = [{op: 'remove', value: tag.id}];
+        return this.http.patch(this.documentTagUrl(document), data);
+    }
+
+    private documentTagUrl(document: FoundDocument): string {
+        return `/api/tag/document_tags/${document.corpus.name}/${document.id}`;
     }
 }

From 8ee5f56987c4a97c412ee2afb00c4e0dbe58f979 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 16:02:16 +0200
Subject: [PATCH 15/98] fetch tags in tagservice

expand tagservicemock
---
 frontend/src/app/services/tag.service.ts | 21 ++++++++++++++++++++-
 frontend/src/mock-data/tag.ts            | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index 561a44812..14b25117c 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -3,6 +3,7 @@ import { Corpus, FoundDocument } from '../models';
 import { HttpClient } from '@angular/common/http';
 import { Observable } from 'rxjs';
 import { Tag } from '../models';
+import { tap } from 'rxjs/operators';
 
 type TaggingActions = {
     op: 'add'|'remove';
@@ -13,8 +14,18 @@ type TaggingActions = {
     providedIn: 'root'
 })
 export class TagService {
+    /** all tags from the user */
+    tags$: Observable<Tag[]>;
 
-    constructor(private http: HttpClient) { }
+    constructor(private http: HttpClient) {
+        this.fetch();
+    }
+
+    makeTag(name: string, description?: string): Observable<Tag> {
+        return this.http.put<Tag>(this.tagUrl(), {name, description}).pipe(
+            tap(this.fetch.bind(this))
+        );
+    }
 
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
         return this.http.get<Tag[]>(this.documentTagUrl(document));
@@ -30,6 +41,14 @@ export class TagService {
         return this.http.patch(this.documentTagUrl(document), data);
     }
 
+    private fetch() {
+        this.tags$ = this.http.get<Tag[]>(this.tagUrl());
+    }
+
+    private tagUrl(tag?: Tag) {
+        return `/api/tag/tags${tag ? tag.id : ''}/`;
+    }
+
     private documentTagUrl(document: FoundDocument): string {
         return `/api/tag/document_tags/${document.corpus.name}/${document.id}`;
     }
diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts
index 420bfbf3a..68570d848 100644
--- a/frontend/src/mock-data/tag.ts
+++ b/frontend/src/mock-data/tag.ts
@@ -1,5 +1,6 @@
 import { Observable, of } from 'rxjs';
 import { FoundDocument, Tag } from '../app/models';
+import { tap } from 'rxjs/operators';
 
 export const mockTags: Tag[] = [
     {
@@ -16,7 +17,27 @@ export const mockTags: Tag[] = [
 ];
 
 export class TagServiceMock {
+    tags$ = of(mockTags);
+
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
         return of(mockTags);
     }
+
+    makeTag(name: string, description?: string): Observable<Tag> {
+        return of({
+            id: 3, name, description, count: 0
+        }).pipe(tap(this.fetch.bind(this)));
+    }
+
+    addDocumentTag(document, tag): Observable<any> {
+        return of(true);
+    }
+
+    removeDocumentTag(document, tag): Observable<any> {
+        return of(true);
+    }
+
+    private fetch() {
+        this.tags$ = of(mockTags);
+    }
 }

From e0a9f5fba6472615cba52d40da92b51539e5db24 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Thu, 13 Jul 2023 16:30:20 +0200
Subject: [PATCH 16/98] make tag selection component

---
 .../document-tags.component.html              |  9 ++++---
 .../tag/tag-select/tag-select.component.html  | 21 ++++++++++++++++
 .../tag/tag-select/tag-select.component.scss  |  7 ++++++
 .../tag/tag-select/tag-select.component.ts    | 25 +++++++++++++++++++
 .../src/app/tag/tag-select/tag-select.spec.ts | 25 +++++++++++++++++++
 5 files changed, 84 insertions(+), 3 deletions(-)
 create mode 100644 frontend/src/app/tag/tag-select/tag-select.component.html
 create mode 100644 frontend/src/app/tag/tag-select/tag-select.component.scss
 create mode 100644 frontend/src/app/tag/tag-select/tag-select.component.ts
 create mode 100644 frontend/src/app/tag/tag-select/tag-select.spec.ts

diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/search/document-tags/document-tags.component.html
index 9aca05cad..b3f5cd48b 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.html
+++ b/frontend/src/app/search/document-tags/document-tags.component.html
@@ -11,10 +11,13 @@
     </div>
 
     <div class="control">
-        <button class="button tag" aria-label="add a tag"
-            iaBalloon="add a tag" iaBalloonPosition="up">
+        <ia-tag-select></ia-tag-select>
+
+        <!-- <button class="button tag" aria-label="add a tag"
+            iaBalloon="add a tag" iaBalloonPosition="up"
+            (click)="tagMenu.toggle($event)">
             <span class="icon"><fa-icon [icon]="faPlus" aria-hidden="true"></fa-icon></span>
-        </button>
+        </button> -->
     </div>
 
 </div>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
new file mode 100644
index 000000000..d21f8352f
--- /dev/null
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -0,0 +1,21 @@
+<div class="field has-addons">
+    <div class="control">
+        <div class="select tag-select is-small" *ngIf="tags$ | async as tags">
+            <select >
+                <option *ngFor="let tag of tags">
+                    {{tag.name}}
+                </option>
+            </select>
+        </div>
+    </div>
+    <div class="control">
+        <button class="button tag">
+            <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
+        </button>
+    </div>
+    <div class="control">
+        <button class="button tag">
+            <span class="icon"><fa-icon [icon]="faTimes"></fa-icon></span>
+        </button>
+    </div>
+</div>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.scss b/frontend/src/app/tag/tag-select/tag-select.component.scss
new file mode 100644
index 000000000..30e27c16c
--- /dev/null
+++ b/frontend/src/app/tag/tag-select/tag-select.component.scss
@@ -0,0 +1,7 @@
+.tag-select {
+    select {
+        padding-top: 0;
+        padding-bottom: 0;
+        height: 2em;
+        }
+}
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts
new file mode 100644
index 000000000..86ee9162f
--- /dev/null
+++ b/frontend/src/app/tag/tag-select/tag-select.component.ts
@@ -0,0 +1,25 @@
+import { Component, OnInit } from '@angular/core';
+import { TagService } from '../../services/tag.service';
+import { Observable } from 'rxjs';
+import { Tag } from '../../models';
+import { faCheck, faTimes } from '@fortawesome/free-solid-svg-icons';
+
+@Component({
+    selector: 'ia-tag-select',
+    templateUrl: './tag-select.component.html',
+    styleUrls: ['./tag-select.component.scss']
+})
+export class TagSelectComponent implements OnInit {
+    tags$: Observable<Tag[]>;
+
+    faCheck = faCheck;
+    faTimes = faTimes;
+
+    constructor(private tagService: TagService) {
+        this.tags$ = this.tagService.tags$;
+    }
+
+    ngOnInit(): void {
+    }
+
+}
diff --git a/frontend/src/app/tag/tag-select/tag-select.spec.ts b/frontend/src/app/tag/tag-select/tag-select.spec.ts
new file mode 100644
index 000000000..ec43fe473
--- /dev/null
+++ b/frontend/src/app/tag/tag-select/tag-select.spec.ts
@@ -0,0 +1,25 @@
+
+import { ComponentFixture, TestBed } from '@angular/core/testing';
+
+import { TagSelectComponent } from './tag-select.component';
+import { commonTestBed } from '../../common-test-bed';
+import { makeDocument } from '../../../mock-data/constructor-helpers';
+
+describe('DocumentTagsComponent', () => {
+    let component: TagSelectComponent;
+    let fixture: ComponentFixture<TagSelectComponent>;
+
+    beforeEach(async () => {
+        commonTestBed().testingModule.compileComponents();
+    });
+
+    beforeEach(() => {
+        fixture = TestBed.createComponent(TagSelectComponent);
+        component = fixture.componentInstance;
+        fixture.detectChanges();
+    });
+
+    it('should create', () => {
+        expect(component).toBeTruthy();
+    });
+});

From fbc0f83be071921ccff40c174e2a927e682b4625 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Fri, 14 Jul 2023 12:45:25 +0200
Subject: [PATCH 17/98] emit events in tag-select

---
 .../document-tags.component.html              |  6 ++---
 .../tag/tag-select/tag-select.component.html  | 10 ++++----
 .../tag/tag-select/tag-select.component.ts    | 24 ++++++++++++++++---
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/search/document-tags/document-tags.component.html
index b3f5cd48b..11c87ecdc 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.html
+++ b/frontend/src/app/search/document-tags/document-tags.component.html
@@ -1,5 +1,5 @@
-<div class="field is-grouped is-grouped-multiline">
-    <div class="control tags has-addons" *ngFor="let tag of (document?.tags$ | async)">
+<div class="field is-grouped is-grouped-multiline" *ngIf="document?.tags$ | async as tags">
+    <div class="control tags has-addons" *ngFor="let tag of tags">
         <span class="tag is-primary">
             {{tag.name}}
         </span>
@@ -11,7 +11,7 @@
     </div>
 
     <div class="control">
-        <ia-tag-select></ia-tag-select>
+        <ia-tag-select [exclude]="tags"></ia-tag-select>
 
         <!-- <button class="button tag" aria-label="add a tag"
             iaBalloon="add a tag" iaBalloonPosition="up"
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index d21f8352f..758c2ba0a 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -1,20 +1,22 @@
 <div class="field has-addons">
     <div class="control">
         <div class="select tag-select is-small" *ngIf="tags$ | async as tags">
-            <select >
-                <option *ngFor="let tag of tags">
+            <select #tagSelect aria-label="select a tag">
+                <option *ngFor="let tag of filterTags(tags, exclude)"
+                    [value]="tag.id">
                     {{tag.name}}
                 </option>
             </select>
         </div>
     </div>
     <div class="control">
-        <button class="button tag">
+        <button class="button tag" aria-label="add tag"
+            (click)="confirm()">
             <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
         </button>
     </div>
     <div class="control">
-        <button class="button tag">
+        <button class="button tag" aria-label="cancel" type="reset" (click)="cancel.emit()">
             <span class="icon"><fa-icon [icon]="faTimes"></fa-icon></span>
         </button>
     </div>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts
index 86ee9162f..e71bc3cfa 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.ts
+++ b/frontend/src/app/tag/tag-select/tag-select.component.ts
@@ -1,25 +1,43 @@
-import { Component, OnInit } from '@angular/core';
+import { Component, ElementRef, EventEmitter, Input, OnInit, Output, ViewChild } from '@angular/core';
 import { TagService } from '../../services/tag.service';
 import { Observable } from 'rxjs';
 import { Tag } from '../../models';
 import { faCheck, faTimes } from '@fortawesome/free-solid-svg-icons';
+import * as _ from 'lodash';
+
 
 @Component({
     selector: 'ia-tag-select',
     templateUrl: './tag-select.component.html',
     styleUrls: ['./tag-select.component.scss']
 })
-export class TagSelectComponent implements OnInit {
+export class TagSelectComponent {
+    @Input() exclude: Tag[];
+    @Output() selection = new EventEmitter<number>();
+    @Output() cancel = new EventEmitter<void>();
+
+    @ViewChild('tagSelect') tagSelect: ElementRef;
+
     tags$: Observable<Tag[]>;
 
     faCheck = faCheck;
     faTimes = faTimes;
 
+
     constructor(private tagService: TagService) {
         this.tags$ = this.tagService.tags$;
     }
 
-    ngOnInit(): void {
+    get selectedTagId(): number {
+        const option = this.tagSelect.nativeElement.selectedOptions[0];
+        return parseInt(option.value, 10);
     }
 
+    filterTags(tags: Tag[], exclude?: Tag[]) {
+        return _.differenceBy(tags, exclude || [], 'name');
+    }
+
+    confirm() {
+        this.selection.emit(this.selectedTagId);
+    }
 }

From 97c97d022b69a776b48ecf4669f5786dd6604133 Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Fri, 14 Jul 2023 12:52:38 +0200
Subject: [PATCH 18/98] toggle new tag menu

---
 frontend/src/app/models/found-document.ts        |  4 ++--
 .../document-tags/document-tags.component.html   | 16 ++++++++++------
 .../document-tags/document-tags.component.ts     |  7 +++++--
 frontend/src/app/services/tag.service.ts         |  4 ++--
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index 3e5d73598..d5e821608 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -71,8 +71,8 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
-    addTag(tag: Tag): void {
-        this.tagService.addDocumentTag(this, tag).pipe(
+    addTag(tagId: number): void {
+        this.tagService.addDocumentTag(this, tagId).pipe(
             tap(this.fetchTags.bind(this))
         ).subscribe();
     }
diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/search/document-tags/document-tags.component.html
index 11c87ecdc..64d771f0d 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.html
+++ b/frontend/src/app/search/document-tags/document-tags.component.html
@@ -11,13 +11,17 @@
     </div>
 
     <div class="control">
-        <ia-tag-select [exclude]="tags"></ia-tag-select>
+        <ng-container *ngIf="showAddNew; else toggleAddNew">
+            <ia-tag-select [exclude]="tags" (selection)="addTag($event)" (cancel)="showAddNew = false"></ia-tag-select>
+        </ng-container>
 
-        <!-- <button class="button tag" aria-label="add a tag"
-            iaBalloon="add a tag" iaBalloonPosition="up"
-            (click)="tagMenu.toggle($event)">
-            <span class="icon"><fa-icon [icon]="faPlus" aria-hidden="true"></fa-icon></span>
-        </button> -->
+        <ng-template #toggleAddNew>
+            <button class="button tag" aria-label="add a tag"
+                iaBalloon="add a tag" iaBalloonPosition="up"
+                (click)="showAddNew = true">
+                <span class="icon"><fa-icon [icon]="faPlus" aria-hidden="true"></fa-icon></span>
+            </button>
+        </ng-template>
     </div>
 
 </div>
diff --git a/frontend/src/app/search/document-tags/document-tags.component.ts b/frontend/src/app/search/document-tags/document-tags.component.ts
index 0e8d20030..807af18c6 100644
--- a/frontend/src/app/search/document-tags/document-tags.component.ts
+++ b/frontend/src/app/search/document-tags/document-tags.component.ts
@@ -13,13 +13,16 @@ export class DocumentTagsComponent implements OnInit {
     faTimes = faTimes;
     faPlus = faPlus;
 
+    showAddNew = false;
+
     constructor() { }
 
     ngOnInit(): void {
     }
 
-    addTag(tag: Tag) {
-        this.document.addTag(tag);
+    addTag(tagId: number) {
+        this.document.addTag(tagId);
+        this.showAddNew = false;
     }
 
     removeTag(tag: Tag) {
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index 14b25117c..e16e8a29c 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -31,8 +31,8 @@ export class TagService {
         return this.http.get<Tag[]>(this.documentTagUrl(document));
     }
 
-    addDocumentTag(document: FoundDocument, tag: Tag): Observable<any> {
-        const data: TaggingActions = [{op: 'add', value: tag.id}];
+    addDocumentTag(document: FoundDocument, tagId: number): Observable<any> {
+        const data: TaggingActions = [{op: 'add', value: tagId}];
         return this.http.patch(this.documentTagUrl(document), data);
     }
 

From 0b82d54309bb34c640bd199f97b49dee52f1606c Mon Sep 17 00:00:00 2001
From: lukavdplas <lukavdplas@gmail.com>
Date: Fri, 14 Jul 2023 13:49:46 +0200
Subject: [PATCH 19/98] fix tests

---
 frontend/src/app/app.module.ts                     | 2 ++
 frontend/src/app/tag/tag-select/tag-select.spec.ts | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/frontend/src/app/app.module.ts b/frontend/src/app/app.module.ts
index d3f51443f..00e509557 100644
--- a/frontend/src/app/app.module.ts
+++ b/frontend/src/app/app.module.ts
@@ -94,6 +94,7 @@ import { DatePickerComponent } from './corpus-selection/corpus-filter/date-picke
 import { CorpusInfoComponent } from './corpus-info/corpus-info.component';
 import { FieldInfoComponent } from './corpus-info/field-info/field-info.component';
 import { DocumentTagsComponent } from './search/document-tags/document-tags.component';
+import { TagSelectComponent } from './tag/tag-select/tag-select.component';
 
 
 export const appRoutes: Routes = [
@@ -237,6 +238,7 @@ export const declarations: any[] = [
     SearchSortingComponent,
     SelectFieldComponent,
     SimilarityChartComponent,
+    TagSelectComponent,
     TermComparisonEditorComponent,
     TimeIntervalSliderComponent,
     TimelineComponent,
diff --git a/frontend/src/app/tag/tag-select/tag-select.spec.ts b/frontend/src/app/tag/tag-select/tag-select.spec.ts
index ec43fe473..0b2b6e859 100644
--- a/frontend/src/app/tag/tag-select/tag-select.spec.ts
+++ b/frontend/src/app/tag/tag-select/tag-select.spec.ts
@@ -3,9 +3,8 @@ import { ComponentFixture, TestBed } from '@angular/core/testing';
 
 import { TagSelectComponent } from './tag-select.component';
 import { commonTestBed } from '../../common-test-bed';
-import { makeDocument } from '../../../mock-data/constructor-helpers';
 
-describe('DocumentTagsComponent', () => {
+describe('TagSelectComponent', () => {
     let component: TagSelectComponent;
     let fixture: ComponentFixture<TagSelectComponent>;
 

From 841197a6ca94d2c62eb9d497e0fcd4531b7c0c19 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Tue, 1 Aug 2023 16:50:11 +0200
Subject: [PATCH 20/98] add tag module

---
 frontend/src/app/document/document.module.ts  |  2 ++
 .../document-tags.component.html              |  0
 .../document-tags.component.scss              |  0
 .../document-tags.component.spec.ts           |  0
 .../document-tags/document-tags.component.ts  |  0
 frontend/src/app/tag/tag.module.ts            | 20 +++++++++++++++++++
 6 files changed, 22 insertions(+)
 rename frontend/src/app/{search => tag}/document-tags/document-tags.component.html (100%)
 rename frontend/src/app/{search => tag}/document-tags/document-tags.component.scss (100%)
 rename frontend/src/app/{search => tag}/document-tags/document-tags.component.spec.ts (100%)
 rename frontend/src/app/{search => tag}/document-tags/document-tags.component.ts (100%)
 create mode 100644 frontend/src/app/tag/tag.module.ts

diff --git a/frontend/src/app/document/document.module.ts b/frontend/src/app/document/document.module.ts
index 08f365d5a..84b110184 100644
--- a/frontend/src/app/document/document.module.ts
+++ b/frontend/src/app/document/document.module.ts
@@ -5,6 +5,7 @@ import { DocumentPageComponent } from '../document-page/document-page.component'
 import { ImageViewModule } from '../image-view/image-view.module';
 import { SearchRelevanceComponent } from '../search';
 import { CorpusModule } from '../corpus-header/corpus.module';
+import { TagModule } from '../tag/tag.module';
 
 
 
@@ -18,6 +19,7 @@ import { CorpusModule } from '../corpus-header/corpus.module';
         CorpusModule,
         SharedModule,
         ImageViewModule,
+        TagModule,
     ], exports: [
         DocumentViewComponent,
         DocumentPageComponent,
diff --git a/frontend/src/app/search/document-tags/document-tags.component.html b/frontend/src/app/tag/document-tags/document-tags.component.html
similarity index 100%
rename from frontend/src/app/search/document-tags/document-tags.component.html
rename to frontend/src/app/tag/document-tags/document-tags.component.html
diff --git a/frontend/src/app/search/document-tags/document-tags.component.scss b/frontend/src/app/tag/document-tags/document-tags.component.scss
similarity index 100%
rename from frontend/src/app/search/document-tags/document-tags.component.scss
rename to frontend/src/app/tag/document-tags/document-tags.component.scss
diff --git a/frontend/src/app/search/document-tags/document-tags.component.spec.ts b/frontend/src/app/tag/document-tags/document-tags.component.spec.ts
similarity index 100%
rename from frontend/src/app/search/document-tags/document-tags.component.spec.ts
rename to frontend/src/app/tag/document-tags/document-tags.component.spec.ts
diff --git a/frontend/src/app/search/document-tags/document-tags.component.ts b/frontend/src/app/tag/document-tags/document-tags.component.ts
similarity index 100%
rename from frontend/src/app/search/document-tags/document-tags.component.ts
rename to frontend/src/app/tag/document-tags/document-tags.component.ts
diff --git a/frontend/src/app/tag/tag.module.ts b/frontend/src/app/tag/tag.module.ts
new file mode 100644
index 000000000..1904e896a
--- /dev/null
+++ b/frontend/src/app/tag/tag.module.ts
@@ -0,0 +1,20 @@
+import { NgModule } from '@angular/core';
+import { SharedModule } from '../shared/shared.module';
+import { TagSelectComponent } from './tag-select/tag-select.component';
+import { DocumentTagsComponent } from './document-tags/document-tags.component';
+
+
+
+@NgModule({
+    declarations: [
+        DocumentTagsComponent,
+        TagSelectComponent,
+    ],
+    imports: [
+        SharedModule
+    ],
+    exports: [
+        DocumentTagsComponent,
+    ]
+})
+export class TagModule { }

From ce2326d774e9f5b35c476d3e1dc9f6fc1e2636a8 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Tue, 1 Aug 2023 17:41:33 +0200
Subject: [PATCH 21/98] update to api changes

---
 frontend/src/app/models/found-document.ts     | 13 ++------
 frontend/src/app/services/tag.service.ts      | 32 ++++++++++---------
 .../document-tags/document-tags.component.ts  | 18 +++++++++--
 frontend/src/mock-data/tag.ts                 | 10 ++----
 4 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index d5e821608..771447d48 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -71,16 +71,9 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
-    addTag(tagId: number): void {
-        this.tagService.addDocumentTag(this, tagId).pipe(
-            tap(this.fetchTags.bind(this))
-        ).subscribe();
-    }
-
-    removeTag(tag: Tag): void {
-        this.tagService.removeDocumentTag(this, tag).pipe(
-            tap(this.fetchTags.bind(this))
-        ).subscribe();
+    setTags(tagIds: number[]): Observable<Tag[]> {
+        this.tags$ = this.tagService.setDocumentTags(this, tagIds);
+        return this.tags$;
     }
 
     private fetchTags(): void {
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index e16e8a29c..82bce73e1 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -1,14 +1,17 @@
 import { Injectable } from '@angular/core';
-import { Corpus, FoundDocument } from '../models';
+import { FoundDocument } from '../models';
 import { HttpClient } from '@angular/common/http';
 import { Observable } from 'rxjs';
 import { Tag } from '../models';
-import { tap } from 'rxjs/operators';
+import { map, tap } from 'rxjs/operators';
+
+
+interface DocumentTagsResponse {
+    corpus: string;
+    doc_id: string;
+    tags: Tag[];
+};
 
-type TaggingActions = {
-    op: 'add'|'remove';
-    value: number;
-}[];
 
 @Injectable({
     providedIn: 'root'
@@ -28,17 +31,16 @@ export class TagService {
     }
 
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
-        return this.http.get<Tag[]>(this.documentTagUrl(document));
-    }
-
-    addDocumentTag(document: FoundDocument, tagId: number): Observable<any> {
-        const data: TaggingActions = [{op: 'add', value: tagId}];
-        return this.http.patch(this.documentTagUrl(document), data);
+        return this.http.get<DocumentTagsResponse>(this.documentTagUrl(document)).pipe(
+            map(response => response.tags)
+        );
     }
 
-    removeDocumentTag(document: FoundDocument, tag: Tag): Observable<any> {
-        const data: TaggingActions = [{op: 'remove', value: tag.id}];
-        return this.http.patch(this.documentTagUrl(document), data);
+    setDocumentTags(document: FoundDocument, tagIds: number[]): Observable<Tag[]> {
+        return this.http.patch<DocumentTagsResponse>(
+            this.documentTagUrl(document),
+            { tags: tagIds }
+        ).pipe(map(response => response.tags));
     }
 
     private fetch() {
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.ts b/frontend/src/app/tag/document-tags/document-tags.component.ts
index 807af18c6..8ac848366 100644
--- a/frontend/src/app/tag/document-tags/document-tags.component.ts
+++ b/frontend/src/app/tag/document-tags/document-tags.component.ts
@@ -1,6 +1,8 @@
 import { Component, Input, OnInit } from '@angular/core';
 import { FoundDocument, Tag } from '../../models';
 import { faPlus, faTimes } from '@fortawesome/free-solid-svg-icons';
+import { first, map, mergeMap } from 'rxjs/operators';
+import * as _ from 'lodash';
 
 @Component({
   selector: 'ia-document-tags',
@@ -21,12 +23,22 @@ export class DocumentTagsComponent implements OnInit {
     }
 
     addTag(tagId: number) {
-        this.document.addTag(tagId);
-        this.showAddNew = false;
+        const op = (ids: number[]) => ids.concat([tagId]);
+        this.setTags(op);
     }
 
     removeTag(tag: Tag) {
-        this.document.removeTag(tag);
+        const op = (ids: number[]) => ids.filter(id => id !== tag.id);
+        this.setTags(op);
+    }
+
+    private setTags(operation: (ids: number[]) => number[]) {
+        this.document.tags$.pipe(
+            first(),
+            map(tags => tags.map(tag => tag.id)),
+            map(operation),
+            mergeMap(ids => this.document.setTags(ids))
+        ).subscribe();
     }
 
 }
diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts
index 68570d848..6ceedc662 100644
--- a/frontend/src/mock-data/tag.ts
+++ b/frontend/src/mock-data/tag.ts
@@ -29,13 +29,9 @@ export class TagServiceMock {
         }).pipe(tap(this.fetch.bind(this)));
     }
 
-    addDocumentTag(document, tag): Observable<any> {
-        return of(true);
-    }
-
-    removeDocumentTag(document, tag): Observable<any> {
-        return of(true);
-    }
+    setDocumentTags(document: FoundDocument, tags: Tag[]): Observable<Tag[]> {
+        return of(tags);
+    };
 
     private fetch() {
         this.tags$ = of(mockTags);

From 2c4d0be129828695f827bbf77f6e0f15b025686c Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Wed, 9 Aug 2023 14:09:06 +0200
Subject: [PATCH 22/98] test for assigning multiple tags simultaneously

---
 backend/tag/tests/test_views.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/backend/tag/tests/test_views.py b/backend/tag/tests/test_views.py
index 6a0b82aed..2d9d579d4 100644
--- a/backend/tag/tests/test_views.py
+++ b/backend/tag/tests/test_views.py
@@ -105,6 +105,21 @@ def test_patch_document_tags(auth_client, auth_user_tag, mock_corpus, auth_user_
     assert status.is_success(response.status_code)
     assert auth_user_tag.count == 0
 
+def test_assign_multiple_tags(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces):
+    doc = 'test'
+    patch_request = lambda data: auth_client.patch(
+        f'/api/tag/document_tags/{mock_corpus}/{doc}',
+        data,
+        content_type='application/json'
+    )
+
+    response = patch_request({
+        'tags': [tag.id for tag in multiple_tags]
+    })
+    assert status.is_success(response.status_code)
+    doc = TaggedDocument.objects.get(doc_id=doc)
+    assert doc.tags.count() == len(multiple_tags)
+
 def test_patch_tags_contamination(auth_client, auth_user_tag, admin_user_tag, mock_corpus, mock_corpus_obj, auth_user_corpus_acces):
     '''
     Verify that patching tags does not affect the tags of other users

From 0cfca2e8a4cab3509eb8ec2a7f234c5eeccb227e Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Wed, 9 Aug 2023 14:13:10 +0200
Subject: [PATCH 23/98] test for adding multiple tags sequentially

---
 backend/tag/tests/test_views.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/backend/tag/tests/test_views.py b/backend/tag/tests/test_views.py
index 2d9d579d4..d72f38a76 100644
--- a/backend/tag/tests/test_views.py
+++ b/backend/tag/tests/test_views.py
@@ -105,7 +105,7 @@ def test_patch_document_tags(auth_client, auth_user_tag, mock_corpus, auth_user_
     assert status.is_success(response.status_code)
     assert auth_user_tag.count == 0
 
-def test_assign_multiple_tags(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces):
+def test_assign_multiple_tags_at_once(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces):
     doc = 'test'
     patch_request = lambda data: auth_client.patch(
         f'/api/tag/document_tags/{mock_corpus}/{doc}',
@@ -120,6 +120,24 @@ def test_assign_multiple_tags(auth_client, multiple_tags, mock_corpus, auth_user
     doc = TaggedDocument.objects.get(doc_id=doc)
     assert doc.tags.count() == len(multiple_tags)
 
+def test_assign_multiple_tags_one_by_one(auth_client, multiple_tags, mock_corpus, auth_user_corpus_acces):
+    doc = 'test'
+    patch_request = lambda data: auth_client.patch(
+        f'/api/tag/document_tags/{mock_corpus}/{doc}',
+        data,
+        content_type='application/json'
+    )
+
+    for i in range(len(multiple_tags)):
+        response = patch_request({
+            'tags': [tag.id for tag in multiple_tags][:i+1]
+        })
+
+        assert status.is_success(response.status_code)
+        doc = TaggedDocument.objects.get(doc_id=doc)
+        n_tags = doc.tags.count()
+        assert doc.tags.count() == i + 1
+
 def test_patch_tags_contamination(auth_client, auth_user_tag, admin_user_tag, mock_corpus, mock_corpus_obj, auth_user_corpus_acces):
     '''
     Verify that patching tags does not affect the tags of other users

From aad1bb08bc8edb4da4a6d90be1d38d4ad88f8609 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Wed, 9 Aug 2023 14:35:31 +0200
Subject: [PATCH 24/98] move requests to api service

---
 frontend/src/app/models/tag.ts                |  6 +++
 frontend/src/app/services/api.service.ts      | 39 ++++++++++++++++++-
 frontend/src/app/services/tag.service.spec.ts |  7 +++-
 frontend/src/app/services/tag.service.ts      | 32 ++++-----------
 frontend/src/mock-data/api.ts                 |  4 ++
 5 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/frontend/src/app/models/tag.ts b/frontend/src/app/models/tag.ts
index 55a650412..b2bcaf9e2 100644
--- a/frontend/src/app/models/tag.ts
+++ b/frontend/src/app/models/tag.ts
@@ -4,3 +4,9 @@ export interface Tag {
     description: string;
     count: number;
 }
+
+export interface DocumentTagsResponse {
+    corpus: string;
+    doc_id: string;
+    tags: Tag[];
+};
diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts
index ad965626b..6bcf35bb6 100644
--- a/frontend/src/app/services/api.service.ts
+++ b/frontend/src/app/services/api.service.ts
@@ -2,7 +2,7 @@
 import { Injectable } from '@angular/core';
 
 import { HttpClient } from '@angular/common/http';
-import { timer } from 'rxjs';
+import { Observable, timer } from 'rxjs';
 import { filter, switchMap, take, tap } from 'rxjs/operators';
 import { ImageInfo } from '../image-view/image-view.component';
 import {
@@ -10,6 +10,7 @@ import {
     AggregateTermFrequencyParameters,
     Corpus,
     DateTermFrequencyParameters,
+    DocumentTagsResponse,
     Download,
     DownloadOptions,
     FieldCoverage,
@@ -18,6 +19,7 @@ import {
     NGramRequestParameters,
     QueryDb,
     ResultsDownloadParameters,
+    Tag,
     TaskResult,
     TaskSuccess,
     TasksOutcome,
@@ -36,7 +38,9 @@ interface SolisLoginResponse {
     queries: QueryDb[];
 }
 
-@Injectable()
+@Injectable({
+    providedIn: 'root',
+})
 export class ApiService {
     private apiUrl = environment.apiUrl;
 
@@ -44,6 +48,7 @@ export class ApiService {
     private visApiURL = 'visualization';
     private downloadApiURL = 'download';
     private corpusApiUrl = 'corpus';
+    private tagApiUrl = 'tag';
 
     private authApiRoute = (route: string): string =>
         `/${this.authApiUrl}/${route}/`;
@@ -233,6 +238,36 @@ export class ApiService {
         return this.http.get<Corpus[]>('/api/corpus/');
     }
 
+    // Tagging
+
+    public userTags(): Observable<Tag[]> {
+        const url = this.apiRoute(this.tagApiUrl, 'tags/');
+        return this.http.get<Tag[]>(url);
+    }
+
+    public createTag(name: string, description?: string): Observable<Tag> {
+        const url = this.apiRoute(this.tagApiUrl, 'tags/');
+        return this.http.put<Tag>(url, { name, description });
+    }
+
+    public documentTags(document: FoundDocument): Observable<DocumentTagsResponse> {
+        const url = this.apiRoute(
+            this.tagApiUrl,
+            `document_tags/${document.corpus.name}/${document.id}`
+        );
+        return this.http.get<DocumentTagsResponse>(url);
+    }
+
+    public setDocumentTags(document: FoundDocument, tagIds: number[]): Observable<DocumentTagsResponse> {
+        const url = this.apiRoute(
+            this.tagApiUrl,
+            `document_tags/${document.corpus.name}/${document.id}`,
+        );
+        return this.http.patch<DocumentTagsResponse>(url,
+            { tags: tagIds }
+        );
+    }
+
     // Authentication API
     public login(username: string, password: string) {
         return this.http.post<{ key: string }>(this.authApiRoute('login'), {
diff --git a/frontend/src/app/services/tag.service.spec.ts b/frontend/src/app/services/tag.service.spec.ts
index e04f69429..912bd8df4 100644
--- a/frontend/src/app/services/tag.service.spec.ts
+++ b/frontend/src/app/services/tag.service.spec.ts
@@ -2,14 +2,19 @@ import { TestBed } from '@angular/core/testing';
 
 import { TagService } from './tag.service';
 import { HttpClientTestingModule } from '@angular/common/http/testing';
+import { ApiService } from './api.service';
+import { ApiServiceMock } from '../../mock-data/api';
 
 describe('TagService', () => {
     let service: TagService;
 
     beforeEach(() => {
         TestBed.configureTestingModule({
+            providers: [
+                { provide: ApiService, useValue: new ApiServiceMock() },
+            ],
             imports: [
-                HttpClientTestingModule
+                HttpClientTestingModule,
             ]
         });
         service = TestBed.inject(TagService);
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index 82bce73e1..5c1749377 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -1,16 +1,9 @@
 import { Injectable } from '@angular/core';
 import { FoundDocument } from '../models';
-import { HttpClient } from '@angular/common/http';
 import { Observable } from 'rxjs';
 import { Tag } from '../models';
 import { map, tap } from 'rxjs/operators';
-
-
-interface DocumentTagsResponse {
-    corpus: string;
-    doc_id: string;
-    tags: Tag[];
-};
+import { ApiService } from './api.service';
 
 
 @Injectable({
@@ -20,38 +13,29 @@ export class TagService {
     /** all tags from the user */
     tags$: Observable<Tag[]>;
 
-    constructor(private http: HttpClient) {
+    constructor(private apiService: ApiService) {
         this.fetch();
     }
 
     makeTag(name: string, description?: string): Observable<Tag> {
-        return this.http.put<Tag>(this.tagUrl(), {name, description}).pipe(
+        return this.apiService.createTag(name, description).pipe(
             tap(this.fetch.bind(this))
         );
     }
 
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
-        return this.http.get<DocumentTagsResponse>(this.documentTagUrl(document)).pipe(
+        return this.apiService.documentTags(document).pipe(
             map(response => response.tags)
         );
     }
 
     setDocumentTags(document: FoundDocument, tagIds: number[]): Observable<Tag[]> {
-        return this.http.patch<DocumentTagsResponse>(
-            this.documentTagUrl(document),
-            { tags: tagIds }
-        ).pipe(map(response => response.tags));
+        return this.apiService.setDocumentTags(document, tagIds).pipe(
+            map(response => response.tags)
+        );
     }
 
     private fetch() {
-        this.tags$ = this.http.get<Tag[]>(this.tagUrl());
-    }
-
-    private tagUrl(tag?: Tag) {
-        return `/api/tag/tags${tag ? tag.id : ''}/`;
-    }
-
-    private documentTagUrl(document: FoundDocument): string {
-        return `/api/tag/document_tags/${document.corpus.name}/${document.id}`;
+        this.tags$ = this.apiService.userTags();
     }
 }
diff --git a/frontend/src/mock-data/api.ts b/frontend/src/mock-data/api.ts
index ed635bebd..dcf8ceb4e 100644
--- a/frontend/src/mock-data/api.ts
+++ b/frontend/src/mock-data/api.ts
@@ -66,4 +66,8 @@ export class ApiServiceMock {
     requestMedia() {
         return Promise.resolve({});
     }
+
+    userTags() {
+        return of([]);
+    }
 }

From 3cf6200002912317d0fcda69d9483f483197e78a Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 10 Aug 2023 12:52:47 +0200
Subject: [PATCH 25/98] add unique constraint to taggeddocument model

---
 ...geddocument_unique_document_id_for_corpus.py | 17 +++++++++++++++++
 backend/tag/models.py                           |  8 ++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py

diff --git a/backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py b/backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py
new file mode 100644
index 000000000..1af0d1928
--- /dev/null
+++ b/backend/tag/migrations/0003_taggeddocument_unique_document_id_for_corpus.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.1.9 on 2023-08-10 10:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('tag', '0002_taggeddocument_delete_taginstance'),
+    ]
+
+    operations = [
+        migrations.AddConstraint(
+            model_name='taggeddocument',
+            constraint=models.UniqueConstraint(fields=('corpus', 'doc_id'), name='unique_document_ID_for_corpus'),
+        ),
+    ]
diff --git a/backend/tag/models.py b/backend/tag/models.py
index 1182a86a5..54e44cd93 100644
--- a/backend/tag/models.py
+++ b/backend/tag/models.py
@@ -42,3 +42,11 @@ class TaggedDocument(models.Model):
         to=Tag,
         related_name='tagged_docs'
     )
+
+    class Meta:
+        constraints = [
+            UniqueConstraint(
+                fields=['corpus', 'doc_id'],
+                name='unique_document_ID_for_corpus'
+            )
+        ]

From 2bd82c7ae5422cd1cdd9fb44d5c5cdeeb02d8008 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 10 Aug 2023 13:41:46 +0200
Subject: [PATCH 26/98] use behaviorsubject for document tags

---
 .../src/app/models/found-document.spec.ts     | 14 ++++++++++-
 frontend/src/app/models/found-document.ts     | 24 ++++++++++++++-----
 .../document-tags.component.html              |  6 ++---
 .../document-tags/document-tags.component.ts  | 16 ++-----------
 frontend/src/mock-data/tag.ts                 |  3 ++-
 5 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts
index 5ed24225d..7f8935b3a 100644
--- a/frontend/src/app/models/found-document.spec.ts
+++ b/frontend/src/app/models/found-document.spec.ts
@@ -3,7 +3,9 @@ import { makeDocument } from '../../mock-data/constructor-helpers';
 import { mockCorpus, mockCorpus3 } from '../../mock-data/corpus';
 import { FoundDocument } from './found-document';
 import { TagService } from '../services/tag.service';
-import { TagServiceMock } from '../../mock-data/tag';
+import { TagServiceMock, mockTags } from '../../mock-data/tag';
+import { Tag } from './tag';
+import * as _ from 'lodash';
 
 const maxScore = 2.9113607;
 const mockResponse = {
@@ -60,4 +62,14 @@ describe('FoundDocument', () => {
         }, mockCorpus3);
         expect(shouldHaveContext.hasContext).toBeTrue();
     });
+
+    it('should set tags', () => {
+        const doc = makeDocument({ great_field: 'test' });
+        expect(doc.tags$.value).toEqual(mockTags);
+        const tag = _.first(mockTags);
+        doc.removeTag(tag.id);
+        expect(doc.tags$.value.length).toBe(1);
+        doc.addTag(tag.id);
+        expect(doc.tags$.value.length).toBe(2);
+    });
 });
diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index 771447d48..b7452779e 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -3,7 +3,7 @@ import { makeContextParams } from '../utils/document-context';
 import { Corpus, CorpusField } from './corpus';
 import { FieldValues, HighlightResult, SearchHit } from './elasticsearch';
 import { Tag } from './tag';
-import { Observable } from 'rxjs';
+import { BehaviorSubject, Observable, Subject } from 'rxjs';
 import { TagService } from '../services/tag.service';
 import { tap } from 'rxjs/operators';
 
@@ -25,7 +25,7 @@ export class FoundDocument {
     highlight: HighlightResult;
 
     /** tags created on the document */
-    tags$: Observable<Tag[]>;
+    tags$ = new BehaviorSubject<Tag[]>(undefined);
 
     constructor(
         private tagService: TagService,
@@ -71,13 +71,25 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
-    setTags(tagIds: number[]): Observable<Tag[]> {
-        this.tags$ = this.tagService.setDocumentTags(this, tagIds);
-        return this.tags$;
+    addTag(tagId: number): void {
+        const newTagIds = this.tags$.value.map(tag => tag.id).concat([tagId]);
+        this.setTags(newTagIds);
+    }
+
+    removeTag(tagId: number): void {
+        const newTagIds = _.remove(
+            this.tags$.value.map(tag => tag.id),
+            id => id === tagId
+        );
+        this.setTags(newTagIds);
+    }
+
+    setTags(tagIds: number[]): void {
+        this.tagService.setDocumentTags(this, tagIds).subscribe(this.tags$);
     }
 
     private fetchTags(): void {
-        this.tags$ = this.tagService.getDocumentTags(this);
+        this.tagService.getDocumentTags(this).subscribe(this.tags$);
     }
 
 }
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.html b/frontend/src/app/tag/document-tags/document-tags.component.html
index 64d771f0d..d23494d43 100644
--- a/frontend/src/app/tag/document-tags/document-tags.component.html
+++ b/frontend/src/app/tag/document-tags/document-tags.component.html
@@ -1,5 +1,5 @@
-<div class="field is-grouped is-grouped-multiline" *ngIf="document?.tags$ | async as tags">
-    <div class="control tags has-addons" *ngFor="let tag of tags">
+<div class="field is-grouped is-grouped-multiline" *ngIf="(document?.tags$ | async) !== undefined">
+    <div class="control tags has-addons" *ngFor="let tag of document.tags$ | async">
         <span class="tag is-primary">
             {{tag.name}}
         </span>
@@ -12,7 +12,7 @@
 
     <div class="control">
         <ng-container *ngIf="showAddNew; else toggleAddNew">
-            <ia-tag-select [exclude]="tags" (selection)="addTag($event)" (cancel)="showAddNew = false"></ia-tag-select>
+            <ia-tag-select [exclude]="document.tags$ | async" (selection)="addTag($event)" (cancel)="showAddNew = false"></ia-tag-select>
         </ng-container>
 
         <ng-template #toggleAddNew>
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.ts b/frontend/src/app/tag/document-tags/document-tags.component.ts
index 8ac848366..0f9c4e20c 100644
--- a/frontend/src/app/tag/document-tags/document-tags.component.ts
+++ b/frontend/src/app/tag/document-tags/document-tags.component.ts
@@ -23,22 +23,10 @@ export class DocumentTagsComponent implements OnInit {
     }
 
     addTag(tagId: number) {
-        const op = (ids: number[]) => ids.concat([tagId]);
-        this.setTags(op);
+        this.document.addTag(tagId);
     }
 
     removeTag(tag: Tag) {
-        const op = (ids: number[]) => ids.filter(id => id !== tag.id);
-        this.setTags(op);
+        this.document.removeTag(tag.id);
     }
-
-    private setTags(operation: (ids: number[]) => number[]) {
-        this.document.tags$.pipe(
-            first(),
-            map(tags => tags.map(tag => tag.id)),
-            map(operation),
-            mergeMap(ids => this.document.setTags(ids))
-        ).subscribe();
-    }
-
 }
diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts
index 6ceedc662..2195e17fc 100644
--- a/frontend/src/mock-data/tag.ts
+++ b/frontend/src/mock-data/tag.ts
@@ -29,7 +29,8 @@ export class TagServiceMock {
         }).pipe(tap(this.fetch.bind(this)));
     }
 
-    setDocumentTags(document: FoundDocument, tags: Tag[]): Observable<Tag[]> {
+    setDocumentTags(document: FoundDocument, tagIds: number[]): Observable<Tag[]> {
+        const tags = mockTags.filter(tag => tagIds.includes(tag.id));
         return of(tags);
     };
 

From aab468a890105325715209c4030173ec6b3b6ab8 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 10 Aug 2023 14:19:43 +0200
Subject: [PATCH 27/98] fix all the things

---
 frontend/src/app/models/found-document.ts            | 12 ++++++++----
 frontend/src/app/services/tag.service.ts             |  2 +-
 .../tag/document-tags/document-tags.component.html   |  6 +++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index b7452779e..035eef36e 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -77,19 +77,23 @@ export class FoundDocument {
     }
 
     removeTag(tagId: number): void {
-        const newTagIds = _.remove(
+        const newTagIds = _.without(
             this.tags$.value.map(tag => tag.id),
-            id => id === tagId
+            tagId,
         );
         this.setTags(newTagIds);
     }
 
     setTags(tagIds: number[]): void {
-        this.tagService.setDocumentTags(this, tagIds).subscribe(this.tags$);
+        this.tagService.setDocumentTags(this, tagIds).subscribe(
+            value => this.tags$.next(value)
+        );
     }
 
     private fetchTags(): void {
-        this.tagService.getDocumentTags(this).subscribe(this.tags$);
+        this.tagService.getDocumentTags(this).subscribe(
+            value => this.tags$.next(value)
+        );
     }
 
 }
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index 5c1749377..f6c4b573e 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -19,7 +19,7 @@ export class TagService {
 
     makeTag(name: string, description?: string): Observable<Tag> {
         return this.apiService.createTag(name, description).pipe(
-            tap(this.fetch.bind(this))
+            tap(() => this.fetch())
         );
     }
 
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.html b/frontend/src/app/tag/document-tags/document-tags.component.html
index d23494d43..dc8d37ad8 100644
--- a/frontend/src/app/tag/document-tags/document-tags.component.html
+++ b/frontend/src/app/tag/document-tags/document-tags.component.html
@@ -1,5 +1,5 @@
-<div class="field is-grouped is-grouped-multiline" *ngIf="(document?.tags$ | async) !== undefined">
-    <div class="control tags has-addons" *ngFor="let tag of document.tags$ | async">
+<div class="field is-grouped is-grouped-multiline" *ngIf="(document?.tags$ | async) as tags">
+    <div class="control tags has-addons" *ngFor="let tag of tags">
         <span class="tag is-primary">
             {{tag.name}}
         </span>
@@ -12,7 +12,7 @@
 
     <div class="control">
         <ng-container *ngIf="showAddNew; else toggleAddNew">
-            <ia-tag-select [exclude]="document.tags$ | async" (selection)="addTag($event)" (cancel)="showAddNew = false"></ia-tag-select>
+            <ia-tag-select [exclude]="tags" (selection)="addTag($event)" (cancel)="showAddNew = false"></ia-tag-select>
         </ng-container>
 
         <ng-template #toggleAddNew>

From 21df1c6fa019fd7649623c373fa5a851103c93b9 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Mon, 21 Aug 2023 14:20:55 +0200
Subject: [PATCH 28/98] fix import

---
 frontend/src/app/services/elastic-search.service.spec.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/src/app/services/elastic-search.service.spec.ts b/frontend/src/app/services/elastic-search.service.spec.ts
index 0c7e23b5b..b24021381 100644
--- a/frontend/src/app/services/elastic-search.service.spec.ts
+++ b/frontend/src/app/services/elastic-search.service.spec.ts
@@ -4,7 +4,7 @@ import { ElasticSearchService, SearchResponse } from './elastic-search.service';
 import { Aggregator, QueryModel } from '../models';
 import { mockCorpus, mockField, mockField2 } from '../../mock-data/corpus';
 import { TagService } from './tag.service';
-import { TagServiceMock } from 'src/mock-data/tag';
+import { TagServiceMock } from '../../mock-data/tag';
 
 const mockResponse: SearchResponse = {
     took: 4,

From 07a66e52c4d9cef4c3dbee40c002f67c068a5344 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 19 Oct 2023 16:51:28 +0200
Subject: [PATCH 29/98] add Peaceportal corpora and test data

---
 .../corpora/peaceportal/FIJI/XMLtemplate.j2   |  71 +++
 backend/corpora/peaceportal/FIJI/fiji.py      | 284 ++++++++++
 .../peaceportal/FIJI/fiji_converter.py        | 277 ++++++++++
 backend/corpora/peaceportal/__init__.py       |   0
 backend/corpora/peaceportal/epidat.py         | 405 ++++++++++++++
 backend/corpora/peaceportal/fiji_separate.py  |  17 +
 backend/corpora/peaceportal/iis.py            | 361 +++++++++++++
 .../peaceportal/iis_corpus_preprocessor.py    | 100 ++++
 backend/corpora/peaceportal/peaceportal.py    | 501 ++++++++++++++++++
 .../tests/data/epidat/blr/blr-4.xml           | 216 ++++++++
 .../tests/data/epidat/hlh/hlh-12.xml          | 302 +++++++++++
 .../peaceportal/tests/data/fiji/299.xml       |  64 +++
 .../peaceportal/tests/data/fiji/687.xml       |  61 +++
 .../peaceportal/tests/data/fiji/759.xml       |  65 +++
 .../data/iis/transcription_txts/akld0002.xml  |   5 +
 .../data/iis/transcription_txts/beth0042.xml  |   5 +
 .../data/iis/transcription_txts/jeru0014.xml  |   5 +
 .../tests/data/iis/xml/akld0002.xml           | 196 +++++++
 .../tests/data/iis/xml/beth0042.xml           | 143 +++++
 .../tests/data/iis/xml/jeru0014.xml           | 140 +++++
 .../peaceportal/tests/data/safed/safed.csv    |  10 +
 .../peaceportal/tests/data/tol/tol-11.xml     | 214 ++++++++
 .../peaceportal/tests/data/tol/tol-27.xml     | 189 +++++++
 .../peaceportal/tests/data/tol/tol-36.xml     | 197 +++++++
 backend/corpora/peaceportal/tol.py            | 390 ++++++++++++++
 25 files changed, 4218 insertions(+)
 create mode 100644 backend/corpora/peaceportal/FIJI/XMLtemplate.j2
 create mode 100644 backend/corpora/peaceportal/FIJI/fiji.py
 create mode 100644 backend/corpora/peaceportal/FIJI/fiji_converter.py
 create mode 100644 backend/corpora/peaceportal/__init__.py
 create mode 100644 backend/corpora/peaceportal/epidat.py
 create mode 100644 backend/corpora/peaceportal/fiji_separate.py
 create mode 100644 backend/corpora/peaceportal/iis.py
 create mode 100644 backend/corpora/peaceportal/iis_corpus_preprocessor.py
 create mode 100644 backend/corpora/peaceportal/peaceportal.py
 create mode 100644 backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/fiji/299.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/fiji/687.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/fiji/759.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/safed/safed.csv
 create mode 100644 backend/corpora/peaceportal/tests/data/tol/tol-11.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/tol/tol-27.xml
 create mode 100644 backend/corpora/peaceportal/tests/data/tol/tol-36.xml
 create mode 100644 backend/corpora/peaceportal/tol.py

diff --git a/backend/corpora/peaceportal/FIJI/XMLtemplate.j2 b/backend/corpora/peaceportal/FIJI/XMLtemplate.j2
new file mode 100644
index 000000000..702a8d0ef
--- /dev/null
+++ b/backend/corpora/peaceportal/FIJI/XMLtemplate.j2
@@ -0,0 +1,71 @@
+<TEI xml:space="preserve" xml:lang="en"
+xml:base="ex-epidoctemplate.xml" xmlns="http://www.tei-c.org/ns/1.0">
+    <teiHeader>
+        <fileDesc>
+            <titleStmt>
+                <title>{{ title }}</title>
+            </titleStmt>
+            <publicationStmt>
+                <authority/>
+                <idno type="filename"/>
+            </publicationStmt>
+            <sourceDesc>
+                <msDesc>
+                    <msIdentifier>
+                        <location>{{ presentLocation }}</location>
+                        {%- if publications %}
+                        <publications>
+                            {%- for publication in publications %}
+                            <publication>{{ publication }}</publication>
+                            {%- endfor %}
+                        </publications>
+                        {% endif -%}
+                    </msIdentifier>
+                    <history>
+                        <origin>
+                            <provenance>{{ provenance }}</provenance>
+                            <origDate>{{ date }}</origDate>
+                            <remarksOnDate>{{ remarksOnDate }}</remarksOnDate>
+                        </origin>
+                    </history>
+                </msDesc>
+            </sourceDesc>
+        </fileDesc>
+        <profileDesc>
+            {%- if persons %}
+            <particDesc>
+                <listPerson>
+                    {%- for person in persons %}
+                    <person sex="{{ person.sex }}">
+                        <persName>{{ person.name }}</persName>
+                    </person>
+                    {%- endfor %}
+                </listPerson>
+            </particDesc>
+            {% endif -%}
+            <langUsage>
+                {%- for language in languages %}
+                <language>{{ language }}</language>
+                {%- endfor %}
+            </langUsage>
+        </profileDesc>
+    </teiHeader>
+    <facsimile>
+        <photoFacsimile>{{ facsimile }}</photoFacsimile>
+        <photosLeonard>{{ photosLeonard }}</photosLeonard>
+        <image3D>{{ image3D }}</image3D>
+    </facsimile>
+    <text>
+        <body>
+            <transcription>{{ transcription }}</transcription>
+            <inscriptionType>{{ inscriptionType }}</inscriptionType>
+            <iconographyType>{{ iconographyType }}</iconographyType>
+            <iconographyDescription>{{ iconographyDescription }}</iconographyDescription>
+            <material>{{ material }}</material>
+            <incipit>{{ incipit }}</incipit>
+            <age>{{ age }}</age>
+            <ageComments>{{ ageComments }}</ageComments>
+            <commentary>{{ commentary }}</commentary>
+        </body>
+    </text>
+</TEI>
diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
new file mode 100644
index 000000000..e4f1235e7
--- /dev/null
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -0,0 +1,284 @@
+import re
+import os
+import os.path as op
+import logging
+from flask import current_app
+
+from addcorpus.extract import XML, Constant, Combined
+from addcorpus.corpus import Field
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, join_commentaries, get_text_in_language
+
+
+class FIJI(PeacePortal):
+    '''
+    This is a fresh version of Ortal-Paz Saar's 'Funerary Inscriptions of Jews from Italy' corpus,
+    updated to align with the PEACE portal index. This mostly implies that there are less fields
+    than in the earlier version (i.e. the one under corpora/jewishinscriptions).
+    '''
+
+    data_directory = current_app.config['PEACEPORTAL_FIJI_DATA']
+    es_index = current_app.config['PEACEPORTAL_FIJI_ES_INDEX']
+    es_alias = current_app.config['PEACEPORTAL_ALIAS']
+    filename_pattern = re.compile('\d+')
+
+    def sources(self, start, end):
+        logger = logging.getLogger(__name__)
+        for directory, _, filenames in os.walk(self.data_directory):
+            for filename in filenames:
+                name, extension = op.splitext(filename)
+                full_path = op.join(directory, filename)
+                if extension != '.xml':
+                    logger.debug(self.non_xml_msg.format(full_path))
+                    continue
+                match = self.filename_pattern.match(name)
+                if not match:
+                    logger.warning(self.non_match_msg.format(full_path))
+                    continue
+                inscriptionID = match.groups()
+                yield full_path, {
+                    'inscriptionID': inscriptionID
+                }
+
+    def __init__(self):
+        self.source_database.extractor = Constant(
+            value='Funerary Inscriptions of Jews from Italy (Utrecht University)'
+        )
+
+        self._id.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'titleStmt', 'title'],
+            toplevel=False,
+        )
+
+        self.url.extractor = Constant(
+            value=None
+        )
+
+        # the year is commented out: need to have not before / not after fields
+        # this is advisable since often we only roughly now the century
+        # self.year.extractor = XML(
+        #     tag=['teiHeader', 'fileDesc', 'sourceDesc',
+        #          'msDesc', 'history', 'origin', 'origDate'],
+        #     toplevel=False
+        # )
+
+        self.transcription.extractor = XML(
+            tag=['text', 'body', 'transcription'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.names.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson', 'person'],
+            flatten=True,
+            multiple=True,
+            toplevel=False,
+        )
+
+        self.sex.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson', 'person'],
+            attribute='sex',
+            multiple=True,
+            toplevel=False,
+        )
+
+        self.age.extractor = XML(
+            tag=['text', 'body', 'age'],
+            toplevel=False,
+            transform=lambda age: transform_age_integer(age)
+        )
+
+        self.country.extractor = Constant(
+            value='Italy'
+        )
+
+        self.settlement.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'history', 'origin', 'provenance'],
+            toplevel=False,
+        )
+
+        self.material.extractor = XML(
+            tag=['text', 'body', 'material'],
+            toplevel=False,
+            transform=lambda x: categorize_material(x)
+        )
+
+        self.material_details = XML(
+            tag=['text', 'body', 'material'],
+            toplevel=False,
+        )
+
+        self.language.extractor = XML(
+            tag=['teiHeader', 'profileDesc', 'langUsage', 'language'],
+            toplevel=False,
+            multiple=True,
+            transform=lambda x: normalize_language(x)
+        )
+
+        self.comments.extractor = Combined(
+            XML(
+                tag=['text', 'body', 'commentary'],
+                toplevel=False,
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'history', 'origin', 'remarksOnDate'],
+                toplevel=False,
+                transform=lambda x: 'DATE:\n{}\n'.format(x) if x else x
+            ),
+            XML(
+                tag=['text', 'body', 'ageComments'],
+                toplevel=False,
+                transform=lambda x: 'AGE:\n{}\n'.format(x) if x else x
+            ),
+            XML(
+                tag=['text', 'body', 'iconographyDescription'],
+                toplevel=False,
+                transform=lambda x: 'ICONOGRAPHY:\n{}\n'.format(x) if x else x
+            ),
+            transform=lambda x: join_commentaries(x)
+        )
+
+
+        self.bibliography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'publications', 'publication'],
+            toplevel=False,
+            multiple=True
+        )
+
+        self.location_details.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'location'],
+            toplevel=False
+        )
+
+        self.iconography.extractor = XML(
+            tag=['text', 'body', 'iconographyType'],
+            toplevel=False
+        )
+
+        self.transcription_hebrew.extractor = Combined(
+            self.transcription.extractor,
+            Constant('he'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_latin.extractor = Combined(
+            self.transcription.extractor,
+            Constant('la'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_greek.extractor = Combined(
+            self.transcription.extractor,
+            Constant('el'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+
+def transform_age(age):
+    if age in ['?', 'none', 'none?']:
+        return 'Unknown'
+    return age
+
+
+def transform_age_integer(age):
+    try:
+        return int(age)
+    except:
+        return None
+        
+
+def normalize_language(languages):
+    results = []
+    for lang in languages:
+        if not lang:
+            results.append('Unknown')
+            continue
+
+        ltext = lang.lower().strip()
+        if 'greek' in ltext or 'greeek' in ltext:
+            results.append(select_greek(lang))
+        if 'latin' in ltext:
+            results.append(select_latin(lang))
+        if 'hebrew' in ltext:
+            results.append(select_hebrew(lang))
+        if ltext == 'aramaic' or ltext == 'samaritan':
+            return lang
+        if '?' in ltext or ltext == 'x' or ltext == 'none':
+            results.append('Unknown')
+    return results
+
+
+def select_greek(text):
+    text = text.strip()
+    if text in [
+        "Greek", "Greek (?)", "Greeek",
+        "Greek (some Latin characters)",
+        "Latin (some Greek characters)",
+        "Greek or Latin", "Latin and Greek (?)",
+        "Latin in Greek characters"
+        "Greek (transliterated Latin?)",
+        "Greek with transliterated Latin (?)",
+        "Greek with transliterated Latin formula",
+    ]:
+        return 'Greek'
+    if text in [
+        "Greek (in Hebrew characters)",
+        "Greek in Latin characters (?)",
+        "Latin (including transliterated Greek)",
+        "transliterated Greek"
+    ]:
+        return 'Greek (transliterated)'
+
+def select_latin(text):
+    text = text.strip()
+    if text in [
+        "Latin", "Latin (?)",
+        "Greek (some Latin characters)",
+        "Latin (some Greek characters)",
+        "Latin (including transliterated Greek)",
+        "Greek or Latin", "Latin and Greek (?)",
+        "Latin (transliterated Hebrew)"
+    ]:
+        return "Latin"
+
+    if text in [
+        "Latin in Greek characters",
+        "Greek (transliterated Latin?)",
+        "Greek with transliterated Latin (?)",
+        "Greek with transliterated Latin formula",
+    ]:
+        return "Latin (transliterated)"
+
+
+def select_hebrew(text):
+    text = text.strip()
+
+    if text in [
+        "Hebrew", "Hebrew (?)"
+    ]:
+        return "Hebrew"
+
+    if text in [
+        "Latin (transliterated Hebrew)",
+        "Hebrew (transliterated)",
+    ]:
+        return "Hebrew (transliterated)"
+
+
+
+
+        # TODO: new fields
+
+        # TODO: move to a comments field:
+
+
+
+        # excluded (for now):
+        # 3D_image
+        # inscription_type
+
+        # TODO: discuss
+        # fascimile
+        # photos_leonard
diff --git a/backend/corpora/peaceportal/FIJI/fiji_converter.py b/backend/corpora/peaceportal/FIJI/fiji_converter.py
new file mode 100644
index 000000000..3fe108f39
--- /dev/null
+++ b/backend/corpora/peaceportal/FIJI/fiji_converter.py
@@ -0,0 +1,277 @@
+'''
+This script is based on the convertDatabase.py Jelmer van Nuss wrote to extract
+FIJI data from Ortal-Paz Saar's excelsheet. As opposed to that script (which seemed to have
+worked only with a manually edited source file), it is explicit in the changes required
+to extract the data. This hopefully secures that the script can be re-used when Ortal-Paz
+sends us a updated excelsheet (e.g. with translations added).
+'''
+import os
+import sys
+import openpyxl
+import argparse
+from jinja2 import Template
+
+
+def main(sys_args):
+    args = parse_arguments(sys_args)
+    out_folder = args.out_folder
+
+    if not os.path.exists(out_folder):
+        os.makedirs(out_folder)
+
+    wb = openpyxl.load_workbook(args.input)
+    sheet = wb['Sheet1']
+    headers = list(list(sheet.values)[0])
+    preprocess_headers(headers)
+    for row in sheet.values:
+        row_dict = {headers[i]: row[i] for i in range(len(row))}
+        record = extract_record(row_dict)
+        if record:
+            export(out_folder, record)
+
+
+def preprocess_headers(headers):
+    for index, header in enumerate(headers):
+        if header == 'Date (add 68 to the year of Temple destruction)':
+            headers[index] = 'Date'
+        if header == 'Sex ':
+            headers[index] = 'Sex'
+        if header == 'Iconography':
+            headers[index] = 'Iconography type'
+        if header == 'Iconography details':
+            headers[index] = 'Iconography description'
+
+
+def extract_record(row):
+    if not row['Inscription no.']:
+        return None
+    return dict(
+        title=row["Inscription no."],
+        date=row["Date"],
+        remarksOnDate=preprocess_text(row["Remarks on date"]),
+        provenance=row["Provenance"],
+        presentLocation=row["Present location"],
+        publications=get_publications(row),
+        facsimile=row["Photo / Facsimile from publication"],
+        photosLeonard=row["Photos by Leonard"],
+        image3D=row["3D image"],
+        transcription=get_transcription(row),
+        inscriptionType=row["Inscription type"],
+        persons=get_persons(row),
+        age=row['Age'],
+        ageComments=preprocess_text(row["Remarks on age"]),
+        iconographyType=row["Iconography type"],
+        iconographyDescription=preprocess_text(row["Iconography description"]),
+        material=row["Material"],
+        languages=get_languages(row),
+        incipit=row["Incipit"],
+        commentary=get_commentary(row)
+    )
+
+
+def export(out_folder, record):
+    export_path = os.path.join(out_folder, '{}.xml'.format(record['title']))
+    with open('XMLtemplate.j2') as file_:
+        template = Template(file_.read())
+
+        with open(export_path, 'w+', encoding='utf-8') as xmlFile:
+            xmlFile.write(template.render(record))
+
+
+def get_publications(row):
+    results = []
+    publication_nos = str(row["No. in publication"]).split(';')
+    publications = row["Publication"]
+    if not publications:
+        return results
+    publications = publications.split(';')
+
+    for index, pub in enumerate(publications):
+        publication = pub.replace('\n', '')
+        try:
+            publication_no = publication_nos[index].replace('\n', '').strip()
+            publication = "{} ({})".format(publication, publication_no)
+        except IndexError:
+            pass # ignore adding pub_no if it doesn't exist
+        results.append(publication)
+    return results
+
+
+def get_transcription(row):
+    transcription = preprocess_text(row["Transcription"])
+    return transcription.replace('\n', '\n<lb/>\n')
+
+
+def get_languages(row):
+    value = row["Language"]
+    if not value:
+        return ""
+    langs = value.split(',')
+    if len(langs) > 1:
+        cleaned = []
+        for lang in langs:
+            cleaned.append(lang.strip())
+        return cleaned
+    else:
+        return langs
+
+
+def get_commentary(row):
+    commentary = row["Open questions / Remarks"]
+    # add number of lines surviving (if it exists)
+    # Note that at the time of writing, there is only 1 (!) record
+    # that has data in this field
+    additional = row['Number of lines (s=surviving, o=original)']
+    if additional:
+        period = commentary.endswith('.')
+        commentary = '{}{} There are {} surviving lines.'.format(
+            commentary, '.' if not period else '', additional
+        )
+    if commentary:
+        return commentary
+    else:
+        return ""
+
+
+def preprocess_text(text):
+    '''
+    Preprocess a text field.
+    For now replaces < and > with html entities.
+    '''
+    if not text:
+        return ""
+    return text.replace('<', '&lt;').replace('>', '&gt;')
+
+
+def get_persons(row):
+    persons = []
+    inscription_id = row['Inscription no.']
+    names = get_names_from_field(row, "Names mentioned")
+    namesHebrew = get_names_from_field(row, "Names mentioned (original language)")
+    sexes = get_sexes(row)
+
+    if len(names) == 1 and len(namesHebrew) > 1 and len(sexes) == 1:
+        # if we have multiple Hebrew names, simply join them together
+        # TODO: check with Ortal-Paz if this is ok
+        persons.append(create_person(
+            names[0], " ".join(namesHebrew), sexes[0]))
+    elif len(names) == 1 and len(namesHebrew) == 1 and len(sexes) > 1 or inscription_id == '368':
+        # if we have multiple sexes, store name(s) once and create a person entry to record each sex
+        # also handles one special case (ID 368)
+        for index, sex in enumerate(sexes):
+            if index == 0:
+                persons.append(create_person(
+                    names[0], namesHebrew[0], sexes[0]))
+            else:
+                persons.append(create_person('', '', sexes[index]))
+    elif len(names) > 1 or len(namesHebrew) > 1 or len(sexes) > 1:
+        # TODO: discuss the three remaining cases with Ortal-Paz
+        # custom cases for some rows
+        # if row['Inscription no.'] == 33:
+        #     persons.append(create_person(" ".join(names),
+        #                                  " ".join(namesHebrew), sexes[0]))
+        # else:
+        #     pass
+        # print(row['Inscription no.'])
+        # print(names, namesHebrew, sexes)
+        pass
+    elif len(names) > 1 and len(namesHebrew) > 1 and len(sexes) > 1:
+        # if we get here there are multiple people and we assume they are complete
+        for index, name in enumerate(names):
+            persons.append(create_person(
+                name, namesHebrew[index], sexes[index]))
+    else:
+        # simple case of a single person
+        name = first_or_empty(names)
+        nameHebrew = first_or_empty(namesHebrew)
+        sex = sexes[0]
+        persons.append(create_person(name, nameHebrew, sex))
+
+    return persons
+
+
+def first_or_empty(_list):
+    if len(_list) > 0:
+        return _list[0]
+    else:
+        return ''
+
+
+def get_names_from_field(row, field):
+    results = []
+    names_raw = extract_multifield(row, field, '\n')
+    for name in names_raw:
+        if name == 'X' or name == 'Χ':
+            # Note that the second character is not a 'X', but one copy-pasted from the commandline (and which looks a lot like one)
+            results.append('')
+        else:
+            results.append(name)
+    return results
+
+
+def get_sexes(row):
+    results = []
+    sexes_raw = extract_multifield(row, "Sex",  '\n')
+    for sex in sexes_raw:
+        if '?' in sex:
+            results.append('Unknown')
+        elif 'M' in sex and 'F' in sex:
+            results.append('M')
+            results.append('F')
+        else:
+            results.append(sex)
+    return results
+
+
+def create_person(name, nameHebrew, sex):
+    if not name:
+        return {
+            'name': '', 'sex': sex
+        }
+    else:
+        return {
+            'name': "{} ({})".format(name, preprocess_text(nameHebrew)), 'sex': sex
+        }
+
+
+def extract_multifield(row, fieldname, splitter):
+    '''
+    Extract the values from a single field that (might) contains multiple values.
+    Returns an array that will not contain empty strings or None.
+    '''
+    results = []
+    content = row[fieldname]
+    if not content:
+        return results
+    values = content.split(splitter)
+    for value in values:
+        if value:
+            results.append(value)
+    return results
+
+
+def parse_arguments(sys_args):
+    '''
+    Parse the supplied arguments.
+    '''
+    parser = argparse.ArgumentParser(
+        description='Preprocess FIJI csv (from excelsheet)')
+
+    parser.add_argument(
+        '--input', '-in', dest='input', required=False, default='FIJI_full.csv',
+        help='Path to the CSV file that contains the data. Defaults to \'FIJI_full.csv\' (i.e. in the script\'s folder')
+
+    parser.add_argument(
+        '--delimiter', '-d', dest='delimiter', required=False, default=';',
+        help='Character that delimits fields in the CSV. Defaults to \';\'')
+
+    parser.add_argument(
+        '--out_folder', '-out', dest='out_folder', required=False, default="FIJI",
+        help='''Path to the folder where the output should end up.
+            Will be created if it doesn\'t exist. Defaults to \'FIJI\' (i.e. in the script\'s folder)''')
+
+    parsedArgs = parser.parse_args()
+    return parsedArgs
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/backend/corpora/peaceportal/__init__.py b/backend/corpora/peaceportal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
new file mode 100644
index 000000000..bbd49191d
--- /dev/null
+++ b/backend/corpora/peaceportal/epidat.py
@@ -0,0 +1,405 @@
+import re
+from copy import copy
+
+from django.conf import settings
+
+from addcorpus.extract import XML, Constant, HTML, Combined
+from addcorpus.es_settings import es_settings
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+
+
+class Epidat(PeacePortal):
+
+    data_directory = settings.PEACEPORTAL_EPIDAT_DATA
+    es_index = settings.PEACEPORTAL_EPIDAT_ES_INDEX
+    es_alias = settings.PEACEPORTAL_ALIAS
+
+    languages = ['german', 'hebrew', 'english', 'dutch']
+
+    def es_settings(self):
+        return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
+
+    def __init__(self):
+        self.source_database.extractor = Constant(
+            value='Epidat (Steinheim Institute)'
+        )
+
+        self._id.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'msIdentifier', 'idno'],
+            multiple=False,
+            toplevel=False,
+            flatten=True
+        )
+
+        self.url.extractor = HTML(
+            tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'],
+            multiple=False,
+            toplevel=False,
+            flatten=True,
+            attribute_filter={
+                'attribute': 'type',
+                'value': 'url'
+            }
+        )
+
+        self.year.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origDate', 'date'],
+            toplevel=False,
+            transform=lambda x: get_year(x),
+        )
+
+        self.not_before.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origDate', 'date'],
+            toplevel=False,
+            attribute='notBefore',
+            transform=lambda x: get_year(x),
+        )
+
+        self.not_after.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origDate', 'date'],
+            toplevel=False,
+            attribute='notAfter',
+            transform=lambda x: get_year(x),
+        )
+
+        self.transcription.extractor = XML(
+            tag=['text', 'body', 'div'],
+            toplevel=False,
+            multiple=False,
+            flatten=True,
+            transform=lambda x: clean_newline_characters(x),
+            transform_soup_func=extract_transcript
+        )
+
+        self.transcription_german.extractor = XML(
+            tag=['text', 'body', ],
+            toplevel=False,
+            multiple=False,
+            flatten=True,
+            transform=lambda x: clean_newline_characters(x),
+            transform_soup_func=extract_translation
+        )
+
+        self.names.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson', 'person'],
+            flatten=True,
+            multiple=True,
+            toplevel=False,
+        )
+
+        self.sex.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson', 'person'],
+            attribute='sex',
+            multiple=True,
+            toplevel=False,
+            transform=lambda x: convert_sex(x)
+        )
+
+        self.dates_of_death.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson'],
+            transform_soup_func=extract_death,
+            attribute='when',
+            multiple=False,
+            toplevel=False,
+        )
+
+        self.country.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'country'],
+            toplevel=False,
+            transform_soup_func=extract_country,
+            transform=lambda x: clean_country(x),
+            flatten=True,
+        )
+
+        self.region.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'country', 'region'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.settlement.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'settlement'],
+            toplevel=False,
+            flatten=True,
+            transform_soup_func=extract_settlement,
+        )
+
+        self.location_details.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'settlement', 'geogName'],
+            toplevel=False,
+            flatten=True,
+            transform_soup_func=extract_location_details,
+        )
+
+        self.material.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                 'objectDesc', 'supportDesc', 'support', 'p', 'material'],
+            toplevel=False,
+            flatten=True,
+            transform=lambda x: categorize_material(x)
+        )
+
+        self.material_details.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                 'objectDesc', 'supportDesc', 'support', 'p', 'material'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.language.extractor = XML(
+            tag=['teiHeader', 'profileDesc', 'langUsage', 'language'],
+            toplevel=False,
+            multiple=True,
+            transform=lambda x: get_language(x)
+        )
+
+        self.comments.extractor = Combined(
+            XML(
+                tag=['text', 'body'],
+                toplevel=False,
+                transform_soup_func=extract_commentary,
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'supportDesc', 'condition'],
+                toplevel=False,
+                flatten=True,
+                transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'supportDesc', 'support', 'p'],
+                toplevel=False,
+                transform_soup_func=extract_support_comments,
+            ),
+            transform=lambda x: join_commentaries(x)
+        )
+
+        self.images.extractor = XML(
+            tag=['facsimile', 'graphic'],
+            multiple=True,
+            attribute='url',
+            toplevel=False
+        )
+
+        self.coordinates.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'],
+            toplevel=False,
+            multiple=False,
+            flatten=True
+        )
+
+        self.iconography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
+            toplevel=False,
+            multiple=False
+        )
+
+        self.bibliography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'msIdentifier', 'publications', 'publication'],
+            toplevel=False,
+            multiple=True
+        )
+
+        self.transcription_hebrew.extractor = Combined(
+            self.transcription.extractor,
+            Constant('he'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_english.extractor = Combined(
+            self.transcription.extractor,
+            Constant('en'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_dutch.extractor = Combined(
+            self.transcription.extractor,
+            Constant('nl'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+
+def convert_sex(values):
+    if not values:
+        return ['Unknown']
+    result = []
+    for value in values:
+        if value == '1':
+            result.append('M')
+        elif value == '2':
+            result.append('F')
+        else:
+            result.append('Unknown')
+    return result
+
+
+def clean_country(text):
+    if not text:
+        return 'Unknown'
+    if text.lower().strip() == 'tobedone':
+        return 'Unknown'
+    return text
+
+
+def get_year(text):
+    if not text or text == '--':
+        return
+    matches = re.search('[1-2]{0,1}[0-9]{3}', text)
+    if matches:
+        return matches[0]
+
+
+def get_language(values):
+    if not values:
+        return ['Unknown']
+    if 'German in Hebrew letters' in values:
+        return ['German (transliterated)', 'Hebrew']
+    return values
+
+
+def extract_transcript(soup):
+    '''
+    Helper function to ensure correct extraction of the transcripts.
+    Note that there are multiple formats in which these are stored,
+    but the text that we need is always in the `<ab>` children of
+    `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one).
+    '''
+    if not soup:
+        return
+    return soup.find_all('ab')
+
+
+def extract_translation(soup):
+    '''
+    Helper function to extract translation from the <body> tag
+    '''
+    if not soup:
+        return
+    translation = soup.find('div', {'type': 'translation'})
+    if translation:
+        return translation.find_all('ab')
+    else:
+        return
+
+
+def extract_commentary(soup):
+    '''
+    Helper function to extract all commentaries from the <body> tag.
+    A single element will be returned with the commentaries found as text content.
+    '''
+    if not soup: return
+    found = []
+    commentaries = soup.find_all('div', {'type': 'commentary'})
+
+    for commentary in commentaries:
+        if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']:
+            p = commentary.find('p')
+            if p:
+                text = p.get_text()
+                if text:
+                    text = clean_commentary(text)
+                    found.append('{}:\n{}\n'.format(commentary['subtype'].strip().upper(), text))
+
+    if len(found) > 1:
+        cloned_soup = copy(soup)
+        cloned_soup.clear()
+        cloned_soup.string = "\n".join(found)
+        return cloned_soup
+    else:
+        return None
+
+def extract_support_comments(soup):
+    if not soup: return
+    cloned_soup = copy(soup)
+    cloned_soup.clear()
+
+    commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS')
+    commentaries = add_support_comment(soup, commentaries, 'objectType', 'OBJECTTYPE')
+
+    # add any additional text from the <p> element,
+    # i.e. if there is text it is the very last node
+    contents = soup.contents
+    text = contents[len(contents) - 1].strip()
+    if text:
+        text = clean_commentary(text)
+        commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text)
+
+    cloned_soup.string = commentaries
+    return cloned_soup
+
+
+def add_support_comment(soup, existing_commentaries, elem_name, commentary_name):
+    elem = soup.find(elem_name)
+    if elem:
+        text = elem.get_text()
+        if text:
+            text = clean_commentary(text)
+            return '{}{}:\n{}\n\n'.format(existing_commentaries, commentary_name, text)
+    return existing_commentaries
+
+
+def extract_death(soup):
+    '''
+    Helper function to extract date of death from multiple person tags.
+    '''
+    if not soup:
+        return
+    return soup.find_all('death')
+
+
+def extract_country(soup):
+    '''
+    Helper function to extract country.
+    This is needed because the output of `flatten` would otherwise include the text contents
+    of the `<region>`.
+    '''
+    return clone_soup_extract_child(soup, 'region')
+
+
+def extract_settlement(soup):
+    return clone_soup_extract_child(soup, 'geogName')
+
+
+def extract_location_details(soup):
+    return clone_soup_extract_child(soup, 'geo')
+
+
+def clone_soup_extract_child(soup, to_extract):
+    '''
+    Helper function to clone the soup and extract a child element.
+    This is useful when the output of `flatten` would otherwise include the text contents
+    of the child.
+    '''
+    if not soup:
+        return
+    cloned_soup = copy(soup)
+    child = cloned_soup.find(to_extract)
+    if child:
+        child.extract()
+    return cloned_soup
+
+    # TODO: add field
+
+    # TODO: move to a comments field:
+
+    # excluded (for now):
+    # title
+    # organization (incl details, e.g. address)
+    # licence
+    # taxonomy (i.e. things like foto1, foto2 -> no working links to actual images)
+
diff --git a/backend/corpora/peaceportal/fiji_separate.py b/backend/corpora/peaceportal/fiji_separate.py
new file mode 100644
index 000000000..e2b3f564f
--- /dev/null
+++ b/backend/corpora/peaceportal/fiji_separate.py
@@ -0,0 +1,17 @@
+from django.conf import settings
+
+from corpora.peaceportal.peaceportal import PeacePortal
+
+class FIJISEPARATE(PeacePortal):
+
+    es_index = settings.FIJI_ALIAS
+
+    # all fields listed here will be ignored if they are
+    # in the PeacePortal base class definition. Ideal for excluding
+    # filters that are irrelevant
+    redundant_fields = ['source_database', 'region']
+
+    def __init__(self):
+        for field in self.fields:
+            if field.name in self.redundant_fields:
+                self.fields.remove(field)
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
new file mode 100644
index 000000000..26b9c0669
--- /dev/null
+++ b/backend/corpora/peaceportal/iis.py
@@ -0,0 +1,361 @@
+from copy import copy
+
+from django.conf import settings
+
+from addcorpus.extract import XML, Constant, HTML, ExternalFile, Combined
+from addcorpus.corpus import Field
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+
+
+class IIS(PeacePortal):
+    data_directory = settings.PEACEPORTAL_IIS_DATA
+    external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
+    es_index = settings.PEACEPORTAL_IIS_ES_INDEX
+    es_alias = settings.PEACEPORTAL_ALIAS
+
+    def __init__(self):
+        self.source_database.extractor = Constant(
+            value='Inscriptions of Israel/Palestine (Brown University)'
+        )
+
+        self._id.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'msIdentifier', 'idno'],
+            multiple=False,
+            toplevel=False,
+            flatten=True,
+            transform=lambda x: ''.join(x.lower().split())
+        )
+
+        self.url.extractor = HTML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'msIdentifier', 'idno'],
+            multiple=False,
+            toplevel=False,
+            flatten=True,
+            transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format(
+                ''.join(x.lower().split()))
+        )
+
+        # quick and dirty for now: extract value for 'notBefore'
+        self.year.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'date'],
+            toplevel=False,
+            attribute='notBefore'
+        )
+
+        self.not_before.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'date'],
+            toplevel=False,
+            attribute='notBefore'
+        )
+
+        self.not_after.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'date'],
+            toplevel=False,
+            attribute='notAfter',
+        )
+
+        self.transcription.extractor = ExternalFile(
+            stream_handler=extract_transcript
+        )
+
+        self.transcription_english.extractor = HTML(
+            tag=['div'],
+            toplevel=True,
+            multiple=False,
+            flatten=True,
+            attribute_filter={
+                'attribute': 'type',
+                'value': 'translation'
+            },
+            transform_soup_func=extract_paragraph,
+            transform=lambda x: ' '.join(x.split()) if x else None
+        )
+
+        # is not present in IIS data
+        # self.names.extractor = XML(
+        #     tag=['teiHeader', 'profileDesc',
+        #          'particDesc', 'listPerson', 'person'],
+        #     flatten=True,
+        #     multiple=True,
+        #     toplevel=False,
+        # )
+
+        self.iconography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
+            toplevel=False,
+            multiple=True,
+            flatten=True
+        )
+
+        # is not present in IIS data
+        self.sex.extractor = Constant(
+            value='Unknown'
+        )
+
+        self.country.extractor = Constant(
+            value='Israel/Palestine'
+        )
+
+        self.region.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'placeName', 'region'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.settlement.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'placeName', 'settlement'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.location_details.extractor = Combined(
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                     'history', 'origin', 'placeName'],
+                toplevel=False,
+                flatten=True
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                     'history', 'origin', 'p'],
+                toplevel=False,
+                flatten=True
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                     'history', 'provenance'],
+                toplevel=False,
+                flatten=True
+            )
+        )
+
+        self.material.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                 'objectDesc', 'supportDesc'],
+            attribute='ana',
+            toplevel=False,
+            flatten=True,
+            transform=lambda x: categorize_material(x)
+        )
+
+        self.material_details.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                 'objectDesc', 'supportDesc'],
+            attribute='ana',
+            toplevel=False,
+            flatten=True
+        )
+
+        self.language.extractor = Combined(
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+                     'textLang'],
+                attribute='mainLang',
+                toplevel=False,
+                transform=lambda x: normalize_language(x)
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+                     'textLang'],
+                attribute='otherLangs',
+                toplevel=False,
+                transform=lambda x: normalize_language(x)
+            )
+        )
+
+        self.comments.extractor = Combined(
+            XML(
+                tag=['text'],
+                toplevel=False,
+                multiple=False,
+                flatten=True,
+                transform_soup_func=extract_comments,
+                transform=lambda x: clean_commentary(x) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'supportDesc', 'condition'],
+                toplevel=False,
+                transform_soup_func=extract_condition
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'layoutDesc', 'layout', 'p'],
+                toplevel=False,
+                transform=lambda x: 'LAYOUT:\n{}\n\n'.format(clean_commentary(x)) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc'],
+                toplevel=False,
+                attribute='ana',
+                transform=lambda x: 'OBJECTTYPE:\n{}\n\n'.format(x[1:]) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                     'objectDesc', 'supportDesc', 'support', 'dimensions'],
+                toplevel=False,
+                transform_soup_func=extract_dimensions,
+                transform=lambda x: 'DIMENSIONS:\n{}\n\n'.format(
+                    x) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                     'objectDesc', 'supportDesc', 'support', 'p'],
+                toplevel=False,
+                flatten=True,
+                transform=lambda x: 'SUPPORT:\n{}\n\n'.format(
+                    clean_commentary(x)) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', 'handDesc', 'handNote'],
+                toplevel=False,
+                transform_soup_func=extract_handnotes
+            ),
+            transform=lambda x: join_commentaries(x)
+        )
+
+        self.bibliography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'msIdentifier', 'publications', 'publication'],
+            toplevel=False,
+            multiple=True
+        )
+
+        self.transcription_hebrew.extractor = Combined(
+            self.transcription.extractor,
+            Constant('he'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_latin.extractor = Combined(
+            self.transcription.extractor,
+            Constant('la'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_greek.extractor = Combined(
+            self.transcription.extractor,
+            Constant('el'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+
+def extract_transcript(filestream):
+    text = filestream.read().strip()
+    filestream.close()
+    # remove the tabs and spaces inherited from xml
+    text = clean_newline_characters(text)
+    if text:
+        text = text.replace('\t', '')
+    return text
+
+
+def extract_paragraph(soup):
+    '''
+    Extract first <p> element from `soup`, ignore the rest.
+    Ideal for ignoring <h2> headers in the HTML versions of the body.
+    '''
+    if not soup:
+        return
+    return soup.find('p')
+
+
+def extract_comments(soup):
+    '''
+    Helper function to extract the commentary from either <body> or <back> (siblings under <text>)
+    '''
+    if not soup:
+        return
+    commentary_div = soup.find('div', {'type': 'commentary'})
+    return extract_paragraph(commentary_div)
+
+
+def extract_attribute_and_child_p(soup, field_header):
+    '''
+    Extract value for 'ana' attribute from soup,
+    as well as the text from a <p> child. Will be returned
+    in a new soup, i.e. a single element with text content
+    in the following format `textcontent (attrivubtevalue)`
+    '''
+    result = ''
+    text = ''
+    ana = None
+    if 'ana' in soup.attrs:
+        ana = soup['ana']
+    p = extract_paragraph(soup)
+    if p:
+        text = p.get_text()
+        if text:
+            result = clean_commentary(text)
+    if ana:
+        result = '{} ({})'.format(result, ana)
+
+    if result:
+        cloned_soup = copy(soup)
+        cloned_soup.clear()
+        cloned_soup.string = '{}:\n{}\n\n'.format(field_header, result)
+        return cloned_soup
+
+
+def extract_condition(soup):
+    return extract_attribute_and_child_p(soup, 'CONDITION')
+
+
+def extract_handnotes(soup):
+    if not soup: return
+    return extract_attribute_and_child_p(soup, 'HANDNOTES')
+
+
+def extract_dimensions(soup):
+    result = ''
+    height_elem = soup.find('height')
+    if height_elem:
+        height = height_elem.get_text()
+        if height:
+            result = "H: {} ".format(height)
+
+    width_elem = soup.find('width')
+    if width_elem:
+        width = width_elem.get_text()
+        if width:
+            result = "{}W: {}".format(result, width)
+
+    depth_elem = soup.find('depth')
+    if depth_elem:
+        depth = depth_elem.get_text()
+        if depth:
+            result = "{} D: {}".format(result, depth)
+
+    cloned_soup = copy(soup)
+    cloned_soup.clear()
+    cloned_soup.string = result
+    return cloned_soup
+
+
+def normalize_language(text):
+    if not text:
+        return
+    ltext = text.lower().strip()
+    if ltext in ['grc']:
+        return 'Greek'
+    if ltext in ['he', 'heb']:
+        return 'Hebrew'
+    if ltext in ['arc']:
+        return 'Aramaic'
+    if ltext in ['la', 'latin']:
+        return 'Latin'
+
+    # excluded (for now):
+    # revision history
+
+    # MISSING (i.e. present in Epidat and Fiji)
+    # person(s) - names (profileDesc is completely missing)
diff --git a/backend/corpora/peaceportal/iis_corpus_preprocessor.py b/backend/corpora/peaceportal/iis_corpus_preprocessor.py
new file mode 100644
index 000000000..9be08fa47
--- /dev/null
+++ b/backend/corpora/peaceportal/iis_corpus_preprocessor.py
@@ -0,0 +1,100 @@
+import os
+import sys
+import glob
+import argparse
+from bs4 import BeautifulSoup
+
+
+def main(sys_args):
+    args = parse_arguments(sys_args)
+    prepare_out_folder(args.out_folder)
+    preprocess(args.xml_folder, args.out_folder)
+
+def prepare_out_folder(out_folder):
+    if not os.path.exists(out_folder):
+        os.makedirs(out_folder)
+    else:
+        files = glob.glob('{}/*'.format(out_folder))
+        for f in files:
+            os.remove(f)
+
+def preprocess(in_folder, out_folder):
+
+    for filepath in glob.iglob('{}/*.xml'.format(in_folder)):
+        with open(filepath, 'r') as xml:
+            soup = BeautifulSoup(xml.read(), 'xml')
+
+        filename = os.path.basename(filepath)
+        keep_only_transcription(filename, soup, out_folder)
+        # TODO: add extraction of foreigns
+
+
+def keep_only_transcription(filename, soup, out_folder):
+        out_file = os.path.join(get_subfolder(out_folder, 'tei_with_transcription_only'), filename)
+
+        text_tag = soup.find('text')
+        transcription = get_transcription(filename, text_tag)
+        text_tag.clear()
+        if transcription:
+            text_tag.append(transcription)
+
+        with open(out_file, 'w') as f_out:
+            f_out.write(str(soup))
+
+
+## TODO: extract foreign and export them to separate file.
+# def do_something_with_foreign(filename, soup):
+#     text_tag = soup.find('text')
+    #     transcription = get_transcription(filename, text_tag)
+    #     if transcription:
+    #         foreigns = text_tag.find_all('foreign')
+    #         # print(foreigns)
+
+    #         for f in foreigns:
+    #             if f.findChild():
+    #                 print(f)
+
+
+def get_transcription(filename, text_tag):
+    transcription = text_tag.find('div', { 'subtype': 'transcription'})
+
+    # if there is no transcription, fallback to diplomatic
+    if not transcription:
+        transcription = text_tag.find('div', { 'subtype': 'diplomatic'})
+
+    if not transcription:
+        print('No transcription found in {}'.format(filename))
+    return transcription
+
+
+def get_subfolder(folder, subfoldername):
+    '''
+    Get a subfolder with `subfoldername` in `folder`.
+    Will be created if it doesn't exist.
+    '''
+    path = os.path.join(folder, subfoldername)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return path
+
+
+def parse_arguments(sys_args):
+    '''
+    Parse the supplied arguments.
+    '''
+    parser = argparse.ArgumentParser(
+        description='Preprocess EpiDoc scrapes, i.e. extract Leiden')
+
+    parser.add_argument(
+        '--xml_folder', '-xml', dest='xml_folder', required=True,
+        help='Path to the folder where the .xml files reside.')
+
+    parser.add_argument(
+        '--out_folder', '-out', dest='out_folder', required=True,
+        help='Path to the folder where the output should end up. Will be created if it doesn\'t exist or emptied out if it does.')
+
+    parsedArgs = parser.parse_args()
+    return parsedArgs
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
new file mode 100644
index 000000000..3f02a842a
--- /dev/null
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -0,0 +1,501 @@
+import os
+import os.path as op
+import logging
+from datetime import datetime
+from langdetect import detect
+from langdetect.lang_detect_exception import LangDetectException
+
+from django.conf import settings
+
+from addcorpus.corpus import XMLCorpus, Field
+from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping
+from addcorpus.es_settings import es_settings
+from addcorpus.extract import XML, Constant
+from addcorpus.filters import MultipleChoiceFilter, RangeFilter
+
+
+
+class PeacePortal(XMLCorpus):
+    '''
+    Base class for corpora in the PEACE portal.
+
+    This supplies the frontend with the information it needs.
+    Child corpora should only provide extractors for each field.
+    Consequently, create indices (with alias 'peaceportal') from
+    the corpora specific definitions, and point the application
+    to this base corpus.
+    '''
+
+    title = "PEACE Portal"
+    description = "A collection of inscriptions on Jewish burial sites"
+    # store min_year as int, since datetime does not support BCE dates
+    min_year = -530
+    max_date = datetime(year=1950, month=12, day=31)
+    visualize = []
+    es_index = current_app.config['PEACEPORTAL_ALIAS']
+    scan_image_type = 'image/png'
+    # fields below are required by code but not actually used
+    min_date = datetime(year=746, month=1, day=1)
+    image = 'bogus'
+    data_directory = 'bogus'
+
+    # Data overrides from .common.XMLCorpus
+    tag_toplevel = ''
+    tag_entry = 'TEI'
+
+    # New data members
+    non_xml_msg = 'Skipping non-XML file {}'
+    non_match_msg = 'Skipping XML file with nonmatching name {}'
+    # overwrite below in child class if you need to extract the (converted) transcription
+    # from external files. See README.
+    external_file_folder = '.'
+    languages = []
+
+    def es_settings(self):
+        return es_settings()
+
+    def sources(self, start, end):
+        logger = logging.getLogger(__name__)
+        for directory, _, filenames in os.walk(self.data_directory):
+            for filename in filenames:
+                name, extension = op.splitext(filename)
+                full_path = op.join(directory, filename)
+
+                if extension != '.xml':
+                    logger.debug(self.non_xml_msg.format(full_path))
+                    continue
+
+                yield full_path, {
+                    # applies only to iis corpus
+                    'associated_file': os.path.join(self.external_file_folder, filename)
+                }
+
+    def request_media(self, document):
+        images = document['fieldValues']['images']
+        if not images:
+            images = []
+        return { 'media': images }
+
+    source_database = Field(
+        name='source_database',
+        display_name='Source database',
+        description='The database a record originates from.',
+        es_mapping=keyword_mapping(),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these databases.',
+            option_count=4,
+        ),
+        csv_core=True
+    )
+
+    _id = Field(
+        name='id',
+        display_name='ID',
+        description='ID of the inscription entry.',
+        csv_core=True,
+        es_mapping=keyword_mapping(),
+        search_field_core=True
+    )
+
+    url = Field(
+        name='url',
+        display_name='URL',
+        description='URL of the inscription entry.',
+        es_mapping=keyword_mapping(),
+        search_field_core=True
+    )
+
+    year = Field(
+        name='year',
+        display_name='Year',
+        description='Year of origin of the inscription.',
+        es_mapping=int_mapping(),
+        search_filter=RangeFilter(
+            description='Restrict the years from which search results will be returned.',
+            lower=min_year,
+            upper=max_date.year,
+        ),
+        csv_core=True,
+        sortable=True,
+        visualization_type='term_frequency',
+        visualization_sort='key',
+        results_overview=True
+    )
+
+    not_before = Field(
+        name='not_before',
+        display_name='Not before',
+        description='Inscription is dated not earlier than this year.',
+        es_mapping=int_mapping(),
+        hidden=True
+    )
+
+    not_after = Field(
+        name='not_after',
+        display_name='Not after',
+        description='Inscription is dated not later than this year.',
+        es_mapping=int_mapping(),
+        hidden=True
+    )
+
+    transcription = Field(
+        name='transcription',
+        es_mapping=main_content_mapping(),
+        display_name='Transcription',
+        description='Text content of the inscription.',
+        search_field_core=True,
+        results_overview=True,
+        display_type='text_content'
+    )
+
+    transcription_german = Field(
+        name='transcription_german',
+        es_mapping={'type': 'text', 'analyzer': 'german' },
+        hidden=True
+    )
+
+    transcription_english = Field(
+        name='transcription_english',
+        es_mapping={'type': 'text', 'analyzer': 'english'},
+        hidden=True
+    )
+
+    transcription_hebrew = Field(
+        name='transcription_hebrew',
+        es_mapping={'type': 'text'},
+        hidden=True
+    )
+
+    transcription_latin = Field(
+        name='transcription_latin',
+        es_mapping={'type': 'text'},
+        hidden=True
+    )
+
+    transcription_greek = Field(
+        name='transcription_greek',
+        es_mapping={'type': 'text', 'analyzer': 'greek'},
+        hidden=True
+    )
+
+    transcription_dutch = Field(
+        name='transcription_dutch',
+        es_mapping={'type': 'text', 'analyzer': 'dutch'},
+        hidden=True
+    )
+
+    age = Field(
+        name='age',
+        display_name='Age',
+        description='Age of the buried person(s)',
+        es_mapping=int_mapping(),
+        search_filter=RangeFilter(
+            description='Filter by age of the buried persons.',
+            lower=0,
+            upper=100,
+        ),
+        extractor=Constant(
+            value=None
+        )
+    )
+
+    # A string with all the names occuring in the source
+    names = Field(
+        name='names',
+        es_mapping=text_mapping(),
+        display_name='Names',
+        description='Names of the buried persons.',
+        search_field_core=True
+    )
+
+    # Should be an array with potentially multiple values from these: 'M', 'F', or None.
+    sex = Field(
+        name='sex',
+        display_name='Sex',
+        description='Gender(s) of the buried person(s). None if the sex is unknown.',
+        es_mapping=keyword_mapping(),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these genders.',
+            option_count=3,
+        ),
+        csv_core=True
+    )
+
+    country = Field(
+        name='country',
+        display_name='Country',
+        description='Country where the inscription was found.',
+        es_mapping=keyword_mapping(True),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these countries.',
+            option_count=5
+        ),
+        visualization_type='term_frequency',
+        results_overview=True
+    )
+
+    settlement = Field(
+        name='settlement',
+        display_name='Settlement',
+        description='The settlement where the inscription was found.',
+        es_mapping=keyword_mapping(True),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these settlements.',
+            option_count=29
+        ),
+        visualization_type='term_frequency'
+    )
+
+    region = Field(
+        name='region',
+        display_name='Region',
+        description='The region where the inscription was found.',
+        es_mapping=keyword_mapping(True),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these regions.',
+            option_count=29
+        ),
+        visualization_type='term_frequency'
+    )
+
+    location_details = Field(
+        name='location_details',
+        display_name='Location details',
+        description='Details about the location of the inscription',
+        es_mapping=text_mapping()
+    )
+
+    material = Field(
+        name='material',
+        display_name='Material',
+        description='Type of material the inscription is written on.',
+        es_mapping=keyword_mapping(),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these material types.',
+            option_count=39
+        ),
+        visualization_type='term_frequency'
+    )
+
+    material_details = Field(
+        name='material_details',
+        display_name='Material details',
+        description='Details about the material the inscription is written on.',
+        es_mapping=text_mapping(),
+        search_field_core=True
+    )
+
+    language = Field(
+        name='language',
+        display_name='Language',
+        description='Language written on the inscription.',
+        es_mapping=keyword_mapping(),
+        search_filter=MultipleChoiceFilter(
+            description='Search only within these languages.',
+            option_count=10
+        ),
+        csv_core=True,
+        visualization_type='term_frequency'
+    )
+
+    bibliography = Field(
+        name='bibliography',
+        es_mapping=keyword_mapping(),
+        display_name='Bibliography',
+        description='Reference(s) to who edited and published this funerary inscription.'
+    )
+
+    comments = Field(
+        name='comments',
+        es_mapping=text_mapping(),
+        display_name='Commentary',
+        description='Extra comments, questions or remarks on this inscription.',
+        search_field_core=True,
+    )
+
+    images = Field(
+        name='images',
+        es_mapping=keyword_mapping(),
+        display_name='Images',
+        description='Links to image(s) of the inscription.',
+        hidden=True
+    )
+
+    coordinates = Field(
+        name='coordinates',
+        es_mapping=keyword_mapping(),
+        display_name='Coordinates',
+        description='GIS coordinates for the inscription.'
+    )
+
+    iconography = Field(
+        name='iconography',
+        es_mapping=text_mapping(),
+        display_name='Iconography',
+        description='Description of the icons used in the inscription.',
+        search_field_core=True
+    )
+
+    dates_of_death = Field(
+        name='dates_of_death',
+        es_mapping=keyword_mapping(),
+        display_name='Date of death',
+    )
+
+    fields = [
+        _id,
+        url,
+        year,
+        not_before,
+        not_after,
+        source_database,
+        transcription,
+        names,
+        sex,
+        dates_of_death,
+        age,
+        country,
+        region,
+        settlement,
+        location_details,
+        language,
+        iconography,
+        images,
+        coordinates,
+        material,
+        material_details,
+        bibliography,
+        comments,
+        transcription_german,
+        transcription_hebrew,
+        transcription_latin,
+        transcription_greek,
+        transcription_english,
+        transcription_dutch
+    ]
+
+
+def clean_newline_characters(text):
+    '''
+    Remove all spaces surrounding newlines in `text`.
+    Also removes multiple newline characters in a row.
+    '''
+    if not text: return
+    parts = text.split('\n')
+    cleaned = []
+    for part in parts:
+        if not '\n' in part:
+            stripped = part.strip()
+            if stripped:
+                cleaned.append(part.strip())
+    return '\n'.join(cleaned)
+
+
+def clean_commentary(commentary):
+    '''
+    Clean a commentary by removing all whitespaces characters between words,
+    except for one space.
+    '''
+    return ' '.join(commentary.split())
+
+def join_commentaries(commentaries):
+    '''
+    Helper function to join the result of a Combined extractor
+    into one string, separating items by a newline
+    '''
+    results = []
+    for comm in commentaries:
+        if comm:
+            results.append(comm)
+    return "\n".join(results)
+
+def categorize_material(text):
+    '''
+    Helper function to (significantly) reduce the material field to a set of categories.
+    The Epidat corpus in particular has mainly descriptions of the material.
+    Returns a list of categories, i.e. those that appear in `text`.
+    '''
+    if not text: return ['Unknown']
+
+    categories = ['Sandstein', 'Kalkstein', 'Stein', 'Granit', 'Kunststein',
+                  'Lavatuff', 'Marmor', 'Kalk', 'Syenit', 'Labrador', 'Basalt', 'Beton',
+                  'Glas', 'Rosenquarz', 'Gabbro', 'Diorit', 'Bronze',
+                  # below from FIJI and IIS
+                  'Limestone', 'Stone', 'Clay', 'Plaster', 'Glass', 'Kurkar', 'Granite',
+                  'Marble', 'Metal', 'Bone', 'Lead' ]
+    result = []
+    ltext = text.lower()
+
+    for c in categories:
+        if c.lower() in ltext:
+            result.append(translate_category(c))
+
+    if len(result) == 0:
+        # reduce unknown, other and ? to Unknown
+        # 'schrifttafel' removes some clutter from Epidat
+        if 'unknown' in ltext or 'other' in ltext or '?' in ltext or 'schrifttafel':
+            result.append('Unknown')
+        else:
+            result.append(text)
+
+    return result
+
+def translate_category(category):
+    '''
+    Helper function to translate non-English categories of material into English
+    '''
+    pairs = {
+        'Sandstein': 'Sandstone',
+        'Kalkstein': 'Limestone',
+        'Stein': 'Stone',
+        'Granit': 'Granite',
+        'Kunststein': 'Artificial stone',
+        'Lavatuff': 'Tufa',
+        'Marmor': 'Marble',
+        'Kalk': 'Limestone',
+        'Syenit': 'Syenite',
+        'Labrador': 'Labradorite',
+        'Beton': 'Concrete',
+        'Glas': 'Glass',
+        'Rosenquarz': 'Rose quartz',
+        'Diorit': 'Diorite'
+    }
+
+    for original, translation in pairs.items():
+        if category == original:
+            return translation
+    return category
+
+
+def get_text_in_language(_input):
+    '''
+    Get all the lines from a transcription that are in a certain language
+    (according to the `langdetect` package). Note that `transcription` will
+    be split on newlines to create lines that will be fed to langdetect one by one.
+    All lines that are in `language_code` will be collected and returned as one string,
+    i.e. they will be joined with a space (no newlines!).
+
+    Parameters:
+        _input -- A tuple or list with (transcription, language_code). Will typically be the output
+        of a Combined extractor, i.e. one for the transcript and a Constant extractor with the language code.
+        For a list of language codes detected by langdetect, see https://pypi.org/project/langdetect/
+    '''
+    results = []
+    if len(_input) != 2 or not _input[0]:
+        return results
+    lines = _input[0].split('\n')
+    language_code = _input[1]
+
+    for line in lines:
+        if not line: continue
+        detected_code = None
+        try:
+            # note that Aramaic is detected as Hebrew
+            detected_code = detect(line)
+        except LangDetectException:
+            # sometimes langdetect isn't happy with some stuff like
+            # very short strings with mainly numbers in it
+            pass
+        if detected_code and detected_code == language_code:
+            results.append(line)
+    return ' '.join(results)
diff --git a/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml b/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml
new file mode 100644
index 000000000..90136bb1c
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/epidat/blr/blr-4.xml
@@ -0,0 +1,216 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?><?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://purl.oclc.org/dsdl/schematron"?><TEI xml:lang="en" xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0">
+<teiHeader>
+<fileDesc>
+<titleStmt>
+<title>epidat, blr-4</title>
+<!-- respStmt to be done -->
+</titleStmt>
+<editionStmt>
+<edition/>
+</editionStmt>
+<extent/>
+<publicationStmt>
+<publisher>
+<orgName ref="http://d-nb.info/gnd/212562-6">
+      Salomon Ludwig Steinheim-Institut 
+      <address>
+<addrLine>Edmund-Körner-Platz 2</addrLine>
+<addrLine>D-45127 Essen</addrLine>
+</address>
+</orgName>
+</publisher>
+<idno>blr-4</idno>
+<idno type="url">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4</idno>
+<availability status="free" xml:lang="en">
+<licence>
+<ref target="http://creativecommons.org/licenses/by/4.0/">Distributed under a Creative Commons licence Attribution-BY 4.0</ref>
+<p>
+       All reuse or distribution of this work must contain somewhere a link back to the URL 
+       <ref>http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4</ref>
+</p>
+</licence>
+</availability>
+</publicationStmt>
+<notesStmt>
+<note/>
+</notesStmt>
+<sourceDesc>
+<bibl>born digital</bibl>
+<msDesc>
+<msIdentifier>
+<repository>epidat</repository>
+<idno>blr-4</idno>
+<altIdentifier>
+<idno type="html">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4</idno>
+<note/>
+</altIdentifier>
+<altIdentifier>
+<idno type="xml">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4-t</idno>
+<note/>
+</altIdentifier>
+<publications/></msIdentifier>
+<physDesc>
+<objectDesc>
+<supportDesc>
+<support>
+<p>
+<material>stone</material>
+<objectType ref="http://vocab.getty.edu/page/aat/300005909">sepulchral monument</objectType>
+</p>
+</support>
+</supportDesc>
+<layoutDesc>
+<layout/>
+</layoutDesc>
+</objectDesc>
+</physDesc>
+<history>
+<origin>
+<origDate>
+<date notBefore="1865-02-28">1865-02-28</date>
+</origDate>
+<origPlace>
+<country key="XA-DE-TH" type="ISO_3166">
+         Germany
+         <region>Thuringa</region>
+</country>
+<settlement key="tgn:1036472" ref="http://vocab.getty.edu/tgn/1036472" type="city">
+         Bleicherode 
+         <geogName ref="http://d-nb.info/gnd/4331943-9">
+          Jewish Cemetery
+          <geo decls="#WGS">51.434387 10.571183</geo>
+</geogName>
+</settlement>
+</origPlace>
+</origin>
+</history>
+</msDesc>
+</sourceDesc>
+</fileDesc>
+<encodingDesc>
+<schemaRef url="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng">
+<desc>EpiDoc: TEI XML for epigraphic Documents Schema</desc>
+</schemaRef>
+<classDecl>
+<taxonomy>
+<category xml:id="foto1">
+<catDesc>Julia Buchmann, Nicola Wiemann, Maike Schlotterhose; Bleicherode</catDesc>
+</category>
+</taxonomy>
+</classDecl>
+<geoDecl datum="WGS84" xml:id="WGS">World Geodetic System</geoDecl>
+</encodingDesc>
+<profileDesc>
+<particDesc>
+<listPerson>
+<person sex="1" xml:id="blr-4-1">
+<persName>Natan Schönfeld (Nathan Schönfeld)</persName>
+<death when="1865-02-28"/>
+</person>
+</listPerson>
+</particDesc>
+<langUsage>
+<!-- According to bcp47 http://tools.ietf.org/html/bcp47 (Language) and ISO15924 (writing System)-->
+<language ident="he" usage="62">Hebrew</language>
+<language ident="de" usage="38">German</language>
+</langUsage>
+</profileDesc>
+<revisionDesc>
+<change/>
+</revisionDesc>
+</teiHeader>
+<facsimile>
+<graphic url="http://steinheim-institut.de/daten/picsblr/xl/0004_blr_2012.jpg">
+<desc>
+<ref target="#foto1"/>
+<date when="2012"/>
+    recto
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picsblr/xl/0004rblr_2012.jpg">
+<desc>
+<ref target="#foto1"/>
+<date when="2012"/>
+    Detail
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picsblr/xl/0004dblr_2012.jpg">
+<desc>
+<ref target="#foto1"/>
+<date when="2012"/>
+    verso
+   </desc>
+</graphic>
+</facsimile>
+<text>
+<body>
+<div type="edition">
+<head>Edition</head>
+<div subtype="recto" type="textpart">
+<ab>
+<lb n="1"/>
+      Hier ruhet
+      <lb n="2"/>
+      der Kaufmann
+      <lb n="3"/>
+      Nathan Schönfeld
+      <lb n="4"/>
+      geb. d. 4. April 1812
+      <lb n="5"/>
+      gest. d. [28.] Februar 1865
+     </ab>
+</div>
+<div subtype="verso" type="textpart">
+<ab>
+<lb n="6"/>
+      ‎‏פ״נ‏‎
+      <lb n="7"/>
+      ‎‏איש חמדות יקר רוח אוהב‏‎
+      <lb n="8"/>
+      ‎‏צדק ופועל טוב כ״ה נתן‏‎
+      <lb n="9"/>
+      ‎‏שאנפעלד נולד ח׳ של פסח‏‎
+      <lb n="10"/>
+      ‎‏תקע״ב ונפטר בשם טוב יום ג׳‏‎
+      <lb n="11"/>
+      ‎‏ב׳ אדר תרכ״ה לפ״ק‏‎
+      <lb n="12"/>
+      ‎‏תנצב״ה‏‎
+     </ab>
+</div>
+</div>
+<div type="translation">
+<head>Übersetzung</head>
+<div subtype="recto" type="textpart">
+<ab/>
+</div>
+<lb n="6"/>
+    Hier ist begraben
+    <lb n="7"/>
+    #.:ein werter Mann#.;, #.:edelmütig#.;, Wohltat
+    <lb n="8"/>
+    liebend und Gutes wirkend, der geehrte Herr Natan
+    <lb n="9"/>
+    Schönfeld, geboren 8. (Tag) von Pessach 572
+    <lb n="10"/>
+    und verschieden #.:mit gutem Namen#.; Tag 3,
+    <lb n="11"/>
+    2. Adar 625 der kleinen Zählung.
+    <lb n="12"/>
+    Seine Seele sei eingebunden in das Bündel des Lebens
+   </div>
+<div subtype="Zitate" type="commentary">
+<head>Zitate</head>
+<p>Zl 7: Dan 10,11 | Zl 7: Spr 17,27</p>
+<p>Zl 10: bBer 17a</p>
+</div>
+<div subtype="Prosopographie" type="commentary">
+<head>Prosopographie</head>
+</div>
+<div type="bibliography">
+<head>Bibliographie</head>
+</div>
+</body>
+</text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml b/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml
new file mode 100644
index 000000000..63a21a51d
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/epidat/hlh/hlh-12.xml
@@ -0,0 +1,302 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?><?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://purl.oclc.org/dsdl/schematron"?><TEI xml:lang="en" xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0">
+<teiHeader>
+<fileDesc>
+<titleStmt>
+<title>epidat, hlh-12</title>
+<!-- respStmt to be done -->
+</titleStmt>
+<editionStmt>
+<edition/>
+</editionStmt>
+<extent/>
+<publicationStmt>
+<publisher>
+<orgName ref="http://d-nb.info/gnd/212562-6">
+      Salomon Ludwig Steinheim-Institut 
+      <address>
+<addrLine>Edmund-Körner-Platz 2</addrLine>
+<addrLine>D-45127 Essen</addrLine>
+</address>
+</orgName>
+</publisher>
+<idno>hlh-12</idno>
+<idno type="url">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12</idno>
+<availability status="free" xml:lang="en">
+<licence>
+<ref target="http://creativecommons.org/licenses/by/4.0/">Distributed under a Creative Commons licence Attribution-BY 4.0</ref>
+<p>
+       All reuse or distribution of this work must contain somewhere a link back to the URL 
+       <ref>http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12</ref>
+</p>
+</licence>
+</availability>
+</publicationStmt>
+<notesStmt>
+<note/>
+</notesStmt>
+<sourceDesc>
+<bibl>born digital</bibl>
+<msDesc>
+<msIdentifier>
+<repository>epidat</repository>
+<idno>hlh-12</idno>
+<altIdentifier>
+<idno type="html">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12</idno>
+<note/>
+</altIdentifier>
+<altIdentifier>
+<idno type="xml">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=hlh-12-t</idno>
+<note/>
+</altIdentifier>
+<publications><publication>Stadt Mülheim an der Ruhr, Sterberegister Broich 1891 (1196/5/14), Nr. 247.</publication><publication>Kaufhold, Barbara, Jüdischen Leben in Mülheim an der Ruhr, Essen 2004.</publication></publications></msIdentifier>
+<physDesc>
+<objectDesc>
+<supportDesc>
+<support>
+<p>
+<material>stone</material>
+<objectType ref="http://vocab.getty.edu/page/aat/300005909">sepulchral monument</objectType>
+</p>
+</support>
+<condition>
+<p>
+<date>2013</date>
+           Der Zustand des Steins hat sich seit 1986 kaum verändert
+         </p>
+</condition>
+</supportDesc>
+<layoutDesc>
+<layout/>
+</layoutDesc>
+</objectDesc>
+<decoDesc>
+<decoNote type="symbol"/>
+<decoNote type="ornament">sechzackiger Stern</decoNote>
+</decoDesc>
+</physDesc>
+<history>
+<origin>
+<origDate>
+<date notBefore="1891-12-06">1891-12-06</date>
+</origDate>
+<origPlace>
+<country key="XA-DE-NW" type="ISO_3166">
+         Germany
+         <region>North Rhine-Westphalia</region>
+</country>
+<settlement key="tgn:1039517" ref="http://vocab.getty.edu/tgn/1039517" type="city">
+         Kettwig (Neuer Friedhof in Heiligenhaus) 
+         <geogName ref="http://d-nb.info/gnd/4331943-9">
+          Jewish Cemetery
+          <geo decls="#WGS">51.346014 6.924709</geo>
+</geogName>
+</settlement>
+</origPlace>
+</origin>
+</history>
+</msDesc>
+</sourceDesc>
+</fileDesc>
+<encodingDesc>
+<schemaRef url="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng">
+<desc>EpiDoc: TEI XML for epigraphic Documents Schema</desc>
+</schemaRef>
+<classDecl>
+<taxonomy>
+<category xml:id="foto1">
+<catDesc>
+       Epigraphisches Bildarchiv, 
+       <orgName ref="http://d-nb.info/gnd/212562-6">Steinheim-Institut</orgName>
+</catDesc>
+</category>
+<category xml:id="foto2">
+<catDesc>Nathanja Hüttenmeister, Carmen Wedemeyer</catDesc>
+</category>
+</taxonomy>
+</classDecl>
+<geoDecl datum="WGS84" xml:id="WGS">World Geodetic System</geoDecl>
+</encodingDesc>
+<profileDesc>
+<particDesc>
+<listPerson>
+<person sex="2" xml:id="hlh-12-1">
+<persName>Gitle bat Mosche (Clara Leffmann)</persName>
+<death when="1891-12-06"/>
+</person>
+<listRelation>
+<relation active="#hlh-3-1 #hlh-30-1" name="parent" passive="#hlh-12-1"/>
+</listRelation>
+</listPerson>
+</particDesc>
+<langUsage>
+<!-- According to bcp47 http://tools.ietf.org/html/bcp47 (Language) and ISO15924 (writing System)-->
+<language ident="he" usage="50">Hebrew</language>
+<language ident="de" usage="50">German</language>
+</langUsage>
+</profileDesc>
+<revisionDesc>
+<change/>
+</revisionDesc>
+</teiHeader>
+<facsimile>
+<graphic url="http://steinheim-institut.de/daten/picshlh/xl/0012_dhlh_2013.png">
+<desc>
+<ref target="#foto2"/>
+<date when="2013"/>
+    recto
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picshlh/xl/0012_hlh_1986.png">
+<desc>
+<ref target="#foto1"/>
+<date when="1986"/>
+    recto
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picshlh/xl/0012_hlh_2013.png">
+<desc>
+<ref target="#foto2"/>
+<date when="2013"/>
+    recto
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picshlh/xl/0012dhlh_1986.png">
+<desc>
+<ref target="#foto1"/>
+<date when="1986"/>
+    Detail
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picshlh/xl/0012dhlh_2013(2).png">
+<desc>
+<ref target="#foto2"/>
+<date when="2013"/>
+    Detail
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/picshlh/xl/0012dhlh_2013(5).png">
+<desc>
+<ref target="#foto2"/>
+<date when="2013"/>
+    Detail
+   </desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/maps/rir/simplified/hlh.svg" xml:id="simplified_svg">
+<desc subtype="simplified" type="map">
+<date when="2015-06-11T17:00:46"/>
+</desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/maps/rir/simplified/jpg/hlh.jpg" xml:id="simplified_jpg">
+<desc subtype="simplified" type="map">
+<date when="2015-07-22T13:10:51"/>
+</desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/maps/rir/accurate/jpg/hlh.jpg" xml:id="accurate_jpg">
+<desc subtype="accurate" type="map">
+<date when="2015-07-22T13:10:51"/>
+</desc>
+</graphic>
+<graphic url="http://steinheim-institut.de/daten/maps/rir/accurate/hlh.svg" xml:id="accurate_svg">
+<desc subtype="accurate" type="map">
+<date when="2015-06-11T17:00:46"/>
+</desc>
+</graphic>
+</facsimile>
+<text>
+<body>
+<div type="edition">
+<head>Edition</head>
+<div subtype="recto" type="textpart">
+<ab>
+<lb n="1"/>
+      ‎‏פ״ט‏‎
+      <lb n="2"/>
+      ‎‏הבתולה צנועה וחמודה‏‎
+      <lb n="3"/>
+      ‎‏מ׳ גיטלא בת משה‏‎
+      <lb n="4"/>
+      ‎‏ה״ה ראשנה שנקברה בבית‏‎
+      <lb n="5"/>
+      ‎‏החיים החדשה בק״ק‏‎
+      <lb n="6"/>
+      ‎‏קעטטוויג ומתה בשם ט׳‏‎
+      <lb n="7"/>
+      ‎‏ביום א׳ ה׳ כסלו תרנ״ב ל׳‏‎
+      <lb n="8"/>
+      ‎‏תנצב״ה‏‎
+      <lb n="9"/>
+      Hier ruht die Jungfrau
+      <lb n="10"/>
+      Clara Leffmann
+      <lb n="11"/>
+      Sie starb erst 19
+      <lb n="12"/>
+      Jahre alt, gottergeben und
+      <lb n="13"/>
+      tief betrauert von den ihrigen,
+      <lb n="14"/>
+      den 8. Dezbr. 1891
+     </ab>
+</div>
+<div subtype="Postament" type="textpart">
+<ab>
+<lb n="15"/>
+      Friede ihrer Asche.
+     </ab>
+</div>
+</div>
+<div type="translation">
+<head>Übersetzung</head>
+<div subtype="recto" type="textpart">
+<ab>
+<lb n="1"/>
+      Hier ist geborgen
+      <lb n="2"/>
+      die züchtige und liebliche Jungfrau,
+      <lb n="3"/>
+      Frau Gitle, Tochter des Mosche,
+      <lb n="4"/>
+      sie ist die Erste, die begraben wurde auf dem neuen
+      <lb n="5"/>
+      Friedhof der heiligen Gemeinde
+      <lb n="6"/>
+      Kettwig, und sie starb #.:mit gutem Namen#.;
+      <lb n="7"/>
+      am Tag 1, 5. Kislev 652 der Zählung.
+      <lb n="8"/>
+      Ihre Seele sei eingebunden in das Bündel des Lebens
+     </ab>
+</div>
+<div subtype="Postament" type="textpart">
+<ab/>
+</div>
+</div>
+<div subtype="Zitate" type="commentary">
+<head>Zitate</head>
+<p>Zl 6: bBer 17a</p>
+</div>
+<div subtype="Zeilenkommentar" type="commentary">
+<head>Zeilenkommentar</head>
+<p>Zl 5: Friedhof, wörtl. "Haus des Lebens".</p>
+</div>
+<div subtype="Endkommentar" type="commentary">
+<head>Endkommentar</head>
+<p>Vermutlich handelt es sich bei der Angabe des Sterbedatums in der deutschen Inschrift um das Begräbnisdatum. Dieser Stein ist der erste des Friedhofes am Görscheider Weg.</p>
+<p>Zwischen den jüdischen Familien aus Kettwig vor der Brücke und Saarn gab es verwandtschaftliche Verhältnisse, so stammte die Familie Leffmann, deren Angehörige z. T. hier bestattet sind, aus Saarn (Kaufhold, Jüdisches Leben in Mülheim a. d. R., S. ).</p>
+</div>
+<div subtype="Prosopographie" type="commentary">
+<head>Prosopographie</head>
+<p>Clara Leffmann war die Tochter des Saarner Metzgers Moritz Leffmann und seiner Frau Sara geb. Herz. Der Bruder Artur fiel 1917 im Krieg (Engelhardt, Chronik, S. 81).</p>
+</div>
+<div type="bibliography">
+<head>Bibliographie</head>
+<bibl>
+<ref target="http://steinheim-institut.de/daten/maps/rir/description/hlh.xml">Systematische bauhistorische Beschreibung</ref>
+      durch 
+     <orgName>Bau- und Stadtbaugeschichte, Fakultät 6, Institut für Architektur, TU Berlin</orgName>
+</bibl>
+</div>
+</body>
+</text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/fiji/299.xml b/backend/corpora/peaceportal/tests/data/fiji/299.xml
new file mode 100644
index 000000000..622c94738
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/fiji/299.xml
@@ -0,0 +1,64 @@
+<TEI xml:space="preserve" xml:lang="en"
+xml:base="ex-epidoctemplate.xml" xmlns="http://www.tei-c.org/ns/1.0">
+    <teiHeader>
+        <fileDesc>
+            <titleStmt>
+                <title>299</title>
+            </titleStmt>
+            <publicationStmt>
+                <authority/>
+                <idno type="filename"/>
+            </publicationStmt>
+            <sourceDesc>
+                <msDesc>
+                    <msIdentifier>
+                        <location>Museo Vaticano, lapidario ebraico ex-Lateranense; inv.no.30762</location>
+                        <publications>
+                            <publication>Noy 1995, p. 69-70 (83)</publication>
+                        </publications>
+                        </msIdentifier>
+                    <history>
+                        <origin>
+                            <provenance>Rome, Monteverde</provenance>
+                            <origDate>3rd-4th century</origDate>
+                            <remarksOnDate>Uncertain</remarksOnDate>
+                        </origin>
+                    </history>
+                </msDesc>
+            </sourceDesc>
+        </fileDesc>
+        <profileDesc>
+            <particDesc>
+                <listPerson>
+                    <person sex="M">
+                        <persName>Felicissima  ( the commemorator)     Emarantus ( the decaesed) (Φη&lt;λ&gt;ικίσσιμα                            Ἠμαράντῳ)</persName>
+                    </person>
+                    <person sex="F">
+                        <persName></persName>
+                    </person>
+                </listPerson>
+            </particDesc>
+            <langUsage>
+                <language>Greek</language>
+            </langUsage>
+        </profileDesc>
+    </teiHeader>
+    <facsimile>
+        <photoFacsimile>CIJ i 1936, 266 no.339</photoFacsimile>
+        <photosLeonard>None</photosLeonard>
+        <image3D>None</image3D>
+    </facsimile>
+    <text>
+        <body>
+            <transcription>Φη&lt;λ&gt;ικίσσιμα Ἠμαράντῳ ἐποίησεν.</transcription>
+            <inscriptionType>Epitaph</inscriptionType>
+            <iconographyType>none</iconographyType>
+            <iconographyDescription></iconographyDescription>
+            <material>Stone (white marble plaque)</material>
+            <incipit>Φη<λ>ικίσσιμα</incipit>
+            <age>not mentioned</age>
+            <ageComments>not mentioned</ageComments>
+            <commentary>Found on the 3rd of December 1904 in Cub.XL. The lower third of the plaque was left unused. There are poits between the syllables. Ferrua thought it might be pagan.  </commentary>
+        </body>
+    </text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/fiji/687.xml b/backend/corpora/peaceportal/tests/data/fiji/687.xml
new file mode 100644
index 000000000..d860857cb
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/fiji/687.xml
@@ -0,0 +1,61 @@
+<TEI xml:space="preserve" xml:lang="en"
+xml:base="ex-epidoctemplate.xml" xmlns="http://www.tei-c.org/ns/1.0">
+    <teiHeader>
+        <fileDesc>
+            <titleStmt>
+                <title>687</title>
+            </titleStmt>
+            <publicationStmt>
+                <authority/>
+                <idno type="filename"/>
+            </publicationStmt>
+            <sourceDesc>
+                <msDesc>
+                    <msIdentifier>
+                        <location>In the catacomb</location>
+                        <publications>
+                            <publication>Noy 1995, p. 351 (417)</publication>
+                        </publications>
+                        </msIdentifier>
+                    <history>
+                        <origin>
+                            <provenance>Rome, Villa Torlonia (lower cat.)</provenance>
+                            <origDate>3rd- 4th century</origDate>
+                            <remarksOnDate></remarksOnDate>
+                        </origin>
+                    </history>
+                </msDesc>
+            </sourceDesc>
+        </fileDesc>
+        <profileDesc>
+            <particDesc>
+                <listPerson>
+                    <person sex="Unknown">
+                        <persName></persName>
+                    </person>
+                </listPerson>
+            </particDesc>
+            <langUsage>
+                <language>Greek</language>
+            </langUsage>
+        </profileDesc>
+    </teiHeader>
+    <facsimile>
+        <photoFacsimile>not available</photoFacsimile>
+        <photosLeonard>None</photosLeonard>
+        <image3D>None</image3D>
+    </facsimile>
+    <text>
+        <body>
+            <transcription>ἐνθάδε [κεῖται--]</transcription>
+            <inscriptionType>Εpitaph</inscriptionType>
+            <iconographyType>?</iconographyType>
+            <iconographyDescription></iconographyDescription>
+            <material>Stone (marble fragment)</material>
+            <incipit>ἐνθάδε [κεῖται--]</incipit>
+            <age>not mentioned or lost</age>
+            <ageComments>not mentioned or lost</ageComments>
+            <commentary></commentary>
+        </body>
+    </text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/fiji/759.xml b/backend/corpora/peaceportal/tests/data/fiji/759.xml
new file mode 100644
index 000000000..74441bf40
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/fiji/759.xml
@@ -0,0 +1,65 @@
+<TEI xml:space="preserve" xml:lang="en"
+xml:base="ex-epidoctemplate.xml" xmlns="http://www.tei-c.org/ns/1.0">
+    <teiHeader>
+        <fileDesc>
+            <titleStmt>
+                <title>759</title>
+            </titleStmt>
+            <publicationStmt>
+                <authority/>
+                <idno type="filename"/>
+            </publicationStmt>
+            <sourceDesc>
+                <msDesc>
+                    <msIdentifier>
+                        <location>Formerly in Villa Torlonia stables</location>
+                        <publications>
+                            <publication>Noy 1995, p. 390-1 (489)</publication>
+                        </publications>
+                        </msIdentifier>
+                    <history>
+                        <origin>
+                            <provenance>Rome, Villa Torlonia (lower cat.)</provenance>
+                            <origDate>3rd- 4th century</origDate>
+                            <remarksOnDate></remarksOnDate>
+                        </origin>
+                    </history>
+                </msDesc>
+            </sourceDesc>
+        </fileDesc>
+        <profileDesc>
+            <particDesc>
+                <listPerson>
+                    <person sex="F">
+                        <persName>Irene (Εἰρήνη)</persName>
+                    </person>
+                </listPerson>
+            </particDesc>
+            <langUsage>
+                <language>Greek</language>
+            </langUsage>
+        </profileDesc>
+    </teiHeader>
+    <facsimile>
+        <photoFacsimile>CIJ i 1936, p. 19-20 no.21</photoFacsimile>
+        <photosLeonard>None</photosLeonard>
+        <image3D>None</image3D>
+    </facsimile>
+    <text>
+        <body>
+            <transcription>Εἰρήνη τρεζπτὴ προσήλυτος πατρὸς καὶ μητρὸς Εἰουδε͂α
+<lb/>
+Ἰσδραηλίτης ἔζησεν ἤτ(η) γ΄ μ(ῆνας) ζ΄ vac.ἡμ(έ)ρ(αν) α΄.
+<lb/>
+͂</transcription>
+            <inscriptionType>Εpitaph</inscriptionType>
+            <iconographyType>none</iconographyType>
+            <iconographyDescription></iconographyDescription>
+            <material>Stone (grey-blue marble plaque)</material>
+            <incipit>Εἰρήνη</incipit>
+            <age>3</age>
+            <ageComments>The precise age was 3 years, 7 months and 1 day.</ageComments>
+            <commentary></commentary>
+        </body>
+    </text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml
new file mode 100644
index 000000000..de749a662
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/akld0002.xml
@@ -0,0 +1,5 @@
+    
+
+ 
+
+Χάρητος		Χάρητος		Χάρητος		Χάρητος  
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml
new file mode 100644
index 000000000..235b943e8
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/beth0042.xml
@@ -0,0 +1,5 @@
+    
+
+ 
+
+Ἀβρᾶ καὶ Σαμῆ   
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml
new file mode 100644
index 000000000..b4ac3b202
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/transcription_txts/jeru0014.xml
@@ -0,0 +1,5 @@
+   
+
+ 
+
+אמא  
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml b/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml
new file mode 100644
index 000000000..5f7921f49
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/xml/akld0002.xml
@@ -0,0 +1,196 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?oxygen RNGSchema="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng"?><TEI n="iip" xml:id="akld0002" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude">
+<teiHeader>
+<fileDesc>
+<!-- ************************************** <ttleStmt> *********************************
+                
+            -->
+<titleStmt>
+<title>Inscriptions of Israel/Palestine</title>
+<respStmt>
+<resp>Prinicipal Investigator</resp>
+<persName xml:id="MS">Michael Satlow</persName>
+</respStmt>
+</titleStmt>
+<!-- *************************************  <publicationStmt>  *********************************
+                Used to group information about the publication and permissions of a work. All files will have an IIP copyright
+                statement, which is stored in an external file called include_publicationStmt.xml. In the future, if an inscription has 
+                special copyright or permissions, we can add more information after the included file.
+                
+                <idno>: Repeat the value that is in <title>.
+                @xml:id is the IIP number. 
+            -->
+<publicationStmt>
+<xi:include href="http://cds.library.brown.edu/projects/iip/include_publicationStmt.xml">
+<xi:fallback>
+<p>ERROR-could not find publication information which should appear in this space.</p>
+</xi:fallback>
+</xi:include>
+<idno/>
+</publicationStmt>
+<!--  *************************************  <sourceDesc>  ********************************* -->
+<sourceDesc>
+<!--  *************************************  <msDesc>  ********************************* 
+                       repeat the id number here. This is an Epidoc convention.
+                -->
+<msDesc>
+<msIdentifier>
+<idno type="IIP">Akld 0002</idno>
+<publications><publication>Shadmi, T. (1996). The Ossuaries and the Sarcophagus. In G. Avni &amp; Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 41–55). Jerusalem: Israel Antiquities Authority. (page 52)</publication><publication>Ilan, T. (1996). The Ossuary and Sarcophagus Inscriptions. In G. Avni &amp; Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 57–72). Jerusalem: Israel Antiquities Authority. (page 58)</publication></publications></msIdentifier>
+<!--  *************************************  <msContents>  ********************************* -->
+<msContents>
+<textLang mainLang="grc" otherLangs=""/>
+<!-- lat, grc, heb, arc -->
+<!-- religion here? -->
+<msItem ana="#jewish" class="#funerary">
+<p>Jerusalem Akeldama Caves confluence of Kidron and Hinnom Valleys,
+                                First century CE. Ossuary. Funerary.</p>
+</msItem>
+</msContents>
+<!--  *************************************  <physDesc>  ********************************* -->
+<physDesc>
+<objectDesc ana="#ossuary">
+<supportDesc ana="#limestone">
+<support>
+<dimensions type="surface" unit="cm">
+<height>64</height>
+<width>29</width>
+<depth>35</depth>
+</dimensions>
+</support>
+<condition ana="#complete.intact">
+<p/>
+</condition>
+</supportDesc>
+<layoutDesc>
+<layout columns="4" writtenLines="4">
+<p>once on each side</p>
+</layout>
+</layoutDesc>
+</objectDesc>
+<!--  *************************************  <handDesc>  *********************************
+                            Contains information about types of writing and means of inscription.
+                            <handNote> is a repeatable element, so that it can handle more than one type of writing. 
+                            @ana has one or more values from the writing taxonomy. 
+                            Any more specific information is contained inside <handNote>. If there is more than one type of writing, 
+                             add @hands (<handDesc hands="2">).
+                            Letter height information goes in <handDesc>, insde a <dimension> element. 
+                        -->
+<handDesc>
+<handNote ana="#impressed.inscribed">
+<p/>
+</handNote>
+</handDesc>
+<!--  *************************************  <decoDesc>  *********************************
+                            US Epigraphy doesn't use this much yet, but this is where information about any decorations will go.
+                            <decoNote> is a repeatable element, so that it can handle more than one decoration. 
+                               @type has one  or more values from the decoration taxonomy, and any more specific information 
+                               is contained inside <decoNote>
+                        -->
+<decoDesc>
+<decoNote>
+<ab>Painted Red</ab>
+<locus/>
+</decoNote>
+</decoDesc>
+</physDesc>
+<history>
+<summary>
+<rs/>
+</summary>
+<origin>
+<date notAfter="0100" notBefore="0001" period="http://n2t.net/ark:/99152/p0m63njbxb9">First century CE</date>
+<placeName>
+<region>Judaea</region>
+<settlement ref="http://pleiades.stoa.org/places/687928">Jerusalem</settlement>
+<geogName type="site">Akeldama</geogName>
+<geogFeat type="locus">Cave 2 chamber B</geogFeat>
+</placeName>
+<!-- check about place vs placeName, also about geographical coordinates if specific enough 
+                            was  <place region="Negev" city="Zoora" site="An Naq" locus="cemetery"> -->
+<p/>
+</origin>
+<provenance>
+<placeName/>
+</provenance>
+</history>
+</msDesc>
+</sourceDesc>
+</fileDesc>
+<!--  *************************************  <encodingDesc>  ********************************
+            
+            Encoders do not edit the <encodingDesc>.
+        -->
+<encodingDesc>
+<xi:include href="http://cds.library.brown.edu/projects/iip/include_taxonomies.xml">
+<xi:fallback>
+<p>Taxonomies for IIP controlled values</p>
+</xi:fallback>
+</xi:include>
+</encodingDesc>
+<!-- Ignore profileDesc for now -->
+<profileDesc/>
+<!--  *************************************  <revisionDesc>  *********************************
+             <change> Any change to the file should be recorded here, with most recent revisions listed at the top of the list. 
+             @when Dates should be in the format YYYYY-MM-DD.
+             @who Type in your name. 
+             The content of <change> can be a short description of what changes were made, for example:
+                initial encoding; update original US Ep. values to P5 values; corrected xx; etc.
+        -->
+<revisionDesc>
+<change when="2014-02-05" who="Hannah Liu">Initial Entry</change>
+<change when="2016-12-02" who="persons.xml#Elli_Mylonas">Normalized objectDesc/@ana</change>
+<change when="2016-12-14" who="persons.xml#Elli_Mylonas">Adding Pleiades IDs to origin/placenames</change>
+<change when="2019-01-29" who="persons.xml#Elli_Mylonas">
+                adding period attribute to date element, with Periodo value.
+            </change>
+</revisionDesc>
+</teiHeader>
+<!--  *************************************  <facsimile>  *********************************
+        Used to point to images of an inscription. 
+          <graphic> (directly within <facsimile>) Use this if there is only a single "main" image without a special caption.
+          <surface> Use <desc> and <graphic> within <surface> if there are mutiple images or if they have captions. <surface>
+          pairs one or more images with a single caption. Usually, in US Ep. captions are only indicated when the image is of a detail.
+          @url contains the name of the image file (no path or directory information, for now).
+        <facsimile> can have more than one <graphic> or <surface>.
+     -->
+<facsimile>
+<surface>
+<desc/>
+<graphic url=""/>
+</surface>
+</facsimile>
+<text>
+<body>
+<!-- figure out how include linking mechanism to link divs to bibl.  add a ref inside an ab inside the div-->
+<div ana="b1" subtype="diplomatic" type="edition">
+<p>ΧΑΡΗΤΟϹ<lb/>ΧΑ ΡΗ  ΤΟ Ϲ <lb/>ΧΑΡΗΤΟϹ<lb/>ΧΑΡΗΤΟϹ</p>
+</div>
+<div ana="b2" subtype="transcription" type="edition">
+<p>Χάρητος<lb/>Χάρητος<lb/>Χάρητος<lb/>Χάρητος</p>
+</div>
+<div ana="b2" type="translation">
+<p>of Chares</p>
+</div>
+<div ana="b1" type="commentary">
+<p/>
+</div>
+<!-- The bibl has more contained elements. What is the possible list of values of the biblScope/@type? 
+             -->
+</body>
+<back>
+<div type="bibliography">
+<listBibl>
+<bibl xml:id="b1">
+<ptr target="IIP-475.xml" type="biblItem"/>
+<biblScope unit="page">52</biblScope>
+</bibl>
+<bibl xml:id="b2">
+<ptr target="IIP-053.xml" type="biblItem"/>
+<biblScope unit="page">58</biblScope>
+</bibl>
+</listBibl>
+</div>
+</back>
+</text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml b/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml
new file mode 100644
index 000000000..f61d5a5d2
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/xml/beth0042.xml
@@ -0,0 +1,143 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?oxygen RNGSchema="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng"?><TEI n="iip" xml:id="beth0042" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude">
+<teiHeader>
+<fileDesc><!-- ********** <titleStmt> ********** -->
+<titleStmt>
+<title>Inscriptions of Israel/Palestine</title>
+<respStmt>
+<resp>Prinicipal Investigator</resp>
+<persName xml:id="MS">Michael Satlow</persName>
+</respStmt>
+</titleStmt>
+<publicationStmt>
+<xi:include href="http://cds.library.brown.edu/projects/iip/include_publicationStmt.xml">
+<xi:fallback>
+<p>ERROR-could not find publication information which should appear in this space.</p>
+</xi:fallback>
+</xi:include>
+<idno type="IIP">beth0042</idno>
+<!--Beth 0042-->
+</publicationStmt>
+<!-- ********* <sourceDesc> ********* -->
+<sourceDesc>
+<msDesc>
+<msIdentifier>
+<idno>Beth 0042</idno>
+<publications><publication>Frey, J. B. (1952). Corpus Inscriptionum Iudaicarum (Vol. II (Asie-Afrique)). Roma: Pontificio Istituto di Archeologia Cristiana. (insc)</publication><publication>Schwabe, M., &amp; Lifshitz, B. (1974). Beth She’arim. Vol. 2, The Greek Inscriptions. Massada Press on behalf of the Israel Exploration Society. (page 25-26)</publication></publications></msIdentifier>
+<!-- ********* <msContents> ********* -->
+<msContents>
+<textLang mainLang="grc"/>
+<msItem ana="#jewish" class="#funerary">
+<p>Galilee. Beth Shearim. 250 CE to 350 CE. Red painted wall of arcosolium. Funerary.</p>
+</msItem>
+</msContents>
+<!-- ********* <physDesc> ********* -->
+<physDesc><!-- ********* <objectDesc> ********* -->
+<objectDesc ana="#wall">
+<supportDesc>
+<support>
+<dimensions type="surface" unit="cm">
+<height>60</height>
+</dimensions>
+</support>
+<condition ana="#complete.intact">
+<p/>
+</condition>
+</supportDesc>
+<!-- ********* <layoutDesc> ********* -->
+<layoutDesc>
+<layout columns="0" writtenLines="1">
+<p/>
+</layout>
+</layoutDesc>
+</objectDesc>
+<!-- ********* <handDesc> ********* -->
+<handDesc>
+<handNote ana="#painted">
+<dimensions atLeast="8" atMost="10" extent="height" type="letter" unit="cm"/>
+</handNote>
+</handDesc>
+<!-- ********* <decoDesc> ********* -->
+<decoDesc>
+<decoNote ana="#xx">
+<ab/>
+<locus/>
+</decoNote>
+</decoDesc>
+</physDesc>
+<!-- ********* <history> ********* -->
+<history>
+<origin>
+<date notAfter="0350" notBefore="0250" period="Talmudic">250 CE to 350 CE</date>
+<placeName>
+<region>Galilee</region>
+<settlement ref="http://pleiades.stoa.org/places/678063">Beth Shearim</settlement>
+<geogName type="site"/>
+</placeName>
+<p/>
+</origin>
+<provenance>
+<placeName/>
+<date/>
+</provenance>
+</history>
+</msDesc>
+</sourceDesc>
+</fileDesc>
+<!-- ********* <encodingDesc> ********* -->
+<encodingDesc>
+<xi:include href="http://cds.library.brown.edu/projects/iip/include_taxonomies.xml">
+<xi:fallback>
+<p>ERROR: could not find taxonomies file, which should appear in this space.</p>
+</xi:fallback>
+</xi:include>
+</encodingDesc>
+<profileDesc/>
+<!-- ********* <revisionDesc> ********* -->
+<revisionDesc>
+<change when="1996-08-16" who="persons.xml#MLS">Creation</change>
+<change when="2005-07-18" who="persons.xml#Jordan_Rosenblum">Revision</change>
+<change when="2016-12-01" who="persons.xml#Elli_Mylonas">Changed graphic element to facsimile and kept existing url</change>
+<change when="2016-12-14" who="persons.xml#Elli_Mylonas">Adding Pleiades IDs to origin/placenames</change>
+<change when="2019-01-29" who="persons.xml#Elli_Mylonas">
+                adding period attribute to date element, with Periodo value.
+            </change>
+</revisionDesc>
+</teiHeader>
+<!-- ********* <facsimile> ********* -->
+<facsimile>
+<surface>
+<desc/>
+<graphic url="BS0042.reg.gif"/>
+</surface>
+</facsimile>
+<text>
+<body>
+<div type="commentary">
+<p>Catacomb 1, Hall G, room IV, arcosolium 1</p>
+</div>
+<div ana="#b2" corresp="#beth0042.translation" subtype="transcription" type="edition" xml:id="beth0042.transcription">
+<p>Ἀβρᾶ καὶ Σαμῆ </p>
+</div>
+<div ana="#b2" corresp="#beth0042.transcription" type="translation" xml:id="beth0042.translation">
+<p>
+<persName ref="persons.xml#beth0042p01" xml:lang="hbo ">Abra </persName> and <persName ref="persons.xml#beth0042p02" xml:lang="hbo ">Same </persName>
+</p>
+</div>
+</body>
+<back>
+<div type="bibliography">
+<listBibl>
+<bibl xml:id="b1">
+<ptr target="IIP-039.xml" type="biblItem"/>
+<biblScope unit="insc"/>
+</bibl>
+<bibl xml:id="b2">
+<ptr target="IIP-145.xml" type="biblItem"/>
+<biblScope unit="page">25-26</biblScope>
+</bibl>
+</listBibl>
+</div>
+</back>
+</text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml b/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml
new file mode 100644
index 000000000..d188209a8
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/iis/xml/jeru0014.xml
@@ -0,0 +1,140 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?oxygen RNGSchema="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng"?><TEI n="iip" xml:id="jeru0014" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude">
+<teiHeader>
+<fileDesc><!-- ********** <titleStmt> ********** -->
+<titleStmt>
+<title>Inscriptions of Israel/Palestine</title>
+<respStmt>
+<resp>Prinicipal Investigator</resp>
+<persName xml:id="MS">Michael Satlow</persName>
+</respStmt>
+</titleStmt>
+<publicationStmt>
+<xi:include href="http://cds.library.brown.edu/projects/iip/include_publicationStmt.xml">
+<xi:fallback>
+<p>ERROR-could not find publication information which should appear in this space.</p>
+</xi:fallback>
+</xi:include>
+<idno type="IIP">jeru0014</idno>
+<!--Jeru 0014-->
+</publicationStmt>
+<!-- ********* <sourceDesc> ********* -->
+<sourceDesc>
+<msDesc>
+<msIdentifier>
+<idno>jeru0014</idno>
+<publications><publication>Rahmani, L. Y. (1994). A Catalogue of Jewish Ossuaries in the Collections of the State of Israel. (A. Sussmann, Ed.). Israel Antiquities Authority: Israel Academy of Sciences and Humanities. (page 80, plate 4, fig. 21)</publication></publications></msIdentifier>
+<!-- ********* <msContents> ********* -->
+<msContents>
+<textLang mainLang="arc"/>
+<msItem ana="#jewish" class="#funerary">
+<p>Judaea. Jerusalem. 20 BCE to 70 CE. Soft limestone ossuary. Funerary.</p>
+</msItem>
+</msContents>
+<!-- ********* <physDesc> ********* -->
+<physDesc><!-- ********* <objectDesc> ********* -->
+<objectDesc ana="#ossuary">
+<supportDesc>
+<support>
+<dimensions type="surface" unit="cm">
+<height>29.5</height>
+<width>52</width>
+<depth>23</depth>
+</dimensions>
+</support>
+<condition>
+<p/>
+</condition>
+</supportDesc>
+<!-- ********* <layoutDesc> ********* -->
+<layoutDesc>
+<layout columns="0" writtenLines="0">
+<p/>
+</layout>
+</layoutDesc>
+</objectDesc>
+<!-- ********* <handDesc> ********* -->
+<handDesc>
+<handNote ana="#engraved"/>
+</handDesc>
+<!-- ********* <decoDesc> ********* -->
+<decoDesc>
+<decoNote ana="#xx">
+<ab/>
+<locus/>
+</decoNote>
+</decoDesc>
+</physDesc>
+<!-- ********* <history> ********* -->
+<history>
+<origin>
+<date notAfter="0070" notBefore="-0020" period="http://n2t.net/ark:/99152/p0m63njbxb9">20 BCE to 70 CE</date>
+<placeName>
+<region>Judaea</region>
+<settlement ref="http://pleiades.stoa.org/places/687928">Jerusalem</settlement>
+<geogName type="site">Kidron Valley</geogName>
+<geogFeat type="locus">southeast of 'En Rogel</geogFeat>
+</placeName>
+<p>Judaea. Jerusalem. Kidron Valley, southeast of Ἑn Rogel.</p>
+</origin>
+<provenance>
+<placeName/>
+<date/>
+</provenance>
+</history>
+</msDesc>
+</sourceDesc>
+</fileDesc>
+<!-- ********* <encodingDesc> ********* -->
+<encodingDesc>
+<xi:include href="http://cds.library.brown.edu/projects/iip/include_taxonomies.xml">
+<xi:fallback>
+<p>ERROR: could not find taxonomies file, which should appear in this space.</p>
+</xi:fallback>
+</xi:include>
+</encodingDesc>
+<profileDesc/>
+<!-- ********* <revisionDesc> ********* -->
+<revisionDesc>
+<change when="2008-04-09" who="persons.xml#Anna_Schnur-Fishman">Creation</change>
+<change when="2016-12-02" who="persons.xml#Elli_Mylonas">Normalized objectDesc/@ana</change>
+<change when="2016-12-14" who="persons.xml#Elli_Mylonas">Adding Pleiades IDs to origin/placenames</change>
+<change when="2019-01-29" who="persons.xml#Elli_Mylonas">
+                adding period attribute to date element, with Periodo value.
+            </change>
+</revisionDesc>
+</teiHeader>
+<facsimile>
+<surface>
+<desc/>
+<graphic url=""/>
+</surface>
+</facsimile>
+<text>
+<body>
+<div ana="#b1" corresp="#jeru0014.translation" subtype="transcription" type="edition" xml:id="jeru0014.transcription" xml:lang="heb">
+<p>אמא</p>
+</div>
+<div ana="#b1" corresp="#jeru0014.transcription" type="translation" xml:id="jeru0014.translation">
+<p>mother (or Imma)</p>
+</div>
+</body>
+<back>
+<div type="commentary">
+<p>The ossuary has an inner ledge on three sides and a flat, sliding lid with a small fingergrip on its outer edge. The word אמא could be a name or the word meaning "mother." Several examples of a name occuring along with this word support the second interpretation.</p>
+</div>
+<div type="bibliography">
+<listBibl>
+<bibl xml:id="b1">
+<ptr target="IIP-119.xml" type="biblItem"/>
+<biblScope unit="page">80</biblScope>
+</bibl>
+<bibl xml:id="b2">
+<ptr target="IIP-119.xml" type="biblItem"/>
+<biblScope unit="page">plate 4, fig. 21</biblScope>
+</bibl>
+</listBibl>
+</div>
+</back>
+</text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/safed/safed.csv b/backend/corpora/peaceportal/tests/data/safed/safed.csv
new file mode 100644
index 000000000..769adb10d
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/safed/safed.csv
@@ -0,0 +1,10 @@
+MISPAR;ADDITION;Map;Px;Who helped;First Name;First Name (hebrew);Middle Name;Middle Name (hebrew);Title;Parent / First;Parent / First (hebrew);Parent Middle Name;Parent Middle Name (hebrew);Family Name;Family Name (hebrew);City;City (hebrew);CHELKA;AREA;NOTES;YOM;CHODESH;SHANA;DAY;MONTH;YEAR;
+1;;;;;Avraham;אברהם;;;;;;;;Harbon;חרבון;;;א;A;החכם הרופא;ה;;רכו;;;1466;
+1;A;;;;Lechichl;לחיחל;;;;;;;;;;;;א;A;;י;שבט;תשי;28;1;1950;
+2;;;;;Pinchas;פנחס;;;;Zvi;צבי;;;;;;;א;A;;כט;טבת;תשכב;05;01;1962;
+3;;;;;Melech;מלך;;;;Meir;מאיר; Yisrael; ישראל;;;;;א;A;;ט;טבת;תשכב;16;12;1961;
+4;;;;;Rachel;רחל;;;;;;;;Negrenik Bahagen;נגריניק בהגן;;;א;A;;טו;טבת;תשכא;03;01;1961;
+5;;m;px;;Eliyahu;אליהו;Manig;מאניג;;Zev;זאב;;;Katz;כץ;;;א;A;age 68;א;ניסן;תשכ;29;03;1960;
+5;A;m;p-x;;Yitzhak;יצחק;;;;Moshe;משה ;David;דוד;Rozenthal HaCohen;רוזנטל הכהן;;;א;A;age 73;כח;חשון;תשכא;;;1960;
+6;;m;px;;Dvasi;דוואסי;;;;Zvi;צבי;;;Masiroka ?;מסירוקא ?;Siruka;סירוקא;א;A;above Mik-Ari Path;א;אייר;תשכ;28;04;1960;
+7;;m;px;;Sima;סימה;;;;Avraham;אברהם;;;Reuven;רובין;Batshan;באטשן;א;A;above Mik-Ari Path;כג;שבט;תשכ;;;1960;
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-11.xml b/backend/corpora/peaceportal/tests/data/tol/tol-11.xml
new file mode 100644
index 000000000..9259da682
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/tol/tol-11.xml
@@ -0,0 +1,214 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://purl.oclc.org/dsdl/schematron"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:space="preserve" xml:lang="en">
+ <teiHeader>
+  <fileDesc>
+   <titleStmt>
+    <title>epidat, tol-11</title>
+    <respStmt>
+     <resp>edited by</resp>
+     <persName ref="https://orcid.org/0000-0002-2701-4256" role="author">Elíshabá Mata</persName>
+    </respStmt>
+   </titleStmt>
+    <editionStmt>
+    <edition/>
+   </editionStmt>
+   <extent/>
+   <publicationStmt>
+    <publisher>
+     <orgName ref="http://d-nb.info/gnd/212562-6">
+      Salomon Ludwig Steinheim-Institut 
+      <address>
+       <addrLine>Edmund-Körner-Platz 2</addrLine>
+       <addrLine>D-45127 Essen</addrLine>
+      </address>
+     </orgName>
+    </publisher>
+    <idno>tol-11</idno>
+    <idno type="url">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11</idno>
+    <availability xml:lang="en" status="free">
+     <licence>
+      <ref target="http://creativecommons.org/licenses/by/4.0/">Distributed under a Creative Commons licence Attribution-BY 4.0</ref>
+      <p>
+       All reuse or distribution of this work must contain somewhere a link back to the URL 
+       <ref>http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11</ref>
+      </p>
+     </licence>
+    </availability>
+   </publicationStmt>
+    <notesStmt>
+    <note/>
+   </notesStmt>
+   <sourceDesc>
+    <bibl>born digital</bibl>
+    <msDesc>
+     <msIdentifier>
+      <repository>epidat</repository>
+      <idno>tol-11</idno>
+      <altIdentifier>
+       <idno type="html">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11</idno>
+       <note/>
+      </altIdentifier>
+      <altIdentifier>
+       <idno type="xml">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11-t</idno>
+       <note/>
+      </altIdentifier>
+     </msIdentifier>
+     <physDesc>
+      <objectDesc>
+       <supportDesc>
+        <support>
+         <p>
+          <material>stone (material not specified)</material>
+          <objectType ref="http://vocab.getty.edu/page/aat/300005909">sepulchral monument</objectType>
+         </p>
+        </support>
+       </supportDesc>
+        <layoutDesc>
+        <layout/>
+       </layoutDesc>
+      </objectDesc>
+     </physDesc>
+     <history>
+       <origin>
+       <!-- dateEntry not valid -->
+       <origPlace>
+        <country type="ISO_3166" key="XA-ES">Spain</country>
+        <settlement type='city' key='tgn:7008719' ref='http://vocab.getty.edu/tgn/7008719'>
+         Toledo 
+         <geogName ref="http://d-nb.info/gnd/4331943-9">
+          Jewish Cemetery
+          <geo decls="#WGS">39.871036 -4.022968</geo>
+         </geogName>
+        </settlement>
+       </origPlace>
+      </origin>
+     </history>
+    </msDesc>
+   </sourceDesc>
+  </fileDesc>
+  <profileDesc>
+   <particDesc>
+    <person xml:id="tol-11-1" sex="1">
+      
+     <persName>
+      Israel
+      <kov>Moshe</kov>
+      <env>Israel</env>
+     </persName>
+     <death when="1308-04-02">Hypothetical date</death>
+     <desc>Other transcription: YIŚRA#[2019]EL BEN MOŠEH BEN YIŚRA#[2019]EL #[000D]#[000A]Young murdered person</desc>
+    </person>
+   </particDesc>
+    <langUsage>
+    <!-- According to bcp47 http://tools.ietf.org/html/bcp47 (Language) and ISO15924 (writing System)-->
+    <language ident='he' usage='100'>Hebrew</language>
+   </langUsage>
+  </profileDesc>
+   <revisionDesc>
+   <change/>
+  </revisionDesc>
+ </teiHeader>
+ <text>
+  <body>
+   <div type='edition'>
+    <head>Edition</head>
+    <div type='textpart' subtype='recto'>
+      <ab>
+      <lb n='1'/>
+      ‎‏מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר‏‎
+      <lb n='2'/>
+      ‎‏לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר‏‎
+      <lb n='3'/>
+      ‎‏עַל מוֹת לַבֵּן בָּחוּר וָטוֹב‏‎
+      <lb n='4'/>
+      ‎‏כְּגַן רָטוֹב‏‎
+      <lb n='5'/>
+      ‎‏קָם עָלָיו כַּזְּדוֹנִים‏‎
+      <lb n='6'/>
+      ‎‏גּוֹי עַז פָּנִים‏‎
+      <lb n='7'/>
+      ‎‏הִשְׁקֵהוּ מֵי רוֹשׁ‏‎
+      <lb n='8'/>
+      ‎‏בָּא עַד הָרֹאשׁ‏‎
+      <lb n='9'/>
+      ‎‏וַיַּכֵּהוּ בִצְדִיָּה‏‎
+      <lb n='10'/>
+      ‎‏מַכָּה טְרִיָּה‏‎
+      <lb n='11'/>
+      ‎‏לָאָרֶץ חַיְתוֹ דִכָּה‏‎
+      <lb n='12'/>
+      ‎‏וַיִּצֶק דַּם הַמַּכָּה‏‎
+      <lb n='13'/>
+      ‎‏נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל‏‎
+      <lb n='14'/>
+      ‎‏נַעַר יִשְׂרָאֵל‏‎
+      <lb n='15'/>
+      ‎‏הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה‏‎
+      <lb n='16'/>
+      ‎‏בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה‏‎
+      <lb n='17'/>
+      ‎‏הַצְּבִי יִשְׂרָאֵל חָלָל‏‎
+      <lb n='18'/>
+      ‎‏בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל‏‎
+      <lb n='19'/>
+      ‎‏אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]‏‎
+      <lb n='20'/>
+      ‎‏וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ‏‎
+      <lb n='21'/>
+      ‎‏עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ‏‎
+      <lb n='22'/>
+      ‎‏בְּפֶסַח וַיָּמֶת אוֹתוֹ‏‎
+      <lb n='23'/>
+      ‎‏תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ‏‎
+      <lb n='24'/>
+      ‎‏וַיֵּאָסֵף אֶל עַמּוֹ‏‎
+      <lb n='25'/>
+      ‎‏תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים‏‎
+      <lb n='26'/>
+      ‎‏צְרוּרָה בִּצְרוֹר הַחַיִּים‏‎
+      <lb n='27'/>
+      ‎‏יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל‏‎
+      <lb n='28'/>
+      ‎‏אֱלֹהֵי יִשְׂרָאֵל‏‎
+      <lb n='29'/>
+     </ab>
+    </div>
+   </div>
+   <div type='translation'>
+    <head>Übersetzung</head>
+    <div type='textpart' subtype='recto'>
+      <ab>
+      <lb n='29'/>
+      <!--IsraelMoshe-->
+     </ab>
+    </div>
+   </div>
+   <div type="commentary" subtype="Prosopographie">
+    <head>Prosopographie</head>
+   </div>
+   <div type="bibliography">
+    <head>Bibliographie</head>
+     <bibl>
+     <ptr target="tol-info-t#Luzatto1841"/>
+     <citedRange unit="page">61-62</citedRange>
+     <idno rend="Nr.">62</idno>
+     <note></note>
+    </bibl>
+     <bibl>
+     <ptr target="tol-info-t#Schwab1910"/>
+     <citedRange unit="page">174-175</citedRange>
+     <idno rend="Nr.">17</idno>
+     <note></note>
+    </bibl>
+     <bibl>
+     <ptr target="tol-info-t#C/M"/>
+     <citedRange unit="page">83-84</citedRange>
+     <idno rend="Nr.">41</idno>
+     <note></note>
+    </bibl>
+   </div>
+  </body>
+ </text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-27.xml b/backend/corpora/peaceportal/tests/data/tol/tol-27.xml
new file mode 100644
index 000000000..0c710ec92
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/tol/tol-27.xml
@@ -0,0 +1,189 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://purl.oclc.org/dsdl/schematron"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:space="preserve" xml:lang="en">
+ <teiHeader>
+  <fileDesc>
+   <titleStmt>
+    <title>epidat, tol-27</title>
+    <respStmt>
+     <resp>edited by</resp>
+     <persName ref="https://orcid.org/0000-0002-2701-4256" role="author">Elíshabá Mata</persName>
+    </respStmt>
+   </titleStmt>
+    <editionStmt>
+    <edition/>
+   </editionStmt>
+   <extent/>
+   <publicationStmt>
+    <publisher>
+     <orgName ref="http://d-nb.info/gnd/212562-6">
+      Salomon Ludwig Steinheim-Institut 
+      <address>
+       <addrLine>Edmund-Körner-Platz 2</addrLine>
+       <addrLine>D-45127 Essen</addrLine>
+      </address>
+     </orgName>
+    </publisher>
+    <idno>tol-27</idno>
+    <idno type="url">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27</idno>
+    <availability xml:lang="en" status="free">
+     <licence>
+      <ref target="http://creativecommons.org/licenses/by/4.0/">Distributed under a Creative Commons licence Attribution-BY 4.0</ref>
+      <p>
+       All reuse or distribution of this work must contain somewhere a link back to the URL 
+       <ref>http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27</ref>
+      </p>
+     </licence>
+    </availability>
+   </publicationStmt>
+    <notesStmt>
+    <note/>
+   </notesStmt>
+   <sourceDesc>
+    <bibl>born digital</bibl>
+    <msDesc>
+     <msIdentifier>
+      <repository>epidat</repository>
+      <idno>tol-27</idno>
+      <altIdentifier>
+       <idno type="html">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27</idno>
+       <note/>
+      </altIdentifier>
+      <altIdentifier>
+       <idno type="xml">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-27-t</idno>
+       <note/>
+      </altIdentifier>
+     </msIdentifier>
+     <physDesc>
+      <objectDesc>
+       <supportDesc>
+        <support>
+         <p>
+          <material>stone (material not specified)</material>
+          <objectType ref="http://vocab.getty.edu/page/aat/300005909">sepulchral monument</objectType>
+         </p>
+        </support>
+       </supportDesc>
+        <layoutDesc>
+        <layout/>
+       </layoutDesc>
+      </objectDesc>
+     </physDesc>
+     <history>
+       <origin>
+       <!-- dateEntry not valid -->
+       <origPlace>
+        <country type="ISO_3166" key="XA-ES">Spain</country>
+        <settlement type='city' key='tgn:7008719' ref='http://vocab.getty.edu/tgn/7008719'>
+         Toledo 
+         <geogName ref="http://d-nb.info/gnd/4331943-9">
+          Jewish Cemetery
+          <geo decls="#WGS">39.871036 -4.022968</geo>
+         </geogName>
+        </settlement>
+       </origPlace>
+      </origin>
+     </history>
+    </msDesc>
+   </sourceDesc>
+  </fileDesc>
+  <profileDesc>
+   <particDesc>
+    <person xml:id="tol-27-1" sex="1">
+      
+     <persName>
+      Moshe
+      <kov>Yizhaq ben Elfats</kov>
+     </persName>
+     <death when="1332-11-16"></death>
+     <desc>Other transcription of the name: MOŠEH BEN YIṢḤAQ BEN #[2019]ELFAṬS#[000D]#[000A]Young man</desc>
+    </person>
+   </particDesc>
+    <langUsage>
+    <!-- According to bcp47 http://tools.ietf.org/html/bcp47 (Language) and ISO15924 (writing System)-->
+    <language ident='he' usage='100'>Hebrew</language>
+   </langUsage>
+  </profileDesc>
+   <revisionDesc>
+   <change/>
+  </revisionDesc>
+ </teiHeader>
+ <text>
+  <body>
+   <div type='edition'>
+    <head>Edition</head>
+    <div type='textpart' subtype='recto'>
+      <ab>
+      <lb n='1'/>
+      ‎‏בְּקֶבֶר זֶה נִטְמָן‏‎
+      <lb n='2'/>
+      ‎‏בָּחוּר נֶטַע נַעֲמָן‏‎
+      <lb n='3'/>
+      ‎‏לְדֵרֶךְ מוּסָר סָר‏‎
+      <lb n='4'/>
+      ‎‏וּמִדֵּרֶךְ יָשָׁר לֹא סָר‏‎
+      <lb n='5'/>
+      ‎‏ז״ךְ שָׁנִים חָיָה‏‎
+      <lb n='6'/>
+      ‎‏וְזַךְ לֵבָב הָיָה‏‎
+      <lb n='7'/>
+      ‎‏וּבז״ךְ בְּמַרְחֶשׁוָן פָּנָה‏‎
+      <lb n='8'/>
+      ‎‏וְעָזַב אֶת אָבִיו בֶּן שִׁבְעִים שָׁנָה‏‎
+      <lb n='9'/>
+      ‎‏נֶאֱנַח מַשְׁ#[05בּצּ]מִים‏‎
+      <lb n='10'/>
+      ‎‏כִּי אָרְכוּ לוֹ אַחֲרָיו הַיָּמִים‏‎
+      <lb n='11'/>
+      ‎‏וּבִשְׁנַת חֲמֵשֶׁת אֲלָפִים וְתִשִׁעִים וְשָׁלֹש‏‎
+      <lb n='12'/>
+      ‎‏נִלְכַּד בְּפַח וּפַחַת‏‎
+      <lb n='13'/>
+      ‎‏וּמִבֵּין רֵעָיו נֶאֱסַף וְנִכְתַּשׁ בְּתֹךְ מַכְתֵּשׁ‏‎
+      <lb n='14'/>
+      ‎‏הוּא מֹשֶה נ״ע בר׳ יִצְחָק נ״ע בֶּן אֵלְפַטְשׂ‏‎
+      <lb n='15'/>
+      ‎‏אֱלֹהָיו יְרַחֵם עָלָיו‏‎
+      <lb n='16'/>
+      ‎‏וְיָנוּחַ וְיַעֲמוֹד לְקֵץ הַיָּמִין לְגוֹרָלוֹ‏‎
+      <lb n='17'/>
+     </ab>
+    </div>
+   </div>
+   <div type='translation'>
+    <head>Übersetzung</head>
+    <div type='textpart' subtype='recto'>
+      <ab>
+      <lb n='17'/>
+      <!--MosheJizchak-->
+     </ab>
+    </div>
+   </div>
+   <div type="commentary" subtype="Prosopographie">
+    <head>Prosopographie</head>
+   </div>
+   <div type="bibliography">
+    <head>Bibliographie</head>
+     <bibl>
+     <ptr target="tol-info-t#Luzatto1841"/>
+     <citedRange unit="page">41-42</citedRange>
+     <idno rend="Nr.">39</idno>
+     <note></note>
+    </bibl>
+     <bibl>
+     <ptr target="tol-info-t#Schwab1910"/>
+     <citedRange unit="page">182-183</citedRange>
+     <idno rend="Nr.">25</idno>
+     <note></note>
+    </bibl>
+     <bibl>
+     <ptr target="tol-info-t#C/M"/>
+     <citedRange unit="page">94-95</citedRange>
+     <idno rend="Nr.">49</idno>
+     <note></note>
+    </bibl>
+   </div>
+  </body>
+ </text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tests/data/tol/tol-36.xml b/backend/corpora/peaceportal/tests/data/tol/tol-36.xml
new file mode 100644
index 000000000..b8d7a8be5
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/data/tol/tol-36.xml
@@ -0,0 +1,197 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<?xml-model href="http://www.stoa.org/epidoc/schema/9.0/tei-epidoc.rng" schematypens="http://purl.oclc.org/dsdl/schematron"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:space="preserve" xml:lang="en">
+ <teiHeader>
+  <fileDesc>
+   <titleStmt>
+    <title>epidat, tol-36</title>
+    <respStmt>
+     <resp>edited by</resp>
+     <persName ref="https://orcid.org/0000-0002-2701-4256" role="author">Elíshabá Mata</persName>
+    </respStmt>
+   </titleStmt>
+    <editionStmt>
+    <edition/>
+   </editionStmt>
+   <extent/>
+   <publicationStmt>
+    <publisher>
+     <orgName ref="http://d-nb.info/gnd/212562-6">
+      Salomon Ludwig Steinheim-Institut 
+      <address>
+       <addrLine>Edmund-Körner-Platz 2</addrLine>
+       <addrLine>D-45127 Essen</addrLine>
+      </address>
+     </orgName>
+    </publisher>
+    <idno>tol-36</idno>
+    <idno type="url">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36</idno>
+    <availability xml:lang="en" status="free">
+     <licence>
+      <ref target="http://creativecommons.org/licenses/by/4.0/">Distributed under a Creative Commons licence Attribution-BY 4.0</ref>
+      <p>
+       All reuse or distribution of this work must contain somewhere a link back to the URL 
+       <ref>http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36</ref>
+      </p>
+     </licence>
+    </availability>
+   </publicationStmt>
+    <notesStmt>
+    <note/>
+   </notesStmt>
+   <sourceDesc>
+    <bibl>born digital</bibl>
+    <msDesc>
+     <msIdentifier>
+      <repository>epidat</repository>
+      <idno>tol-36</idno>
+      <altIdentifier>
+       <idno type="html">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36</idno>
+       <note/>
+      </altIdentifier>
+      <altIdentifier>
+       <idno type="xml">http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-36-t</idno>
+       <note/>
+      </altIdentifier>
+     </msIdentifier>
+     <physDesc>
+      <objectDesc>
+       <supportDesc>
+        <support>
+         <p>
+          <material>stone (material not specified)</material>
+          <objectType ref="http://vocab.getty.edu/page/aat/300005909">sepulchral monument</objectType>
+         </p>
+        </support>
+       </supportDesc>
+        <layoutDesc>
+        <layout/>
+       </layoutDesc>
+      </objectDesc>
+     </physDesc>
+     <history>
+       <origin>
+       <!-- dateEntry not valid -->
+       <origPlace>
+        <country type="ISO_3166" key="XA-ES">Spain</country>
+        <settlement type='city' key='tgn:7008719' ref='http://vocab.getty.edu/tgn/7008719'>
+         Toledo 
+         <geogName ref="http://d-nb.info/gnd/4331943-9">
+          Jewish Cemetery
+          <geo decls="#WGS">39.871036 -4.022968</geo>
+         </geogName>
+        </settlement>
+       </origPlace>
+      </origin>
+     </history>
+    </msDesc>
+   </sourceDesc>
+  </fileDesc>
+  <profileDesc>
+   <particDesc>
+    <person xml:id="tol-36-1" sex="1">
+      
+     <persName>
+      Yaakov
+      <kov>Yizhaq</kov>
+     </persName>
+     <death when="1349-06-27"></death>
+     <desc>Other transcription of the name: YA#[2018]ĂQŌḆ BEN YIṢḤAQ BEN AL-SARAQOSTAN#[000D]#[000A]Occupation: Physician and counselor#[000D]#[000A]Death in the Black Death</desc>
+    </person>
+   </particDesc>
+    <langUsage>
+    <!-- According to bcp47 http://tools.ietf.org/html/bcp47 (Language) and ISO15924 (writing System)-->
+    <language ident='he' usage='100'>Hebrew</language>
+   </langUsage>
+  </profileDesc>
+   <revisionDesc>
+   <change/>
+  </revisionDesc>
+ </teiHeader>
+ <text>
+  <body>
+   <div type='edition'>
+    <head>Edition</head>
+    <div type='textpart' subtype='recto'>
+      <ab>
+      <lb n='1'/>
+      ‎‏בְּקֶבֶר זֶה נִקְבַּר‏‎
+      <lb n='2'/>
+      ‎‏אִישׁ שֵׂכֶל וּנְבוֹן דָּבָר‏‎
+      <lb n='3'/>
+      ‎‏נְקִי כַפָיִם וּבַר‏‎
+      <lb n='4'/>
+      ‎‏מָלֵא הוֹד וְחָכְמָה‏‎
+      <lb n='5'/>
+      ‎‏וְדַעַת וּמְזִמָּה‏‎
+      <lb n='6'/>
+      ‎‏יוֹעֵץ וַחֲכָם חֲרָשִׁים‏‎
+      <lb n='7'/>
+      ‎‏טוֹב עִם ה׳ וְעִם אֲנָשִׁים‏‎
+      <lb n='8'/>
+      ‎‏רוֹפֵא מַחֲלִים הַנְפָשִׁים‏‎
+      <lb n='9'/>
+      ‎‏וּמִזְּרַע קְדוֹשִׁים‏‎
+      <lb n='10'/>
+      ‎‏שְׁמוֹ ר׳ יַעֲקֹב בר׳ יִצְחָק נ׳ע ן׳ אַלְסָארַקֹסְטַן‏‎
+      <lb n='11'/>
+      ‎‏נָתַן כָּל־יָמָיו אֶל לִבּוֹ‏‎
+      <lb n='12'/>
+      ‎‏לֶאֱהוֹב אֶת ה׳ וּלְדָבְקָה בוֹ‏‎
+      <lb n='13'/>
+      ‎‏וְכַאֲשֶׁר בָּאָרֶץ פָּרַץ פֶּרֶץ‏‎
+      <lb n='14'/>
+      ‎‏בִּקְדוֹשִׂים אֲשֶׁר בָּאָרֶץ‏‎
+      <lb n='15'/>
+      ‎‏וַתִּפְרֹץ בָּם הַמַּגֵּפָה‏‎
+      <lb n='16'/>
+      ‎‏נֶאֱסַף אֶל עַמּוֹ‏‎
+      <lb n='17'/>
+      ‎‏וְעָזַב אֶת הָאָרֶץ וְעָלָה לִשְׁכוֹן מְרוֹמוֹ‏‎
+      <lb n='18'/>
+      ‎‏ובי׳׳ב בְּתַמּוּז שְׁנַת מְנוּחָה הָיְתָה יַד אֱלֹהָיו עָלָיו‏‎
+      <lb n='19'/>
+      ‎‏לְשׁוֹבֵב יַעֲקֹב אֵלָיו‏‎
+      <lb n='20'/>
+      ‎‏לָתֵּת לוֹ יָד בֵּין חֲסִידָיו וּלַעֲבוֹד בְּרֹאשָׁם‏‎
+      <lb n='21'/>
+      ‎‏וַיֹּאמֶר ה׳ אֶל יַעֲקֹב קוּם עֲלֵה בֵית אֵל וְשֶׁב שָׁם‏‎
+      <lb n='22'/>
+      ‎‏וְיַעֲקֹב הָלַךְ לְדַרְכּוֹ לִרְאוֹת פְּנֵי דָר נְגָהִים‏‎
+      <lb n='23'/>
+      ‎‏וַיִּפְגְּעוּ בוֹ מַלְאֲכֵי אֱלֹהִים‏‎
+      <lb n='24'/>
+     </ab>
+    </div>
+   </div>
+   <div type='translation'>
+    <head>Übersetzung</head>
+    <div type='textpart' subtype='recto'>
+      <ab>
+      <lb n='24'/>
+      <!--JaakovJizchak-->
+     </ab>
+    </div>
+   </div>
+   <div type="commentary" subtype="Prosopographie">
+    <head>Prosopographie</head>
+   </div>
+   <div type="bibliography">
+    <head>Bibliographie</head>
+     <bibl>
+     <ptr target="tol-info-t#Luzatto1841"/>
+     <citedRange unit="page">65-66</citedRange>
+     <idno rend="Nr.">70</idno>
+     <note></note>
+    </bibl>
+     <bibl>
+     <ptr target="tol-info-t#Schwab1910"/>
+     <citedRange unit="page">209-210;C/M (82),135-138</citedRange>
+     <idno rend="Nr.">58</idno>
+     <note></note>
+    </bibl>
+   </div>
+  </body>
+ </text>
+</TEI>
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
new file mode 100644
index 000000000..29b80a14e
--- /dev/null
+++ b/backend/corpora/peaceportal/tol.py
@@ -0,0 +1,390 @@
+import re
+from copy import copy
+
+from django.conf import settings
+
+from addcorpus.extract import XML, Constant, HTML, Combined
+from addcorpus.corpus import Field
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+
+
+class TOL(PeacePortal):
+    data_directory = settings.PEACEPORTAL_TOL_DATA
+    es_index = settings.PEACEPORTAL_TOL_ES_INDEX
+    es_alias = settings.PEACEPORTAL_ALIAS
+
+    def __init__(self):
+        self.source_database.extractor = Constant(
+            value='Medieval funerary inscriptions from Toledo'
+        )
+
+        self._id.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'msIdentifier', 'idno'],
+            multiple=False,
+            toplevel=False,
+            flatten=True
+        )
+
+        self.url.extractor = HTML(
+            tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'],
+            multiple=False,
+            toplevel=False,
+            flatten=True,
+            attribute_filter={
+                'attribute': 'type',
+                'value': 'url'
+            }
+        )
+
+        self.year.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origDate', 'date'],
+            toplevel=False,
+            transform=lambda x: get_year(x),
+        )
+
+        self.not_before.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origDate', 'date'],
+            toplevel=False,
+            attribute='notBefore',
+            transform=lambda x: get_year(x),
+        )
+
+        self.not_after.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origDate', 'date'],
+            toplevel=False,
+            attribute='notAfter',
+            transform=lambda x: get_year(x),
+        )
+
+        self.transcription.extractor = XML(
+            tag=['text', 'body', 'div'],
+            toplevel=False,
+            multiple=False,
+            flatten=True,
+            transform=lambda x: clean_newline_characters(x),
+            transform_soup_func=extract_transcript
+        )
+
+        self.names.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson', 'person'],
+            flatten=True,
+            multiple=True,
+            toplevel=False,
+        )
+
+        self.sex.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson', 'person'],
+            attribute='sex',
+            multiple=True,
+            toplevel=False,
+            transform=lambda x: convert_sex(x)
+        )
+
+        self.dates_of_death.extractor = XML(
+            tag=['teiHeader', 'profileDesc',
+                 'particDesc', 'listPerson'],
+            transform_soup_func=extract_death,
+            attribute='when',
+            multiple=False,
+            toplevel=False,
+        )
+
+        self.country.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'country'],
+            toplevel=False,
+            transform_soup_func=extract_country,
+            transform=lambda x: clean_country(x),
+            flatten=True,
+        )
+
+        self.region.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'country', 'region'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.settlement.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'settlement'],
+            toplevel=False,
+            flatten=True,
+            transform_soup_func=extract_settlement,
+        )
+
+        self.location_details.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'settlement', 'geogName'],
+            toplevel=False,
+            flatten=True,
+            transform_soup_func=extract_location_details,
+        )
+
+        self.material.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                 'objectDesc', 'supportDesc', 'support', 'p', 'material'],
+            toplevel=False,
+            flatten=True,
+            transform=lambda x: categorize_material(x)
+        )
+
+        self.material_details.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                 'objectDesc', 'supportDesc', 'support', 'p', 'material'],
+            toplevel=False,
+            flatten=True
+        )
+
+        self.language.extractor = XML(
+            tag=['teiHeader', 'profileDesc', 'langUsage', 'language'],
+            toplevel=False,
+            multiple=True,
+            transform=lambda x: get_language(x)
+        )
+
+        self.comments.extractor = Combined(
+            XML(
+                tag=['text', 'body'],
+                toplevel=False,
+                transform_soup_func=extract_commentary,
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'supportDesc', 'condition'],
+                toplevel=False,
+                flatten=True,
+                transform=lambda x: 'CONDITION:\n{}\n'.format(x) if x else x
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'supportDesc', 'support', 'p'],
+                toplevel=False,
+                transform_soup_func=extract_support_comments,
+            ),
+            transform=lambda x: join_commentaries(x)
+        )
+
+        self.images.extractor = XML(
+            tag=['facsimile', 'graphic'],
+            multiple=True,
+            attribute='url',
+            toplevel=False
+        )
+
+        self.coordinates.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'history', 'origin', 'origPlace', 'settlement', 'geogName', 'geo'],
+            toplevel=False,
+            multiple=False,
+            flatten=True
+        )
+
+        self.iconography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
+            toplevel=False,
+            multiple=False
+        )
+
+        self.bibliography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'msIdentifier', 'publications', 'publication'],
+            toplevel=False,
+            multiple=True
+        )
+
+        self.transcription_hebrew.extractor = Combined(
+            self.transcription.extractor,
+            Constant('he'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_english.extractor = Combined(
+            self.transcription.extractor,
+            Constant('en'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+        self.transcription_dutch.extractor = Combined(
+            self.transcription.extractor,
+            Constant('nl'),
+            transform=lambda x: get_text_in_language(x)
+        )
+
+
+def convert_sex(values):
+    if not values:
+        return ['Unknown']
+    result = []
+    for value in values:
+        if value == '1':
+            result.append('M')
+        elif value == '2':
+            result.append('F')
+        else:
+            result.append('Unknown')
+    return result
+
+
+def clean_country(text):
+    if not text:
+        return 'Unknown'
+    if text.lower().strip() == 'tobedone':
+        return 'Unknown'
+    return text
+
+
+def get_year(text):
+    if not text or text == '--':
+        return
+    matches = re.search('[1-2]{0,1}[0-9]{3}', text)
+    if matches:
+        return matches[0]
+
+
+def get_language(values):
+    if not values:
+        return ['Unknown']
+    if 'German in Hebrew letters' in values:
+        return ['German (transliterated)', 'Hebrew']
+    return values
+
+
+def extract_transcript(soup):
+    '''
+    Helper function to ensure correct extraction of the transcripts.
+    Note that there are multiple formats in which these are stored,
+    but the text that we need is always in the `<ab>` children of
+    `['text', 'body', 'div']` (where div has `type=edition`, this is always the first one).
+    '''
+    if not soup:
+        return
+    return soup.find_all('ab')
+
+
+def extract_translation(soup):
+    '''
+    Helper function to extract translation from the <body> tag
+    '''
+    if not soup:
+        return
+    translation = soup.find('div', {'type': 'translation'})
+    if translation:
+        return translation.find_all('ab')
+    else:
+        return
+
+
+def extract_commentary(soup):
+    '''
+    Helper function to extract all commentaries from the <body> tag.
+    A single element will be returned with the commentaries found as text content.
+    '''
+    if not soup: return
+    found = []
+    commentaries = soup.find_all('div', {'type': 'commentary'})
+
+    for commentary in commentaries:
+        if commentary['subtype'] in ['Zitate', 'Zeilenkommentar', 'Prosopographie', 'Abkürzung', 'Endkommentar', 'Stilmittel']:
+            p = commentary.find('p')
+            if p:
+                text = p.get_text()
+                if text:
+                    text = clean_commentary(text)
+                    found.append('{}:\n{}\n'.format(commentary['subtype'].strip().upper(), text))
+
+    if len(found) > 1:
+        cloned_soup = copy(soup)
+        cloned_soup.clear()
+        cloned_soup.string = "\n".join(found)
+        return cloned_soup
+    else:
+        return None
+
+def extract_support_comments(soup):
+    if not soup: return
+    cloned_soup = copy(soup)
+    cloned_soup.clear()
+
+    commentaries = add_support_comment(soup, '', 'dim', 'DIMENSIONS')
+    commentaries = add_support_comment(soup, commentaries, 'objectType', 'OBJECTTYPE')
+
+    # add any additional text from the <p> element,
+    # i.e. if there is text it is the very last node
+    contents = soup.contents
+    text = contents[len(contents) - 1].strip()
+    if text:
+        text = clean_commentary(text)
+        commentaries = '{}{}:\n{}\n'.format(commentaries, 'SUPPORT', text)
+
+    cloned_soup.string = commentaries
+    return cloned_soup
+
+
+def add_support_comment(soup, existing_commentaries, elem_name, commentary_name):
+    elem = soup.find(elem_name)
+    if elem:
+        text = elem.get_text()
+        if text:
+            text = clean_commentary(text)
+            return '{}{}:\n{}\n\n'.format(existing_commentaries, commentary_name, text)
+    return existing_commentaries
+
+
+def extract_death(soup):
+    '''
+    Helper function to extract date of death from multiple person tags.
+    '''
+    if not soup:
+        return
+    return soup.find_all('death')
+
+
+def extract_country(soup):
+    '''
+    Helper function to extract country.
+    This is needed because the output of `flatten` would otherwise include the text contents
+    of the `<region>`.
+    '''
+    return clone_soup_extract_child(soup, 'region')
+
+
+def extract_settlement(soup):
+    return clone_soup_extract_child(soup, 'geogName')
+
+
+def extract_location_details(soup):
+    return clone_soup_extract_child(soup, 'geo')
+
+
+def clone_soup_extract_child(soup, to_extract):
+    '''
+    Helper function to clone the soup and extract a child element.
+    This is useful when the output of `flatten` would otherwise include the text contents
+    of the child.
+    '''
+    if not soup:
+        return
+    cloned_soup = copy(soup)
+    child = cloned_soup.find(to_extract)
+    if child:
+        child.extract()
+    return cloned_soup
+
+    # TODO: add field
+
+    # TODO: move to a comments field:
+
+    # excluded (for now):
+    # title
+    # organization (incl details, e.g. address)
+    # licence
+    # taxonomy (i.e. things like foto1, foto2 -> no working links to actual images)
+

From b8181812dca823369c3aa1462a43e76ef0bec085 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 19 Oct 2023 17:24:10 +0200
Subject: [PATCH 30/98] add language specific analyzers

---
 backend/addcorpus/es_mappings.py | 12 +++---
 backend/addcorpus/es_settings.py | 66 +++++++++++++++++++-------------
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
index a2f58418f..3c5aedc12 100644
--- a/backend/addcorpus/es_mappings.py
+++ b/backend/addcorpus/es_mappings.py
@@ -1,4 +1,4 @@
-def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
+def main_content_mapping(token_counts=True, stopword_analyzer=None, stemming_analyzer=None, updated_highlighting=False):
     '''
     Mapping for the main content field. Options:
 
@@ -17,23 +17,23 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
         'term_vector': 'with_positions_offsets' # include char positions on _source (in addition to the multifields) for highlighting
     })
 
-    if any([token_counts, stopword_analysis, stemming_analysis]):
+    if any([token_counts, stopword_analyzer, stemming_analyzer]):
         multifields = {}
         if token_counts:
             multifields['length'] = {
                 "type":     "token_count",
                 "analyzer": "standard"
             }
-        if stopword_analysis:
+        if stopword_analyzer:
             multifields['clean'] = {
                 "type": "text",
-                "analyzer": "clean",
+                "analyzer": stopword_analyzer,
                 "term_vector": "with_positions_offsets" # include character positions for highlighting
             }
-        if stemming_analysis:
+        if stemming_analyzer:
             multifields['stemmed'] = {
                 "type": "text",
-                "analyzer": "stemmed",
+                "analyzer": stemming_analyzer,
                 "term_vector": "with_positions_offsets",
             }
         mapping['fields'] = multifields
diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 3bf3c25f5..0f90c22f6 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -29,30 +29,28 @@ def get_nltk_stopwords(language_code):
         raise NotImplementedError('language {} has no nltk stopwords list'.format(language))
 
 
-def es_settings(language=None, stopword_analyzer=False, stemming_analyzer=False):
+def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     '''
     Make elasticsearch settings json for a corpus index. Options:
-    - `language`: string with the language code. See addcorpus.constants for options, and which languages support stopwords/stemming
-    - `stopword_analyzer`: define an analyser that removes stopwords.
-    - `stemming_analyzer`: define an analyser that removes stopwords and performs stemming.
+    - `language`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming
+    - `stopword_analyzer`: define an analyzer that removes stopwords.
+    - `stemming_analyzer`: define an analyzer that removes stopwords and performs stemming.
     '''
     settings = {'index': {'number_of_shards': 1, 'number_of_replicas': 1}}
-
-    if stopword_analyzer or stemming_analyzer:
-        settings["analysis"] = {
-            "analyzer": {},
-            "char_filter":{ "number_filter": number_filter() },
-            'filter': {
-                "stopwords": make_stopword_filter(language)
-            }
-        }
-
-        if stopword_analyzer:
-            settings["analysis"]['analyzer']['clean'] = make_stopword_analyzer()
-
-        if stemming_analyzer:
-            settings['analysis']['filter']['stemmer'] = make_stemmer_filter(language)
-            settings["analysis"]['analyzer']['stemmed'] = make_stemmed_analyzer()
+    stopword_filter_name = 'stopwords'
+    clean_analyzer_name = 'clean'
+    stemmer_filter_name = 'stemmer'
+    stemmed_analyzer_name = 'stemmed'
+    
+    for language in languages:
+        add_language_string = lambda name: '{}_{}'.format(language, name) if len(languages) > 0 else name
+        if stopword_analyzer or stemming_analyzer:
+            set_stopword_filter(language, add_language_string(stopword_filter_name))
+            
+            if stopword_analyzer:
+                set_clean_analyzer(language, add_language_string(stopword_filter_name), add_language_string(clean_analyzer_name))
+            if stemming_analyzer:
+                set_stemmed_analyzer(language, add_language_string(stemmer_filter_name), add_language_string(stemmed_analyzer_name))
 
     return settings
 
@@ -63,18 +61,18 @@ def number_filter():
         "replacement":""
     }
 
-def make_stopword_filter(language):
+def make_stopword_filter(language, stopword_filter_name):
     stopwords = get_nltk_stopwords(language)
     return {
         "type": "stop",
-        "stopwords": stopwords
+        stopword_filter_name: stopwords
     }
 
-def make_stopword_analyzer():
+def make_clean_analyzer(stopword_filter_name):
     return {
         "tokenizer": "standard",
         "char_filter": ["number_filter"],
-        "filter": ["lowercase", "stopwords"]
+        "filter": ["lowercase", stopword_filter_name]
     }
 
 def make_stemmer_filter(language):
@@ -84,11 +82,11 @@ def make_stemmer_filter(language):
         "language": stemmer_language
     }
 
-def make_stemmed_analyzer():
+def make_stemmed_analyzer(stemmer_filter_name):
     return {
         "tokenizer": "standard",
         "char_filter": ["number_filter"],
-        "filter": ["lowercase", "stopwords", "stemmer"]
+        "filter": ["lowercase", "stopwords", stemmer_filter_name]
     }
 
 def get_stopwords_from_settings(es_settings):
@@ -99,3 +97,19 @@ def get_stopwords_from_settings(es_settings):
         stopwords = None
 
     return stopwords
+
+def set_stemmed_analyzer(settings, language, stemmer_filter_name, stemmed_analyzer_name):
+    settings['analysis']['filter'][stemmer_filter_name] = make_stemmer_filter(language)
+    settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stemmer_filter_name)
+
+def set_stopword_filter(settings, language, stopword_filter_name):
+    settings["analysis"] = {
+        "analyzer": {},
+        "char_filter":{ "number_filter": number_filter() },
+        'filter': {
+            "stopwords": make_stopword_filter(language, stopword_filter_name)
+        }
+    }
+    
+def set_clean_analyzer(settings, language, stopword_filter_name, clean_analyzer_name):
+    settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer(language, stopword_filter_name)
\ No newline at end of file

From 9470689c104fee685b7854541088df97b01d97f6 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 25 Oct 2023 17:28:51 +0200
Subject: [PATCH 31/98] updates for tests

---
 backend/corpora/peaceportal/FIJI/fiji.py      |  9 +--
 backend/corpora/peaceportal/conftest.py       | 18 +++++
 backend/corpora/peaceportal/peaceportal.py    | 62 ++++++++---------
 .../corpora/peaceportal/tests/test_import.py  | 69 +++++++++++++++++++
 4 files changed, 123 insertions(+), 35 deletions(-)
 create mode 100644 backend/corpora/peaceportal/conftest.py
 create mode 100644 backend/corpora/peaceportal/tests/test_import.py

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index e4f1235e7..bbe807bd6 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -2,7 +2,8 @@
 import os
 import os.path as op
 import logging
-from flask import current_app
+
+from django.conf import settings
 
 from addcorpus.extract import XML, Constant, Combined
 from addcorpus.corpus import Field
@@ -16,9 +17,9 @@ class FIJI(PeacePortal):
     than in the earlier version (i.e. the one under corpora/jewishinscriptions).
     '''
 
-    data_directory = current_app.config['PEACEPORTAL_FIJI_DATA']
-    es_index = current_app.config['PEACEPORTAL_FIJI_ES_INDEX']
-    es_alias = current_app.config['PEACEPORTAL_ALIAS']
+    data_directory = settings.PEACEPORTAL_FIJI_DATA
+    es_index = settings.PEACEPORTAL_FIJI_ES_INDEX
+    es_alias = settings.PEACEPORTAL_ALIAS
     filename_pattern = re.compile('\d+')
 
     def sources(self, start, end):
diff --git a/backend/corpora/peaceportal/conftest.py b/backend/corpora/peaceportal/conftest.py
new file mode 100644
index 000000000..6bcd8732d
--- /dev/null
+++ b/backend/corpora/peaceportal/conftest.py
@@ -0,0 +1,18 @@
+import pytest
+import os
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+@pytest.fixture()
+def peace_corpus_settings(settings):
+    settings.CORPORA = {
+        'peaceportal-epidat': os.path.join(here, 'epidat.py'),
+        'peaceportal-fiji': os.path.join(here, 'FIJI', 'fiji.py'),
+        'peaceportal-iis': os.path.join(here, 'iis.py'),
+        'peaceportal-tol': os.path.join(here, 'tol.py'),
+    }
+
+    settings.PEACEPORTAL_EPIDAT_DATA= os.path.join(here, 'tests', 'data', 'epidat')
+    settings.PEACEPORTAL_FIJI_DATA= os.path.join(here, 'tests', 'data', 'fiji')
+    settings.PEACEPORTAL_IIS_DATA = os.path.join(here, 'tests', 'data', 'iis')
+    settings.PEACEPORTAL_TOL_DATA = os.path.join(here, 'tests', 'data', 'tol')
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 3f02a842a..c3b3d7933 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -7,7 +7,7 @@
 
 from django.conf import settings
 
-from addcorpus.corpus import XMLCorpus, Field
+from addcorpus.corpus import XMLCorpus, FieldDefinition
 from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping
 from addcorpus.es_settings import es_settings
 from addcorpus.extract import XML, Constant
@@ -32,7 +32,7 @@ class PeacePortal(XMLCorpus):
     min_year = -530
     max_date = datetime(year=1950, month=12, day=31)
     visualize = []
-    es_index = current_app.config['PEACEPORTAL_ALIAS']
+    es_index = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal')
     scan_image_type = 'image/png'
     # fields below are required by code but not actually used
     min_date = datetime(year=746, month=1, day=1)
@@ -76,7 +76,7 @@ def request_media(self, document):
             images = []
         return { 'media': images }
 
-    source_database = Field(
+    source_database = FieldDefinition(
         name='source_database',
         display_name='Source database',
         description='The database a record originates from.',
@@ -88,7 +88,7 @@ def request_media(self, document):
         csv_core=True
     )
 
-    _id = Field(
+    _id = FieldDefinition(
         name='id',
         display_name='ID',
         description='ID of the inscription entry.',
@@ -97,7 +97,7 @@ def request_media(self, document):
         search_field_core=True
     )
 
-    url = Field(
+    url = FieldDefinition(
         name='url',
         display_name='URL',
         description='URL of the inscription entry.',
@@ -105,7 +105,7 @@ def request_media(self, document):
         search_field_core=True
     )
 
-    year = Field(
+    year = FieldDefinition(
         name='year',
         display_name='Year',
         description='Year of origin of the inscription.',
@@ -122,7 +122,7 @@ def request_media(self, document):
         results_overview=True
     )
 
-    not_before = Field(
+    not_before = FieldDefinition(
         name='not_before',
         display_name='Not before',
         description='Inscription is dated not earlier than this year.',
@@ -130,7 +130,7 @@ def request_media(self, document):
         hidden=True
     )
 
-    not_after = Field(
+    not_after = FieldDefinition(
         name='not_after',
         display_name='Not after',
         description='Inscription is dated not later than this year.',
@@ -138,7 +138,7 @@ def request_media(self, document):
         hidden=True
     )
 
-    transcription = Field(
+    transcription = FieldDefinition(
         name='transcription',
         es_mapping=main_content_mapping(),
         display_name='Transcription',
@@ -148,43 +148,43 @@ def request_media(self, document):
         display_type='text_content'
     )
 
-    transcription_german = Field(
+    transcription_german = FieldDefinition(
         name='transcription_german',
         es_mapping={'type': 'text', 'analyzer': 'german' },
         hidden=True
     )
 
-    transcription_english = Field(
+    transcription_english = FieldDefinition(
         name='transcription_english',
         es_mapping={'type': 'text', 'analyzer': 'english'},
         hidden=True
     )
 
-    transcription_hebrew = Field(
+    transcription_hebrew = FieldDefinition(
         name='transcription_hebrew',
         es_mapping={'type': 'text'},
         hidden=True
     )
 
-    transcription_latin = Field(
+    transcription_latin = FieldDefinition(
         name='transcription_latin',
         es_mapping={'type': 'text'},
         hidden=True
     )
 
-    transcription_greek = Field(
+    transcription_greek = FieldDefinition(
         name='transcription_greek',
         es_mapping={'type': 'text', 'analyzer': 'greek'},
         hidden=True
     )
 
-    transcription_dutch = Field(
+    transcription_dutch = FieldDefinition(
         name='transcription_dutch',
         es_mapping={'type': 'text', 'analyzer': 'dutch'},
         hidden=True
     )
 
-    age = Field(
+    age = FieldDefinition(
         name='age',
         display_name='Age',
         description='Age of the buried person(s)',
@@ -200,7 +200,7 @@ def request_media(self, document):
     )
 
     # A string with all the names occuring in the source
-    names = Field(
+    names = FieldDefinition(
         name='names',
         es_mapping=text_mapping(),
         display_name='Names',
@@ -209,7 +209,7 @@ def request_media(self, document):
     )
 
     # Should be an array with potentially multiple values from these: 'M', 'F', or None.
-    sex = Field(
+    sex = FieldDefinition(
         name='sex',
         display_name='Sex',
         description='Gender(s) of the buried person(s). None if the sex is unknown.',
@@ -221,7 +221,7 @@ def request_media(self, document):
         csv_core=True
     )
 
-    country = Field(
+    country = FieldDefinition(
         name='country',
         display_name='Country',
         description='Country where the inscription was found.',
@@ -234,7 +234,7 @@ def request_media(self, document):
         results_overview=True
     )
 
-    settlement = Field(
+    settlement = FieldDefinition(
         name='settlement',
         display_name='Settlement',
         description='The settlement where the inscription was found.',
@@ -246,7 +246,7 @@ def request_media(self, document):
         visualization_type='term_frequency'
     )
 
-    region = Field(
+    region = FieldDefinition(
         name='region',
         display_name='Region',
         description='The region where the inscription was found.',
@@ -258,14 +258,14 @@ def request_media(self, document):
         visualization_type='term_frequency'
     )
 
-    location_details = Field(
+    location_details = FieldDefinition(
         name='location_details',
         display_name='Location details',
         description='Details about the location of the inscription',
         es_mapping=text_mapping()
     )
 
-    material = Field(
+    material = FieldDefinition(
         name='material',
         display_name='Material',
         description='Type of material the inscription is written on.',
@@ -277,7 +277,7 @@ def request_media(self, document):
         visualization_type='term_frequency'
     )
 
-    material_details = Field(
+    material_details = FieldDefinition(
         name='material_details',
         display_name='Material details',
         description='Details about the material the inscription is written on.',
@@ -285,7 +285,7 @@ def request_media(self, document):
         search_field_core=True
     )
 
-    language = Field(
+    language = FieldDefinition(
         name='language',
         display_name='Language',
         description='Language written on the inscription.',
@@ -298,14 +298,14 @@ def request_media(self, document):
         visualization_type='term_frequency'
     )
 
-    bibliography = Field(
+    bibliography = FieldDefinition(
         name='bibliography',
         es_mapping=keyword_mapping(),
         display_name='Bibliography',
         description='Reference(s) to who edited and published this funerary inscription.'
     )
 
-    comments = Field(
+    comments = FieldDefinition(
         name='comments',
         es_mapping=text_mapping(),
         display_name='Commentary',
@@ -313,7 +313,7 @@ def request_media(self, document):
         search_field_core=True,
     )
 
-    images = Field(
+    images = FieldDefinition(
         name='images',
         es_mapping=keyword_mapping(),
         display_name='Images',
@@ -321,14 +321,14 @@ def request_media(self, document):
         hidden=True
     )
 
-    coordinates = Field(
+    coordinates = FieldDefinition(
         name='coordinates',
         es_mapping=keyword_mapping(),
         display_name='Coordinates',
         description='GIS coordinates for the inscription.'
     )
 
-    iconography = Field(
+    iconography = FieldDefinition(
         name='iconography',
         es_mapping=text_mapping(),
         display_name='Iconography',
@@ -336,7 +336,7 @@ def request_media(self, document):
         search_field_core=True
     )
 
-    dates_of_death = Field(
+    dates_of_death = FieldDefinition(
         name='dates_of_death',
         es_mapping=keyword_mapping(),
         display_name='Date of death',
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
new file mode 100644
index 000000000..a9a57a6f1
--- /dev/null
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -0,0 +1,69 @@
+import os
+import warnings
+import pytest
+from datetime import datetime
+
+from addcorpus.load_corpus import load_corpus_definition
+
+CORPUS_TEST_DATA = [
+    {
+        'name': 'peaceportal-epidat',
+        'docs': [],
+        'n_documents': 2
+    },
+    {
+        'name': 'peaceportal-fiji',
+        'docs': [],
+        'n_documents': 3
+    },
+    {
+        'name': 'peaceportal-iis',
+        'docs': [],
+        'n_documents': 3
+    },
+    {
+        'name': 'peaceportal-tol',
+        'docs': [],
+        'n_documents': 3
+    }
+]
+
+def corpus_test_name(corpus_spec):
+    return corpus_spec['name']
+
+@pytest.mark.parametrize("corpus_object", CORPUS_TEST_DATA, ids=corpus_test_name)
+def test_imports(peace_corpus_settings, corpus_object):
+    corpus = load_corpus_definition(corpus_object.get('name'))
+    assert len(os.listdir(os.path.abspath(corpus.data_directory))) != 0
+
+    start = corpus_object['start'] if 'start' in corpus_object else corpus.min_date
+    end = corpus_object['end'] if 'end' in corpus_object else corpus.max_date
+
+    tested_fields = set()
+    resulted_fields = set()
+
+    docs = get_documents(corpus, start, end)
+    for target in corpus_object.get('docs'):
+        doc = next(docs)
+        for key in target:
+            tested_fields.add(key)
+            assert key in doc
+            assert doc[key] == target[key]
+
+        for key in doc:
+            resulted_fields.add(key)
+
+    for key in resulted_fields:
+        if not key in tested_fields:
+            message = 'Key "{}" is included the result for {} but has no specification'.format(key, corpus_object.get('name'))
+            warnings.warn(message)
+
+    docs = get_documents(corpus, start, end)
+    assert len(list(docs)) == corpus_object.get('n_documents')
+
+def get_documents(corpus, start, end):
+    sources = corpus.sources(
+        start=start,
+        end=end
+    )
+    return corpus.documents(sources)

From 3b73cbcbd75b84c12723c03715dca6e65421962b Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 26 Oct 2023 10:15:43 +0200
Subject: [PATCH 32/98] add langdetect dependency

---
 backend/requirements.in  |  1 +
 backend/requirements.txt | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/backend/requirements.in b/backend/requirements.in
index ab5812765..884fc7c85 100644
--- a/backend/requirements.in
+++ b/backend/requirements.in
@@ -5,6 +5,7 @@ django-livereload-server
 # django-revproxy, see https://github.com/UUDigitalHumanitieslab/cookiecutter-webapp-deluxe/issues/35
 git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a
 djangosaml2
+langdetect
 psycopg2
 pytest
 pytest-django
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 943b2d568..80293e02e 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -12,6 +12,8 @@ async-timeout==4.0.2
     # via redis
 attrs==22.2.0
     # via pytest
+backports-zoneinfo==0.2.1
+    # via django
 beautifulsoup4==4.11.1
     # via
     #   -r requirements.in
@@ -45,22 +47,27 @@ click-repl==0.2.0
     # via celery
 cryptography==39.0.1
     # via
+    #   pyjwt
     #   pyopenssl
     #   pysaml2
 defusedxml==0.7.1
     # via
     #   djangosaml2
     #   pysaml2
+    #   python3-openid
 dj-rest-auth[with_social]==2.2.7
     # via -r requirements.in
 django==4.1.10
     # via
     #   -r requirements.in
     #   dj-rest-auth
+    #   django-allauth
     #   django-livereload-server
     #   django-revproxy
     #   djangorestframework
     #   djangosaml2
+django-allauth==0.52.0
+    # via dj-rest-auth
 django-livereload-server==0.4
     # via -r requirements.in
 django-revproxy @ git+https://github.com/jazzband/django-revproxy.git@1defbb2dad5c0632391d54bcd3dbdaeabf46266a
@@ -79,6 +86,8 @@ elementpath==4.1.1
     # via xmlschema
 et-xmlfile==1.1.0
     # via openpyxl
+exceptiongroup==1.1.3
+    # via pytest
 execnet==1.9.0
     # via pytest-xdist
 fst-pso==1.8.1
@@ -89,6 +98,8 @@ gensim==4.3.0
     # via -r requirements.in
 idna==3.4
     # via requests
+importlib-resources==6.1.0
+    # via pysaml2
 iniconfig==2.0.0
     # via pytest
 joblib==1.2.0
@@ -99,6 +110,8 @@ kombu==5.2.4
     # via celery
 langcodes==3.3.0
     # via -r requirements.in
+langdetect==1.0.9
+    # via -r requirements.in
 language-data==1.1
     # via -r requirements.in
 lxml==4.9.1
@@ -121,6 +134,8 @@ numpy==1.24.1
     #   scikit-learn
     #   scipy
     #   simpful
+oauthlib==3.2.2
+    # via requests-oauthlib
 openpyxl==3.1.2
     # via -r requirements.in
 packaging==23.0
@@ -139,6 +154,8 @@ pycparser==2.21
     # via cffi
 pyfume==0.2.25
     # via fuzzytm
+pyjwt[crypto]==2.8.0
+    # via django-allauth
 pyopenssl==23.1.1
     # via pysaml2
 pypdf2==3.0.1
@@ -160,6 +177,8 @@ python-dateutil==2.8.2
     # via
     #   pandas
     #   pysaml2
+python3-openid==3.2.0
+    # via django-allauth
 pytz==2022.7
     # via
     #   celery
@@ -172,8 +191,12 @@ regex==2022.10.31
     # via nltk
 requests==2.31.0
     # via
+    #   django-allauth
     #   pysaml2
+    #   requests-oauthlib
     #   simpful
+requests-oauthlib==1.3.1
+    # via django-allauth
 scikit-learn==1.2.1
     # via -r requirements.in
 scipy==1.10.0
@@ -190,6 +213,7 @@ six==1.16.0
     # via
     #   click-repl
     #   django-livereload-server
+    #   langdetect
     #   python-dateutil
 smart-open==6.3.0
     # via gensim
@@ -201,12 +225,16 @@ textdistance==4.5.0
     # via -r requirements.in
 threadpoolctl==3.1.0
     # via scikit-learn
+tomli==2.0.1
+    # via pytest
 tornado==6.3.3
     # via django-livereload-server
 tqdm==4.64.1
     # via
     #   -r requirements.in
     #   nltk
+typing-extensions==4.8.0
+    # via pypdf2
 urllib3==1.26.17
     # via
     #   django-revproxy
@@ -221,6 +249,8 @@ wcwidth==0.2.6
     # via prompt-toolkit
 xmlschema==2.2.3
     # via pysaml2
+zipp==3.17.0
+    # via importlib-resources
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools

From feeadf487eef3daa11894a2bcb67ce41168eda83 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 26 Oct 2023 16:43:42 +0200
Subject: [PATCH 33/98] add language-specific analyzers

---
 backend/addcorpus/es_mappings.py              |  8 ++++--
 backend/addcorpus/es_settings.py              | 28 ++++++++++++-------
 .../dutchnewspapers/dutchnewspapers_public.py |  2 +-
 backend/corpora/ecco/ecco.py                  |  2 +-
 .../guardianobserver/guardianobserver.py      |  2 +-
 backend/corpora/parliament/parliament.py      |  2 +-
 .../parliament/utils/field_defaults.py        |  2 +-
 backend/corpora/peaceportal/FIJI/fiji.py      |  5 ++--
 backend/corpora/peaceportal/conftest.py       |  6 ++--
 backend/corpora/peaceportal/epidat.py         |  4 +--
 backend/corpora/peaceportal/iis.py            |  5 ++--
 backend/corpora/peaceportal/peaceportal.py    | 20 ++++++-------
 backend/corpora/peaceportal/tol.py            |  5 ++--
 backend/corpora/periodicals/periodicals.py    |  2 +-
 backend/corpora/rechtspraak/rechtspraak.py    |  2 +-
 backend/corpora/times/times.py                |  2 +-
 backend/corpora/troonredes/troonredes.py      |  2 +-
 17 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
index 3c5aedc12..b5465c4f1 100644
--- a/backend/addcorpus/es_mappings.py
+++ b/backend/addcorpus/es_mappings.py
@@ -3,8 +3,8 @@ def main_content_mapping(token_counts=True, stopword_analyzer=None, stemming_ana
     Mapping for the main content field. Options:
 
     - `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
-    - `stopword_analysis`: enables analysis using stopword removal. Requires setting a `clean` analyser in the `es_settings` of the corpus.
-    - `stemming_analysis`: enables analysis using stemming. Requires a `stemmed` analyser in the `es_settings` for the corpus.
+    - `stopword_analyzer`: enables analysis using stopword removal. Can be a string specifying `clean-{language}` analyser in the `es_settings` of the corpus, or True for `clean`
+    - `stemming_analysis`: enables analysis using stemming. Can be a string specifying a `stemmed-{}` analyser in the `es_settings` for the corpus, or Truem for `stemmed`
     - 'updated_highlighting': enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
     '''
 
@@ -25,12 +25,16 @@ def main_content_mapping(token_counts=True, stopword_analyzer=None, stemming_ana
                 "analyzer": "standard"
             }
         if stopword_analyzer:
+            if type(stopword_analyzer)==bool:
+                stopword_analyzer = 'clean'
             multifields['clean'] = {
                 "type": "text",
                 "analyzer": stopword_analyzer,
                 "term_vector": "with_positions_offsets" # include character positions for highlighting
             }
         if stemming_analyzer:
+            if type(stemming_analyzer)==bool:
+                stemming_analyzer = 'stemmed'
             multifields['stemmed'] = {
                 "type": "text",
                 "analyzer": stemming_analyzer,
diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 0f90c22f6..3251719ae 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -45,12 +45,13 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     for language in languages:
         add_language_string = lambda name: '{}_{}'.format(language, name) if len(languages) > 0 else name
         if stopword_analyzer or stemming_analyzer:
-            set_stopword_filter(language, add_language_string(stopword_filter_name))
-            
+            if not set_stopword_filter(settings, language, add_language_string(stopword_filter_name)):
+                continue # skip languages for which we do not have a stopword list
+
             if stopword_analyzer:
-                set_clean_analyzer(language, add_language_string(stopword_filter_name), add_language_string(clean_analyzer_name))
+                set_clean_analyzer(settings, language, add_language_string(stopword_filter_name), add_language_string(clean_analyzer_name))
             if stemming_analyzer:
-                set_stemmed_analyzer(language, add_language_string(stemmer_filter_name), add_language_string(stemmed_analyzer_name))
+                set_stemmed_analyzer(settings, language, add_language_string(stemmer_filter_name), add_language_string(stemmed_analyzer_name))
 
     return settings
 
@@ -62,11 +63,14 @@ def number_filter():
     }
 
 def make_stopword_filter(language, stopword_filter_name):
-    stopwords = get_nltk_stopwords(language)
-    return {
-        "type": "stop",
-        stopword_filter_name: stopwords
-    }
+    try:
+        stopwords = get_nltk_stopwords(language)
+        return {
+            "type": "stop",
+            stopword_filter_name: stopwords
+        }
+    except:
+        return None
 
 def make_clean_analyzer(stopword_filter_name):
     return {
@@ -103,13 +107,17 @@ def set_stemmed_analyzer(settings, language, stemmer_filter_name, stemmed_analyz
     settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stemmer_filter_name)
 
 def set_stopword_filter(settings, language, stopword_filter_name):
+    stopword_filter = make_stopword_filter(language, stopword_filter_name)
+    if not stopword_filter:
+        return False
     settings["analysis"] = {
         "analyzer": {},
         "char_filter":{ "number_filter": number_filter() },
         'filter': {
-            "stopwords": make_stopword_filter(language, stopword_filter_name)
+            "stopwords": stopword_filter
         }
     }
+    return True
     
 def set_clean_analyzer(settings, language, stopword_filter_name, clean_analyzer_name):
     settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer(language, stopword_filter_name)
\ No newline at end of file
diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
index f326ced2a..167597f03 100644
--- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
+++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
@@ -40,7 +40,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     tag_toplevel = 'text'
     tag_entry = 'p'
diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
index d23ef196b..b56c616c2 100644
--- a/backend/corpora/ecco/ecco.py
+++ b/backend/corpora/ecco/ecco.py
@@ -32,7 +32,7 @@ class Ecco(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     data_directory = settings.ECCO_DATA
     es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py
index b700e82c1..f6b60a348 100644
--- a/backend/corpora/guardianobserver/guardianobserver.py
+++ b/backend/corpora/guardianobserver/guardianobserver.py
@@ -46,7 +46,7 @@ class GuardianObserver(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     tag_toplevel = 'Record'
 
diff --git a/backend/corpora/parliament/parliament.py b/backend/corpora/parliament/parliament.py
index 94a557b5d..9d5af6096 100644
--- a/backend/corpora/parliament/parliament.py
+++ b/backend/corpora/parliament/parliament.py
@@ -38,7 +38,7 @@ class Parliament(CorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
 
     # overwrite below in child class if you need to extract the (converted) transcription
diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py
index cf74a6c10..360b485c0 100644
--- a/backend/corpora/parliament/utils/field_defaults.py
+++ b/backend/corpora/parliament/utils/field_defaults.py
@@ -284,7 +284,7 @@ def speech():
         display_name='Speech',
         description='The transcribed speech',
         # each index has its own definition of the 'clean' and 'stemmed' analyzer, based on language
-        es_mapping = main_content_mapping(token_counts=True, stopword_analysis=True, stemming_analysis=True, updated_highlighting=True),
+        es_mapping = main_content_mapping(token_counts=True, stopword_analyzer=True, stemming_analyzer=True, updated_highlighting=True),
         results_overview=True,
         search_field_core=True,
         display_type='text_content',
diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index bbe807bd6..83ef4b5c1 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -6,11 +6,10 @@
 from django.conf import settings
 
 from addcorpus.extract import XML, Constant, Combined
-from addcorpus.corpus import Field
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, join_commentaries, get_text_in_language
 
 
-class FIJI(PeacePortal):
+class PeaceportalFIJI(PeacePortal):
     '''
     This is a fresh version of Ortal-Paz Saar's 'Funerary Inscriptions of Jews from Italy' corpus,
     updated to align with the PEACE portal index. This mostly implies that there are less fields
@@ -18,7 +17,7 @@ class FIJI(PeacePortal):
     '''
 
     data_directory = settings.PEACEPORTAL_FIJI_DATA
-    es_index = settings.PEACEPORTAL_FIJI_ES_INDEX
+    es_index = getattr(settings, 'PEACEPORTAL_FIJI_ES_INDEX', 'peaceportal-fiji')
     es_alias = settings.PEACEPORTAL_ALIAS
     filename_pattern = re.compile('\d+')
 
diff --git a/backend/corpora/peaceportal/conftest.py b/backend/corpora/peaceportal/conftest.py
index 6bcd8732d..403909f0e 100644
--- a/backend/corpora/peaceportal/conftest.py
+++ b/backend/corpora/peaceportal/conftest.py
@@ -14,5 +14,7 @@ def peace_corpus_settings(settings):
 
     settings.PEACEPORTAL_EPIDAT_DATA= os.path.join(here, 'tests', 'data', 'epidat')
     settings.PEACEPORTAL_FIJI_DATA= os.path.join(here, 'tests', 'data', 'fiji')
-    settings.PEACEPORTAL_IIS_DATA = os.path.join(here, 'tests', 'data', 'iis')
-    settings.PEACEPORTAL_TOL_DATA = os.path.join(here, 'tests', 'data', 'tol')
\ No newline at end of file
+    settings.PEACEPORTAL_IIS_DATA = os.path.join(here, 'tests', 'data', 'iis', 'xml')
+    settings.PEACEPORTAL_IIS_TXT_DATA = os.path.join(here, 'tests', 'data', 'iis', 'transcription_txts')
+    settings.PEACEPORTAL_TOL_DATA = os.path.join(here, 'tests', 'data', 'tol')
+    settings.PEACEPORTAL_ALIAS = 'peaceportal'
\ No newline at end of file
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index bbd49191d..e7a26a449 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -8,10 +8,10 @@
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 
 
-class Epidat(PeacePortal):
+class PeaceportalEpidat(PeacePortal):
 
     data_directory = settings.PEACEPORTAL_EPIDAT_DATA
-    es_index = settings.PEACEPORTAL_EPIDAT_ES_INDEX
+    es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat')
     es_alias = settings.PEACEPORTAL_ALIAS
 
     languages = ['german', 'hebrew', 'english', 'dutch']
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index 26b9c0669..7d682a9d4 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -3,14 +3,13 @@
 from django.conf import settings
 
 from addcorpus.extract import XML, Constant, HTML, ExternalFile, Combined
-from addcorpus.corpus import Field
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 
 
-class IIS(PeacePortal):
+class PeaceportalIIS(PeacePortal):
     data_directory = settings.PEACEPORTAL_IIS_DATA
     external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
-    es_index = settings.PEACEPORTAL_IIS_ES_INDEX
+    es_index = getattr(settings, 'PEACEPORTAL_IIS_ES_INDEX', 'peaceportal-iis')
     es_alias = settings.PEACEPORTAL_ALIAS
 
     def __init__(self):
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index c3b3d7933..9b7526fd7 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -7,15 +7,15 @@
 
 from django.conf import settings
 
-from addcorpus.corpus import XMLCorpus, FieldDefinition
+from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
 from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping
 from addcorpus.es_settings import es_settings
-from addcorpus.extract import XML, Constant
+from addcorpus.extract import Constant
 from addcorpus.filters import MultipleChoiceFilter, RangeFilter
 
 
 
-class PeacePortal(XMLCorpus):
+class PeacePortal(XMLCorpusDefinition):
     '''
     Base class for corpora in the PEACE portal.
 
@@ -52,7 +52,7 @@ class PeacePortal(XMLCorpus):
     languages = []
 
     def es_settings(self):
-        return es_settings()
+        return es_settings(self.languages, True, True)
 
     def sources(self, start, end):
         logger = logging.getLogger(__name__)
@@ -150,37 +150,37 @@ def request_media(self, document):
 
     transcription_german = FieldDefinition(
         name='transcription_german',
-        es_mapping={'type': 'text', 'analyzer': 'german' },
+        es_mapping=main_content_mapping(stopword_analyzer='clean_german', stemming_analyzer='stemmed_german'),
         hidden=True
     )
 
     transcription_english = FieldDefinition(
         name='transcription_english',
-        es_mapping={'type': 'text', 'analyzer': 'english'},
+        es_mapping=main_content_mapping(stopword_analyzer='clean_english', stemming_analyzer='stemmed_english'),
         hidden=True
     )
 
     transcription_hebrew = FieldDefinition(
-        name='transcription_hebrew',
+        name='transcription_hebrew', # no stopwords / stemmers available
         es_mapping={'type': 'text'},
         hidden=True
     )
 
     transcription_latin = FieldDefinition(
         name='transcription_latin',
-        es_mapping={'type': 'text'},
+        es_mapping={'type': 'text'}, # no stopwords / stemmers available
         hidden=True
     )
 
     transcription_greek = FieldDefinition(
         name='transcription_greek',
-        es_mapping={'type': 'text', 'analyzer': 'greek'},
+        es_mapping=main_content_mapping(stopword_analyzer='clean_greek', stemming_analyzer='stemmed_greek'),
         hidden=True
     )
 
     transcription_dutch = FieldDefinition(
         name='transcription_dutch',
-        es_mapping={'type': 'text', 'analyzer': 'dutch'},
+        es_mapping=main_content_mapping(stopword_analyzer='clean_dutch', stemming_analyzer='stemmed_dutch'),
         hidden=True
     )
 
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
index 29b80a14e..774447ad6 100644
--- a/backend/corpora/peaceportal/tol.py
+++ b/backend/corpora/peaceportal/tol.py
@@ -4,13 +4,12 @@
 from django.conf import settings
 
 from addcorpus.extract import XML, Constant, HTML, Combined
-from addcorpus.corpus import Field
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 
 
-class TOL(PeacePortal):
+class PeaceportalTOL(PeacePortal):
     data_directory = settings.PEACEPORTAL_TOL_DATA
-    es_index = settings.PEACEPORTAL_TOL_ES_INDEX
+    es_index = getattr(settings, 'PEACEPORTAL_TOL_ES_INDEX', 'peaceportal-tol')
     es_alias = settings.PEACEPORTAL_ALIAS
 
     def __init__(self):
diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py
index 3b905c4d7..da4bce105 100644
--- a/backend/corpora/periodicals/periodicals.py
+++ b/backend/corpora/periodicals/periodicals.py
@@ -38,7 +38,7 @@ class Periodicals(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     tag_toplevel = 'articles'
     tag_entry = 'artInfo'
diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py
index b8b6d0892..d4e4dac80 100644
--- a/backend/corpora/rechtspraak/rechtspraak.py
+++ b/backend/corpora/rechtspraak/rechtspraak.py
@@ -45,7 +45,7 @@ class Rechtspraak(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     tag_toplevel = 'open-rechtspraak'
 
diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py
index 1e0ff0d87..3f6e02514 100644
--- a/backend/corpora/times/times.py
+++ b/backend/corpora/times/times.py
@@ -39,7 +39,7 @@ class Times(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     tag_toplevel = 'issue'
     tag_entry = 'article'
diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py
index b8d416530..e37223c00 100644
--- a/backend/corpora/troonredes/troonredes.py
+++ b/backend/corpora/troonredes/troonredes.py
@@ -44,7 +44,7 @@ class Troonredes(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     tag_toplevel = 'doc'
     tag_entry = 'entry'

From 082db1a603ce3791eff48c8f5e23832bbbff582e Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 26 Oct 2023 16:51:28 +0200
Subject: [PATCH 34/98] remove HTML from rejected XMLCorpus extractors

---
 backend/addcorpus/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
index 810fc4c70..775c3f961 100644
--- a/backend/addcorpus/corpus.py
+++ b/backend/addcorpus/corpus.py
@@ -309,7 +309,7 @@ def source2dicts(self, source):
         default implementation for XML layouts; may be subclassed if more
         '''
         # Make sure that extractors are sensible
-        self._reject_extractors(extract.HTML, extract.CSV)
+        self._reject_extractors(extract.CSV)
 
         # extract information from external xml files first, if applicable
         metadata = {}

From 1a6e210a01a54b7095b414a64c1c6cc3810d8db4 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Tue, 31 Oct 2023 15:10:47 +0100
Subject: [PATCH 35/98] change bug report template to form

---
 .github/ISSUE_TEMPLATE/bug_report.md   | 36 --------------
 .github/ISSUE_TEMPLATE/bug_report.yaml | 66 ++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 36 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yaml

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index 5a8565682..000000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,36 +0,0 @@
----
-name: Bug report
-about: Let us know about something that isn't working right
-title: ''
-labels: bug
-assignees: ''
-
----
-
-### What went wrong?
-
-Describe what happened.
-
-### Expected behavior
-
-What did you expect to happen?
-
-### Screenshots
-
-If applicable, please add a screenshot of the problem! 
-
-### Which version?
-
-Please specify where you encountered the issue:
-
-- [ ] https://ianalyzer.hum.uu.nl
-- [ ] https://peopleandparliament.hum.uu.nl
-- [ ] https://peace.sites.uu.nl/
-- [ ] a server hosted elsewhere (i.e. not by the research software lab)
-- [ ] a local server
-
-If this happened on local or third-party server, it helps if you can be more specific about the version. Please include the version number (e.g. "3.2.4") or a commit hash if you know it!
-
-### To reproduce
-
-How can a developer replicate the issue? Please provide any information you can. For example: "I went to https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then clicked on *Download CSV*. I pressed *cancel* and then I clicked *Download CSV* again."
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
new file mode 100644
index 000000000..82fed0b25
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,66 @@
+---
+name: Bug report
+description: Let us know that something isn't working right
+labels:
+  - bug
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for making a bug report! Please fill in this information so we can get to the
+        bottom of your issue.
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What went wrong?
+      description: Please describe what happened.
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: What did you expect to happen?
+    validations:
+      required: true
+  - type: textarea
+    id: screenshot
+    attributes:
+      label: Screenshot
+      description: If you can make a screenshot of the issue, please include it!
+    validations:
+      required: false
+  - type: checkboxes
+    id: instance
+    attributes:
+      label: Where did you find the bug?
+      description: Please add where you found the bug.
+      options:
+        - label: https://ianalyzer.hum.uu.nl
+        - label: https://peopleandparliament.hum.uu.nl
+        - label: https://peace.sites.uu.nl
+        - label: a server hosted elsewhere (i.e. not by the research software lab)
+        - label: a local server
+    validations:
+      required: true
+  - type: input
+    id: version
+    attributes:
+      label: Version
+      description: |
+        For third-party and local servers, please add information about the version of the
+        software, if you know it. A version number (e.g "1.2.3") is great. For a pre-release
+        build, you can provide the branch or commit hash.
+    validations:
+      required: false
+  - type: textarea
+    id: to-reproduce
+    attributes:
+      label: Steps to reproduce
+      description: |
+        How can a developer replicate the issue? Please provide any information you can. For
+        example: "I went to
+        https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
+        clicked on Download CSV. I pressed cancel and then I clicked Download CSV again."
+    validations:
+      required: true
+---

From 022bc2ac030a472642a683062124b02b5c4af88f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 31 Oct 2023 14:15:26 +0000
Subject: [PATCH 36/98] Bump @babel/traverse from 7.17.0 to 7.23.2 in /frontend

Bumps [@babel/traverse](https://github.com/babel/babel/tree/HEAD/packages/babel-traverse) from 7.17.0 to 7.23.2.
- [Release notes](https://github.com/babel/babel/releases)
- [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md)
- [Commits](https://github.com/babel/babel/commits/v7.23.2/packages/babel-traverse)

---
updated-dependencies:
- dependency-name: "@babel/traverse"
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 frontend/yarn.lock | 124 +++++++++++++++++++++++++++++----------------
 1 file changed, 80 insertions(+), 44 deletions(-)

diff --git a/frontend/yarn.lock b/frontend/yarn.lock
index 3b1ee901a..3d00346ea 100644
--- a/frontend/yarn.lock
+++ b/frontend/yarn.lock
@@ -327,7 +327,7 @@
   dependencies:
     "@babel/highlight" "^7.10.3"
 
-"@babel/code-frame@^7.16.7", "@babel/code-frame@^7.22.10", "@babel/code-frame@^7.22.5":
+"@babel/code-frame@^7.16.7", "@babel/code-frame@^7.22.5":
   version "7.22.10"
   resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.22.10.tgz#1c20e612b768fefa75f6e90d6ecb86329247f0a3"
   integrity sha512-/KKIMG4UEL35WmI9OlvMhurwtytjvXoFcGNrOvyG9zIzA8YmPjVtIZUf7b05+TPO7G7/GEmLHDaoCgACHl9hhA==
@@ -335,6 +335,14 @@
     "@babel/highlight" "^7.22.10"
     chalk "^2.4.2"
 
+"@babel/code-frame@^7.22.13":
+  version "7.22.13"
+  resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.22.13.tgz#e3c1c099402598483b7a8c46a721d1038803755e"
+  integrity sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==
+  dependencies:
+    "@babel/highlight" "^7.22.13"
+    chalk "^2.4.2"
+
 "@babel/compat-data@^7.13.11", "@babel/compat-data@^7.16.8":
   version "7.17.0"
   resolved "https://registry.yarnpkg.com/@babel/compat-data/-/compat-data-7.17.0.tgz#86850b8597ea6962089770952075dcaabb8dba34"
@@ -426,7 +434,7 @@
     jsesc "^2.5.1"
     source-map "^0.5.0"
 
-"@babel/generator@^7.17.0", "@babel/generator@^7.22.10", "@babel/generator@^7.22.7":
+"@babel/generator@^7.17.0":
   version "7.22.10"
   resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.22.10.tgz#c92254361f398e160645ac58831069707382b722"
   integrity sha512-79KIf7YiWjjdZ81JnLujDRApWtl7BxTqWD88+FFdQEIOG8LJ0etDOM7CXuIgGJa55sGOwZVwuEsaLEm0PJ5/+A==
@@ -446,6 +454,16 @@
     "@jridgewell/trace-mapping" "^0.3.17"
     jsesc "^2.5.1"
 
+"@babel/generator@^7.23.0":
+  version "7.23.0"
+  resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.23.0.tgz#df5c386e2218be505b34837acbcb874d7a983420"
+  integrity sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==
+  dependencies:
+    "@babel/types" "^7.23.0"
+    "@jridgewell/gen-mapping" "^0.3.2"
+    "@jridgewell/trace-mapping" "^0.3.17"
+    jsesc "^2.5.1"
+
 "@babel/helper-annotate-as-pure@7.16.7", "@babel/helper-annotate-as-pure@^7.16.7":
   version "7.16.7"
   resolved "https://registry.yarnpkg.com/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.16.7.tgz#bb2339a7534a9c128e3102024c60760a3a7f3862"
@@ -549,6 +567,11 @@
   resolved "https://registry.yarnpkg.com/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.5.tgz#f06dd41b7c1f44e1f8da6c4055b41ab3a09a7e98"
   integrity sha512-XGmhECfVA/5sAt+H+xpSg0mfrHq6FzNr9Oxh7PSEBBRUb/mL7Kz3NICXb194rCqAEdxkhPT1a88teizAFyvk8Q==
 
+"@babel/helper-environment-visitor@^7.22.20":
+  version "7.22.20"
+  resolved "https://registry.yarnpkg.com/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz#96159db61d34a29dba454c959f5ae4a649ba9167"
+  integrity sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==
+
 "@babel/helper-explode-assignable-expression@^7.16.7":
   version "7.16.7"
   resolved "https://registry.yarnpkg.com/@babel/helper-explode-assignable-expression/-/helper-explode-assignable-expression-7.16.7.tgz#12a6d8522fdd834f194e868af6354e8650242b7a"
@@ -556,7 +579,7 @@
   dependencies:
     "@babel/types" "^7.16.7"
 
-"@babel/helper-function-name@^7.16.7", "@babel/helper-function-name@^7.22.5":
+"@babel/helper-function-name@^7.16.7":
   version "7.22.5"
   resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.22.5.tgz#ede300828905bb15e582c037162f99d5183af1be"
   integrity sha512-wtHSq6jMRE3uF2otvfuD3DIvVhOsSNshQl0Qrd7qC9oQJzHvOL4qQXlQn2916+CXGywIjpGuIkoyZRRxHPiNQQ==
@@ -564,6 +587,14 @@
     "@babel/template" "^7.22.5"
     "@babel/types" "^7.22.5"
 
+"@babel/helper-function-name@^7.23.0":
+  version "7.23.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz#1f9a3cdbd5b2698a670c30d2735f9af95ed52759"
+  integrity sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==
+  dependencies:
+    "@babel/template" "^7.22.15"
+    "@babel/types" "^7.23.0"
+
 "@babel/helper-hoist-variables@^7.16.7", "@babel/helper-hoist-variables@^7.22.5":
   version "7.22.5"
   resolved "https://registry.yarnpkg.com/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz#c01a007dac05c085914e8fb652b339db50d823bb"
@@ -693,6 +724,11 @@
   resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.5.tgz#9544ef6a33999343c8740fa51350f30eeaaaf193"
   integrity sha512-aJXu+6lErq8ltp+JhkJUfk1MTGyuA4v7f3pA+BJ5HLfNC6nAQ0Cpi9uOquUj8Hehg0aUiHzWQbOVJGao6ztBAQ==
 
+"@babel/helper-validator-identifier@^7.22.20":
+  version "7.22.20"
+  resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz#c4ae002c61d2879e724581d96665583dbc1dc0e0"
+  integrity sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==
+
 "@babel/helper-validator-option@^7.16.7", "@babel/helper-validator-option@^7.22.5":
   version "7.22.5"
   resolved "https://registry.yarnpkg.com/@babel/helper-validator-option/-/helper-validator-option-7.22.5.tgz#de52000a15a177413c8234fa3a8af4ee8102d0ac"
@@ -744,16 +780,30 @@
     chalk "^2.4.2"
     js-tokens "^4.0.0"
 
+"@babel/highlight@^7.22.13":
+  version "7.22.20"
+  resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.22.20.tgz#4ca92b71d80554b01427815e06f2df965b9c1f54"
+  integrity sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==
+  dependencies:
+    "@babel/helper-validator-identifier" "^7.22.20"
+    chalk "^2.4.2"
+    js-tokens "^4.0.0"
+
 "@babel/parser@^7.14.7", "@babel/parser@^7.16.12":
   version "7.17.0"
   resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.17.0.tgz#f0ac33eddbe214e4105363bb17c3341c5ffcc43c"
   integrity sha512-VKXSCQx5D8S04ej+Dqsr1CzYvvWgf20jIw2D+YhQCrIlr2UZGaDds23Y0xg75/skOxpLCRpUZvk/1EAVkGoDOw==
 
-"@babel/parser@^7.16.7", "@babel/parser@^7.17.0", "@babel/parser@^7.22.10", "@babel/parser@^7.22.5", "@babel/parser@^7.22.7":
+"@babel/parser@^7.16.7", "@babel/parser@^7.17.0", "@babel/parser@^7.22.5", "@babel/parser@^7.22.7":
   version "7.22.10"
   resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.22.10.tgz#e37634f9a12a1716136c44624ef54283cabd3f55"
   integrity sha512-lNbdGsQb9ekfsnjFGhEiF4hfFqGgfOP3H3d27re3n+CGhNuTSUEQdfWk556sTLNTloczcdM5TYF2LhzmDQKyvQ==
 
+"@babel/parser@^7.22.15", "@babel/parser@^7.23.0":
+  version "7.23.0"
+  resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.0.tgz#da950e622420bf96ca0d0f2909cdddac3acd8719"
+  integrity sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw==
+
 "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression@^7.16.7":
   version "7.16.7"
   resolved "https://registry.yarnpkg.com/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.16.7.tgz#4eda6d6c2a0aa79c70fa7b6da67763dfe2141050"
@@ -1412,51 +1462,28 @@
     "@babel/parser" "^7.22.5"
     "@babel/types" "^7.22.5"
 
-"@babel/traverse@^7.13.0", "@babel/traverse@^7.16.10", "@babel/traverse@^7.16.8":
-  version "7.17.0"
-  resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.17.0.tgz#3143e5066796408ccc880a33ecd3184f3e75cd30"
-  integrity sha512-fpFIXvqD6kC7c7PUNnZ0Z8cQXlarCLtCUpt2S1Dx7PjoRtCFffvOkHHSom+m5HIxMZn5bIBVb71lhabcmjEsqg==
+"@babel/template@^7.22.15":
+  version "7.22.15"
+  resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.22.15.tgz#09576efc3830f0430f4548ef971dde1350ef2f38"
+  integrity sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==
   dependencies:
-    "@babel/code-frame" "^7.16.7"
-    "@babel/generator" "^7.17.0"
-    "@babel/helper-environment-visitor" "^7.16.7"
-    "@babel/helper-function-name" "^7.16.7"
-    "@babel/helper-hoist-variables" "^7.16.7"
-    "@babel/helper-split-export-declaration" "^7.16.7"
-    "@babel/parser" "^7.17.0"
-    "@babel/types" "^7.17.0"
-    debug "^4.1.0"
-    globals "^11.1.0"
-
-"@babel/traverse@^7.16.7", "@babel/traverse@^7.22.8":
-  version "7.22.8"
-  resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.22.8.tgz#4d4451d31bc34efeae01eac222b514a77aa4000e"
-  integrity sha512-y6LPR+wpM2I3qJrsheCTwhIinzkETbplIgPBbwvqPKc+uljeA5gP+3nP8irdYt1mjQaDnlIcG+dw8OjAco4GXw==
-  dependencies:
-    "@babel/code-frame" "^7.22.5"
-    "@babel/generator" "^7.22.7"
-    "@babel/helper-environment-visitor" "^7.22.5"
-    "@babel/helper-function-name" "^7.22.5"
-    "@babel/helper-hoist-variables" "^7.22.5"
-    "@babel/helper-split-export-declaration" "^7.22.6"
-    "@babel/parser" "^7.22.7"
-    "@babel/types" "^7.22.5"
-    debug "^4.1.0"
-    globals "^11.1.0"
+    "@babel/code-frame" "^7.22.13"
+    "@babel/parser" "^7.22.15"
+    "@babel/types" "^7.22.15"
 
-"@babel/traverse@^7.17.0", "@babel/traverse@^7.22.10", "@babel/traverse@^7.22.6":
-  version "7.22.10"
-  resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.22.10.tgz#20252acb240e746d27c2e82b4484f199cf8141aa"
-  integrity sha512-Q/urqV4pRByiNNpb/f5OSv28ZlGJiFiiTh+GAHktbIrkPhPbl90+uW6SmpoLyZqutrg9AEaEf3Q/ZBRHBXgxig==
+"@babel/traverse@^7.13.0", "@babel/traverse@^7.16.10", "@babel/traverse@^7.16.7", "@babel/traverse@^7.16.8", "@babel/traverse@^7.17.0", "@babel/traverse@^7.22.10", "@babel/traverse@^7.22.6", "@babel/traverse@^7.22.8":
+  version "7.23.2"
+  resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.23.2.tgz#329c7a06735e144a506bdb2cad0268b7f46f4ad8"
+  integrity sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==
   dependencies:
-    "@babel/code-frame" "^7.22.10"
-    "@babel/generator" "^7.22.10"
-    "@babel/helper-environment-visitor" "^7.22.5"
-    "@babel/helper-function-name" "^7.22.5"
+    "@babel/code-frame" "^7.22.13"
+    "@babel/generator" "^7.23.0"
+    "@babel/helper-environment-visitor" "^7.22.20"
+    "@babel/helper-function-name" "^7.23.0"
     "@babel/helper-hoist-variables" "^7.22.5"
     "@babel/helper-split-export-declaration" "^7.22.6"
-    "@babel/parser" "^7.22.10"
-    "@babel/types" "^7.22.10"
+    "@babel/parser" "^7.23.0"
+    "@babel/types" "^7.23.0"
     debug "^4.1.0"
     globals "^11.1.0"
 
@@ -1486,6 +1513,15 @@
     "@babel/helper-validator-identifier" "^7.22.5"
     to-fast-properties "^2.0.0"
 
+"@babel/types@^7.22.15", "@babel/types@^7.23.0":
+  version "7.23.0"
+  resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.23.0.tgz#8c1f020c9df0e737e4e247c0619f58c68458aaeb"
+  integrity sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==
+  dependencies:
+    "@babel/helper-string-parser" "^7.22.5"
+    "@babel/helper-validator-identifier" "^7.22.20"
+    to-fast-properties "^2.0.0"
+
 "@colors/colors@1.5.0":
   version "1.5.0"
   resolved "https://registry.yarnpkg.com/@colors/colors/-/colors-1.5.0.tgz#bb504579c1cae923e6576a4f5da43d25f97bdbd9"

From b34ee3f61bcfcf0d458190742489fd88149db878 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 1 Nov 2023 09:46:19 +0100
Subject: [PATCH 37/98] remove tag_toplevel from Peaceportal base

---
 backend/corpora/peaceportal/peaceportal.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 9b7526fd7..5e0f95aa9 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -40,7 +40,6 @@ class PeacePortal(XMLCorpusDefinition):
     data_directory = 'bogus'
 
     # Data overrides from .common.XMLCorpus
-    tag_toplevel = ''
     tag_entry = 'TEI'
 
     # New data members
@@ -48,7 +47,7 @@ class PeacePortal(XMLCorpusDefinition):
     non_match_msg = 'Skipping XML file with nonmatching name {}'
     # overwrite below in child class if you need to extract the (converted) transcription
     # from external files. See README.
-    external_file_folder = '.'
+    external_file_folder = None
     languages = []
 
     def es_settings(self):
@@ -57,6 +56,7 @@ def es_settings(self):
     def sources(self, start, end):
         logger = logging.getLogger(__name__)
         for directory, _, filenames in os.walk(self.data_directory):
+            print(filenames)
             for filename in filenames:
                 name, extension = op.splitext(filename)
                 full_path = op.join(directory, filename)
@@ -65,11 +65,16 @@ def sources(self, start, end):
                     logger.debug(self.non_xml_msg.format(full_path))
                     continue
 
-                yield full_path, {
+                metadata = {}
+
+                if self.external_file_folder:
+                    metadata = {
                     # applies only to iis corpus
                     'associated_file': os.path.join(self.external_file_folder, filename)
                 }
 
+                yield full_path, metadata
+
     def request_media(self, document):
         images = document['fieldValues']['images']
         if not images:

From cdd45997188dbca98202de13ffe02e396eb7c0a0 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Fri, 3 Nov 2023 14:09:11 +0100
Subject: [PATCH 38/98] reorganize analyzer and filter formatting

---
 backend/addcorpus/corpus.py                   | 19 +++++
 backend/addcorpus/es_mappings.py              | 18 ++--
 backend/addcorpus/es_settings.py              | 37 +++++----
 backend/addcorpus/tests/test_es_settings.py   |  0
 .../corpora/parliament/tests/test_import.py   |  7 +-
 backend/corpora/peaceportal/FIJI/fiji.py      |  6 +-
 backend/corpora/peaceportal/epidat.py         |  8 +-
 backend/corpora/peaceportal/iis.py            |  8 +-
 backend/corpora/peaceportal/peaceportal.py    | 83 +++++++++----------
 .../corpora/peaceportal/tests/test_import.py  |  3 +-
 backend/corpora/peaceportal/tol.py            | 36 +++++++-
 backend/es/conftest.py                        |  1 -
 backend/es/tests/test_es_index.py             |  1 -
 docker-compose.yaml                           |  4 +
 14 files changed, 147 insertions(+), 84 deletions(-)
 create mode 100644 backend/addcorpus/tests/test_es_settings.py

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
index 775c3f961..e1e6f59bb 100644
--- a/backend/addcorpus/corpus.py
+++ b/backend/addcorpus/corpus.py
@@ -274,6 +274,25 @@ def _reject_extractors(self, *inapplicable_extractors):
             if isinstance(field.extractor, inapplicable_extractors):
                 raise RuntimeError(
                     "Specified extractor method cannot be used with this type of data")
+    
+class ParentCorpusDefinition(CorpusDefinition):
+    ''' A class from which other corpus definitions can inherit
+    '''
+    #define fields property so it can be set in __init__
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, value):
+        self._fields = value
+
+    def __init__(self):
+        ''' specify a list of fields here which all subclasses share
+            should be overwritten in subclasses
+        '''
+        self.fields = []
+
 
 class XMLCorpusDefinition(CorpusDefinition):
     '''
diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
index b5465c4f1..85a0d9806 100644
--- a/backend/addcorpus/es_mappings.py
+++ b/backend/addcorpus/es_mappings.py
@@ -1,4 +1,6 @@
-def main_content_mapping(token_counts=True, stopword_analyzer=None, stemming_analyzer=None, updated_highlighting=False):
+from addcorpus.es_settings import add_language_string
+
+def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=False):
     '''
     Mapping for the main content field. Options:
 
@@ -17,27 +19,23 @@ def main_content_mapping(token_counts=True, stopword_analyzer=None, stemming_ana
         'term_vector': 'with_positions_offsets' # include char positions on _source (in addition to the multifields) for highlighting
     })
 
-    if any([token_counts, stopword_analyzer, stemming_analyzer]):
+    if any([token_counts, stopword_analysis, stemming_analysis]):
         multifields = {}
         if token_counts:
             multifields['length'] = {
                 "type":     "token_count",
                 "analyzer": "standard"
             }
-        if stopword_analyzer:
-            if type(stopword_analyzer)==bool:
-                stopword_analyzer = 'clean'
+        if stopword_analysis:
             multifields['clean'] = {
                 "type": "text",
-                "analyzer": stopword_analyzer,
+                "analyzer": add_language_string('clean', language),
                 "term_vector": "with_positions_offsets" # include character positions for highlighting
             }
-        if stemming_analyzer:
-            if type(stemming_analyzer)==bool:
-                stemming_analyzer = 'stemmed'
+        if stemming_analysis:
             multifields['stemmed'] = {
                 "type": "text",
-                "analyzer": stemming_analyzer,
+                "analyzer": add_language_string('stemmed', language),
                 "term_vector": "with_positions_offsets",
             }
         mapping['fields'] = multifields
diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 3251719ae..94d2366e3 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -28,6 +28,8 @@ def get_nltk_stopwords(language_code):
     else:
         raise NotImplementedError('language {} has no nltk stopwords list'.format(language))
 
+def add_language_string(name, language):
+    return '{}_{}'.format(name, language) if language else name
 
 def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     '''
@@ -43,15 +45,17 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     stemmed_analyzer_name = 'stemmed'
     
     for language in languages:
-        add_language_string = lambda name: '{}_{}'.format(language, name) if len(languages) > 0 else name
+        # do not attach language isocodes if there is just one language
+        language_string = language if len(languages) > 1 else None
+
         if stopword_analyzer or stemming_analyzer:
-            if not set_stopword_filter(settings, language, add_language_string(stopword_filter_name)):
+            if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language_string), language):
                 continue # skip languages for which we do not have a stopword list
 
             if stopword_analyzer:
-                set_clean_analyzer(settings, language, add_language_string(stopword_filter_name), add_language_string(clean_analyzer_name))
+                set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language)
             if stemming_analyzer:
-                set_stemmed_analyzer(settings, language, add_language_string(stemmer_filter_name), add_language_string(stemmed_analyzer_name))
+                set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language)
 
     return settings
 
@@ -62,7 +66,7 @@ def number_filter():
         "replacement":""
     }
 
-def make_stopword_filter(language, stopword_filter_name):
+def make_stopword_filter(stopword_filter_name, language):
     try:
         stopwords = get_nltk_stopwords(language)
         return {
@@ -86,11 +90,11 @@ def make_stemmer_filter(language):
         "language": stemmer_language
     }
 
-def make_stemmed_analyzer(stemmer_filter_name):
+def make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name):
     return {
         "tokenizer": "standard",
         "char_filter": ["number_filter"],
-        "filter": ["lowercase", "stopwords", stemmer_filter_name]
+        "filter": ["lowercase", stopword_filter_name, stemmer_filter_name]
     }
 
 def get_stopwords_from_settings(es_settings):
@@ -102,22 +106,27 @@ def get_stopwords_from_settings(es_settings):
 
     return stopwords
 
-def set_stemmed_analyzer(settings, language, stemmer_filter_name, stemmed_analyzer_name):
+def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language):
+    stopword_filter_name = add_language_string(stopword_filter_name, language)
+    stemmer_filter_name = add_language_string(stemmer_filter_name, language)
+    stemmed_analyzer_name = add_language_string(stemmed_analyzer_name, language)
     settings['analysis']['filter'][stemmer_filter_name] = make_stemmer_filter(language)
-    settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stemmer_filter_name)
+    settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)
 
-def set_stopword_filter(settings, language, stopword_filter_name):
-    stopword_filter = make_stopword_filter(language, stopword_filter_name)
+def set_stopword_filter(settings, stopword_filter_name, language):
+    stopword_filter = make_stopword_filter(stopword_filter_name, language)
     if not stopword_filter:
         return False
     settings["analysis"] = {
         "analyzer": {},
         "char_filter":{ "number_filter": number_filter() },
         'filter': {
-            "stopwords": stopword_filter
+            stopword_filter_name: stopword_filter
         }
     }
     return True
     
-def set_clean_analyzer(settings, language, stopword_filter_name, clean_analyzer_name):
-    settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer(language, stopword_filter_name)
\ No newline at end of file
+def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language):
+    settings["analysis"]['analyzer'][
+        add_language_string(clean_analyzer_name, language)
+        ] = make_clean_analyzer(add_language_string(stopword_filter_name, language))
\ No newline at end of file
diff --git a/backend/addcorpus/tests/test_es_settings.py b/backend/addcorpus/tests/test_es_settings.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/corpora/parliament/tests/test_import.py b/backend/corpora/parliament/tests/test_import.py
index fd1b907c6..398e85c6b 100644
--- a/backend/corpora/parliament/tests/test_import.py
+++ b/backend/corpora/parliament/tests/test_import.py
@@ -16,6 +16,7 @@
             'debate_id': 'ca.proc.d.2015-02-02',
             'chamber': 'House of Commons',
             'party': 'New Democratic Party',
+            'role': 'Interjection',
             'speaker': 'Jack Harris',
             'speaker_id': 'c846297d-8bc7-4e69-b6eb-31d0e19f7ec1',
             'speaker_constituency': 'St. John\'s East',
@@ -198,6 +199,7 @@
                     'Boorsma, wegens verblijf buitenslands.',
                 ]),
                 'id': 'nl.proc.ob.d.h-ek-19992000-493-493.1.5.1',
+                'source_archive': 'PoliticalMashup',
                 'speaker': 'De voorzitter Jurgens',
                 'speaker_id': 'nl.m.01992',
                 'speaker_gender': None,
@@ -282,6 +284,7 @@
                 'date': '2021-09-14',
                 'date_is_estimate': None,
                 'chamber': 'Riksdag',
+                'country': 'Sweden',
                 'speech': 'Ärade ledamöter! Varmt välkomna tillbaka till riksdagen! Det känns stort att få välkomna er här på tröskeln till det fjärde riksmötet den här mandatperioden. Vi har ännu ett mycket speciellt arbetsår bakom oss, till stor del präglat av pandemin. Även om vi visste att det inte var helt över för ett år sedan tror jag att vi var många som hoppades att en tydligare vändning var på väg. Så blev det inte. I stället fick vi ytterligare ett riksdagsår med ett reducerat antal ledamöter vid voteringar och utskottsarbete till stor del på distans. Men förhoppningsvis börjar vi nu gå tillbaka mot mer normala arbetsformer. Ett tydligt tecken på detta är att alla 349 ledamöter kommer att vara med vid riksmötets öppnande i eftermiddag. Jag tycker att det är angeläget att riksdagens och regeringens alla ledamöter kan vara på plats vid denna högtidliga och viktiga ceremoni, särskilt som detta är det sista öppnandet under den här mandatperioden. Däremot genomförs inget upprop nu på förmiddagen, och vi vidtar den försiktighetsåtgärden att drygt en tredjedel av ledamöterna och statsråden får sitta på läktaren under ceremonin. Formerna beslutades av mig efter diskussion med gruppledarna och de vice talmännen redan i början av augusti, alltså långt innan det blev bestämt att alla ledamöter får delta i voteringar efter riksmötets öppnande. Jag såg inget skäl att med kort varsel börja ändra i planeringen för riksmötets öppnande, så just denna speciella dag får inte alla ledamöter sitta nere på golvet här i kammaren . M en från och med riksmötets första votering sitter var och en på sin plats och röstar igen på vanligt sätt. Även om pandemin inte är över är situationen i Sverige ändå en helt annan nu än för ett år sedan. Därför har vi – talmanspresidiet och gruppledarna – gjort bedömningen att det är möjligt att samla fler personer än förra året men ändå långt färre än ett vanligt år. Vi har försökt finna en så god balans som möjligt mellan nödvändiga säkerhetsåtgärder, riksdagsordningens bestämmelser och respekt för traditionen. Den sedvanliga mottagningen i Sammanbindningsbanan är som bekant inställd, och det genomförs heller inte någon konsert i Konserthuset. Jag är glad över att vi också kommer att få hjälp att minnas dessa föregångare och förebilder genom att de får en permanent plats på Riksplan i form av en staty. Här tillkommer det att det i trapphallen i Östra riksdagshuset kommer att invigas en tavla som föreställer de här fem pionjärerna. Statyn dröjer ett tag – den kommer att invigas nästa år – men redan i kväll vill riksdagen på dagen för riksmötets öppnande, denna demokratins högtidsdag, uppmärksamma demokratijubileet med att lysa upp Stockholmsnatten med ett ljusspel. Jag kommer att tända en fasadbelysning på Östra riksdagshuset vid en webbsänd ceremoni klockan 20. Ljusspelet kan sedan ses varje kväll till och med den 20 september. Men demokratifirandet tar inte slut där. Vad passar väl bättre på FN:s demokratidag den 15 september än att fira med ett seminarium? I morgon anordnar riksdag och regering seminariet 100 år av demokrati – vilka lärdomar tar vi med oss? Se det gärna på riksdagen.se! Efter riksmötets öppnande tror jag att vi alla ser fram emot ett nytt arbetsår i riksdagen under något mer normala former. Jag har ju, som ni alla vet, tillsammans med gruppledarna slutit en ny överenskommelse om arbetsformerna under hösten, och gruppledarna har också beslutat att inte förlänga överenskommelsen om 55 närvarande ledamöter vid voteringar. Alla ledamöter kan alltså delta vid voteringarna, men vi behåller möjligheten att delta på distans vid utskottens sammanträden. Varje utskott avgör när det är motiverat att hålla fysiska sammanträden, och när man deltar fysiskt planerar vi för att det ska gå att hålla avstånd. Vi ska däremot fortsätta hjälpas åt att hålla antalet externa besök i riksdagens hus nere. Externa åhörare vid olika arrangemang bör undvikas liksom guidade visningar och mingelsituationer. Pandemin är inte över. Vi fortsätter att anpassa verksamheten när och om det behövs, men förhoppningsvis går vi mot ett mer normalt läge. Ärade ledamöter! Det här har varit en mandatperiod som ingen annan. Jag tror inte att någon hade kunnat förutse de många olika, oväntade och delvis dramatiska händelser som har inträffat. Jag tänker naturligtvis i första hand på pandemin och alla dess konsekvenser men även på de två regeringsbildningarna. Och då är det ändå ett helt år kvar av mandatperio ­ den. Jag tror att vi alla kan se fram emot ännu ett händelserikt och spännan ­ de riksdagsår fram till valet. Vi vet i alla fall att det i början av november blir den tredje regeringsbildningen under den här mandatperioden. Oavsett hur man ser på det politiska läget vill jag framhålla, apropå just demokratijubileet, att regeringsbildningarna inte har inneburit någon kris för demokratin. Svensk demokrati står stark, och den är värd att fira. Alla aktörer har i regeringsbildningsprocesserna använt de olika verktyg som finns i den demokratiska, parlamentariska verktygslådan. Misstroendeomröstning, beslut att inte utlysa extraval och talmansrundor – allt sådant följer av de lagar som vi har skapat för vår demokrati. Skeendet må vara turbulent i vissa stycken, men det följer demokratins spelregler. Ärade ledamöter! Jag vill avsluta med några rader ut dikten Sommaren i Sverige av Werner Aspenström. Den skildrar på ett fint sätt vemodet och skönheten när sommaren går mot sitt slut. Då landar på min hand den förgänglighetens tanke som vi kallar trollslända. Ett gult löv lösgör sig och faller klingande mot marken. Sommaren måste hastigt bärgas. … Ty hösten närmar sig med toppeld i asparna. Låt mig nu önska er en fin höst och ett produktivt arbetsår. På återseende här i kammaren klockan 14! Stockholms kommun Stockholms län Södermanlands län Jönköpings län Kronobergs län Blekinge län Hallands län Göteborgs kommun Värmlands län Jämtlands län Norrbottens län EU-dokument Åttaveckorsfristen för att avge ett motiverat yttrande skulle gå ut den 5 november . EU-dokument Följande frågor för skriftliga svar hade framställts: 2020/21:3636 Amorteringskravet och ojämställd bostadsmarknad 2020/21:3637 Den kinesiske ambassadörens agerande 2020/21:3638 Vaccin 2020/21:3639 Lukasjenkos tillgång till 1 miljard dollar från IMF 2020/21:3640 Markering mot Irans idrottsminister 2020/21:3642 Kriminalitet på bostadsmarknaden Skriftliga svar på följande frågor hade kommit in: 2020/21:3535 Barns rätt till säkerställda skyddade boenden 2020/21:3537 Elbrist som hotar investeringar i Sverige 2020/21:3538 Åtgärder för att trygga boende',
                 'sequence': '0',
                 'id': 'i-2a00eff84ce04676-0',
@@ -304,6 +307,7 @@
             {
                 'book_id': 'bn_1828-30_1__01',
                 'book_label': 'Hederwärda bonde-ståndets protokoller wid lagtima riksdagen i Stockholm åren 1828 och 1829. Första bandet.',
+                'country': 'Sweden',
                 'era': 'Ståndsriksdagen',
                 'chamber': 'Bönder',
                 'date_earliest': '1828-01-01',
@@ -497,6 +501,7 @@
 
 1878.""",
                 'id': 'Adeln_Prot_1877_III.pdf_0',
+                'speech_type': 'minutes',
                 'chamber': 'nobility',
                 'date_earliest': '1877-01-01',
                 'date_latest': '1877-12-31',
@@ -634,7 +639,7 @@ def test_imports(parliament_corpora_settings, corpus_object):
 
     for key in resulted_fields:
         if not key in tested_fields:
-            message = 'Key "{}" is included the result for {} but has no specification'.format(key, corpus_object.get('name'))
+            message = 'Key "{}" is included in the result for {} but has no specification'.format(key, corpus_object.get('name'))
             warnings.warn(message)
 
     docs = get_documents(corpus, start, end)
diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 83ef4b5c1..44095c1d6 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -157,19 +157,19 @@ def __init__(self):
             toplevel=False
         )
 
-        self.transcription_hebrew.extractor = Combined(
+        self.transcription_he.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_latin.extractor = Combined(
+        self.transcription_la.extractor = Combined(
             self.transcription.extractor,
             Constant('la'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_greek.extractor = Combined(
+        self.transcription_el.extractor = Combined(
             self.transcription.extractor,
             Constant('el'),
             transform=lambda x: get_text_in_language(x)
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index e7a26a449..461b06b24 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -75,7 +75,7 @@ def __init__(self):
             transform_soup_func=extract_transcript
         )
 
-        self.transcription_german.extractor = XML(
+        self.transcription_de.extractor = XML(
             tag=['text', 'body', ],
             toplevel=False,
             multiple=False,
@@ -215,19 +215,19 @@ def __init__(self):
             multiple=True
         )
 
-        self.transcription_hebrew.extractor = Combined(
+        self.transcription_he.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_english.extractor = Combined(
+        self.transcription_en.extractor = Combined(
             self.transcription.extractor,
             Constant('en'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_dutch.extractor = Combined(
+        self.transcription_nl.extractor = Combined(
             self.transcription.extractor,
             Constant('nl'),
             transform=lambda x: get_text_in_language(x)
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index 7d682a9d4..67699e0e1 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -62,7 +62,7 @@ def __init__(self):
             stream_handler=extract_transcript
         )
 
-        self.transcription_english.extractor = HTML(
+        self.transcription_en.extractor = HTML(
             tag=['div'],
             toplevel=True,
             multiple=False,
@@ -229,19 +229,19 @@ def __init__(self):
             multiple=True
         )
 
-        self.transcription_hebrew.extractor = Combined(
+        self.transcription_he.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_latin.extractor = Combined(
+        self.transcription_la.extractor = Combined(
             self.transcription.extractor,
             Constant('la'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_greek.extractor = Combined(
+        self.transcription_el.extractor = Combined(
             self.transcription.extractor,
             Constant('el'),
             transform=lambda x: get_text_in_language(x)
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 5e0f95aa9..0f27e08f3 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -7,7 +7,7 @@
 
 from django.conf import settings
 
-from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
+from addcorpus.corpus import ParentCorpusDefinition, FieldDefinition, XMLCorpusDefinition
 from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping
 from addcorpus.es_settings import es_settings
 from addcorpus.extract import Constant
@@ -15,7 +15,7 @@
 
 
 
-class PeacePortal(XMLCorpusDefinition):
+class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
     '''
     Base class for corpora in the PEACE portal.
 
@@ -48,7 +48,7 @@ class PeacePortal(XMLCorpusDefinition):
     # overwrite below in child class if you need to extract the (converted) transcription
     # from external files. See README.
     external_file_folder = None
-    languages = []
+    languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-)
 
     def es_settings(self):
         return es_settings(self.languages, True, True)
@@ -56,7 +56,6 @@ def es_settings(self):
     def sources(self, start, end):
         logger = logging.getLogger(__name__)
         for directory, _, filenames in os.walk(self.data_directory):
-            print(filenames)
             for filename in filenames:
                 name, extension = op.splitext(filename)
                 full_path = op.join(directory, filename)
@@ -66,7 +65,6 @@ def sources(self, start, end):
                     continue
 
                 metadata = {}
-
                 if self.external_file_folder:
                     metadata = {
                     # applies only to iis corpus
@@ -154,37 +152,37 @@ def request_media(self, document):
     )
 
     transcription_german = FieldDefinition(
-        name='transcription_german',
+        name='transcription_de',
         es_mapping=main_content_mapping(stopword_analyzer='clean_german', stemming_analyzer='stemmed_german'),
         hidden=True
     )
 
     transcription_english = FieldDefinition(
-        name='transcription_english',
+        name='transcription_en',
         es_mapping=main_content_mapping(stopword_analyzer='clean_english', stemming_analyzer='stemmed_english'),
         hidden=True
     )
 
     transcription_hebrew = FieldDefinition(
-        name='transcription_hebrew', # no stopwords / stemmers available
+        name='transcription_he', # no stopwords / stemmers available
         es_mapping={'type': 'text'},
         hidden=True
     )
 
     transcription_latin = FieldDefinition(
-        name='transcription_latin',
+        name='transcription_la',
         es_mapping={'type': 'text'}, # no stopwords / stemmers available
         hidden=True
     )
 
     transcription_greek = FieldDefinition(
-        name='transcription_greek',
+        name='transcription_el',
         es_mapping=main_content_mapping(stopword_analyzer='clean_greek', stemming_analyzer='stemmed_greek'),
         hidden=True
     )
 
     transcription_dutch = FieldDefinition(
-        name='transcription_dutch',
+        name='transcription_nl',
         es_mapping=main_content_mapping(stopword_analyzer='clean_dutch', stemming_analyzer='stemmed_dutch'),
         hidden=True
     )
@@ -347,37 +345,38 @@ def request_media(self, document):
         display_name='Date of death',
     )
 
-    fields = [
-        _id,
-        url,
-        year,
-        not_before,
-        not_after,
-        source_database,
-        transcription,
-        names,
-        sex,
-        dates_of_death,
-        age,
-        country,
-        region,
-        settlement,
-        location_details,
-        language,
-        iconography,
-        images,
-        coordinates,
-        material,
-        material_details,
-        bibliography,
-        comments,
-        transcription_german,
-        transcription_hebrew,
-        transcription_latin,
-        transcription_greek,
-        transcription_english,
-        transcription_dutch
-    ]
+    def __init__(self):
+        self.fields = [
+            self._id,
+            self.url,
+            self.year,
+            self.not_before,
+            self.not_after,
+            self.source_database,
+            self.transcription,
+            self.names,
+            self.sex,
+            self.dates_of_death,
+            self.age,
+            self.country,
+            self.region,
+            self.settlement,
+            self.location_details,
+            self.language,
+            self.iconography,
+            self.images,
+            self.coordinates,
+            self.material,
+            self.material_details,
+            self.bibliography,
+            self.comments,
+            self.transcription_german,
+            self.transcription_hebrew,
+            self.transcription_latin,
+            self.transcription_greek,
+            self.transcription_english,
+            self.transcription_dutch
+        ]
 
 
 def clean_newline_characters(text):
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index a9a57a6f1..f59c54b43 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -43,6 +43,7 @@ def test_imports(peace_corpus_settings, corpus_object):
     resulted_fields = set()
 
     docs = get_documents(corpus, start, end)
+    print(list(docs))
     for target in corpus_object.get('docs'):
         doc = next(docs)
         for key in target:
@@ -55,7 +56,7 @@ def test_imports(peace_corpus_settings, corpus_object):
 
     for key in resulted_fields:
         if not key in tested_fields:
-            message = 'Key "{}" is included the result for {} but has no specification'.format(key, corpus_object.get('name'))
+            message = 'Key "{}" is included in the result for {} but has no specification'.format(key, corpus_object.get('name'))
             warnings.warn(message)
 
     docs = get_documents(corpus, start, end)
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
index 774447ad6..e859529d4 100644
--- a/backend/corpora/peaceportal/tol.py
+++ b/backend/corpora/peaceportal/tol.py
@@ -12,6 +12,8 @@ class PeaceportalTOL(PeacePortal):
     es_index = getattr(settings, 'PEACEPORTAL_TOL_ES_INDEX', 'peaceportal-tol')
     es_alias = settings.PEACEPORTAL_ALIAS
 
+    languages = ['en', 'nl', 'he']
+
     def __init__(self):
         self.source_database.extractor = Constant(
             value='Medieval funerary inscriptions from Toledo'
@@ -199,24 +201,52 @@ def __init__(self):
             multiple=True
         )
 
-        self.transcription_hebrew.extractor = Combined(
+        self.transcription_he.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_english.extractor = Combined(
+        self.transcription_en.extractor = Combined(
             self.transcription.extractor,
             Constant('en'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_dutch.extractor = Combined(
+        self.transcription_nl.extractor = Combined(
             self.transcription.extractor,
             Constant('nl'),
             transform=lambda x: get_text_in_language(x)
         )
 
+        self.fields = [
+            self.bibliography,
+            self.comments,
+            self.coordinates,
+            self.country,
+            self.dates_of_death,
+            self.iconography,
+            self._id,
+            self.images,
+            self.language,
+            self.location_details,
+            self.material,
+            self.material_details,
+            self.names,
+            self.not_after,
+            self.not_before,
+            self.region,
+            self.settlement,
+            self.sex,
+            self.source_database,
+            self.transcription,
+            self.transcription_nl,
+            self.transcription_en,
+            self.transcription_he,
+            self.url,
+            self.year
+        ]
+
 
 def convert_sex(values):
     if not values:
diff --git a/backend/es/conftest.py b/backend/es/conftest.py
index 8c817a8f7..406d285a6 100644
--- a/backend/es/conftest.py
+++ b/backend/es/conftest.py
@@ -3,7 +3,6 @@
 from django.contrib.auth.models import Group
 
 from addcorpus.load_corpus import load_corpus_definition
-from ianalyzer.elasticsearch import elasticsearch
 from es import es_index
 from addcorpus.models import Corpus
 
diff --git a/backend/es/tests/test_es_index.py b/backend/es/tests/test_es_index.py
index 96eb57ab1..6f69f3611 100644
--- a/backend/es/tests/test_es_index.py
+++ b/backend/es/tests/test_es_index.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 from time import sleep
 
-from addcorpus.load_corpus import load_corpus_definition
 from es.es_index import perform_indexing
 
 start = datetime.strptime('1970-01-01','%Y-%m-%d')
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 0904061d2..eedbd49a6 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -55,6 +55,10 @@ services:
       - cluster.name=ianalizer-es-data-cluster
       - bootstrap.memory_lock=true
       - xpack.security.enabled=false
+      - logger.org.elasticsearch.discovery=ERROR
+      - logger.org.elasticsearch.transport=ERROR
+      - logger.org.elasticsearch.http=ERROR
+      - logger.org.elasticsearch.cluster=ERROR
       - "ES_JAVA_OPTS=-Xms2g -Xmx2g"
       - ELASTIC_PASSWORD=$ELASTIC_ROOT_PASSWORD
     ulimits:

From e770fa3178c8265115f126100d2dd6f11c42cf34 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Wed, 8 Nov 2023 14:08:51 +0100
Subject: [PATCH 39/98] Use p-dropdown for tag select

---
 frontend/src/app/models/found-document.ts     | 38 +++++++++----------
 frontend/src/app/services/tag.service.ts      | 23 +++++------
 .../document-tags/document-tags.component.ts  | 17 ++++-----
 .../tag/tag-select/tag-select.component.html  |  9 +----
 .../tag/tag-select/tag-select.component.ts    | 12 ++----
 frontend/src/app/tag/tag.module.ts            | 16 +++-----
 6 files changed, 48 insertions(+), 67 deletions(-)

diff --git a/frontend/src/app/models/found-document.ts b/frontend/src/app/models/found-document.ts
index 035eef36e..be705bbd7 100644
--- a/frontend/src/app/models/found-document.ts
+++ b/frontend/src/app/models/found-document.ts
@@ -40,7 +40,6 @@ export class FoundDocument {
         this.fetchTags();
     }
 
-
     /**
      * whether the document has a "context" that it belongs to
      *
@@ -53,8 +52,11 @@ export class FoundDocument {
             return false;
         }
 
-        const notBlank = value => value !== undefined && value !== null && value !== '';
-        const contextValues = spec.contextFields.map(this.fieldValue.bind(this));
+        const notBlank = (value) =>
+            value !== undefined && value !== null && value !== '';
+        const contextValues = spec.contextFields.map(
+            this.fieldValue.bind(this)
+        );
         return _.every(contextValues, notBlank);
     }
 
@@ -71,29 +73,25 @@ export class FoundDocument {
         return this.fieldValues[field.name];
     }
 
-    addTag(tagId: number): void {
-        const newTagIds = this.tags$.value.map(tag => tag.id).concat([tagId]);
-        this.setTags(newTagIds);
+    addTag(tag: Tag): void {
+        const newTags = this.tags$.value.concat([tag]);
+        this.setTags(newTags);
     }
 
-    removeTag(tagId: number): void {
-        const newTagIds = _.without(
-            this.tags$.value.map(tag => tag.id),
-            tagId,
-        );
-        this.setTags(newTagIds);
+    removeTag(tag: Tag): void {
+        const newTags = _.without(this.tags$.value, tag);
+        this.setTags(newTags);
     }
 
-    setTags(tagIds: number[]): void {
-        this.tagService.setDocumentTags(this, tagIds).subscribe(
-            value => this.tags$.next(value)
-        );
+    setTags(tags: Tag[]): void {
+        this.tagService
+            .setDocumentTags(this, tags)
+            .subscribe((value) => this.tags$.next(value));
     }
 
     private fetchTags(): void {
-        this.tagService.getDocumentTags(this).subscribe(
-            value => this.tags$.next(value)
-        );
+        this.tagService
+            .getDocumentTags(this)
+            .subscribe((value) => this.tags$.next(value));
     }
-
 }
diff --git a/frontend/src/app/services/tag.service.ts b/frontend/src/app/services/tag.service.ts
index f6c4b573e..167ff57c1 100644
--- a/frontend/src/app/services/tag.service.ts
+++ b/frontend/src/app/services/tag.service.ts
@@ -7,7 +7,7 @@ import { ApiService } from './api.service';
 
 
 @Injectable({
-    providedIn: 'root'
+    providedIn: 'root',
 })
 export class TagService {
     /** all tags from the user */
@@ -18,21 +18,22 @@ export class TagService {
     }
 
     makeTag(name: string, description?: string): Observable<Tag> {
-        return this.apiService.createTag(name, description).pipe(
-            tap(() => this.fetch())
-        );
+        return this.apiService
+            .createTag(name, description)
+            .pipe(tap(() => this.fetch()));
     }
 
     getDocumentTags(document: FoundDocument): Observable<Tag[]> {
-        return this.apiService.documentTags(document).pipe(
-            map(response => response.tags)
-        );
+        return this.apiService
+            .documentTags(document)
+            .pipe(map((response) => response.tags));
     }
 
-    setDocumentTags(document: FoundDocument, tagIds: number[]): Observable<Tag[]> {
-        return this.apiService.setDocumentTags(document, tagIds).pipe(
-            map(response => response.tags)
-        );
+    setDocumentTags(document: FoundDocument, tags: Tag[]): Observable<Tag[]> {
+        const tagIds = tags.map((t) => t.id);
+        return this.apiService
+            .setDocumentTags(document, tagIds)
+            .pipe(map((response) => response.tags));
     }
 
     private fetch() {
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.ts b/frontend/src/app/tag/document-tags/document-tags.component.ts
index 0f9c4e20c..89a0682fc 100644
--- a/frontend/src/app/tag/document-tags/document-tags.component.ts
+++ b/frontend/src/app/tag/document-tags/document-tags.component.ts
@@ -5,9 +5,9 @@ import { first, map, mergeMap } from 'rxjs/operators';
 import * as _ from 'lodash';
 
 @Component({
-  selector: 'ia-document-tags',
-  templateUrl: './document-tags.component.html',
-  styleUrls: ['./document-tags.component.scss']
+    selector: 'ia-document-tags',
+    templateUrl: './document-tags.component.html',
+    styleUrls: ['./document-tags.component.scss'],
 })
 export class DocumentTagsComponent implements OnInit {
     @Input() document: FoundDocument;
@@ -17,16 +17,15 @@ export class DocumentTagsComponent implements OnInit {
 
     showAddNew = false;
 
-    constructor() { }
+    constructor() {}
 
-    ngOnInit(): void {
-    }
+    ngOnInit(): void {}
 
-    addTag(tagId: number) {
-        this.document.addTag(tagId);
+    addTag(tag: Tag) {
+        this.document.addTag(tag);
     }
 
     removeTag(tag: Tag) {
-        this.document.removeTag(tag.id);
+        this.document.removeTag(tag);
     }
 }
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index 758c2ba0a..35cce6bba 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -1,13 +1,6 @@
 <div class="field has-addons">
     <div class="control">
-        <div class="select tag-select is-small" *ngIf="tags$ | async as tags">
-            <select #tagSelect aria-label="select a tag">
-                <option *ngFor="let tag of filterTags(tags, exclude)"
-                    [value]="tag.id">
-                    {{tag.name}}
-                </option>
-            </select>
-        </div>
+        <p-dropdown [options]="tags$ | async" [(ngModel)]="selectedTag" optionLabel="name"></p-dropdown>
     </div>
     <div class="control">
         <button class="button tag" aria-label="add tag"
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts
index e71bc3cfa..1aefd0382 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.ts
+++ b/frontend/src/app/tag/tag-select/tag-select.component.ts
@@ -9,11 +9,11 @@ import * as _ from 'lodash';
 @Component({
     selector: 'ia-tag-select',
     templateUrl: './tag-select.component.html',
-    styleUrls: ['./tag-select.component.scss']
+    styleUrls: ['./tag-select.component.scss'],
 })
 export class TagSelectComponent {
     @Input() exclude: Tag[];
-    @Output() selection = new EventEmitter<number>();
+    @Output() selection = new EventEmitter<Tag>();
     @Output() cancel = new EventEmitter<void>();
 
     @ViewChild('tagSelect') tagSelect: ElementRef;
@@ -23,21 +23,17 @@ export class TagSelectComponent {
     faCheck = faCheck;
     faTimes = faTimes;
 
+    selectedTag: Tag;
 
     constructor(private tagService: TagService) {
         this.tags$ = this.tagService.tags$;
     }
 
-    get selectedTagId(): number {
-        const option = this.tagSelect.nativeElement.selectedOptions[0];
-        return parseInt(option.value, 10);
-    }
-
     filterTags(tags: Tag[], exclude?: Tag[]) {
         return _.differenceBy(tags, exclude || [], 'name');
     }
 
     confirm() {
-        this.selection.emit(this.selectedTagId);
+        this.selection.emit(this.selectedTag);
     }
 }
diff --git a/frontend/src/app/tag/tag.module.ts b/frontend/src/app/tag/tag.module.ts
index 1904e896a..a612e8311 100644
--- a/frontend/src/app/tag/tag.module.ts
+++ b/frontend/src/app/tag/tag.module.ts
@@ -2,19 +2,13 @@ import { NgModule } from '@angular/core';
 import { SharedModule } from '../shared/shared.module';
 import { TagSelectComponent } from './tag-select/tag-select.component';
 import { DocumentTagsComponent } from './document-tags/document-tags.component';
+import { DropdownModule } from 'primeng/dropdown';
 
 
 
 @NgModule({
-    declarations: [
-        DocumentTagsComponent,
-        TagSelectComponent,
-    ],
-    imports: [
-        SharedModule
-    ],
-    exports: [
-        DocumentTagsComponent,
-    ]
+    declarations: [DocumentTagsComponent, TagSelectComponent],
+    imports: [SharedModule, DropdownModule],
+    exports: [DocumentTagsComponent],
 })
-export class TagModule { }
+export class TagModule {}

From 914b1c4fa763ec1c730aa07b04e0b9fa77b57591 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Wed, 8 Nov 2023 14:20:45 +0100
Subject: [PATCH 40/98] Filter available tags

---
 frontend/src/app/tag/tag-select/tag-select.component.html | 7 ++++++-
 frontend/src/app/tag/tag-select/tag-select.component.ts   | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index 35cce6bba..dade83293 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -1,6 +1,11 @@
 <div class="field has-addons">
     <div class="control">
-        <p-dropdown [options]="tags$ | async" [(ngModel)]="selectedTag" optionLabel="name"></p-dropdown>
+        <ng-container *ngIf="filterTags(tags$ | async, exclude) as filteredTags">
+            <p-dropdown [options]="filteredTags" [(ngModel)]="selectedTag" optionLabel="name" emptyMessage="niks hier"
+                [disabled]="!filteredTags.length"
+                [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'">
+            </p-dropdown>
+        </ng-container>
     </div>
     <div class="control">
         <button class="button tag" aria-label="add tag"
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts
index 1aefd0382..9d7291759 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.ts
+++ b/frontend/src/app/tag/tag-select/tag-select.component.ts
@@ -29,7 +29,7 @@ export class TagSelectComponent {
         this.tags$ = this.tagService.tags$;
     }
 
-    filterTags(tags: Tag[], exclude?: Tag[]) {
+    filterTags(tags: Tag[], exclude?: Tag[]): Tag[] {
         return _.differenceBy(tags, exclude || [], 'name');
     }
 

From 0c0f08727d373d30c040c94bc0a96d3c48449fd7 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Wed, 8 Nov 2023 14:52:00 +0100
Subject: [PATCH 41/98] Style small dropdown

---
 frontend/src/_primeng.scss                          | 13 +++++++++++++
 .../app/tag/tag-select/tag-select.component.html    |  5 ++---
 .../app/tag/tag-select/tag-select.component.scss    |  7 -------
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/frontend/src/_primeng.scss b/frontend/src/_primeng.scss
index 7994b994e..73096387b 100644
--- a/frontend/src/_primeng.scss
+++ b/frontend/src/_primeng.scss
@@ -186,6 +186,19 @@ body {
         &:hover {
             border-color: rgb(184, 184, 184) !important;
         }
+        &.is-small {
+            height: 1.5em;
+            vertical-align: top;
+            align-items: center;
+
+            .p-dropdown-label {
+                font-size: 12px;
+            }
+
+            .p-dropdown-item {
+                font-size: 12px;
+            }
+        }
     }
 }
 
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index dade83293..e9565e380 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -3,13 +3,12 @@
         <ng-container *ngIf="filterTags(tags$ | async, exclude) as filteredTags">
             <p-dropdown [options]="filteredTags" [(ngModel)]="selectedTag" optionLabel="name" emptyMessage="niks hier"
                 [disabled]="!filteredTags.length"
-                [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'">
+                [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'" styleClass="is-small">
             </p-dropdown>
         </ng-container>
     </div>
     <div class="control">
-        <button class="button tag" aria-label="add tag"
-            (click)="confirm()">
+        <button class="button tag" aria-label="add tag" (click)="confirm()">
             <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
         </button>
     </div>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.scss b/frontend/src/app/tag/tag-select/tag-select.component.scss
index 30e27c16c..e69de29bb 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.scss
+++ b/frontend/src/app/tag/tag-select/tag-select.component.scss
@@ -1,7 +0,0 @@
-.tag-select {
-    select {
-        padding-top: 0;
-        padding-bottom: 0;
-        height: 2em;
-        }
-}

From 7055265ef725ab96968da21b2830c33671bfc266 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Wed, 8 Nov 2023 14:55:51 +0100
Subject: [PATCH 42/98] Disable add tag button when no selected tag

---
 frontend/src/app/tag/tag-select/tag-select.component.html | 2 +-
 frontend/src/app/tag/tag-select/tag-select.component.ts   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index e9565e380..ed5fb866d 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -8,7 +8,7 @@
         </ng-container>
     </div>
     <div class="control">
-        <button class="button tag" aria-label="add tag" (click)="confirm()">
+        <button class="button tag" aria-label="add tag" (click)="confirm()" [disabled]="!selectedTag">
             <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
         </button>
     </div>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts
index 9d7291759..7a41e3d9b 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.ts
+++ b/frontend/src/app/tag/tag-select/tag-select.component.ts
@@ -35,5 +35,6 @@ export class TagSelectComponent {
 
     confirm() {
         this.selection.emit(this.selectedTag);
+        this.selectedTag = undefined;
     }
 }

From c31d3bb0364c31100a04ea4e9e3520c001e3dc1f Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Wed, 8 Nov 2023 15:41:01 +0100
Subject: [PATCH 43/98] Create tag interface

---
 frontend/src/app/services/api.service.ts      |  2 +-
 .../tag/tag-select/tag-select.component.html  | 23 ++++++++-
 .../tag/tag-select/tag-select.component.ts    | 49 ++++++++++++++++---
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts
index b4607cd90..4acf5b75a 100644
--- a/frontend/src/app/services/api.service.ts
+++ b/frontend/src/app/services/api.service.ts
@@ -253,7 +253,7 @@ export class ApiService {
 
     public createTag(name: string, description?: string): Observable<Tag> {
         const url = this.apiRoute(this.tagApiUrl, 'tags/');
-        return this.http.put<Tag>(url, { name, description });
+        return this.http.post<Tag>(url, { name, description });
     }
 
     public documentTags(document: FoundDocument): Observable<DocumentTagsResponse> {
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index ed5fb866d..9284fa404 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -1,17 +1,36 @@
 <div class="field has-addons">
-    <div class="control">
+    <ng-container *ngIf="!createMode">
+        <div class="control" *ngIf="!createMode">
         <ng-container *ngIf="filterTags(tags$ | async, exclude) as filteredTags">
             <p-dropdown [options]="filteredTags" [(ngModel)]="selectedTag" optionLabel="name" emptyMessage="niks hier"
                 [disabled]="!filteredTags.length"
                 [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'" styleClass="is-small">
+            <ng-template pTemplate="footer">
+                <button class="button tag" aria-label="add tag" (click)="toggleCreate()">
+                    <span class="icon"><fa-icon [icon]="faPlus"></fa-icon></span>
+                    <span>Create a new tag</span>
+                </button>
+            </ng-template>
             </p-dropdown>
         </ng-container>
     </div>
     <div class="control">
-        <button class="button tag" aria-label="add tag" (click)="confirm()" [disabled]="!selectedTag">
+        <button class="button tag" aria-label="add tag" (click)="addTag()" [disabled]="!selectedTag">
+            <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
+        </button>
+    </div>
+</ng-container>
+<ng-container *ngIf="createMode">
+    <div class="control">
+        <input type="text" class="input tag" placeholder="Enter tag name" [(ngModel)]="newTagName">
+    </div>
+    <div class="control">
+        <button class="button tag" aria-label="create tag" (click)="createTag()" [disabled]="!newTagName">
             <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
         </button>
     </div>
+</ng-container>
+
     <div class="control">
         <button class="button tag" aria-label="cancel" type="reset" (click)="cancel.emit()">
             <span class="icon"><fa-icon [icon]="faTimes"></fa-icon></span>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.ts b/frontend/src/app/tag/tag-select/tag-select.component.ts
index 7a41e3d9b..9e49370eb 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.ts
+++ b/frontend/src/app/tag/tag-select/tag-select.component.ts
@@ -1,17 +1,25 @@
-import { Component, ElementRef, EventEmitter, Input, OnInit, Output, ViewChild } from '@angular/core';
-import { TagService } from '../../services/tag.service';
-import { Observable } from 'rxjs';
-import { Tag } from '../../models';
-import { faCheck, faTimes } from '@fortawesome/free-solid-svg-icons';
+import {
+    Component,
+    ElementRef,
+    EventEmitter,
+    Input,
+    OnDestroy,
+    Output,
+    ViewChild,
+} from '@angular/core';
+import { faCheck, faPlus, faTimes } from '@fortawesome/free-solid-svg-icons';
 import * as _ from 'lodash';
-
+import { Observable, Subject } from 'rxjs';
+import { Tag } from '../../models';
+import { TagService } from '../../services/tag.service';
+import { takeUntil } from 'rxjs/operators';
 
 @Component({
     selector: 'ia-tag-select',
     templateUrl: './tag-select.component.html',
     styleUrls: ['./tag-select.component.scss'],
 })
-export class TagSelectComponent {
+export class TagSelectComponent implements OnDestroy {
     @Input() exclude: Tag[];
     @Output() selection = new EventEmitter<Tag>();
     @Output() cancel = new EventEmitter<void>();
@@ -19,12 +27,17 @@ export class TagSelectComponent {
     @ViewChild('tagSelect') tagSelect: ElementRef;
 
     tags$: Observable<Tag[]>;
+    destroy$ = new Subject();
 
     faCheck = faCheck;
     faTimes = faTimes;
+    faPlus = faPlus;
 
     selectedTag: Tag;
 
+    createMode = false;
+    newTagName: string;
+
     constructor(private tagService: TagService) {
         this.tags$ = this.tagService.tags$;
     }
@@ -33,8 +46,28 @@ export class TagSelectComponent {
         return _.differenceBy(tags, exclude || [], 'name');
     }
 
-    confirm() {
+    addTag() {
         this.selection.emit(this.selectedTag);
         this.selectedTag = undefined;
     }
+
+    createTag() {
+        this.tagService
+            .makeTag(this.newTagName)
+            .pipe(takeUntil(this.destroy$))
+            .subscribe((res) => {
+                this.selection.emit(res);
+                this.createMode = false;
+            });
+    }
+
+    toggleCreate(): void {
+        this.selectedTag = undefined;
+        this.createMode = !this.createMode;
+    }
+
+    ngOnDestroy(): void {
+        this.destroy$.next();
+        this.destroy$.complete();
+    }
 }

From 874b004efafdfe990ed176474b5568d1368037f9 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 16:36:37 +0100
Subject: [PATCH 44/98] finalize language specific analyzers

---
 backend/addcorpus/es_settings.py              | 24 ++++++++++++-------
 .../dutchannualreports/dutchannualreports.py  |  2 +-
 backend/corpora/goodreads/goodreads.py        |  1 -
 .../parliament/utils/field_defaults.py        |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 94d2366e3..010ddc757 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -53,9 +53,20 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
                 continue # skip languages for which we do not have a stopword list
 
             if stopword_analyzer:
-                set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language)
+                set_clean_analyzer(
+                    settings,
+                    add_language_string(stopword_filter_name, language_string),
+                    add_language_string(clean_analyzer_name, language_string),
+                    language
+                )
             if stemming_analyzer:
-                set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language)
+                set_stemmed_analyzer(
+                    settings,
+                    add_language_string(stopword_filter_name, language_string),
+                    add_language_string(stemmer_filter_name, language_string),
+                    add_language_string(stemmed_analyzer_name, language_string),
+                    language
+                )
 
     return settings
 
@@ -107,9 +118,6 @@ def get_stopwords_from_settings(es_settings):
     return stopwords
 
 def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language):
-    stopword_filter_name = add_language_string(stopword_filter_name, language)
-    stemmer_filter_name = add_language_string(stemmer_filter_name, language)
-    stemmed_analyzer_name = add_language_string(stemmed_analyzer_name, language)
     settings['analysis']['filter'][stemmer_filter_name] = make_stemmer_filter(language)
     settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)
 
@@ -127,6 +135,6 @@ def set_stopword_filter(settings, stopword_filter_name, language):
     return True
     
 def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language):
-    settings["analysis"]['analyzer'][
-        add_language_string(clean_analyzer_name, language)
-        ] = make_clean_analyzer(add_language_string(stopword_filter_name, language))
\ No newline at end of file
+    settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer(
+        stopword_filter_name
+    )
\ No newline at end of file
diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
index 6a7c89168..884ce066b 100644
--- a/backend/corpora/dutchannualreports/dutchannualreports.py
+++ b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -50,7 +50,7 @@ class DutchAnnualReports(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
         reader = csv.DictReader(f)
diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py
index dfdd07259..db5d0561c 100644
--- a/backend/corpora/goodreads/goodreads.py
+++ b/backend/corpora/goodreads/goodreads.py
@@ -13,7 +13,6 @@
 from addcorpus.corpus import CSVCorpusDefinition, FieldDefinition
 
 from addcorpus.es_mappings import main_content_mapping
-from addcorpus.es_settings import es_settings
 
 logger = logging.getLogger('indexing')
 
diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py
index 360b485c0..cf74a6c10 100644
--- a/backend/corpora/parliament/utils/field_defaults.py
+++ b/backend/corpora/parliament/utils/field_defaults.py
@@ -284,7 +284,7 @@ def speech():
         display_name='Speech',
         description='The transcribed speech',
         # each index has its own definition of the 'clean' and 'stemmed' analyzer, based on language
-        es_mapping = main_content_mapping(token_counts=True, stopword_analyzer=True, stemming_analyzer=True, updated_highlighting=True),
+        es_mapping = main_content_mapping(token_counts=True, stopword_analysis=True, stemming_analysis=True, updated_highlighting=True),
         results_overview=True,
         search_field_core=True,
         display_type='text_content',

From 425d045e82da897f44ddb741f0998c9128fe01d8 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 16:36:54 +0100
Subject: [PATCH 45/98] set and inherit fields

---
 backend/corpora/peaceportal/FIJI/fiji.py   |  7 ++++---
 backend/corpora/peaceportal/epidat.py      |  9 +++++----
 backend/corpora/peaceportal/iis.py         |  9 +++++----
 backend/corpora/peaceportal/peaceportal.py | 17 +++++++++++++----
 backend/corpora/peaceportal/tol.py         | 12 ++++++------
 5 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 44095c1d6..b350bce83 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -40,6 +40,7 @@ def sources(self, start, end):
                 }
 
     def __init__(self):
+        super().__init__()
         self.source_database.extractor = Constant(
             value='Funerary Inscriptions of Jews from Italy (Utrecht University)'
         )
@@ -157,19 +158,19 @@ def __init__(self):
             toplevel=False
         )
 
-        self.transcription_he.extractor = Combined(
+        self.transcription_hebrew.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_la.extractor = Combined(
+        self.transcription_latin.extractor = Combined(
             self.transcription.extractor,
             Constant('la'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_el.extractor = Combined(
+        self.transcription_greek.extractor = Combined(
             self.transcription.extractor,
             Constant('el'),
             transform=lambda x: get_text_in_language(x)
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index 461b06b24..917f7c59c 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -20,6 +20,7 @@ def es_settings(self):
         return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
 
     def __init__(self):
+        super().__init__()
         self.source_database.extractor = Constant(
             value='Epidat (Steinheim Institute)'
         )
@@ -75,7 +76,7 @@ def __init__(self):
             transform_soup_func=extract_transcript
         )
 
-        self.transcription_de.extractor = XML(
+        self.transcription_german.extractor = XML(
             tag=['text', 'body', ],
             toplevel=False,
             multiple=False,
@@ -215,19 +216,19 @@ def __init__(self):
             multiple=True
         )
 
-        self.transcription_he.extractor = Combined(
+        self.transcription_hebrew.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_en.extractor = Combined(
+        self.transcription_english.extractor = Combined(
             self.transcription.extractor,
             Constant('en'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_nl.extractor = Combined(
+        self.transcription_dutch.extractor = Combined(
             self.transcription.extractor,
             Constant('nl'),
             transform=lambda x: get_text_in_language(x)
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index 67699e0e1..c51211a8a 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -13,6 +13,7 @@ class PeaceportalIIS(PeacePortal):
     es_alias = settings.PEACEPORTAL_ALIAS
 
     def __init__(self):
+        super().__init__()
         self.source_database.extractor = Constant(
             value='Inscriptions of Israel/Palestine (Brown University)'
         )
@@ -62,7 +63,7 @@ def __init__(self):
             stream_handler=extract_transcript
         )
 
-        self.transcription_en.extractor = HTML(
+        self.transcription_english.extractor = HTML(
             tag=['div'],
             toplevel=True,
             multiple=False,
@@ -229,19 +230,19 @@ def __init__(self):
             multiple=True
         )
 
-        self.transcription_he.extractor = Combined(
+        self.transcription_hebrew.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_la.extractor = Combined(
+        self.transcription_latin.extractor = Combined(
             self.transcription.extractor,
             Constant('la'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_el.extractor = Combined(
+        self.transcription_greek.extractor = Combined(
             self.transcription.extractor,
             Constant('el'),
             transform=lambda x: get_text_in_language(x)
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 0f27e08f3..f4217dc63 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -153,13 +153,13 @@ def request_media(self, document):
 
     transcription_german = FieldDefinition(
         name='transcription_de',
-        es_mapping=main_content_mapping(stopword_analyzer='clean_german', stemming_analyzer='stemmed_german'),
+        es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='de'),
         hidden=True
     )
 
     transcription_english = FieldDefinition(
         name='transcription_en',
-        es_mapping=main_content_mapping(stopword_analyzer='clean_english', stemming_analyzer='stemmed_english'),
+        es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='en'),
         hidden=True
     )
 
@@ -177,13 +177,13 @@ def request_media(self, document):
 
     transcription_greek = FieldDefinition(
         name='transcription_el',
-        es_mapping=main_content_mapping(stopword_analyzer='clean_greek', stemming_analyzer='stemmed_greek'),
+        es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='el'),
         hidden=True
     )
 
     transcription_dutch = FieldDefinition(
         name='transcription_nl',
-        es_mapping=main_content_mapping(stopword_analyzer='clean_dutch', stemming_analyzer='stemmed_dutch'),
+        es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='nl'),
         hidden=True
     )
 
@@ -345,6 +345,15 @@ def request_media(self, document):
         display_name='Date of death',
     )
 
+        #define fields property so it can be set in __init__
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, value):
+        self._fields = value
+
     def __init__(self):
         self.fields = [
             self._id,
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
index e859529d4..be1d850b5 100644
--- a/backend/corpora/peaceportal/tol.py
+++ b/backend/corpora/peaceportal/tol.py
@@ -201,19 +201,19 @@ def __init__(self):
             multiple=True
         )
 
-        self.transcription_he.extractor = Combined(
+        self.transcription_hebrew.extractor = Combined(
             self.transcription.extractor,
             Constant('he'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_en.extractor = Combined(
+        self.transcription_english.extractor = Combined(
             self.transcription.extractor,
             Constant('en'),
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.transcription_nl.extractor = Combined(
+        self.transcription_dutch.extractor = Combined(
             self.transcription.extractor,
             Constant('nl'),
             transform=lambda x: get_text_in_language(x)
@@ -240,9 +240,9 @@ def __init__(self):
             self.sex,
             self.source_database,
             self.transcription,
-            self.transcription_nl,
-            self.transcription_en,
-            self.transcription_he,
+            self.transcription_dutch,
+            self.transcription_english,
+            self.transcription_hebrew,
             self.url,
             self.year
         ]

From 432923bce7252574b98ca0d46430df5a940de610 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 16:46:01 +0100
Subject: [PATCH 46/98] add kibana to docker-compose

---
 docker-compose.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index eedbd49a6..9c83c1635 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -69,6 +69,14 @@ services:
       - ianalyzer-es:/usr/share/elasticsearch/data
     ports:
       - 127.0.0.1:9200:9200
+  kibana:
+    image: docker.elastic.co/kibana/kibana:8.5.0
+    depends_on:
+      - elasticsearch
+    environment:
+        - "ELASTICSEARCH_URL=http://elasticsearch:9200"
+    ports:
+      - 127.0.0.1:5601:5601
   redis:
     image: redis:latest
     restart: unless-stopped

From 3a25867c99cc4bdd41711865945e7e2eb02a9e91 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 16:51:27 +0100
Subject: [PATCH 47/98] specify postgres image registry

---
 docker-compose.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index 9c83c1635..78bbf5648 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   db:
-    image: postgres
+    image: docker.io/library/postgres
     environment:
       - POSTGRES_DB=${SQL_DATABASE}
       - POSTGRES_USER=${SQL_USER}

From 2b45d83e628f79bcefc4ce9c085842cc28f46911 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 16:53:22 +0100
Subject: [PATCH 48/98] add registry info for node and python

---
 backend/Dockerfile  | 2 +-
 frontend/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/Dockerfile b/backend/Dockerfile
index a8442a733..726934cd0 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -1,5 +1,5 @@
 # Base image
-FROM python:3.8-buster
+FROM docker.io/library/python:3.8-buster
 # Setting this means stdout and stderr streams are sent to terminal in real time
 ENV PYTHONUNBUFFERED 1
 # Get required libraries for xmlsec
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index b80b97408..514c7b21d 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -1,5 +1,5 @@
 # base image
-FROM node:14-alpine
+FROM docker.io/library/node:14-alpine
 
 RUN apk update && apk add --no-cache --virtual .gyp python3 make g++
 # Install Chrome

From 9f23a3b88a4c4157ad35a3b146dccd7b927f902e Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Wed, 8 Nov 2023 15:56:12 +0100
Subject: [PATCH 49/98] polish styling and keyboard interactions

---
 frontend/src/_primeng.scss                    | 13 ----
 .../src/app/dropdown/dropdown.component.html  |  4 +-
 .../src/app/dropdown/dropdown.component.scss  |  7 +++
 .../src/app/dropdown/dropdown.component.ts    |  6 ++
 .../document-tags.component.html              |  2 +-
 .../tag/tag-select/tag-select.component.html  | 60 +++++++++++--------
 .../tag/tag-select/tag-select.component.scss  |  3 +
 frontend/src/app/tag/tag.module.ts            |  3 +-
 frontend/src/styles.scss                      | 13 ++++
 9 files changed, 68 insertions(+), 43 deletions(-)

diff --git a/frontend/src/_primeng.scss b/frontend/src/_primeng.scss
index 73096387b..7994b994e 100644
--- a/frontend/src/_primeng.scss
+++ b/frontend/src/_primeng.scss
@@ -186,19 +186,6 @@ body {
         &:hover {
             border-color: rgb(184, 184, 184) !important;
         }
-        &.is-small {
-            height: 1.5em;
-            vertical-align: top;
-            align-items: center;
-
-            .p-dropdown-label {
-                font-size: 12px;
-            }
-
-            .p-dropdown-item {
-                font-size: 12px;
-            }
-        }
     }
 }
 
diff --git a/frontend/src/app/dropdown/dropdown.component.html b/frontend/src/app/dropdown/dropdown.component.html
index 3d17df9e6..27b3eb11a 100644
--- a/frontend/src/app/dropdown/dropdown.component.html
+++ b/frontend/src/app/dropdown/dropdown.component.html
@@ -1,6 +1,7 @@
 <div class="dropdown" [ngClass]="{'is-active': showDropdown}">
     <div class="dropdown-trigger" (click)="toggleDropdown()">
-        <button class="button" aria-haspopup="true">
+        <button class="button" aria-haspopup="true"
+            [disabled]="disabled" [class]="styleClass">
             <span class="icon" *ngIf="icon">
                 <fa-icon [icon]="icon"></fa-icon>
             </span>
@@ -22,6 +23,7 @@
                 <ng-container *ngIf="optionLabel === undefined">{{option}}</ng-container>
                 <ng-container *ngIf="optionLabel !== undefined">{{option[optionLabel]}}</ng-container>
             </a>
+            <ng-content select="footer"></ng-content>
         </div>
     </div>
 </div>
diff --git a/frontend/src/app/dropdown/dropdown.component.scss b/frontend/src/app/dropdown/dropdown.component.scss
index e69de29bb..865791e8a 100644
--- a/frontend/src/app/dropdown/dropdown.component.scss
+++ b/frontend/src/app/dropdown/dropdown.component.scss
@@ -0,0 +1,7 @@
+@import "_utilities";
+
+.dropdown-item {
+    // this prevents the .dropdown item rule from being overwritten
+    // when the dropdown is used within a <table>
+    color: $text !important;
+}
diff --git a/frontend/src/app/dropdown/dropdown.component.ts b/frontend/src/app/dropdown/dropdown.component.ts
index 0ce576a3d..7e08c16c6 100644
--- a/frontend/src/app/dropdown/dropdown.component.ts
+++ b/frontend/src/app/dropdown/dropdown.component.ts
@@ -17,6 +17,12 @@ export class DropdownComponent<T> implements OnDestroy {
     @Input()
     public value: T | undefined = undefined;
 
+    @Input()
+    public disabled = false;
+
+    @Input()
+    public styleClass: string;
+
     @Input()
     public options: T[] = [];
 
diff --git a/frontend/src/app/tag/document-tags/document-tags.component.html b/frontend/src/app/tag/document-tags/document-tags.component.html
index dc8d37ad8..a38716edf 100644
--- a/frontend/src/app/tag/document-tags/document-tags.component.html
+++ b/frontend/src/app/tag/document-tags/document-tags.component.html
@@ -10,7 +10,7 @@
         </button>
     </div>
 
-    <div class="control">
+    <div class="control" style="max-width: 100%;" >
         <ng-container *ngIf="showAddNew; else toggleAddNew">
             <ia-tag-select [exclude]="tags" (selection)="addTag($event)" (cancel)="showAddNew = false"></ia-tag-select>
         </ng-container>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index 9284fa404..8c477204a 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -1,39 +1,47 @@
-<div class="field has-addons">
-    <ng-container *ngIf="!createMode">
-        <div class="control" *ngIf="!createMode">
+<form class="field has-addons">
+    <div class="control" *ngIf="!createMode">
         <ng-container *ngIf="filterTags(tags$ | async, exclude) as filteredTags">
-            <p-dropdown [options]="filteredTags" [(ngModel)]="selectedTag" optionLabel="name" emptyMessage="niks hier"
+            <ia-dropdown
+                [options]="filteredTags"
+                optionLabel="name"
                 [disabled]="!filteredTags.length"
-                [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'" styleClass="is-small">
-            <ng-template pTemplate="footer">
-                <button class="button tag" aria-label="add tag" (click)="toggleCreate()">
-                    <span class="icon"><fa-icon [icon]="faPlus"></fa-icon></span>
-                    <span>Create a new tag</span>
-                </button>
-            </ng-template>
-            </p-dropdown>
+                (onChange)="selectedTag = $event"
+                [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'"
+                styleClass="tag-button"
+                >
+                <footer>
+                    <a role="button" class="dropdown-item"
+                        tabindex="0"
+                        (click)="toggleCreate()"
+                        (keydown.enter)="toggleCreate()">
+                        <span class="icon"><fa-icon [icon]="faPlus"></fa-icon></span>
+                        <span>Create a new tag</span>
+                    </a>
+                </footer>
+            </ia-dropdown>
         </ng-container>
     </div>
-    <div class="control">
-        <button class="button tag" aria-label="add tag" (click)="addTag()" [disabled]="!selectedTag">
-            <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
-        </button>
-    </div>
-</ng-container>
-<ng-container *ngIf="createMode">
-    <div class="control">
-        <input type="text" class="input tag" placeholder="Enter tag name" [(ngModel)]="newTagName">
+    <div class="control"  *ngIf="createMode">
+        <input type="text"
+            class="input is-small tag-input" placeholder="Enter tag name"
+            name="tag-name"
+            [(ngModel)]="newTagName">
     </div>
     <div class="control">
-        <button class="button tag" aria-label="create tag" (click)="createTag()" [disabled]="!newTagName">
+        <button
+            class="button tag"
+            aria-label="confirm"
+            type="submit"
+            (click)="createMode ? createTag() : addTag()"
+            [disabled]="createMode ? !newTagName?.length : !selectedTag"
+            >
             <span class="icon"><fa-icon [icon]="faCheck"></fa-icon></span>
         </button>
     </div>
-</ng-container>
-
     <div class="control">
-        <button class="button tag" aria-label="cancel" type="reset" (click)="cancel.emit()">
+        <button class="button tag" aria-label="cancel" type="reset" (click)="cancel.emit()"
+            type="button">
             <span class="icon"><fa-icon [icon]="faTimes"></fa-icon></span>
         </button>
     </div>
-</div>
+</form>
diff --git a/frontend/src/app/tag/tag-select/tag-select.component.scss b/frontend/src/app/tag/tag-select/tag-select.component.scss
index e69de29bb..6ec645475 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.scss
+++ b/frontend/src/app/tag/tag-select/tag-select.component.scss
@@ -0,0 +1,3 @@
+.tag-input {
+    height: 2em;
+}
diff --git a/frontend/src/app/tag/tag.module.ts b/frontend/src/app/tag/tag.module.ts
index a612e8311..fccbf4184 100644
--- a/frontend/src/app/tag/tag.module.ts
+++ b/frontend/src/app/tag/tag.module.ts
@@ -2,13 +2,12 @@ import { NgModule } from '@angular/core';
 import { SharedModule } from '../shared/shared.module';
 import { TagSelectComponent } from './tag-select/tag-select.component';
 import { DocumentTagsComponent } from './document-tags/document-tags.component';
-import { DropdownModule } from 'primeng/dropdown';
 
 
 
 @NgModule({
     declarations: [DocumentTagsComponent, TagSelectComponent],
-    imports: [SharedModule, DropdownModule],
+    imports: [SharedModule],
     exports: [DocumentTagsComponent],
 })
 export class TagModule {}
diff --git a/frontend/src/styles.scss b/frontend/src/styles.scss
index 112864b5a..149750a77 100644
--- a/frontend/src/styles.scss
+++ b/frontend/src/styles.scss
@@ -44,3 +44,16 @@
 .is-loading:not(.button) {
     @extend %content-loader;
 }
+
+.tag-button {
+	align-items: center;
+	border-radius: 3px;
+	display: inline-flex;
+	font-size: 0.75rem;
+	height: 2em;
+	justify-content: center;
+	line-height: 1.5;
+	padding-left: 0.75em;
+	padding-right: 0.75em;
+	white-space: nowrap;
+}

From 91bd385b9430c0440f8f4b68ecce55399c4c2ce8 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Wed, 8 Nov 2023 17:00:42 +0100
Subject: [PATCH 50/98] fix foundDocument spec

---
 frontend/src/app/models/found-document.spec.ts | 4 ++--
 frontend/src/mock-data/tag.ts                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/frontend/src/app/models/found-document.spec.ts b/frontend/src/app/models/found-document.spec.ts
index 7f8935b3a..161ee64f2 100644
--- a/frontend/src/app/models/found-document.spec.ts
+++ b/frontend/src/app/models/found-document.spec.ts
@@ -67,9 +67,9 @@ describe('FoundDocument', () => {
         const doc = makeDocument({ great_field: 'test' });
         expect(doc.tags$.value).toEqual(mockTags);
         const tag = _.first(mockTags);
-        doc.removeTag(tag.id);
+        doc.removeTag(tag);
         expect(doc.tags$.value.length).toBe(1);
-        doc.addTag(tag.id);
+        doc.addTag(tag);
         expect(doc.tags$.value.length).toBe(2);
     });
 });
diff --git a/frontend/src/mock-data/tag.ts b/frontend/src/mock-data/tag.ts
index 2195e17fc..7d08e7a13 100644
--- a/frontend/src/mock-data/tag.ts
+++ b/frontend/src/mock-data/tag.ts
@@ -29,8 +29,8 @@ export class TagServiceMock {
         }).pipe(tap(this.fetch.bind(this)));
     }
 
-    setDocumentTags(document: FoundDocument, tagIds: number[]): Observable<Tag[]> {
-        const tags = mockTags.filter(tag => tagIds.includes(tag.id));
+    setDocumentTags(document: FoundDocument, tagIds: Tag[]): Observable<Tag[]> {
+        const tags = mockTags.filter(tag => tagIds.includes(tag));
         return of(tags);
     };
 

From a9cbc7bf2360bf4285686d37df39f6a7acdb61e7 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Wed, 8 Nov 2023 17:05:01 +0100
Subject: [PATCH 51/98] do not disable tag dropdown

this prevented new tags from being added when there no existing tags to choose from
---
 frontend/src/app/tag/tag-select/tag-select.component.html | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/frontend/src/app/tag/tag-select/tag-select.component.html b/frontend/src/app/tag/tag-select/tag-select.component.html
index 8c477204a..9b1d6a3cf 100644
--- a/frontend/src/app/tag/tag-select/tag-select.component.html
+++ b/frontend/src/app/tag/tag-select/tag-select.component.html
@@ -4,9 +4,8 @@
             <ia-dropdown
                 [options]="filteredTags"
                 optionLabel="name"
-                [disabled]="!filteredTags.length"
                 (onChange)="selectedTag = $event"
-                [placeholder]="filteredTags.length ? 'Assign a tag' : 'No more tags to assign'"
+                placeholder="Assign a tag"
                 styleClass="tag-button"
                 >
                 <footer>

From e5169de05aa99722243c56f7ededc1b1f3df8595 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 17:45:13 +0100
Subject: [PATCH 52/98] remove regular expression user warning

---
 backend/corpora/peaceportal/FIJI/fiji.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index b350bce83..5302f537c 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -19,7 +19,7 @@ class PeaceportalFIJI(PeacePortal):
     data_directory = settings.PEACEPORTAL_FIJI_DATA
     es_index = getattr(settings, 'PEACEPORTAL_FIJI_ES_INDEX', 'peaceportal-fiji')
     es_alias = settings.PEACEPORTAL_ALIAS
-    filename_pattern = re.compile('\d+')
+    filename_pattern = re.compile(r'\d+')
 
     def sources(self, start, end):
         logger = logging.getLogger(__name__)

From 5492847602ddffb4517c75450752fa47002b48ec Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 8 Nov 2023 17:53:10 +0100
Subject: [PATCH 53/98] add language variable on FieldDefintion

---
 backend/addcorpus/corpus.py                | 2 ++
 backend/corpora/peaceportal/peaceportal.py | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
index e1e6f59bb..b11f279bd 100644
--- a/backend/addcorpus/corpus.py
+++ b/backend/addcorpus/corpus.py
@@ -712,6 +712,7 @@ def __init__(self,
                  visualizations=[],
                  visualization_sort=None,
                  es_mapping={'type': 'text'},
+                 language=None,
                  search_filter=None,
                  extractor=extract.Constant(None),
                  sortable=None,
@@ -735,6 +736,7 @@ def __init__(self,
         self.visualizations = visualizations
         self.visualization_sort = visualization_sort
         self.es_mapping = es_mapping
+        self.language = language
         self.indexed = indexed
         self.hidden = not indexed or hidden
         self.extractor = extractor
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index f4217dc63..1f721d50b 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -154,36 +154,42 @@ def request_media(self, document):
     transcription_german = FieldDefinition(
         name='transcription_de',
         es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='de'),
+        language='de',
         hidden=True
     )
 
     transcription_english = FieldDefinition(
         name='transcription_en',
         es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='en'),
+        language='en',
         hidden=True
     )
 
     transcription_hebrew = FieldDefinition(
         name='transcription_he', # no stopwords / stemmers available
         es_mapping={'type': 'text'},
+        language='he',
         hidden=True
     )
 
     transcription_latin = FieldDefinition(
         name='transcription_la',
         es_mapping={'type': 'text'}, # no stopwords / stemmers available
+        language='la',
         hidden=True
     )
 
     transcription_greek = FieldDefinition(
         name='transcription_el',
         es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='el'),
+        language='el',
         hidden=True
     )
 
     transcription_dutch = FieldDefinition(
         name='transcription_nl',
         es_mapping=main_content_mapping(stopword_analysis=True, stemming_analysis=True, language='nl'),
+        language='nl',
         hidden=True
     )
 

From d4fefca034a13059f6b650b559d3a09f12f2443a Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 2 Nov 2023 16:48:47 +0100
Subject: [PATCH 54/98] clarify search relevance meter

close #1260
---
 .../src/app/search/search-relevance.component.html     | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/frontend/src/app/search/search-relevance.component.html b/frontend/src/app/search/search-relevance.component.html
index 3732d8097..bc325a4bd 100644
--- a/frontend/src/app/search/search-relevance.component.html
+++ b/frontend/src/app/search/search-relevance.component.html
@@ -1 +1,9 @@
-<progress class="progress is-primary" [value]="value" max="1" title="Relevance score = {{value}}">{{value}}</progress>
+<progress role="meter"
+    class="progress is-primary"
+    [value]="value" max="1"
+    title="Relevance score = {{value}}"
+    [attr.aria-label]="'Relevance score = {{value}}'"
+    [attr.aria-valuenow]="value"
+    aria-valuemin="0" aria-valuemax="1"
+    >{{value}}
+</progress>

From 8fe8d244f56613fe0e577ab45b51ad74be32caf9 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 2 Nov 2023 16:51:47 +0100
Subject: [PATCH 55/98] use th for row headers

---
 frontend/src/app/search/search-results.component.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontend/src/app/search/search-results.component.html b/frontend/src/app/search/search-results.component.html
index b9781907d..6da05070c 100644
--- a/frontend/src/app/search/search-results.component.html
+++ b/frontend/src/app/search/search-results.component.html
@@ -41,9 +41,9 @@ <h2 class="subtitle" *ngIf="results.total.value > 5">
                     <table class="table is-fullwidth">
                         <ng-container *ngFor="let field of results.fields">
                             <tr *ngIf="document.fieldValue(field)">
-                                <td>
-                                    <b>{{field.displayName}}: </b>
-                                </td>
+                                <th>
+                                    {{field.displayName}}:
+                                </th>
                                 <ng-container *ngIf="document.highlight && document.highlight[field.name]; else unhighlightedRow">
                                     <td>
                                         <ng-container *ngFor="let highlight of document.highlight[field.name]">

From a7ed79fee177af6ef630341ca3ebb07674cc43cb Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 2 Nov 2023 17:00:57 +0100
Subject: [PATCH 56/98] labelled nav elements

---
 frontend/src/app/corpus-header/corpus-header.component.html | 6 +++---
 frontend/src/app/menu/menu.component.html                   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/frontend/src/app/corpus-header/corpus-header.component.html b/frontend/src/app/corpus-header/corpus-header.component.html
index 6902541c1..57de216d5 100644
--- a/frontend/src/app/corpus-header/corpus-header.component.html
+++ b/frontend/src/app/corpus-header/corpus-header.component.html
@@ -1,7 +1,7 @@
 <section class="section title-section">
     <div class="container is-readable">
         <div class="level" *ngIf="corpus">
-            <div class="level-right">
+            <div class="level-left">
                 <div class="level-item">
                     <h1 class="title">
                         <span *ngIf="currentPage === 'search'">Search</span>
@@ -12,7 +12,7 @@ <h1 class="title">
                     </h1>
                 </div>
             </div>
-            <div class="level-left">
+            <nav class="level-right" aria-label="secondary navigation">
                 <div class="field has-addons">
                     <div class="control">
                         <a class="button is-medium" [routerLink]="['/search', corpus.name]"
@@ -42,7 +42,7 @@ <h1 class="title">
                         </a>
                     </div>
                 </div>
-            </div>
+            </nav>
         </div>
     </div>
 </section>
diff --git a/frontend/src/app/menu/menu.component.html b/frontend/src/app/menu/menu.component.html
index 02e733ce0..dd2ef146b 100644
--- a/frontend/src/app/menu/menu.component.html
+++ b/frontend/src/app/menu/menu.component.html
@@ -1,4 +1,4 @@
-<nav class="navbar" *ngIf="currentUser">
+<nav class="navbar" *ngIf="currentUser" aria-label="primary navigation">
     <div class="container is-fluid">
         <div class="navbar-brand">
             <a class="navbar-item" [routerLinkActive]="'is-active'" [routerLink]="['/home']"> <img src="/assets/logo.png" width="50" height="50"> I-ANALYZER</a>

From 90c57d766503c34db534739ef719f067d210816a Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 2 Nov 2023 17:01:59 +0100
Subject: [PATCH 57/98] navbar: set icons aria-hidden

---
 frontend/src/app/menu/menu.component.html | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/frontend/src/app/menu/menu.component.html b/frontend/src/app/menu/menu.component.html
index dd2ef146b..af545d9fb 100644
--- a/frontend/src/app/menu/menu.component.html
+++ b/frontend/src/app/menu/menu.component.html
@@ -14,7 +14,7 @@
                 <!-- navbar items -->
                 <div class="navbar-item has-dropdown">
                     <button class="navbar-link is-hoverable" (click)="corpora.toggle($event)">
-                        <span class="icon">
+                        <span class="icon" aria-hidden="true">
                             <i class="fa fa-database"></i>
                         </span>
                         Corpora
@@ -22,13 +22,13 @@
                     <p-menu #corpora [popup]="true" [model]="menuCorporaItems" appendTo="body" styleClass="menu-dropdown"></p-menu>
                 </div>
                 <a class="navbar-item" [routerLinkActive]="'is-active'" [routerLink]="['/manual', 'main']">
-                    <span class="icon">
+                    <span class="icon" aria-hidden="true">
                         <i class="fa fa-book"></i>
                     </span>
                     Manual
                 </a>
                 <a class="navbar-item" [routerLinkActive]="'is-active'" [routerLink]="['/about']">
-                    <span class="icon">
+                    <span class="icon" aria-hidden="true">
                         <i class="fa fa-info-circle"></i>
                     </span>
                     About
@@ -38,7 +38,7 @@
                 <!-- navbar items -->
                 <div class="navbar-item has-dropdown">
                     <button class="navbar-link is-hoverable" (click)="admin.toggle($event)">
-                        <span class="icon">
+                        <span class="icon" aria-hidden="true">
                             <i class="fa fa-user-circle"></i>
                         </span>
                         {{currentUser.name}}

From 3f4ca7c1ca0ceaf368531bb86a8a47d672dac1f0 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:34:55 +0100
Subject: [PATCH 58/98] manual navigation: hide icons, add aria labels

---
 frontend/src/app/manual/manual-navigation.component.html | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/frontend/src/app/manual/manual-navigation.component.html b/frontend/src/app/manual/manual-navigation.component.html
index bf378d406..dceec8e93 100644
--- a/frontend/src/app/manual/manual-navigation.component.html
+++ b/frontend/src/app/manual/manual-navigation.component.html
@@ -1,18 +1,19 @@
-<nav class="panel" *ngIf="filtered">
+<nav class="panel" *ngIf="filtered" aria-label="secondary navigation">
     <p class="panel-heading">
         Manual Topics
     </p>
     <div class="panel-block">
         <p class="control has-icons-left">
-            <input class="input is-small" type="text" placeholder="search" [(ngModel)]="filterText">
-            <span class="icon is-small is-left">
+            <input class="input is-small" type="text" placeholder="search" [(ngModel)]="filterText"
+                aria-label="search">
+            <span class="icon is-small is-left" aria-hidden="true">
                 <i class="fa fa-search"></i>
             </span>
         </p>
     </div>
     <ng-container *ngFor="let page of filtered | async">
         <a class="panel-block" routerLinkActive="is-active" [routerLink]="['/manual', page.id]">
-            <span class="panel-icon">
+            <span class="panel-icon" aria-hidden="true">
                 <i class="fa fa-book"></i>
             </span>
             <span [innerHtml]="page.title | highlight:highlightText"></span>

From b7725adea9a5309f7d302f39cce6ded3ad816981 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:05:34 +0100
Subject: [PATCH 59/98] corpus selection: set icons aria-hidden

---
 .../corpus-selector/corpus-selector.component.html           | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html
index 6007bdcc8..3ad24777e 100644
--- a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html
+++ b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html
@@ -11,9 +11,10 @@ <h2 class="title-content column" [routerLink]="['/search', corpus.name]" role="l
             </h2>
             <div class="column is-narrow">
                 <button class="moreInfoLink"
+                role="link"
                 iaBalloon="More information" iaBalloonPosition="down-left"
                 [routerLink]="['/info', corpus.name]">
-                <i class=" fa fa-info-circle info-icon"></i>
+                <i class=" fa fa-info-circle info-icon" aria-hidden="true"></i>
             </button>
 
             </div>
@@ -35,7 +36,7 @@ <h2 class="title-content column" [routerLink]="['/search', corpus.name]" role="l
             </div>
             <div class="column is-narrow explore-column">
                 <a class="button corpus-action is-medium" [routerLink]="['/search', corpus.name]">
-                    <span class="icon">
+                    <span class="icon" aria-hidden="true">
                         <fa-icon [icon]="searchIcon"></fa-icon>
                     </span>
                     <span>Explore</span>

From 3a563511a41758f404cb58f73a3760cea3401323 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:38:24 +0100
Subject: [PATCH 60/98] search fields selection: use <button> for "show all"

part of #794
---
 frontend/src/app/select-field/select-field.component.html | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/frontend/src/app/select-field/select-field.component.html b/frontend/src/app/select-field/select-field.component.html
index 078bf63d5..e43dd6d37 100644
--- a/frontend/src/app/select-field/select-field.component.html
+++ b/frontend/src/app/select-field/select-field.component.html
@@ -7,6 +7,8 @@
     [ngModelOptions]="{standalone: true}"
     dropdownIcon="fa fa-cog">
     <p-footer *ngIf="optionFields">
-        <a class="button is-primary" (click)="toggleAllFields()">{{allVisible? 'Show default fields' : 'Show all fields'}}</a>
+        <button class="button is-primary" (click)="toggleAllFields()">
+            {{allVisible? 'Show default fields' : 'Show all fields'}}
+    </button>
     </p-footer>
 </p-multiSelect>

From 5a32178f84dd3a8583885d8557c503e3262d15e8 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:40:50 +0100
Subject: [PATCH 61/98] results sorting: use <button> for direction toggle

part of #794
---
 frontend/src/app/search/search-sorting.component.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontend/src/app/search/search-sorting.component.html b/frontend/src/app/search/search-sorting.component.html
index b1fde463a..2bea04119 100644
--- a/frontend/src/app/search/search-sorting.component.html
+++ b/frontend/src/app/search/search-sorting.component.html
@@ -2,8 +2,8 @@
     <ia-dropdown placeholder="Relevance" canDeselect="true" [value]="sortField" [options]="sortableFields" optionLabel="displayName" (onChange)="changeField($event)"></ia-dropdown>
 </div>
 <div class="control">
-    <a class="button" (click)="toggleSortType()" [ngClass]="{ 'is-static': !sortField }">
-        <span class="icon is-small" [ngSwitch]="sortType">
+    <button class="button" (click)="toggleSortType()" [ngClass]="{ 'is-static': !sortField }">
+        <span class="icon is-small" [ngSwitch]="sortType" aria-hidden="true">
             <i *ngSwitchCase="'alphaDesc'" class="fa fa-sort-alpha-desc"></i>
             <i *ngSwitchCase="'alphaAsc'" class="fa fa-sort-alpha-asc"></i>
             <i *ngSwitchCase="'numericDesc'" class="fa fa-sort-numeric-desc"></i>
@@ -11,5 +11,5 @@
         </span>
         <span *ngIf="ascending">Ascending</span>
         <span *ngIf="!ascending">Descending</span>
-    </a>
+    </button>
 </div>

From 91534326ae234b5e3502e2c574d02ad1771eb310 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:42:49 +0100
Subject: [PATCH 62/98] visualisation table: use <button> for download

part of #794
---
 frontend/src/app/visualization/freqtable.component.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontend/src/app/visualization/freqtable.component.html b/frontend/src/app/visualization/freqtable.component.html
index 5e0c8ceee..509aaaa70 100644
--- a/frontend/src/app/visualization/freqtable.component.html
+++ b/frontend/src/app/visualization/freqtable.component.html
@@ -40,12 +40,12 @@
 
     <div class="level-right">
         <div class="level-item">
-            <a class="button" (click)="downloadTable()">
+            <button class="button" (click)="downloadTable()">
                 <span class="icon is-small">
                     <i class="fa fa-download"></i>
                 </span>
                 <span>Download table data</span>
-            </a>
+            </button>
         </div>
     </div>
 </div>

From 2ffd9a6bd7ebd7dd401b408eb41bfeea9a777cdb Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 12:02:54 +0100
Subject: [PATCH 63/98] search history: separate link column

part of #794
---
 .../search-history.component.html             | 10 +++++++-
 .../search-history.component.scss             | 23 -------------------
 .../search-history.component.ts               | 17 ++++++++------
 3 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/frontend/src/app/history/search-history/search-history.component.html b/frontend/src/app/history/search-history/search-history.component.html
index 85efdf7f4..dd6eb1c8f 100644
--- a/frontend/src/app/history/search-history/search-history.component.html
+++ b/frontend/src/app/history/search-history/search-history.component.html
@@ -12,6 +12,7 @@
                 <th>Filters</th>
                 <th>Results</th>
                 <th>Corpus</th>
+                <th>Link</th>
             </tr>
             <tr>
                 <th></th>
@@ -22,15 +23,22 @@
                     <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}" placeholder="Any" showClear="true"
                         (onChange)="dt.filter($event.value, 'corpus', 'equals')"></p-dropdown>
                 </th>
+                <th></th>
             </tr>
         </ng-template>
         <ng-template pTemplate="body" let-query>
-            <tr (click)="returnToSavedQuery(query)">
+            <tr>
                 <td>{{query.started | date:'medium'}}</td>
                 <td>{{query.queryModel | formatQueryText }}</td>
                 <td ia-query-filters [queryModel]="query.queryModel"></td>
                 <td>{{query.total_results}}</td>
                 <td>{{corpusTitle(query.corpus)}}</td>
+                <td><a [routerLink]="routerLink(query)" [queryParams]="queryParams(query)">
+                    <span class="icon" aria-label="open this query">
+                        <fa-icon [icon]="linkIcon"></fa-icon>
+                    </span>
+                    </a>
+                </td>
             </tr>
         </ng-template>
     </p-table>
diff --git a/frontend/src/app/history/search-history/search-history.component.scss b/frontend/src/app/history/search-history/search-history.component.scss
index 564cc1024..f43e315f6 100644
--- a/frontend/src/app/history/search-history/search-history.component.scss
+++ b/frontend/src/app/history/search-history/search-history.component.scss
@@ -1,26 +1,3 @@
-@import "../../../_utilities";
-
-table {
-	margin-left: 100px;
-	margin-bottom: 50px;
-}
-
-th, td {
-	padding-right: 10px;
-	padding-left: 10px;
-}
-
 th {
     white-space: nowrap;
 }
-
-tr {
-	border: 1px solid black;
-	border-collapse: separate;
-}
-
-tr:hover {
-	background-color: $contrast-primary-color;
-	cursor: pointer;
-	color: $text-primary-color;
-}
diff --git a/frontend/src/app/history/search-history/search-history.component.ts b/frontend/src/app/history/search-history/search-history.component.ts
index 7e6addc15..731790dc6 100644
--- a/frontend/src/app/history/search-history/search-history.component.ts
+++ b/frontend/src/app/history/search-history/search-history.component.ts
@@ -1,11 +1,12 @@
 import { Component, OnInit } from '@angular/core';
-import { Router } from '@angular/router';
+import { Params, Router } from '@angular/router';
 import * as _ from 'lodash';
 import { esQueryToQueryModel } from '../../utils/es-query';
 import { QueryDb } from '../../models/index';
 import { CorpusService, QueryService } from '../../services/index';
 import { HistoryDirective } from '../history.directive';
 import { findByName } from '../../utils/utils';
+import { faLink } from '@fortawesome/free-solid-svg-icons';
 
 @Component({
     selector: 'search-history',
@@ -23,6 +24,8 @@ export class SearchHistoryComponent extends HistoryDirective implements OnInit {
         super(corpusService);
     }
 
+    linkIcon = faLink;
+
     async ngOnInit() {
         this.retrieveCorpora();
         this.queryService.retrieveQueries().then(
@@ -39,11 +42,11 @@ export class SearchHistoryComponent extends HistoryDirective implements OnInit {
         return query;
     }
 
-    returnToSavedQuery(query: QueryDb) {
-        this.router.navigate(['/search', query.corpus],
-            {queryParams: query.queryModel.toQueryParams()});
-        if (window) {
-            window.scrollTo(0, 0);
-        }
+    routerLink(query: QueryDb): string[] {
+        return ['/search', query.corpus];
+    }
+
+    queryParams(query: QueryDb): Params {
+        return query.queryModel.toQueryParams();
     }
 }

From fc063bc003dad76c24b635474dbddbd68bb1102f Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:16:27 +0100
Subject: [PATCH 64/98] corpus info page: use tabs component

enables keyboard interaction on tabs
---
 .../corpus-info/corpus-info.component.html    | 38 +++++++++----------
 .../app/corpus-info/corpus-info.component.ts  | 23 +----------
 2 files changed, 20 insertions(+), 41 deletions(-)

diff --git a/frontend/src/app/corpus-info/corpus-info.component.html b/frontend/src/app/corpus-info/corpus-info.component.html
index d97f0a875..929f52d88 100644
--- a/frontend/src/app/corpus-info/corpus-info.component.html
+++ b/frontend/src/app/corpus-info/corpus-info.component.html
@@ -21,28 +21,28 @@
 </section>
 
 <section class="section" style="padding-top: 0;" [class.is-loading]="!corpus">
-    <div class="container is-readable" *ngIf="corpus">
-        <div class="tabs is-boxed">
-            <ul>
-                <li *ngFor="let tab of tabs" [class.is-active]="currentTab.value === tab.name">
-                    <a *ngIf="corpus[tab.property]"
-                        role="button" (click)="currentTab.next(tab.name)">
-                        {{tab.title}}
-                    </a>
-                </li>
-            </ul>
-        </div>
-
-        <ng-container *ngIf="currentTab | async as tab">
-            <div *ngIf="tab === 'general'" class="content" [class.is-loading]="!description" [innerHTML]="description"></div>
+    <div class="container is-readable">
+        <ia-tabs>
+            <ng-template iaTabPanel *ngIf="corpus?.descriptionpage"
+                id="general" title="General information">
+                <div class="content" [class.is-loading]="!description"
+                    [innerHTML]="description"></div>
+            </ng-template>
 
-            <div *ngIf="tab === 'fields'">
+            <ng-template iaTabPanel *ngIf="corpus?.fields"
+                id="fields" title="Fields">
                 <div class="block" *ngFor="let field of corpus.fields">
-                    <ia-field-info  [field]="field" [coverage]="fieldCoverage ? fieldCoverage[field.name] : undefined"></ia-field-info>
+                    <ia-field-info  [field]="field"
+                        [coverage]="fieldCoverage ? fieldCoverage[field.name] : undefined">
+                    </ia-field-info>
                 </div>
-            </div>
+            </ng-template>
 
-            <div *ngIf="tab === 'models'" class="content" [class.is-loading]="!wordModelDocumentation" [innerHTML]="wordModelDocumentation"></div>
-        </ng-container>
+            <ng-template iaTabPanel *ngIf="corpus?.word_models_present"
+                id="word_models" title="Word models">
+                <div class="content" [class.is-loading]="!wordModelDocumentation"
+                    [innerHTML]="wordModelDocumentation"></div>
+            </ng-template>
+        </ia-tabs>
     </div>
 </section>
diff --git a/frontend/src/app/corpus-info/corpus-info.component.ts b/frontend/src/app/corpus-info/corpus-info.component.ts
index a1fe9b065..2542be144 100644
--- a/frontend/src/app/corpus-info/corpus-info.component.ts
+++ b/frontend/src/app/corpus-info/corpus-info.component.ts
@@ -16,25 +16,6 @@ export class CorpusInfoComponent implements OnInit {
     wordModelDocumentation: string;
     fieldCoverage: FieldCoverage;
 
-    tabs = [
-        {
-            name: 'general',
-            title: 'General information',
-            property: 'descriptionpage',
-        }, {
-            name: 'fields',
-            title: 'Fields',
-            property: 'fields',
-        }, {
-            name: 'models',
-            title: 'Word models',
-            property: 'word_models_present',
-        }
-    ];
-
-    currentTab = new BehaviorSubject<'general'|'fields'|'models'>(
-        'general'
-    );
 
     constructor(private corpusService: CorpusService, private apiService: ApiService, private wordModelsService: WordmodelsService) { }
 
@@ -47,9 +28,7 @@ export class CorpusInfoComponent implements OnInit {
         if (corpus.descriptionpage) {
             this.apiService.corpusdescription({filename: corpus.descriptionpage, corpus: corpus.name})
             .then(marked.parse)
-            .then(doc => this.description = doc);
-        } else {
-            this.currentTab.next('fields');
+                .then(doc => this.description = doc);
         }
         this.apiService.fieldCoverage(corpus.name).then(
             result => this.fieldCoverage = result

From 7053c591f1e526dc6f5d5499964b1a386dc272fe Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 11:05:17 +0100
Subject: [PATCH 65/98] corpus index: show focus on corpus action link

---
 .../corpus-selector/corpus-selector.component.scss          | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss
index 22dbd5ffb..0c9d62e0d 100644
--- a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss
+++ b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss
@@ -45,7 +45,11 @@
 
     &:hover {
         border-color: $text-primary-color;
-            background-color: $contrast-primary-accent-color;
+        background-color: $contrast-primary-accent-color;
+    }
+
+    &:focus {
+        border-width: 3px;
     }
 }
 

From 6b8e7a27af393d6a7e6ff77c5e2ea905d7907c29 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 12:30:25 +0100
Subject: [PATCH 66/98] search/download history: no empty header cells

---
 .../download-history.component.html           | 21 +++++++------------
 .../search-history.component.html             | 15 +++++--------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/frontend/src/app/history/download-history/download-history.component.html b/frontend/src/app/history/download-history/download-history.component.html
index 9c471e6a1..51f893d7a 100644
--- a/frontend/src/app/history/download-history/download-history.component.html
+++ b/frontend/src/app/history/download-history/download-history.component.html
@@ -7,28 +7,21 @@
         </ng-template>
         <ng-template pTemplate="header">
             <tr>
-                <th>Timestamp</th>
-                <th>Type</th>
+                <th rowspan="2">Timestamp</th>
+                <th rowspan="2">Type</th>
                 <th>Corpus</th>
-                <th>Query</th>
-                <th>Filters</th>
-                <th>Field(s)</th>
-                <th>Status</th>
-                <th>Download</th>
+                <th rowspan="2">Query</th>
+                <th rowspan="2">Filters</th>
+                <th rowspan="2">Field(s)</th>
+                <th rowspan="2">Status</th>
+                <th rowspan="2">Download</th>
             </tr>
             <tr>
-                <th></th>
-                <th></th>
                 <th>
                     <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}"
                         placeholder="Any" showClear="true"
                         (onChange)="dt.filter($event.value, 'corpus', 'equals')"></p-dropdown>
                 </th>
-                <th></th>
-                <th></th>
-                <th></th>
-                <th></th>
-                <th></th>
             </tr>
         </ng-template>
         <ng-template pTemplate="body" let-download>
diff --git a/frontend/src/app/history/search-history/search-history.component.html b/frontend/src/app/history/search-history/search-history.component.html
index dd6eb1c8f..da851a4b8 100644
--- a/frontend/src/app/history/search-history/search-history.component.html
+++ b/frontend/src/app/history/search-history/search-history.component.html
@@ -7,23 +7,18 @@
         </ng-template>
         <ng-template pTemplate="header">
             <tr>
-                <th>Timestamp</th>
-                <th>Query</th>
-                <th>Filters</th>
-                <th>Results</th>
+                <th rowspan="2">Timestamp</th>
+                <th rowspan="2">Query</th>
+                <th rowspan="2">Filters</th>
+                <th rowspan="2">Results</th>
                 <th>Corpus</th>
-                <th>Link</th>
+                <th rowspan="2">Link</th>
             </tr>
             <tr>
-                <th></th>
-                <th></th>
-                <th></th>
-                <th></th>
                 <th>
                     <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}" placeholder="Any" showClear="true"
                         (onChange)="dt.filter($event.value, 'corpus', 'equals')"></p-dropdown>
                 </th>
-                <th></th>
             </tr>
         </ng-template>
         <ng-template pTemplate="body" let-query>

From ae9ddd77b63475157d14fa565778c336c5fa803f Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 12:31:37 +0100
Subject: [PATCH 67/98] use columnFilter component for column filters

---
 .../download-history/download-history.component.html   | 10 +++++++---
 .../search-history/search-history.component.html       |  9 +++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/frontend/src/app/history/download-history/download-history.component.html b/frontend/src/app/history/download-history/download-history.component.html
index 51f893d7a..cd8258925 100644
--- a/frontend/src/app/history/download-history/download-history.component.html
+++ b/frontend/src/app/history/download-history/download-history.component.html
@@ -18,9 +18,13 @@
             </tr>
             <tr>
                 <th>
-                    <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}"
-                        placeholder="Any" showClear="true"
-                        (onChange)="dt.filter($event.value, 'corpus', 'equals')"></p-dropdown>
+                    <p-columnFilter field="corpus" matchMode="equals" [showMenu]="false">
+                        <ng-template pTemplate="filter" let-value let-filter="filterCallback">
+                            <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}" placeholder="Any"
+                                [ngModel]="value" (onChange)="filter($event.value)">
+                            </p-dropdown>
+                        </ng-template>
+                    </p-columnFilter>
                 </th>
             </tr>
         </ng-template>
diff --git a/frontend/src/app/history/search-history/search-history.component.html b/frontend/src/app/history/search-history/search-history.component.html
index da851a4b8..d73e01c91 100644
--- a/frontend/src/app/history/search-history/search-history.component.html
+++ b/frontend/src/app/history/search-history/search-history.component.html
@@ -16,8 +16,13 @@
             </tr>
             <tr>
                 <th>
-                    <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}" placeholder="Any" showClear="true"
-                        (onChange)="dt.filter($event.value, 'corpus', 'equals')"></p-dropdown>
+                    <p-columnFilter field="corpus" matchMode="equals" [showMenu]="false">
+                        <ng-template pTemplate="filter" let-value let-filter="filterCallback">
+                            <p-dropdown [options]="corpusMenuItems" [style]="{'width':'100%'}" placeholder="Any"
+                                [ngModel]="value" (onChange)="filter($event.value)">
+                            </p-dropdown>
+                        </ng-template>
+                    </p-columnFilter>
                 </th>
             </tr>
         </ng-template>

From 1e108cae8f70ede0d8aa2dc8097c140324268569 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 12:37:50 +0100
Subject: [PATCH 68/98] Add header to search history and downloads page

---
 .../download-history/download-history.component.html |  4 +++-
 .../search-history/search-history.component.html     | 12 ++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/frontend/src/app/history/download-history/download-history.component.html b/frontend/src/app/history/download-history/download-history.component.html
index cd8258925..e24627184 100644
--- a/frontend/src/app/history/download-history/download-history.component.html
+++ b/frontend/src/app/history/download-history/download-history.component.html
@@ -1,7 +1,9 @@
 <section class="section">
+    <h1 class="title">Downloads</h1>
+
     <p-table #dt *ngIf="downloads && corpora" [value]="downloads" [paginator]="true" [rows]="10">
         <ng-template pTemplate="caption">
-            <div style="text-align: left">
+            <div>
                 Downloads
             </div>
         </ng-template>
diff --git a/frontend/src/app/history/search-history/search-history.component.html b/frontend/src/app/history/search-history/search-history.component.html
index d73e01c91..ee9fe7977 100644
--- a/frontend/src/app/history/search-history/search-history.component.html
+++ b/frontend/src/app/history/search-history/search-history.component.html
@@ -1,8 +1,16 @@
 <section class="section">
+    <h1 class="title">Search history</h1>
+
+    <p class="block">
+        Here you can see an overview of your search history.
+        You can turn search history on or off in
+        <a [routerLink]="['/settings']">settings</a>.
+    </p>
+
     <p-table #dt *ngIf="queries" [value]="queries" [paginator]="true" [rows]="10">
         <ng-template pTemplate="caption">
-            <div style="text-align: left">
-                Search History
+            <div>
+                Search history
             </div>
         </ng-template>
         <ng-template pTemplate="header">

From 82099d01c70989499fa48673b10f8097037cdf1c Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Fri, 3 Nov 2023 12:53:35 +0100
Subject: [PATCH 69/98] fix test teardown errors

---
 frontend/src/app/visualization/wordcloud/wordcloud.component.ts | 2 +-
 .../app/word-models/related-words/related-words.component.ts    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
index 8f40567be..dac74ec42 100644
--- a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
+++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
@@ -78,7 +78,7 @@ export class WordcloudComponent implements OnChanges, OnDestroy {
     }
 
     ngOnDestroy(): void {
-        this.chart.destroy();
+        this.chart?.destroy();
     }
 
     loadData() {
diff --git a/frontend/src/app/word-models/related-words/related-words.component.ts b/frontend/src/app/word-models/related-words/related-words.component.ts
index 559b00390..9f0580e6a 100644
--- a/frontend/src/app/word-models/related-words/related-words.component.ts
+++ b/frontend/src/app/word-models/related-words/related-words.component.ts
@@ -31,7 +31,7 @@ export class RelatedWordsComponent extends ParamDirective implements OnChanges {
     zoomedInData: WordSimilarity[][]; // data when focusing on a single time interval: shows nearest neighbours from that period
 
     faCheck = faCheck;
-    nullableParameters: ['neighbours'];
+    nullableParameters = ['neighbours'];
 
     constructor(
         route: ActivatedRoute,

From 9c0e40c5d36bbcae68ac51576bf77ecddffac98d Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 9 Nov 2023 14:07:41 +0100
Subject: [PATCH 70/98] fix problems with es_settings

---
 backend/addcorpus/es_settings.py              | 48 +++++++++----
 backend/addcorpus/save_corpus.py              |  5 +-
 backend/addcorpus/tests/test_es_settings.py   | 70 +++++++++++++++++++
 backend/corpora/ecco/ecco.py                  |  6 +-
 backend/corpora/peaceportal/epidat.py         |  5 +-
 backend/corpora/peaceportal/peaceportal.py    | 10 +--
 .../corpora/peaceportal/tests/test_import.py  |  1 -
 7 files changed, 115 insertions(+), 30 deletions(-)

diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 010ddc757..4bc194496 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -5,6 +5,15 @@
 HERE = os.path.abspath(os.path.dirname(__file__))
 NLTK_DATA_PATH = os.path.join(HERE, 'nltk_data')
 
+# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
+AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian', 
+                         'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch', 
+                         'english', 'estonian', 'finnish', 'french', 'galician',
+                         'german', 'greek', 'hindi', 'hungarian', 'indonesian', 
+                         'irish', 'italian', 'latvian', 'lithuanian', 'norwegian', 
+                         'persian', 'portuguese', 'romanian', 'russian', 'sorani', 
+                         'spanish', 'swedish', 'turkish', 'thai']
+
 def get_language_key(language_code):
     '''
     Get the nltk stopwords file / elasticsearch stemmer name for a language code
@@ -44,6 +53,8 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     stemmer_filter_name = 'stemmer'
     stemmed_analyzer_name = 'stemmed'
     
+    set_char_filter(settings)
+    
     for language in languages:
         # do not attach language isocodes if there is just one language
         language_string = language if len(languages) > 1 else None
@@ -57,9 +68,8 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
                     settings,
                     add_language_string(stopword_filter_name, language_string),
                     add_language_string(clean_analyzer_name, language_string),
-                    language
                 )
-            if stemming_analyzer:
+            if stemming_analyzer and get_language_key(language) in AVAILABLE_ES_STEMMERS:
                 set_stemmed_analyzer(
                     settings,
                     add_language_string(stopword_filter_name, language_string),
@@ -118,23 +128,31 @@ def get_stopwords_from_settings(es_settings):
     return stopwords
 
 def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language):
-    settings['analysis']['filter'][stemmer_filter_name] = make_stemmer_filter(language)
-    settings["analysis"]['analyzer'][stemmed_analyzer_name] = make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)
+    filters = settings['analysis'].get('filter', {})
+    filters.update({stemmer_filter_name: make_stemmer_filter(language)})
+    settings['analysis']['filter'] = filters
+    analyzers = settings['analysis'].get('analyzer')
+    analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)})
+    settings['analysis']['analyzer'] = analyzers
+
+def set_char_filter(settings):
+    settings["analysis"] = {
+        "char_filter": { "number_filter": number_filter() }
+    }
 
 def set_stopword_filter(settings, stopword_filter_name, language):
     stopword_filter = make_stopword_filter(stopword_filter_name, language)
     if not stopword_filter:
         return False
-    settings["analysis"] = {
-        "analyzer": {},
-        "char_filter":{ "number_filter": number_filter() },
-        'filter': {
-            stopword_filter_name: stopword_filter
-        }
-    }
+    filters = settings['analysis'].get('filter', {})
+    filters.update({
+        stopword_filter_name: stopword_filter
+    })
+    settings['analysis']['filter'] = filters
     return True
     
-def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name, language):
-    settings["analysis"]['analyzer'][clean_analyzer_name] = make_clean_analyzer(
-        stopword_filter_name
-    )
\ No newline at end of file
+def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name):
+    clean_analyzer = make_clean_analyzer(stopword_filter_name)
+    analyzers = settings['analysis'].get('analyzer', {})
+    analyzers.update({clean_analyzer_name: clean_analyzer})
+    settings["analysis"]['analyzer'] = analyzers
\ No newline at end of file
diff --git a/backend/addcorpus/save_corpus.py b/backend/addcorpus/save_corpus.py
index 1c9010754..82a0db368 100644
--- a/backend/addcorpus/save_corpus.py
+++ b/backend/addcorpus/save_corpus.py
@@ -48,7 +48,10 @@ def _copy_corpus_attributes(corpus_definition: CorpusDefinition, configuration:
         'word_models_present',
     ]
 
-    defined = get_defined_attributes(corpus_definition, attributes_to_copy)
+    try:
+        defined = get_defined_attributes(corpus_definition, attributes_to_copy)
+    except Exception as e:
+        raise e
 
     for attr, value in defined.items():
         configuration.__setattr__(attr, value)
diff --git a/backend/addcorpus/tests/test_es_settings.py b/backend/addcorpus/tests/test_es_settings.py
index e69de29bb..9f94b1b69 100644
--- a/backend/addcorpus/tests/test_es_settings.py
+++ b/backend/addcorpus/tests/test_es_settings.py
@@ -0,0 +1,70 @@
+import pytest
+
+from addcorpus.es_settings import es_settings
+
+char_filter_tokenizer = {'char_filter': ['number_filter'], 'tokenizer': 'standard'}
+
+test_cases = {
+    'single_language': {
+        'languages': ['en'],
+        'stopword': True,
+        'stemming': True,
+        'expected': {
+            'filter': {
+                'stemmer': {'type': 'stemmer', 'language': 'english'},
+                'stopwords': {'type': 'stop', 'stopwords': list()}
+            },
+            'analyzer': {
+                'clean': {
+                    'filter': ['lowercase', 'stopwords'],
+                    **char_filter_tokenizer
+                },
+                'stemmed': {
+                    'filter': ['lowercase', 'stopwords', 'stemmer'],
+                    **char_filter_tokenizer
+                }
+            }
+        }
+    },
+    'multiple_languages': {
+        'languages': ['en', 'de'],
+        'stopword': True,
+        'stemming': True,
+        'expected': {
+            'filter': {
+                'stemmer_de': {'type': 'stemmer', 'language': 'german'},
+                'stopwords_de': {'type': 'stop', 'stopwords': list()},
+                'stemmer_en': {'type': 'stemmer', 'language': 'english'},
+                'stopwords_en': {'type': 'stop', 'stopwords': list()},
+            },
+            'analyzer': {
+                'clean_de': {
+                    'filter': ['lowercase', 'stopwords_de'],
+                    **char_filter_tokenizer
+                },
+                'stemmed_de': {
+                    'filter': ['lowercase', 'stopwords_de', 'stemmer_de'],
+                    **char_filter_tokenizer
+                },
+                'clean_en': {
+                    'filter': ['lowercase', 'stopwords_en'],
+                    **char_filter_tokenizer
+                },
+                'stemmed_en': {
+                    'filter': ['lowercase', 'stopwords_en', 'stemmer_en'],
+                    **char_filter_tokenizer
+                }
+            }
+        }
+    }
+}
+
+@pytest.mark.parametrize('test_config', list(test_cases.values()))
+def test_es_settings(test_config):
+    settings = es_settings(test_config['languages'], test_config['stopword'], test_config['stemming'])
+    assert settings['analysis']['filter'].keys() == test_config['expected']['filter'].keys()
+    assert settings['analysis']['analyzer'].keys() == test_config['expected']['analyzer'].keys()
+    for analyzer in settings['analysis']['analyzer'].keys():
+        assert settings['analysis']['analyzer'][analyzer]['filter'][1] in settings['analysis']['filter']
+        if analyzer.startswith('stemmed'):
+            assert settings['analysis']['analyzer'][analyzer]['filter'][2] in settings['analysis']['filter']
\ No newline at end of file
diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
index 341970e5e..0a97b25d6 100644
--- a/backend/corpora/ecco/ecco.py
+++ b/backend/corpora/ecco/ecco.py
@@ -30,10 +30,6 @@ class Ecco(XMLCorpusDefinition):
     min_date = datetime(year=1700, month=1, day=1)
     max_date = datetime(year=1800, month=12, day=31)
 
-    @property
-    def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
-
     data_directory = settings.ECCO_DATA
     es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
     image = 'ecco.jpg'
@@ -49,7 +45,7 @@ def es_settings(self):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
 
     def sources(self, start=min_date, end=max_date):
         logging.basicConfig(filename='ecco.log', level=logging.INFO)
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index 917f7c59c..c0c4b42cb 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -14,10 +14,7 @@ class PeaceportalEpidat(PeacePortal):
     es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat')
     es_alias = settings.PEACEPORTAL_ALIAS
 
-    languages = ['german', 'hebrew', 'english', 'dutch']
-
-    def es_settings(self):
-        return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
+    languages = ['de', 'he', 'en', 'nl']
 
     def __init__(self):
         super().__init__()
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 1f721d50b..c81f9fa06 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -36,7 +36,8 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
     scan_image_type = 'image/png'
     # fields below are required by code but not actually used
     min_date = datetime(year=746, month=1, day=1)
-    image = 'bogus'
+    image = 'bogus.jpg'
+    category = 'inscription'
     data_directory = 'bogus'
 
     # Data overrides from .common.XMLCorpus
@@ -50,8 +51,9 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
     external_file_folder = None
     languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-)
 
+    @property
     def es_settings(self):
-        return es_settings(self.languages, True, True)
+        return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
 
     def sources(self, start, end):
         logger = logging.getLogger(__name__)
@@ -166,8 +168,8 @@ def request_media(self, document):
     )
 
     transcription_hebrew = FieldDefinition(
-        name='transcription_he', # no stopwords / stemmers available
-        es_mapping={'type': 'text'},
+        name='transcription_he', # no stemmers available
+        es_mapping=main_content_mapping(stopword_analysis=True, language='he'),
         language='he',
         hidden=True
     )
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index f59c54b43..5790a2b9e 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -43,7 +43,6 @@ def test_imports(peace_corpus_settings, corpus_object):
     resulted_fields = set()
 
     docs = get_documents(corpus, start, end)
-    print(list(docs))
     for target in corpus_object.get('docs'):
         doc = next(docs)
         for key in target:

From 8efab51f7a580df81f628c77001f8d63bbe0940f Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 9 Nov 2023 14:08:02 +0100
Subject: [PATCH 71/98] error string for NotImplementedError

---
 backend/addcorpus/corpus.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
index b11f279bd..4c8cd4256 100644
--- a/backend/addcorpus/corpus.py
+++ b/backend/addcorpus/corpus.py
@@ -33,37 +33,37 @@ class CorpusDefinition(object):
     @property
     def title(self):
         '''
-        Path to source data directory.
+        Title of the corpus
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing title')
 
     @property
     def description(self):
         '''
         Short description of the corpus
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing description')
 
     @property
     def data_directory(self):
         '''
         Path to source data directory.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing data_directory')
 
     @property
     def min_date(self):
         '''
         Minimum timestamp for data files.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing min_date')
 
     @property
     def max_date(self):
         '''
         Maximum timestamp for data files.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing max_date')
 
 
     '''
@@ -81,14 +81,14 @@ def category(self):
 
         See addcorpus.constants.CATEGORIES for options
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing category')
 
     @property
     def es_index(self):
         '''
         ElasticSearch index name.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing category')
 
     '''
     Elasticsearch alias. Defaults to None.
@@ -111,7 +111,7 @@ def fields(self):
         the `Field` class, containing information about each attribute.
         MUST include a field with `name='id'`.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing fields')
 
 
     '''
@@ -139,7 +139,7 @@ def image(self):
         Name of the corpus image. Should be relative path from a directory 'images'
         in the same directory as the corpus definition file.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing image')
 
     '''
     MIME type of scanned documents (images)
@@ -241,7 +241,7 @@ def sources(self, start=datetime.min, end=datetime.max):
         empty or contains only a timestamp; but any data that is to be
         extracted without reading the file itself can be specified there.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing sources')
 
     def source2dicts(self, sources):
         '''
@@ -249,7 +249,7 @@ def source2dicts(self, sources):
 
         The dictionaries are created from this corpus' `Field`s.
         '''
-        raise NotImplementedError()
+        raise NotImplementedError('CorpusDefinition missing source2dicts')
 
     def documents(self, sources=None):
         '''

From 8609858d83b8b93763c08a7f81a6e855d30cb098 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 9 Nov 2023 16:21:38 +0100
Subject: [PATCH 72/98] remove XMLCorpus restriction

---
 backend/corpora/peaceportal/FIJI/fiji.py      |  7 ++-
 backend/corpora/peaceportal/conftest.py       |  1 +
 backend/corpora/peaceportal/epidat.py         |  6 ++-
 backend/corpora/peaceportal/iis.py            | 16 +++++--
 backend/corpora/peaceportal/peaceportal.py    | 43 ++++++++-----------
 .../corpora/peaceportal/tests/test_import.py  |  6 +++
 backend/corpora/peaceportal/tol.py            | 34 +++------------
 backend/corpora/utils/exclude_fields.py       |  9 ++++
 backend/corpora/utils/test_corpora_utils.py   | 17 ++++++++
 9 files changed, 78 insertions(+), 61 deletions(-)
 create mode 100644 backend/corpora/utils/exclude_fields.py
 create mode 100644 backend/corpora/utils/test_corpora_utils.py

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 5302f537c..0b925f08c 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -5,11 +5,12 @@
 
 from django.conf import settings
 
+from addcorpus.corpus import XMLCorpusDefinition
 from addcorpus.extract import XML, Constant, Combined
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, join_commentaries, get_text_in_language
+from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
-
-class PeaceportalFIJI(PeacePortal):
+class PeaceportalFIJI(PeacePortal, XMLCorpusDefinition):
     '''
     This is a fresh version of Ortal-Paz Saar's 'Funerary Inscriptions of Jews from Italy' corpus,
     updated to align with the PEACE portal index. This mostly implies that there are less fields
@@ -176,6 +177,8 @@ def __init__(self):
             transform=lambda x: get_text_in_language(x)
         )
 
+        self.fields = exclude_fields_without_extractor(self.fields)
+
 
 def transform_age(age):
     if age in ['?', 'none', 'none?']:
diff --git a/backend/corpora/peaceportal/conftest.py b/backend/corpora/peaceportal/conftest.py
index 403909f0e..0584ac323 100644
--- a/backend/corpora/peaceportal/conftest.py
+++ b/backend/corpora/peaceportal/conftest.py
@@ -6,6 +6,7 @@
 @pytest.fixture()
 def peace_corpus_settings(settings):
     settings.CORPORA = {
+        'peaceportal': os.path.join(here, 'peaceportal.py'),
         'peaceportal-epidat': os.path.join(here, 'epidat.py'),
         'peaceportal-fiji': os.path.join(here, 'FIJI', 'fiji.py'),
         'peaceportal-iis': os.path.join(here, 'iis.py'),
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index c0c4b42cb..adae33d26 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -3,12 +3,14 @@
 
 from django.conf import settings
 
+from addcorpus.corpus import XMLCorpusDefinition
 from addcorpus.extract import XML, Constant, HTML, Combined
 from addcorpus.es_settings import es_settings
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
 
-class PeaceportalEpidat(PeacePortal):
+class PeaceportalEpidat(PeacePortal, XMLCorpusDefinition):
 
     data_directory = settings.PEACEPORTAL_EPIDAT_DATA
     es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat')
@@ -231,6 +233,8 @@ def __init__(self):
             transform=lambda x: get_text_in_language(x)
         )
 
+        self.fields = exclude_fields_without_extractor(self.fields)
+
 
 def convert_sex(values):
     if not values:
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index c51211a8a..e078cf483 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -1,19 +1,27 @@
 from copy import copy
+from os.path import join
 
 from django.conf import settings
 
+from addcorpus.corpus import XMLCorpusDefinition
 from addcorpus.extract import XML, Constant, HTML, ExternalFile, Combined
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
-
-class PeaceportalIIS(PeacePortal):
+class PeaceportalIIS(PeacePortal, XMLCorpusDefinition):
     data_directory = settings.PEACEPORTAL_IIS_DATA
-    external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
     es_index = getattr(settings, 'PEACEPORTAL_IIS_ES_INDEX', 'peaceportal-iis')
     es_alias = settings.PEACEPORTAL_ALIAS
 
+    def add_metadata(self, filename):
+        external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
+        return  {
+            'associated_file': join(external_file_folder, filename)
+        }
+
     def __init__(self):
         super().__init__()
+        self.external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
         self.source_database.extractor = Constant(
             value='Inscriptions of Israel/Palestine (Brown University)'
         )
@@ -248,6 +256,8 @@ def __init__(self):
             transform=lambda x: get_text_in_language(x)
         )
 
+        self.fields = exclude_fields_without_extractor(self.fields)
+
 
 def extract_transcript(filestream):
     text = filestream.read().strip()
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index c81f9fa06..e7ba937f6 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -7,15 +7,13 @@
 
 from django.conf import settings
 
-from addcorpus.corpus import ParentCorpusDefinition, FieldDefinition, XMLCorpusDefinition
+from addcorpus.corpus import ParentCorpusDefinition, FieldDefinition
 from addcorpus.es_mappings import int_mapping, keyword_mapping, main_content_mapping, text_mapping
 from addcorpus.es_settings import es_settings
 from addcorpus.extract import Constant
 from addcorpus.filters import MultipleChoiceFilter, RangeFilter
 
-
-
-class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
+class PeacePortal(ParentCorpusDefinition):
     '''
     Base class for corpora in the PEACE portal.
 
@@ -48,7 +46,6 @@ class PeacePortal(ParentCorpusDefinition, XMLCorpusDefinition):
     non_match_msg = 'Skipping XML file with nonmatching name {}'
     # overwrite below in child class if you need to extract the (converted) transcription
     # from external files. See README.
-    external_file_folder = None
     languages = ['en', 'de', 'nl', 'he', 'la', 'el'] # el stands for modern Greek (1500-)
 
     @property
@@ -56,24 +53,27 @@ def es_settings(self):
         return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
 
     def sources(self, start, end):
-        logger = logging.getLogger(__name__)
         for directory, _, filenames in os.walk(self.data_directory):
             for filename in filenames:
                 name, extension = op.splitext(filename)
                 full_path = op.join(directory, filename)
-
-                if extension != '.xml':
-                    logger.debug(self.non_xml_msg.format(full_path))
+                if not self.validate_extension(extension, full_path):
                     continue
-
-                metadata = {}
-                if self.external_file_folder:
-                    metadata = {
-                    # applies only to iis corpus
-                    'associated_file': os.path.join(self.external_file_folder, filename)
-                }
-
+                metadata = self.add_metadata(filename)
                 yield full_path, metadata
+    
+    def add_metadata(self, filename):
+        return {}
+
+    def validate_extension(self, extension, full_path):
+        '''
+        Check that the file is valid for this corpus.
+        So far, all PeacePortal corpora are XML, but may include CSV corpora in the future 
+        '''
+        logger = logging.getLogger(__name__)
+        if extension == 'xml':
+            return True
+        logger.debug(self.non_xml_msg.format(full_path))
 
     def request_media(self, document):
         images = document['fieldValues']['images']
@@ -353,15 +353,6 @@ def request_media(self, document):
         display_name='Date of death',
     )
 
-        #define fields property so it can be set in __init__
-    @property
-    def fields(self):
-        return self._fields
-
-    @fields.setter
-    def fields(self, value):
-        self._fields = value
-
     def __init__(self):
         self.fields = [
             self._id,
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index 5790a2b9e..17822b616 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -33,8 +33,14 @@ def corpus_test_name(corpus_spec):
 
 @pytest.mark.parametrize("corpus_object", CORPUS_TEST_DATA, ids=corpus_test_name)
 def test_imports(peace_corpus_settings, corpus_object):
+    parent_corpus = load_corpus_definition('peaceportal')
     corpus = load_corpus_definition(corpus_object.get('name'))
+    print(corpus.add_metadata('somefile.txt'))
     assert len(os.listdir(os.path.abspath(corpus.data_directory))) != 0
+    fully_specified = ['peaceportal-iis', 'peaceportal-tol']
+    if corpus_object.get('name') not in fully_specified:
+        # only IIS / TOL have all fields
+        assert len(corpus.fields) != len(parent_corpus.fields)
 
     start = corpus_object['start'] if 'start' in corpus_object else corpus.min_date
     end = corpus_object['end'] if 'end' in corpus_object else corpus.max_date
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
index be1d850b5..2c5a82e3f 100644
--- a/backend/corpora/peaceportal/tol.py
+++ b/backend/corpora/peaceportal/tol.py
@@ -3,11 +3,12 @@
 
 from django.conf import settings
 
+from addcorpus.corpus import XMLCorpusDefinition
 from addcorpus.extract import XML, Constant, HTML, Combined
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
+from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
-
-class PeaceportalTOL(PeacePortal):
+class PeaceportalTOL(PeacePortal, XMLCorpusDefinition):
     data_directory = settings.PEACEPORTAL_TOL_DATA
     es_index = getattr(settings, 'PEACEPORTAL_TOL_ES_INDEX', 'peaceportal-tol')
     es_alias = settings.PEACEPORTAL_ALIAS
@@ -15,6 +16,7 @@ class PeaceportalTOL(PeacePortal):
     languages = ['en', 'nl', 'he']
 
     def __init__(self):
+        super().__init__()
         self.source_database.extractor = Constant(
             value='Medieval funerary inscriptions from Toledo'
         )
@@ -219,33 +221,7 @@ def __init__(self):
             transform=lambda x: get_text_in_language(x)
         )
 
-        self.fields = [
-            self.bibliography,
-            self.comments,
-            self.coordinates,
-            self.country,
-            self.dates_of_death,
-            self.iconography,
-            self._id,
-            self.images,
-            self.language,
-            self.location_details,
-            self.material,
-            self.material_details,
-            self.names,
-            self.not_after,
-            self.not_before,
-            self.region,
-            self.settlement,
-            self.sex,
-            self.source_database,
-            self.transcription,
-            self.transcription_dutch,
-            self.transcription_english,
-            self.transcription_hebrew,
-            self.url,
-            self.year
-        ]
+        self.fields = exclude_fields_without_extractor(self.fields)
 
 
 def convert_sex(values):
diff --git a/backend/corpora/utils/exclude_fields.py b/backend/corpora/utils/exclude_fields.py
new file mode 100644
index 000000000..bccc58792
--- /dev/null
+++ b/backend/corpora/utils/exclude_fields.py
@@ -0,0 +1,9 @@
+from addcorpus import extract
+
+def has_extractor(field):
+    if type(field.extractor) != extract.Constant:
+        return True
+    return field.extractor.apply() != None
+
+def exclude_fields_without_extractor(fields):
+    return list(filter(has_extractor, fields))
\ No newline at end of file
diff --git a/backend/corpora/utils/test_corpora_utils.py b/backend/corpora/utils/test_corpora_utils.py
new file mode 100644
index 000000000..960381186
--- /dev/null
+++ b/backend/corpora/utils/test_corpora_utils.py
@@ -0,0 +1,17 @@
+from addcorpus.corpus import FieldDefinition
+from addcorpus.extract import Constant
+
+from corpora.utils import exclude_fields
+
+def test_exclude_fields():
+    fields = [
+        FieldDefinition(
+            name='test1',
+            extractor=Constant('some value')
+        ),
+        FieldDefinition(
+            name='test2'
+        )
+    ]
+    new_fields = exclude_fields.exclude_fields_without_extractor(fields)
+    assert new_fields != fields

From e2051b913dfbef304486d598eb866f226f23bd31 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 9 Nov 2023 16:33:22 +0100
Subject: [PATCH 73/98] fix tests

---
 backend/corpora/peaceportal/peaceportal.py       | 2 +-
 backend/corpora/peaceportal/tests/test_import.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index e7ba937f6..75196b709 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -71,7 +71,7 @@ def validate_extension(self, extension, full_path):
         So far, all PeacePortal corpora are XML, but may include CSV corpora in the future 
         '''
         logger = logging.getLogger(__name__)
-        if extension == 'xml':
+        if extension == '.xml':
             return True
         logger.debug(self.non_xml_msg.format(full_path))
 
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index 17822b616..8b9ec4424 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -12,12 +12,12 @@
         'n_documents': 2
     },
     {
-        'name': 'peaceportal-fiji',
+        'name': 'peaceportal-iis',
         'docs': [],
         'n_documents': 3
     },
     {
-        'name': 'peaceportal-iis',
+        'name': 'peaceportal-fiji',
         'docs': [],
         'n_documents': 3
     },
@@ -35,7 +35,6 @@ def corpus_test_name(corpus_spec):
 def test_imports(peace_corpus_settings, corpus_object):
     parent_corpus = load_corpus_definition('peaceportal')
     corpus = load_corpus_definition(corpus_object.get('name'))
-    print(corpus.add_metadata('somefile.txt'))
     assert len(os.listdir(os.path.abspath(corpus.data_directory))) != 0
     fully_specified = ['peaceportal-iis', 'peaceportal-tol']
     if corpus_object.get('name') not in fully_specified:

From a8b887053db997c5094f6573b622d3cf02e945ff Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 9 Nov 2023 16:48:24 +0100
Subject: [PATCH 74/98] add expected values in import test

---
 backend/corpora/peaceportal/FIJI/fiji.py      |   2 +-
 .../corpora/peaceportal/tests/test_import.py  | 212 +++++++++++++++++-
 2 files changed, 209 insertions(+), 5 deletions(-)

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 0b925f08c..8dd033df5 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -56,7 +56,7 @@ def __init__(self):
         )
 
         # the year is commented out: need to have not before / not after fields
-        # this is advisable since often we only roughly now the century
+        # this is advisable since often we only roughly know the century
         # self.year.extractor = XML(
         #     tag=['teiHeader', 'fileDesc', 'sourceDesc',
         #          'msDesc', 'history', 'origin', 'origDate'],
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index 8b9ec4424..9ba54de97 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -8,22 +8,226 @@
 CORPUS_TEST_DATA = [
     {
         'name': 'peaceportal-epidat',
-        'docs': [],
+        'docs': [{
+          "id": "blr-4",
+          "url": "http://www.steinheim-institut.de:80/cgi-bin/epidat?id=blr-4",
+          "year": "1865",
+          "not_before": "1865",
+          "not_after": None,
+          "source_database": "Epidat (Steinheim Institute)",
+          "transcription": """Hier ruhet
+der Kaufmann
+Nathan Schönfeld
+geb. d. 4. April 1812
+gest. d. [28.] Februar 1865
+‎‏פ״נ‏‎
+‎‏איש חמדות יקר רוח אוהב‏‎
+‎‏צדק ופועל טוב כ״ה נתן‏‎
+‎‏שאנפעלד נולד ח׳ של פסח‏‎
+‎‏תקע״ב ונפטר בשם טוב יום ג׳‏‎
+‎‏ב׳ אדר תרכ״ה לפ״ק‏‎
+‎‏תנצב״ה‏‎""",
+          "names": "Natan Schönfeld (Nathan Schönfeld)",
+          "sex": [
+            "M"
+          ],
+          "dates_of_death": [
+            "1865-02-28"
+          ],
+          "country": "Germany",
+          "region": "Thuringa",
+          "settlement": "Bleicherode",
+          "location_details": "Jewish Cemetery",
+          "language": [
+            "Hebrew",
+            "German"
+          ],
+          "iconography": None,
+          "images": [
+            "http://steinheim-institut.de/daten/picsblr/xl/0004_blr_2012.jpg",
+            "http://steinheim-institut.de/daten/picsblr/xl/0004rblr_2012.jpg",
+            "http://steinheim-institut.de/daten/picsblr/xl/0004dblr_2012.jpg"
+          ],
+          "coordinates": "51.434387 10.571183",
+          "material": [
+            "Stone"
+          ],
+          "material_details": "stone",
+          "bibliography": None,
+          "comments": """OBJECTTYPE:
+sepulchral monument
+
+""",
+          "transcription_de": None,
+          "transcription_he": "‎‏פ״נ‏‎ ‎‏איש חמדות יקר רוח אוהב‏‎ ‎‏צדק ופועל טוב כ״ה נתן‏‎ ‎‏שאנפעלד נולד ח׳ של פסח‏‎ ‎‏תקע״ב ונפטר בשם טוב יום ג׳‏‎ ‎‏ב׳ אדר תרכ״ה לפ״ק‏‎ ‎‏תנצב״ה‏‎",
+          "transcription_en": "",
+          "transcription_nl": "Hier ruhet"
+    }],
         'n_documents': 2
     },
     {
         'name': 'peaceportal-iis',
-        'docs': [],
+        'docs': [{
+          "id": "akld0002",
+          "url": "https://library.brown.edu/iip/viewinscr/akld0002",
+          "year": "0001",
+          "not_before": "0001",
+          "not_after": "0100",
+          "source_database": "Inscriptions of Israel/Palestine (Brown University)",
+          "transcription": """Χάρητος
+Χάρητος
+Χάρητος
+Χάρητος""",
+          "sex": "Unknown",
+          "country": "Israel/Palestine",
+          "region": "Judaea",
+          "settlement": "Jerusalem",
+          "location_details": [
+            "Judaea Jerusalem Akeldama Cave 2 chamber B",
+            "",
+            ""
+          ],
+          "language": [
+            "Greek",
+            None
+          ],
+          "iconography": "Painted Red",
+          "material": [
+            "Limestone",
+            "Stone"
+          ],
+          "material_details": "#limestone",
+          "bibliography": [
+            "Shadmi, T. (1996). The Ossuaries and the Sarcophagus. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 41–55). Jerusalem: Israel Antiquities Authority. (page 52)",
+            "Ilan, T. (1996). The Ossuary and Sarcophagus Inscriptions. In G. Avni & Z. Greenhut (Eds.), The Akeldama Tombs: Three Burial Caves in the Kidron Valley, Jerusalem (pp. 57–72). Jerusalem: Israel Antiquities Authority. (page 58)"
+          ],
+          "comments": """CONDITION:
+ (#complete.intact)
+
+
+LAYOUT:
+once on each side
+
+
+OBJECTTYPE:
+ossuary
+
+
+DIMENSIONS:
+H: 64 W: 29 D: 35
+
+
+HANDNOTES:
+ (#impressed.inscribed)
+
+""",
+          "transcription_he": "",
+          "transcription_la": "",
+          "transcription_el": "Χάρητος Χάρητος Χάρητος Χάρητος",
+          "transcription_en": "of Chares"
+        }],
         'n_documents': 3
     },
     {
         'name': 'peaceportal-fiji',
-        'docs': [],
+        'docs': [{
+            "id": "299",
+            "source_database": "Funerary Inscriptions of Jews from Italy (Utrecht University)",
+            "transcription": "Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν.",
+            "names": "Felicissima ( the commemorator) Emarantus ( the decaesed) (Φη<λ>ικίσσιμα Ἠμαράντῳ)",
+            "sex": [
+              "M",
+              "F"
+            ],
+            "age": None,
+            "country": "Italy",
+            "settlement": "Rome, Monteverde",
+            "location_details": "Museo Vaticano, lapidario ebraico ex-Lateranense; inv.no.30762",
+            "language": [
+              "Greek"
+            ],
+            "iconography": "none",
+            "material": [
+              "Stone",
+              "Marble"
+            ],
+            "bibliography": [
+              "Noy 1995, p. 69-70 (83)"
+            ],
+            "comments": """DATE:
+Uncertain
+""",
+            "transcription_he": "",
+            "transcription_la": "",
+            "transcription_el": "Φη<λ>ικίσσιμα Ἠμαράντῳ ἐποίησεν."
+        }],
         'n_documents': 3
     },
     {
         'name': 'peaceportal-tol',
-        'docs': [],
+        'docs': [{
+          "id": "tol-11",
+          "url": "http://www.steinheim-institut.de:80/cgi-bin/epidat?id=tol-11",
+          "year": None,
+          "not_before": None,
+          "not_after": None,
+          "source_database": "Medieval funerary inscriptions from Toledo",
+          "transcription": """‎‏מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר‏‎
+‎‏לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר‏‎
+‎‏עַל מוֹת לַבֵּן בָּחוּר וָטוֹב‏‎
+‎‏כְּגַן רָטוֹב‏‎
+‎‏קָם עָלָיו כַּזְּדוֹנִים‏‎
+‎‏גּוֹי עַז פָּנִים‏‎
+‎‏הִשְׁקֵהוּ מֵי רוֹשׁ‏‎
+‎‏בָּא עַד הָרֹאשׁ‏‎
+‎‏וַיַּכֵּהוּ בִצְדִיָּה‏‎
+‎‏מַכָּה טְרִיָּה‏‎
+‎‏לָאָרֶץ חַיְתוֹ דִכָּה‏‎
+‎‏וַיִּצֶק דַּם הַמַּכָּה‏‎
+‎‏נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל‏‎
+‎‏נַעַר יִשְׂרָאֵל‏‎
+‎‏הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה‏‎
+‎‏בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה‏‎
+‎‏הַצְּבִי יִשְׂרָאֵל חָלָל‏‎
+‎‏בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל‏‎
+‎‏אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]‏‎
+‎‏וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ‏‎
+‎‏עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ‏‎
+‎‏בְּפֶסַח וַיָּמֶת אוֹתוֹ‏‎
+‎‏תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ‏‎
+‎‏וַיֵּאָסֵף אֶל עַמּוֹ‏‎
+‎‏תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים‏‎
+‎‏צְרוּרָה בִּצְרוֹר הַחַיִּים‏‎
+‎‏יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל‏‎
+‎‏אֱלֹהֵי יִשְׂרָאֵל‏‎""",
+          "names": None,
+          "sex": [
+            "Unknown"
+          ],
+          "dates_of_death": None,
+          "country": "Spain",
+          "region": None,
+          "settlement": "Toledo",
+          "location_details": "Jewish Cemetery",
+          "language": [
+            "Hebrew"
+          ],
+          "iconography": None,
+          "images": None,
+          "coordinates": "39.871036 -4.022968",
+          "material": [
+            "Stone"
+          ],
+          "material_details": "stone (material not specified)",
+          "bibliography": None,
+          "comments": """OBJECTTYPE:
+sepulchral monument
+
+""",
+          "transcription_he": "‎‏מִקְנֶה הַשַׂ#[05בּצּ]דֶה וְהַמְּעָרָה אֲשֶׁר בּוֹ לְאֲחֻזַת קֶבֶר‏‎ ‎‏לָאִישׁ מְצָאהוּ שׁוֹד וָשֶׁבֶר‏‎ ‎‏עַל מוֹת לַבֵּן בָּחוּר וָטוֹב‏‎ ‎‏כְּגַן רָטוֹב‏‎ ‎‏קָם עָלָיו כַּזְּדוֹנִים‏‎ ‎‏גּוֹי עַז פָּנִים‏‎ ‎‏הִשְׁקֵהוּ מֵי רוֹשׁ‏‎ ‎‏בָּא עַד הָרֹאשׁ‏‎ ‎‏וַיַּכֵּהוּ בִצְדִיָּה‏‎ ‎‏מַכָּה טְרִיָּה‏‎ ‎‏לָאָרֶץ חַיְתוֹ דִכָּה‏‎ ‎‏וַיִּצֶק דַּם הַמַּכָּה‏‎ ‎‏נַתַּנְהוּ בְדַמּוֹ מִתְגָּאֵל‏‎ ‎‏נַעַר יִשְׂרָאֵל‏‎ ‎‏הוּא ר׳ יִשְׂרָאֵל בר׳ מֹשֶה‏‎ ‎‏בֶּן יִשְׂרָאֵל, דַמּוֹ יְחַשֵּׁב כְּדַם קָרְבָּן אִשֶׁ#[05בּצּ]ה‏‎ ‎‏הַצְּבִי יִשְׂרָאֵל חָלָל‏‎ ‎‏בִּשְׁנַת עַל בָּמוֹתֶיךָ חֻלָל‏‎ ‎‏אֹי נִיסָן [נֵס לָקַחְהוּ חֲבָל ?]‏‎ ‎‏וְרֹאשׁ לֹא נִשָּׂא מִיּוֹם נְפַלוֹ‏‎ ‎‏עַד בָּא הַמַּשְׁחִית אֶל בֵּיתוֹ‏‎ ‎‏בְּפֶסַח וַיָּמֶת אוֹתוֹ‏‎ ‎‏תְּהִי מִיתָתוֹ כַפָּרָה לְנִשְׁמָתוֹ‏‎ ‎‏וַיֵּאָסֵף אֶל עַמּוֹ‏‎ ‎‏תִּהְיֶה נַפְשׁוֹ בְסוֹד נְקִיִּים‏‎ ‎‏צְרוּרָה בִּצְרוֹר הַחַיִּים‏‎ ‎‏יִפְרוֹשׁ כְּנָפָיו עָלָיו הָאֵל‏‎ ‎‏אֱלֹהֵי יִשְׂרָאֵל‏‎",
+          "transcription_en": "",
+          "transcription_nl": ""
+        }],
         'n_documents': 3
     }
 ]

From 609c1c86d43c52fd75f285f480a929a76ad97acb Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Tue, 14 Nov 2023 17:38:55 +0100
Subject: [PATCH 75/98] fix import test

---
 backend/corpora/peaceportal/tests/test_import.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index 9ba54de97..e9dc175fb 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -82,15 +82,15 @@
           "country": "Israel/Palestine",
           "region": "Judaea",
           "settlement": "Jerusalem",
-          "location_details": [
+          "location_details": (
             "Judaea Jerusalem Akeldama Cave 2 chamber B",
             "",
             ""
-          ],
-          "language": [
+          ),
+          "language": (
             "Greek",
             None
-          ],
+          ),
           "iconography": "Painted Red",
           "material": [
             "Limestone",

From 5fab243717b0cf50333fc3fef342a4ff09c91143 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 12:29:29 +0100
Subject: [PATCH 76/98] remove unused imports

---
 backend/addcorpus/tests/test_times.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backend/addcorpus/tests/test_times.py b/backend/addcorpus/tests/test_times.py
index 5d232ab03..6a2790250 100644
--- a/backend/addcorpus/tests/test_times.py
+++ b/backend/addcorpus/tests/test_times.py
@@ -1,6 +1,5 @@
-from os.path import expanduser, realpath, join, dirname, relpath, abspath
+from os.path import join, dirname, abspath
 from datetime import datetime
-from importlib import reload
 
 import pytest
 

From efbcaeff3383831e06b7ae0b8cc13995191f8d40 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 12:29:50 +0100
Subject: [PATCH 77/98] update field exclusion test

---
 backend/corpora/utils/test_corpora_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/corpora/utils/test_corpora_utils.py b/backend/corpora/utils/test_corpora_utils.py
index 960381186..5b8274bf5 100644
--- a/backend/corpora/utils/test_corpora_utils.py
+++ b/backend/corpora/utils/test_corpora_utils.py
@@ -14,4 +14,4 @@ def test_exclude_fields():
         )
     ]
     new_fields = exclude_fields.exclude_fields_without_extractor(fields)
-    assert new_fields != fields
+    assert len(new_fields) == 1

From f1be252b714efe227aa14eba38f52a3a19f4888d Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 12:30:08 +0100
Subject: [PATCH 78/98] move NLTK corpus info to Django settings

---
 backend/addcorpus/es_settings.py              | 11 ++++---
 .../parliament/tests/test_es_settings.py      | 30 +++++--------------
 backend/ianalyzer/common_settings.py          |  2 ++
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 4bc194496..9b140e1d1 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -1,9 +1,8 @@
-import nltk
 import os
-from langcodes import Language
 
-HERE = os.path.abspath(os.path.dirname(__file__))
-NLTK_DATA_PATH = os.path.join(HERE, 'nltk_data')
+from django.conf import settings
+from langcodes import Language
+import nltk
 
 # available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
 AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian', 
@@ -24,8 +23,8 @@ def get_language_key(language_code):
     return Language.make(language_code).display_name().lower()
 
 def get_nltk_stopwords(language_code):
-    nltk.download('stopwords', NLTK_DATA_PATH)
-    stopwords_dir = os.path.join(NLTK_DATA_PATH, 'corpora', 'stopwords')
+    nltk.download('stopwords', settings.NLTK_DATA_PATH)
+    stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
     languages = os.listdir(stopwords_dir)
     language = get_language_key(language_code)
 
diff --git a/backend/corpora/parliament/tests/test_es_settings.py b/backend/corpora/parliament/tests/test_es_settings.py
index 11d446416..4a5891035 100644
--- a/backend/corpora/parliament/tests/test_es_settings.py
+++ b/backend/corpora/parliament/tests/test_es_settings.py
@@ -3,11 +3,11 @@
 import os
 import shutil
 
-def test_stopwords(clean_nltk_data_directory):
+def test_stopwords(clean_nltk_data_directory, settings):
     """
     Check that stopwords results are valid and all languages are included
     """
-
+    settings.NLTK_DATA_PATH = clean_nltk_data_directory
     cases = [
         {
             'language': 'en',
@@ -50,30 +50,14 @@ def test_stopwords(clean_nltk_data_directory):
 
 
 @pytest.fixture
-def clean_nltk_data_directory():
+def clean_nltk_data_directory(settings):
     """
     Temporarily move already downloaded nltk_data if it was already downloaded,
     and restore the nltk_data directory after testing. If no nltk_data folder existed,
     data downloaded during testing will also be removed when done.
     """
-    data_path = es_settings.NLTK_DATA_PATH
-
-    if os.path.isdir(data_path):
-        # remove already downloaded data
-        temp_path = os.path.join(es_settings.HERE, '_nltk_data_temp')
-        shutil.move(data_path, temp_path)
-
-        yield data_path
-
-        # clear test data
-        if os.path.exists(data_path):
-            shutil.rmtree(data_path)
-
-        # move the old data back
-        shutil.move(temp_path, data_path)
-    else:
-        yield data_path
+    here = os.path.dirname(os.path.abspath(__file__))
+    data_path = os.path.join(here, '_nltk_data_temp')
+    yield data_path
 
-        # clear test data
-        if os.path.isdir(data_path):
-            shutil.rmtree(data_path)
+    shutil.rmtree(data_path)
diff --git a/backend/ianalyzer/common_settings.py b/backend/ianalyzer/common_settings.py
index 06b8fcaf0..f78775222 100644
--- a/backend/ianalyzer/common_settings.py
+++ b/backend/ianalyzer/common_settings.py
@@ -131,3 +131,5 @@
 }
 
 LOGO_LINK = 'https://dhstatic.hum.uu.nl/logo-cdh/png/UU_CDH_logo_EN_whiteFC.png'
+
+NLTK_DATA_PATH = os.path.join(BASE_DIR, 'addcorpus', 'nltk_data')
\ No newline at end of file

From 2aa54e450da84dcce368801b2740c2cb3d2e3051 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 13:43:15 +0100
Subject: [PATCH 79/98] skip parliament es_settings test if no internet
 connection

---
 backend/conftest.py                            | 18 +++++++++++++++---
 .../parliament/tests/test_es_settings.py       |  2 +-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/backend/conftest.py b/backend/conftest.py
index 47b7e430f..09a24fd5c 100644
--- a/backend/conftest.py
+++ b/backend/conftest.py
@@ -1,6 +1,9 @@
+from time import sleep
+
 import pytest
+import requests
 from allauth.account.models import EmailAddress
-from time import sleep
+
 from ianalyzer.elasticsearch import elasticsearch
 from addcorpus.load_corpus import load_corpus_definition
 from addcorpus.save_corpus import load_and_save_all_corpora
@@ -60,8 +63,18 @@ def admin_client(client, admin_user, admin_credentials):
     yield client
     client.logout()
 
-# elasticsearch
+@pytest.fixture(scope='session')
+def connected_to_internet():
+    """
+    Check if there is internet connection. Skip if no connection can be made.
+    """
+    try:
+        requests.get("https://1.1.1.1")
+    except:
+        pytest.skip('Cannot connect to internet')
+    
 
+# elasticsearch
 @pytest.fixture(scope='session')
 def es_client():
     """
@@ -78,7 +91,6 @@ def es_client():
     return client
 
 # mock corpora
-
 @pytest.fixture(autouse=True)
 def add_mock_corpora_to_db(db):
     #add mock corpora to the database at the start of each test
diff --git a/backend/corpora/parliament/tests/test_es_settings.py b/backend/corpora/parliament/tests/test_es_settings.py
index 4a5891035..f554fae15 100644
--- a/backend/corpora/parliament/tests/test_es_settings.py
+++ b/backend/corpora/parliament/tests/test_es_settings.py
@@ -3,7 +3,7 @@
 import os
 import shutil
 
-def test_stopwords(clean_nltk_data_directory, settings):
+def test_stopwords(clean_nltk_data_directory, settings, connected_to_internet):
     """
     Check that stopwords results are valid and all languages are included
     """

From e7e23ac5cc6f960acf15841375e3ebf8245bcb2a Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 14:21:56 +0100
Subject: [PATCH 80/98] sort filenames and suppress test warnings

---
 backend/corpora/peaceportal/peaceportal.py       | 2 +-
 backend/corpora/peaceportal/tests/test_import.py | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 75196b709..9df4b2c16 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -54,7 +54,7 @@ def es_settings(self):
 
     def sources(self, start, end):
         for directory, _, filenames in os.walk(self.data_directory):
-            for filename in filenames:
+            for filename in sorted(filenames):
                 name, extension = op.splitext(filename)
                 full_path = op.join(directory, filename)
                 if not self.validate_extension(extension, full_path):
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index e9dc175fb..d0ef24da8 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -262,11 +262,6 @@ def test_imports(peace_corpus_settings, corpus_object):
         for key in doc:
             resulted_fields.add(key)
 
-    for key in resulted_fields:
-        if not key in tested_fields:
-            message = 'Key "{}" is included in the result for {} but has no specification'.format(key, corpus_object.get('name'))
-            warnings.warn(message)
-
     docs = get_documents(corpus, start, end)
     assert len(list(docs)) == corpus_object.get('n_documents')
 

From 3e615e264099e02f04945ae2186ecb6b6a1f8af6 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 18:15:17 +0100
Subject: [PATCH 81/98] add documentation for ParentCorpusDefinition

---
 backend/addcorpus/corpus.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
index 4c8cd4256..ad817cde2 100644
--- a/backend/addcorpus/corpus.py
+++ b/backend/addcorpus/corpus.py
@@ -276,7 +276,12 @@ def _reject_extractors(self, *inapplicable_extractors):
                     "Specified extractor method cannot be used with this type of data")
     
 class ParentCorpusDefinition(CorpusDefinition):
-    ''' A class from which other corpus definitions can inherit
+    ''' A class from which other corpus definitions can inherit.
+    This class is in charge of setting fields, usually without defining an extractor.
+    The subclassed CorpusDefinitions will set extractors on the fields -
+    this way, CorpusDefinitions can share the same mappings and filters,
+    while the logic to collect sources and populate the fields can be different.
+    The ParentCorpusDefinition can also be used to allow cross-corpus search and filtering.
     '''
     #define fields property so it can be set in __init__
     @property
@@ -288,8 +293,9 @@ def fields(self, value):
         self._fields = value
 
     def __init__(self):
-        ''' specify a list of fields here which all subclasses share
-            should be overwritten in subclasses
+        ''' Specify a list of fields which all subclasses share
+            A subclass of ParentCorpusDefinition will provide extractors for the fields,
+            and potentially prune done the list of fields to those which have an extractor
         '''
         self.fields = []
 

From b79c9af6a22a439154ff26a35dbdd747cb2212a3 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 18:40:07 +0100
Subject: [PATCH 82/98] rename extract.HTML -> extract.FilterAttribute

---
 backend/addcorpus/corpus.py           | 7 ++-----
 backend/addcorpus/extract.py          | 6 +++---
 backend/corpora/peaceportal/epidat.py | 5 ++---
 backend/corpora/peaceportal/iis.py    | 6 +++---
 backend/corpora/peaceportal/tol.py    | 4 ++--
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
index ad817cde2..76b05e81b 100644
--- a/backend/addcorpus/corpus.py
+++ b/backend/addcorpus/corpus.py
@@ -11,9 +11,6 @@
 from os.path import isdir
 
 from django.conf import settings
-from langcodes import Language, standardize_tag
-
-from addcorpus.constants import CATEGORIES
 
 import logging
 
@@ -544,7 +541,7 @@ def source2dicts(self, source):
         '''
         (filename, metadata) = source
 
-        self._reject_extractors(extract.XML, extract.CSV)
+        self._reject_extractors(extract.CSV)
 
         # Loading HTML
         logger.info('Reading HTML file {} ...'.format(filename))
@@ -619,7 +616,7 @@ class CSVCorpusDefinition(CorpusDefinition):
     def source2dicts(self, source):
         # make sure the field size is as big as the system permits
         csv.field_size_limit(sys.maxsize)
-        self._reject_extractors(extract.XML, extract.HTML)
+        self._reject_extractors(extract.XML, extract.FilterAttribute)
 
         if isinstance(source, str):
             filename = source
diff --git a/backend/addcorpus/extract.py b/backend/addcorpus/extract.py
index fba462923..4f4b38187 100644
--- a/backend/addcorpus/extract.py
+++ b/backend/addcorpus/extract.py
@@ -320,14 +320,14 @@ def _attr(self, soup):
             ]
 
 
-class HTML(XML):
+class FilterAttribute(XML):
     '''
     This extractor extracts attributes or contents from a BeautifulSoup node.
-    It is an extension of XML class
+    It is an extension of the XML extractor
     '''
 
     def __init__(self,
-                 attribute_filter={  # Whether to search other xml files for this field, and the file tag these files should have
+                 attribute_filter={  # Specify an attribute / value pair by which to select content
                      'attribute': None,
                      'value': None},
                  *nargs,
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index adae33d26..d4b7d9008 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -4,8 +4,7 @@
 from django.conf import settings
 
 from addcorpus.corpus import XMLCorpusDefinition
-from addcorpus.extract import XML, Constant, HTML, Combined
-from addcorpus.es_settings import es_settings
+from addcorpus.extract import XML, Constant, Combined, FilterAttribute
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
@@ -32,7 +31,7 @@ def __init__(self):
             flatten=True
         )
 
-        self.url.extractor = HTML(
+        self.url.extractor = FilterAttribute(
             tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'],
             multiple=False,
             toplevel=False,
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index e078cf483..361facfe2 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -4,7 +4,7 @@
 from django.conf import settings
 
 from addcorpus.corpus import XMLCorpusDefinition
-from addcorpus.extract import XML, Constant, HTML, ExternalFile, Combined
+from addcorpus.extract import Combined, Constant, ExternalFile, FilterAttribute, XML
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
@@ -35,7 +35,7 @@ def __init__(self):
             transform=lambda x: ''.join(x.lower().split())
         )
 
-        self.url.extractor = HTML(
+        self.url.extractor = FilterAttribute(
             tag=['teiHeader', 'fileDesc', 'sourceDesc',
                  'msDesc', 'msIdentifier', 'idno'],
             multiple=False,
@@ -71,7 +71,7 @@ def __init__(self):
             stream_handler=extract_transcript
         )
 
-        self.transcription_english.extractor = HTML(
+        self.transcription_english.extractor = FilterAttribute(
             tag=['div'],
             toplevel=True,
             multiple=False,
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
index 2c5a82e3f..3325076d6 100644
--- a/backend/corpora/peaceportal/tol.py
+++ b/backend/corpora/peaceportal/tol.py
@@ -4,7 +4,7 @@
 from django.conf import settings
 
 from addcorpus.corpus import XMLCorpusDefinition
-from addcorpus.extract import XML, Constant, HTML, Combined
+from addcorpus.extract import XML, Constant, Combined, FilterAttribute
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
@@ -29,7 +29,7 @@ def __init__(self):
             flatten=True
         )
 
-        self.url.extractor = HTML(
+        self.url.extractor = FilterAttribute(
             tag=['teiHeader', 'fileDesc', 'publicationStmt', 'idno'],
             multiple=False,
             toplevel=False,

From bb85a9b59353e6457c8a743d5285a0a9ee96bc7e Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 18:43:16 +0100
Subject: [PATCH 83/98] correct docstring

---
 backend/addcorpus/es_mappings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
index 7eca9f6e4..72662dfa8 100644
--- a/backend/addcorpus/es_mappings.py
+++ b/backend/addcorpus/es_mappings.py
@@ -5,9 +5,9 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
     Mapping for the main content field. Options:
 
     - `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
-    - `stopword_analyzer`: enables analysis using stopword removal. Can be a string specifying `clean-{language}` analyser in the `es_settings` of the corpus, or True for `clean`
-    - `stemming_analysis`: enables analysis using stemming. Can be a string specifying a `stemmed-{}` analyser in the `es_settings` for the corpus, or Truem for `stemmed`
-    - 'updated_highlighting': enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
+    - `stopword_analysis`: enables analysis using stopword removal.
+    - `stemming_analysis`: enables analysis using stemming.
+    - `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
     '''
 
     mapping = {

From 35e36130d9b339068749b9feb211c1d3e41130f2 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 18:57:27 +0100
Subject: [PATCH 84/98] remove special rule for monolingual corpora

---
 backend/addcorpus/es_settings.py        | 13 ++++++-------
 documentation/Defining-corpus-fields.md |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index 9b140e1d1..aaee0ed00 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -56,24 +56,23 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     
     for language in languages:
         # do not attach language isocodes if there is just one language
-        language_string = language if len(languages) > 1 else None
 
         if stopword_analyzer or stemming_analyzer:
-            if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language_string), language):
+            if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language), language):
                 continue # skip languages for which we do not have a stopword list
 
             if stopword_analyzer:
                 set_clean_analyzer(
                     settings,
-                    add_language_string(stopword_filter_name, language_string),
-                    add_language_string(clean_analyzer_name, language_string),
+                    add_language_string(stopword_filter_name, language),
+                    add_language_string(clean_analyzer_name, language),
                 )
             if stemming_analyzer and get_language_key(language) in AVAILABLE_ES_STEMMERS:
                 set_stemmed_analyzer(
                     settings,
-                    add_language_string(stopword_filter_name, language_string),
-                    add_language_string(stemmer_filter_name, language_string),
-                    add_language_string(stemmed_analyzer_name, language_string),
+                    add_language_string(stopword_filter_name, language),
+                    add_language_string(stemmer_filter_name, language),
+                    add_language_string(stemmed_analyzer_name, language),
                     language
                 )
 
diff --git a/documentation/Defining-corpus-fields.md b/documentation/Defining-corpus-fields.md
index 53dc30d89..c6c9445e6 100644
--- a/documentation/Defining-corpus-fields.md
+++ b/documentation/Defining-corpus-fields.md
@@ -26,8 +26,8 @@ Elasticsearch supports specifying a `fields` parameter to a field to define subf
 
 The one way in which multifields _are_ used is to allow different analyzers on the same text field. Text fields typically use the default analyzer, which performs basic tokenisation and converts text to lowercase. For more extensive analysis, subfields can be added. I-analyzer uses the following naming convention:
 
-- `*.clean`: uses a language-specific analyzer to filter stopwords.
-- `*.stemmed`: uses a language-specific analyzer to filter stopwords and stem words.
+- `*.clean-{iso-language-code}`: uses a language-specific analyzer to filter stopwords.
+- `*.stemmed-{iso-language-code}`: uses a language-specific analyzer to filter stopwords and stem words.
 - `*.length`: specifies the token count of the text, which is useful for aggregations.
 - `*.text`: a field with text mapping. Can be added to a keyword field to support full-text search in the field.
 

From f06d92bc56d05a204f0bf89c1c0825c7799e63a3 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 18:59:42 +0100
Subject: [PATCH 85/98] move es_alias to parent class

---
 backend/corpora/peaceportal/FIJI/fiji.py   | 1 -
 backend/corpora/peaceportal/epidat.py      | 1 -
 backend/corpora/peaceportal/iis.py         | 1 -
 backend/corpora/peaceportal/peaceportal.py | 1 +
 backend/corpora/peaceportal/tol.py         | 1 -
 5 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 8dd033df5..85536e8dc 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -19,7 +19,6 @@ class PeaceportalFIJI(PeacePortal, XMLCorpusDefinition):
 
     data_directory = settings.PEACEPORTAL_FIJI_DATA
     es_index = getattr(settings, 'PEACEPORTAL_FIJI_ES_INDEX', 'peaceportal-fiji')
-    es_alias = settings.PEACEPORTAL_ALIAS
     filename_pattern = re.compile(r'\d+')
 
     def sources(self, start, end):
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
index d4b7d9008..a0f1e8b53 100644
--- a/backend/corpora/peaceportal/epidat.py
+++ b/backend/corpora/peaceportal/epidat.py
@@ -13,7 +13,6 @@ class PeaceportalEpidat(PeacePortal, XMLCorpusDefinition):
 
     data_directory = settings.PEACEPORTAL_EPIDAT_DATA
     es_index = getattr(settings, 'PEACEPORTAL_EPIDAT_ES_INDEX', 'peaceportal-epidat')
-    es_alias = settings.PEACEPORTAL_ALIAS
 
     languages = ['de', 'he', 'en', 'nl']
 
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index 361facfe2..a27ccc1ea 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -11,7 +11,6 @@
 class PeaceportalIIS(PeacePortal, XMLCorpusDefinition):
     data_directory = settings.PEACEPORTAL_IIS_DATA
     es_index = getattr(settings, 'PEACEPORTAL_IIS_ES_INDEX', 'peaceportal-iis')
-    es_alias = settings.PEACEPORTAL_ALIAS
 
     def add_metadata(self, filename):
         external_file_folder = settings.PEACEPORTAL_IIS_TXT_DATA
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 9df4b2c16..72537cfee 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -31,6 +31,7 @@ class PeacePortal(ParentCorpusDefinition):
     max_date = datetime(year=1950, month=12, day=31)
     visualize = []
     es_index = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal')
+    es_alias = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal')
     scan_image_type = 'image/png'
     # fields below are required by code but not actually used
     min_date = datetime(year=746, month=1, day=1)
diff --git a/backend/corpora/peaceportal/tol.py b/backend/corpora/peaceportal/tol.py
index 3325076d6..4d75f4cd4 100644
--- a/backend/corpora/peaceportal/tol.py
+++ b/backend/corpora/peaceportal/tol.py
@@ -11,7 +11,6 @@
 class PeaceportalTOL(PeacePortal, XMLCorpusDefinition):
     data_directory = settings.PEACEPORTAL_TOL_DATA
     es_index = getattr(settings, 'PEACEPORTAL_TOL_ES_INDEX', 'peaceportal-tol')
-    es_alias = settings.PEACEPORTAL_ALIAS
 
     languages = ['en', 'nl', 'he']
 

From 70683b332a9ab83929471d1641386e406dd668c6 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Wed, 15 Nov 2023 19:12:17 +0100
Subject: [PATCH 86/98] remove / edit transform functions

---
 backend/corpora/peaceportal/FIJI/fiji.py         |  6 ------
 backend/corpora/peaceportal/iis.py               | 14 +++-----------
 backend/corpora/peaceportal/tests/test_import.py |  4 ++--
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 85536e8dc..96667cc6c 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -179,12 +179,6 @@ def __init__(self):
         self.fields = exclude_fields_without_extractor(self.fields)
 
 
-def transform_age(age):
-    if age in ['?', 'none', 'none?']:
-        return 'Unknown'
-    return age
-
-
 def transform_age_integer(age):
     try:
         return int(age)
diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index a27ccc1ea..1bbe30af6 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -5,6 +5,7 @@
 
 from addcorpus.corpus import XMLCorpusDefinition
 from addcorpus.extract import Combined, Constant, ExternalFile, FilterAttribute, XML
+from addcorpus.serializers import LanguageField
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
 from corpora.utils.exclude_fields import exclude_fields_without_extractor
 
@@ -351,17 +352,8 @@ def extract_dimensions(soup):
 
 
 def normalize_language(text):
-    if not text:
-        return
-    ltext = text.lower().strip()
-    if ltext in ['grc']:
-        return 'Greek'
-    if ltext in ['he', 'heb']:
-        return 'Hebrew'
-    if ltext in ['arc']:
-        return 'Aramaic'
-    if ltext in ['la', 'latin']:
-        return 'Latin'
+    serializer = LanguageField()
+    return serializer.to_representation(text)
 
     # excluded (for now):
     # revision history
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_import.py
index d0ef24da8..c19ba2dc0 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_import.py
@@ -88,8 +88,8 @@
             ""
           ),
           "language": (
-            "Greek",
-            None
+            "Ancient Greek",
+            "Unknown"
           ),
           "iconography": "Painted Red",
           "material": [

From 9a16d3edc5e758077c2ede02c5fa2675bcfe601e Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 10:18:57 +0100
Subject: [PATCH 87/98] remove exceptions for monolingual corpora; adjust tests

---
 backend/addcorpus/es_settings.py              | 36 ++++++++++---------
 backend/addcorpus/tests/test_es_settings.py   | 12 +++----
 .../dutchannualreports/dutchannualreports.py  |  4 +--
 .../dutchnewspapers/dutchnewspapers_public.py |  4 +--
 backend/corpora/ecco/ecco.py                  |  4 +--
 .../guardianobserver/guardianobserver.py      |  4 +--
 backend/corpora/parliament/parliament.py      |  2 +-
 .../parliament/utils/field_defaults.py        |  2 +-
 backend/corpora/peaceportal/peaceportal.py    |  2 +-
 backend/corpora/periodicals/periodicals.py    |  4 +--
 backend/corpora/rechtspraak/rechtspraak.py    |  4 +--
 backend/corpora/times/times.py                |  4 +--
 backend/corpora/troonredes/troonredes.py      |  4 +--
 .../mock_corpora/multilingual_mock_corpus.py  | 12 +++----
 .../tests/mock_corpora/large_mock_corpus.py   | 12 +++----
 .../tests/mock_corpora/small_mock_corpus.py   | 35 +++++-------------
 .../visualization/tests/test_termvectors.py   |  4 +--
 backend/visualization/tests/test_wordcloud.py |  1 -
 backend/visualization/wordcloud.py            | 15 +++++---
 documentation/Defining-corpus-fields.md       |  4 +--
 20 files changed, 79 insertions(+), 90 deletions(-)

diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
index aaee0ed00..4268665b3 100644
--- a/backend/addcorpus/es_settings.py
+++ b/backend/addcorpus/es_settings.py
@@ -39,12 +39,12 @@ def get_nltk_stopwords(language_code):
 def add_language_string(name, language):
     return '{}_{}'.format(name, language) if language else name
 
-def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
+def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
     '''
     Make elasticsearch settings json for a corpus index. Options:
-    - `language`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming
-    - `stopword_analyzer`: define an analyzer that removes stopwords.
-    - `stemming_analyzer`: define an analyzer that removes stopwords and performs stemming.
+    - `languages`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming
+    - `stopword_analysis`: set to True to add an analyzer that removes stopwords.
+    - `stemming_analysis`: set to True to add an analyzer that removes stopwords and performs stemming.
     '''
     settings = {'index': {'number_of_shards': 1, 'number_of_replicas': 1}}
     stopword_filter_name = 'stopwords'
@@ -57,17 +57,20 @@ def es_settings(languages=[], stopword_analyzer=False, stemming_analyzer=False):
     for language in languages:
         # do not attach language isocodes if there is just one language
 
-        if stopword_analyzer or stemming_analyzer:
+        if stopword_analysis or stemming_analysis:
             if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language), language):
                 continue # skip languages for which we do not have a stopword list
 
-            if stopword_analyzer:
+            if stopword_analysis:
                 set_clean_analyzer(
                     settings,
                     add_language_string(stopword_filter_name, language),
                     add_language_string(clean_analyzer_name, language),
                 )
-            if stemming_analyzer and get_language_key(language) in AVAILABLE_ES_STEMMERS:
+            if stemming_analysis:
+                if not get_language_key(language) in AVAILABLE_ES_STEMMERS:
+                    raise UserWarning('You specified `stemming_analysis=True`, but \
+                                      there is no stemmer available for this language')
                 set_stemmed_analyzer(
                     settings,
                     add_language_string(stopword_filter_name, language),
@@ -85,12 +88,12 @@ def number_filter():
         "replacement":""
     }
 
-def make_stopword_filter(stopword_filter_name, language):
+def make_stopword_filter(language):
     try:
         stopwords = get_nltk_stopwords(language)
         return {
             "type": "stop",
-            stopword_filter_name: stopwords
+            'stopwords': stopwords
         }
     except:
         return None
@@ -116,14 +119,15 @@ def make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name):
         "filter": ["lowercase", stopword_filter_name, stemmer_filter_name]
     }
 
-def get_stopwords_from_settings(es_settings):
+def get_stopwords_from_settings(es_settings, analyzer):
     try:
-        token_filter = es_settings["analysis"]['filter']['stopwords']
-        stopwords = token_filter['stopwords']
+        # the name of the stopword filter is second in the list, after "lowercase"
+        stopword_filter_name = es_settings['analysis']['analyzer'].get(
+            analyzer).get('filter')[-1]
+        token_filter = es_settings["analysis"]['filter'][stopword_filter_name]
+        return token_filter['stopwords']
     except:
-        stopwords = None
-
-    return stopwords
+        return []
 
 def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language):
     filters = settings['analysis'].get('filter', {})
@@ -139,7 +143,7 @@ def set_char_filter(settings):
     }
 
 def set_stopword_filter(settings, stopword_filter_name, language):
-    stopword_filter = make_stopword_filter(stopword_filter_name, language)
+    stopword_filter = make_stopword_filter(language)
     if not stopword_filter:
         return False
     filters = settings['analysis'].get('filter', {})
diff --git a/backend/addcorpus/tests/test_es_settings.py b/backend/addcorpus/tests/test_es_settings.py
index 9f94b1b69..0f178f321 100644
--- a/backend/addcorpus/tests/test_es_settings.py
+++ b/backend/addcorpus/tests/test_es_settings.py
@@ -11,16 +11,16 @@
         'stemming': True,
         'expected': {
             'filter': {
-                'stemmer': {'type': 'stemmer', 'language': 'english'},
-                'stopwords': {'type': 'stop', 'stopwords': list()}
+                'stemmer_en': {'type': 'stemmer', 'language': 'english'},
+                'stopwords_en': {'type': 'stop', 'stopwords': list()},
             },
             'analyzer': {
-                'clean': {
-                    'filter': ['lowercase', 'stopwords'],
+                'clean_en': {
+                    'filter': ['lowercase', 'stopwords_en'],
                     **char_filter_tokenizer
                 },
-                'stemmed': {
-                    'filter': ['lowercase', 'stopwords', 'stemmer'],
+                'stemmed_en': {
+                    'filter': ['lowercase', 'stopwords_en', 'stemmer_en'],
                     **char_filter_tokenizer
                 }
             }
diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
index 884ce066b..02d92c432 100644
--- a/backend/corpora/dutchannualreports/dutchannualreports.py
+++ b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -50,7 +50,7 @@ class DutchAnnualReports(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
         reader = csv.DictReader(f)
@@ -180,7 +180,7 @@ def sources(self, start=min_date, end=max_date):
         ),
         FieldDefinition(
             name='content',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'nl'),
             display_name='Content',
             display_type='text_content',
             visualizations=['wordcloud'],
diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
index 167597f03..ba366c289 100644
--- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
+++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py
@@ -40,7 +40,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     tag_toplevel = 'text'
     tag_entry = 'p'
@@ -315,7 +315,7 @@ def fields(self):
             display_name='Content',
             display_type='text_content',
             description='Text content.',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'nl'),
             results_overview=True,
             search_field_core=True,
             extractor=XML(tag='p', multiple=True,
diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
index 0a97b25d6..00c96593f 100644
--- a/backend/corpora/ecco/ecco.py
+++ b/backend/corpora/ecco/ecco.py
@@ -45,7 +45,7 @@ class Ecco(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     def sources(self, start=min_date, end=max_date):
         logging.basicConfig(filename='ecco.log', level=logging.INFO)
@@ -150,7 +150,7 @@ def fields(self):
                 name='content',
                 display_name='Content',
                 display_type='text_content',
-                es_mapping=main_content_mapping(True, True, True),
+                es_mapping=main_content_mapping(True, True, True, 'en'),
                 description='Text content.',
                 results_overview=True,
                 search_field_core=True,
diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py
index f6b60a348..5d08bf104 100644
--- a/backend/corpora/guardianobserver/guardianobserver.py
+++ b/backend/corpora/guardianobserver/guardianobserver.py
@@ -46,7 +46,7 @@ class GuardianObserver(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     tag_toplevel = 'Record'
 
@@ -170,7 +170,7 @@ def sources(self, start=datetime.min, end=datetime.max):
         ),
         FieldDefinition(
             name='content',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'en'),
             display_name='Content',
             display_type='text_content',
             visualizations=['wordcloud'],
diff --git a/backend/corpora/parliament/parliament.py b/backend/corpora/parliament/parliament.py
index 9d5af6096..6f3be976f 100644
--- a/backend/corpora/parliament/parliament.py
+++ b/backend/corpora/parliament/parliament.py
@@ -38,7 +38,7 @@ class Parliament(CorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
 
     # overwrite below in child class if you need to extract the (converted) transcription
diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py
index cf74a6c10..eb8da607e 100644
--- a/backend/corpora/parliament/utils/field_defaults.py
+++ b/backend/corpora/parliament/utils/field_defaults.py
@@ -284,7 +284,7 @@ def speech():
         display_name='Speech',
         description='The transcribed speech',
         # each index has its own definition of the 'clean' and 'stemmed' analyzer, based on language
-        es_mapping = main_content_mapping(token_counts=True, stopword_analysis=True, stemming_analysis=True, updated_highlighting=True),
+        es_mapping = main_content_mapping(token_counts=True, stopword_analysis=True, stemming_analysis=True, language='en', updated_highlighting=True),
         results_overview=True,
         search_field_core=True,
         display_type='text_content',
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index 72537cfee..d3483f5d0 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -51,7 +51,7 @@ class PeacePortal(ParentCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages, stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages, stopword_analysis=True, stemming_analysis=True)
 
     def sources(self, start, end):
         for directory, _, filenames in os.walk(self.data_directory):
diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py
index da4bce105..72882bc41 100644
--- a/backend/corpora/periodicals/periodicals.py
+++ b/backend/corpora/periodicals/periodicals.py
@@ -38,7 +38,7 @@ class Periodicals(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     tag_toplevel = 'articles'
     tag_entry = 'artInfo'
@@ -145,7 +145,7 @@ def sources(self, start=min_date, end=max_date):
             display_name='Content',
             display_type='text_content',
             description='Text content.',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'en'),
             results_overview=True,
             extractor=extract.XML(tag='ocrText', flatten=True),
             search_field_core=True,
diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py
index d4e4dac80..2404ee06b 100644
--- a/backend/corpora/rechtspraak/rechtspraak.py
+++ b/backend/corpora/rechtspraak/rechtspraak.py
@@ -45,7 +45,7 @@ class Rechtspraak(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     tag_toplevel = 'open-rechtspraak'
 
@@ -287,7 +287,7 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None
             name='content',
             display_name='Content',
             display_type='text_content',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'nl'),
             extractor=extract.Backup(
                 extract.XML('uitspraak', flatten=True),
                 extract.XML('conclusie', flatten=True),
diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py
index 3f6e02514..35e56ff0f 100644
--- a/backend/corpora/times/times.py
+++ b/backend/corpora/times/times.py
@@ -39,7 +39,7 @@ class Times(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     tag_toplevel = 'issue'
     tag_entry = 'article'
@@ -424,7 +424,7 @@ def sources(self, start=datetime.min, end=datetime.max):
             name='content',
             display_name='Content',
             display_type='text_content',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'en'),
             visualizations=['wordcloud'],
             description='Raw OCR\'ed text (content).',
             results_overview=True,
diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py
index e37223c00..0bc8cbc2c 100644
--- a/backend/corpora/troonredes/troonredes.py
+++ b/backend/corpora/troonredes/troonredes.py
@@ -44,7 +44,7 @@ class Troonredes(XMLCorpusDefinition):
 
     @property
     def es_settings(self):
-        return es_settings(self.languages[:1], stopword_analyzer=True, stemming_analyzer=True)
+        return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
 
     tag_toplevel = 'doc'
     tag_entry = 'entry'
@@ -136,7 +136,7 @@ def sources(self, start=min_date, end=max_date):
             display_name='Content',
             display_type='text_content',
             description='Text content.',
-            es_mapping=main_content_mapping(True, True, True),
+            es_mapping=main_content_mapping(True, True, True, 'nl'),
             results_overview=True,
             search_field_core=True,
             visualizations=['wordcloud', 'ngram'],
diff --git a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py
index 39eb62ce0..ffb8e046a 100644
--- a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py
+++ b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py
@@ -1,7 +1,9 @@
 from datetime import datetime
+import os
+
 from addcorpus.corpus import FieldDefinition, CSVCorpusDefinition
+from addcorpus.es_mappings import keyword_mapping, text_mapping
 from addcorpus.extract import CSV
-import os
 
 # Fake corpus class for unit tests
 
@@ -26,17 +28,13 @@ def sources(self, start=min_date, end=max_date):
 
     content = FieldDefinition(
         name = 'content',
-        es_mapping= {
-            'type': 'text',
-        },
+        es_mapping = text_mapping(),
         extractor = CSV('content')
     )
 
     language = FieldDefinition(
         name = 'language',
-        es_mapping= {
-            'type': 'keyword'
-        },
+        es_mapping = keyword_mapping(),
         extractor = CSV('language')
     )
 
diff --git a/backend/visualization/tests/mock_corpora/large_mock_corpus.py b/backend/visualization/tests/mock_corpora/large_mock_corpus.py
index e15652945..466ceb8a6 100644
--- a/backend/visualization/tests/mock_corpora/large_mock_corpus.py
+++ b/backend/visualization/tests/mock_corpora/large_mock_corpus.py
@@ -1,7 +1,9 @@
 from datetime import datetime
-from addcorpus.corpus import CorpusDefinition, FieldDefinition
 import random
 
+from addcorpus.corpus import CorpusDefinition, FieldDefinition
+from addcorpus.es_mappings import date_mapping, text_mapping
+
 TOTAL_DOCUMENTS = 11000
 
 # some constants for generating data
@@ -48,16 +50,12 @@ def source2dicts(self, source):
 
     date = FieldDefinition(
         name = 'date',
-        es_mapping = {
-            'type': 'date',
-        }
+        es_mapping = date_mapping()
     )
 
     content = FieldDefinition(
         name = 'content',
-        es_mapping = {
-            'type': 'text'
-        }
+        es_mapping = text_mapping()
     )
 
     fields = [date, content]
diff --git a/backend/visualization/tests/mock_corpora/small_mock_corpus.py b/backend/visualization/tests/mock_corpora/small_mock_corpus.py
index a3ad7fd2a..f97c42121 100644
--- a/backend/visualization/tests/mock_corpora/small_mock_corpus.py
+++ b/backend/visualization/tests/mock_corpora/small_mock_corpus.py
@@ -1,9 +1,12 @@
 from datetime import datetime
+import os
+
 from addcorpus.corpus import FieldDefinition, CSVCorpusDefinition
 from addcorpus.extract import CSV
-import os
+from addcorpus.es_mappings import date_mapping, keyword_mapping, main_content_mapping, text_mapping
 from addcorpus.es_settings import es_settings
 
+
 # Fake corpus class for unit tests
 
 here = os.path.abspath(os.path.dirname(__file__))
@@ -20,7 +23,7 @@ class SmallMockCorpus(CSVCorpusDefinition):
     languages = ['en']
     category = 'book'
 
-    es_settings = es_settings('en', stopword_analyzer=True)
+    es_settings = es_settings(['en'], stopword_analysis=True)
 
     def sources(self, start=min_date, end=max_date):
         for csv_file in os.listdir(os.path.join(here, 'source_files')):
@@ -28,45 +31,25 @@ def sources(self, start=min_date, end=max_date):
 
     date = FieldDefinition(
         name = 'date',
-        es_mapping = {
-            'type': 'date',
-        },
+        es_mapping = date_mapping(),
         extractor = CSV('date')
     )
 
     title_field = FieldDefinition(
         name = 'title',
-        es_mapping = {
-            'type': 'text',
-        },
+        es_mapping = text_mapping(),
         extractor = CSV('title')
     )
 
     content = FieldDefinition(
         name = 'content',
-        es_mapping= {
-            'type': 'text',
-            "fields": {
-                "clean": {
-                    "type": "text",
-                },
-                "stemmed": {
-                    "type": "text",
-                },
-                "length": {
-                    "type": "token_count",
-                    'analyzer': 'standard',
-                }
-            }
-        },
+        es_mapping = main_content_mapping(True, True, False, 'en'),
         extractor = CSV('content')
     )
 
     genre = FieldDefinition(
         name = 'genre',
-        es_mapping= {
-            'type': 'keyword'
-        },
+        es_mapping = keyword_mapping(),
         extractor = CSV('genre')
     )
 
diff --git a/backend/visualization/tests/test_termvectors.py b/backend/visualization/tests/test_termvectors.py
index 967102b53..ea4f6fe4c 100644
--- a/backend/visualization/tests/test_termvectors.py
+++ b/backend/visualization/tests/test_termvectors.py
@@ -67,7 +67,7 @@ def test_find_matches(es_client, termvectors_result, small_mock_corpus):
     }, {
         'query_text': 'regarded with such "evil forebodings"',
         'components': ['regarded', 'with', 'such', 'evil forebodings'],
-        'analyzed': [['regarded'], ['with'], ['such'], ['evil', 'forebodings']]
+        'analyzed': [['regarded'], ['evil', 'forebodings']]
     }, {
         'query_text': 'evil + forebodings',
         'components': ['evil', '+', 'forebodings'],
@@ -83,7 +83,7 @@ def test_find_matches(es_client, termvectors_result, small_mock_corpus):
     }, {
         'query_text': 'rejoice~1 to hear',
         'components': ['rejoice~1', 'to', 'hear'],
-        'analyzed': [['rejoice~1'], ['to'], ['hear']]
+        'analyzed': [['rejoice~1'], ['hear']]
     }
 ]
 
diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py
index 32dc21190..5bb5e6c54 100644
--- a/backend/visualization/tests/test_wordcloud.py
+++ b/backend/visualization/tests/test_wordcloud.py
@@ -127,7 +127,6 @@ def test_wordcloud_counts(small_mock_corpus):
 
 def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud):
     stopwords = ['the', 'and', 'of']
-
     for stopword in stopwords:
         match = any(
             item['key'] == stopword for item in small_mock_corpus_complete_wordcloud)
diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py
index 786929240..68ad5b543 100644
--- a/backend/visualization/wordcloud.py
+++ b/backend/visualization/wordcloud.py
@@ -1,12 +1,19 @@
 from collections import Counter
 from sklearn.feature_extraction.text import CountVectorizer
+
 from addcorpus.load_corpus import load_corpus_definition
 from addcorpus.es_settings import get_stopwords_from_settings
 from es import download as download
 
-def corpus_stopwords(corpus_name):
+def field_stopwords(corpus_name, field):
     corpus = load_corpus_definition(corpus_name)
-    return get_stopwords_from_settings(corpus.es_settings)
+    field_definition = next((f for f in corpus.fields if f.name == field), None)
+    mapping = field_definition.es_mapping
+    analyzer = mapping.get(
+        'fields', {}).get('clean', {}).get('analyzer')
+    if not analyzer:
+        return []
+    return get_stopwords_from_settings(corpus.es_settings, analyzer)
 
 def make_wordcloud_data(documents, field, corpus):
     texts = []
@@ -14,8 +21,8 @@ def make_wordcloud_data(documents, field, corpus):
         content = document['_source'][field]
         if content and content != '':
             texts.append(content)
-
-    stopwords = corpus_stopwords(corpus) or []
+            
+    stopwords = field_stopwords(corpus, field)
     cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords)
     cvtexts = cv.fit_transform(texts)
     counts = cvtexts.sum(axis=0).A1
diff --git a/documentation/Defining-corpus-fields.md b/documentation/Defining-corpus-fields.md
index c6c9445e6..53dc30d89 100644
--- a/documentation/Defining-corpus-fields.md
+++ b/documentation/Defining-corpus-fields.md
@@ -26,8 +26,8 @@ Elasticsearch supports specifying a `fields` parameter to a field to define subf
 
 The one way in which multifields _are_ used is to allow different analyzers on the same text field. Text fields typically use the default analyzer, which performs basic tokenisation and converts text to lowercase. For more extensive analysis, subfields can be added. I-analyzer uses the following naming convention:
 
-- `*.clean-{iso-language-code}`: uses a language-specific analyzer to filter stopwords.
-- `*.stemmed-{iso-language-code}`: uses a language-specific analyzer to filter stopwords and stem words.
+- `*.clean`: uses a language-specific analyzer to filter stopwords.
+- `*.stemmed`: uses a language-specific analyzer to filter stopwords and stem words.
 - `*.length`: specifies the token count of the text, which is useful for aggregations.
 - `*.text`: a field with text mapping. Can be added to a keyword field to support full-text search in the field.
 

From 01b59c89ce93e9ec4de1c478488ed63df057db79 Mon Sep 17 00:00:00 2001
From: Berit <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 10:25:20 +0100
Subject: [PATCH 88/98] Update backend/corpora/peaceportal/FIJI/fiji.py

Co-authored-by: Luka van der Plas <43678097+lukavdplas@users.noreply.github.com>
---
 backend/corpora/peaceportal/FIJI/fiji.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/corpora/peaceportal/FIJI/fiji.py b/backend/corpora/peaceportal/FIJI/fiji.py
index 96667cc6c..6aafed152 100644
--- a/backend/corpora/peaceportal/FIJI/fiji.py
+++ b/backend/corpora/peaceportal/FIJI/fiji.py
@@ -13,7 +13,7 @@
 class PeaceportalFIJI(PeacePortal, XMLCorpusDefinition):
     '''
     This is a fresh version of Ortal-Paz Saar's 'Funerary Inscriptions of Jews from Italy' corpus,
-    updated to align with the PEACE portal index. This mostly implies that there are less fields
+    updated to align with the PEACE portal index. This mostly implies that there are fewer fields
     than in the earlier version (i.e. the one under corpora/jewishinscriptions).
     '''
 

From 4914e1c53c61e7c9c0dfc47a5400a93ed28749c2 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 10:39:01 +0100
Subject: [PATCH 89/98] use get_language_key function

---
 backend/corpora/peaceportal/iis.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index 1bbe30af6..dbf06d086 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -4,6 +4,7 @@
 from django.conf import settings
 
 from addcorpus.corpus import XMLCorpusDefinition
+from addcorpus.es_settings import get_language_key
 from addcorpus.extract import Combined, Constant, ExternalFile, FilterAttribute, XML
 from addcorpus.serializers import LanguageField
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
@@ -168,14 +169,14 @@ def __init__(self):
                      'textLang'],
                 attribute='mainLang',
                 toplevel=False,
-                transform=lambda x: normalize_language(x)
+                transform=lambda x: get_language_key(x)
             ),
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
                      'textLang'],
                 attribute='otherLangs',
                 toplevel=False,
-                transform=lambda x: normalize_language(x)
+                transform=lambda x: get_language_key(x)
             )
         )
 
@@ -351,10 +352,6 @@ def extract_dimensions(soup):
     return cloned_soup
 
 
-def normalize_language(text):
-    serializer = LanguageField()
-    return serializer.to_representation(text)
-
     # excluded (for now):
     # revision history
 

From 6a2f41b0803e6d595cf5267e5dc4577643ea2eb5 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 11:09:01 +0100
Subject: [PATCH 90/98] Revert "use get_language_key function"

This reverts commit 4914e1c53c61e7c9c0dfc47a5400a93ed28749c2.
---
 backend/corpora/peaceportal/iis.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index dbf06d086..1bbe30af6 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -4,7 +4,6 @@
 from django.conf import settings
 
 from addcorpus.corpus import XMLCorpusDefinition
-from addcorpus.es_settings import get_language_key
 from addcorpus.extract import Combined, Constant, ExternalFile, FilterAttribute, XML
 from addcorpus.serializers import LanguageField
 from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries, get_text_in_language
@@ -169,14 +168,14 @@ def __init__(self):
                      'textLang'],
                 attribute='mainLang',
                 toplevel=False,
-                transform=lambda x: get_language_key(x)
+                transform=lambda x: normalize_language(x)
             ),
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
                      'textLang'],
                 attribute='otherLangs',
                 toplevel=False,
-                transform=lambda x: get_language_key(x)
+                transform=lambda x: normalize_language(x)
             )
         )
 
@@ -352,6 +351,10 @@ def extract_dimensions(soup):
     return cloned_soup
 
 
+def normalize_language(text):
+    serializer = LanguageField()
+    return serializer.to_representation(text)
+
     # excluded (for now):
     # revision history
 

From d117cd5a7886a1b0dcfb0ce6fc5949578063dd38 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 11:19:47 +0100
Subject: [PATCH 91/98] remove unused import

---
 frontend/src/app/visualization/wordcloud/wordcloud.component.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
index dac74ec42..8138f30b8 100644
--- a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
+++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts
@@ -4,7 +4,6 @@ import {
 
 
 import { AggregateResult, CorpusField, QueryModel, Corpus, FreqTableHeaders } from '../../models/index';
-import { ApiService } from '../../services/index';
 import { BehaviorSubject } from 'rxjs';
 import { VisualizationService } from '../../services/visualization.service';
 import { showLoading } from '../../utils/utils';

From a8c8d056b7944f1a3f753087cde02b8f84f925e6 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 11:19:56 +0100
Subject: [PATCH 92/98] add loadcorpora to Docker startup

---
 docker-compose.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index 78bbf5648..19f75ef4d 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -36,7 +36,7 @@ services:
       - type: bind
         source: $DATA_DIR
         target: /corpora
-    command: bash -c "python manage.py migrate && python manage.py runserver 0.0.0.0:8000"
+    command: bash -c "python manage.py migrate && python manage.py loadcorpora && python manage.py runserver 0.0.0.0:8000"
   frontend:
     build:
       context: ./frontend

From 89d8f8ec6d57e39045b580bab49dfac93290587c Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 11:35:55 +0100
Subject: [PATCH 93/98] add language code

---
 backend/corpora/peaceportal/iis.py         | 14 ++++++++++++++
 backend/corpora/peaceportal/peaceportal.py | 10 +++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
index 1bbe30af6..e9cd78a84 100644
--- a/backend/corpora/peaceportal/iis.py
+++ b/backend/corpora/peaceportal/iis.py
@@ -178,6 +178,20 @@ def __init__(self):
                 transform=lambda x: normalize_language(x)
             )
         )
+        self.language_code.extractor = Combined(
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+                     'textLang'],
+                attribute='mainLang',
+                toplevel=False
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
+                     'textLang'],
+                attribute='otherLangs',
+                toplevel=False
+            )
+        )
 
         self.comments.extractor = Combined(
             XML(
diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
index d3483f5d0..da8653927 100644
--- a/backend/corpora/peaceportal/peaceportal.py
+++ b/backend/corpora/peaceportal/peaceportal.py
@@ -300,7 +300,7 @@ def request_media(self, document):
     language = FieldDefinition(
         name='language',
         display_name='Language',
-        description='Language written on the inscription.',
+        description='Language of the inscription.',
         es_mapping=keyword_mapping(),
         search_filter=MultipleChoiceFilter(
             description='Search only within these languages.',
@@ -310,6 +310,13 @@ def request_media(self, document):
         visualization_type='term_frequency'
     )
 
+    language_code = FieldDefinition(
+        name='language_code',
+        display_name='Language code',
+        description='ISO 639 code for the language of the inscription.',
+        es_mapping=keyword_mapping()
+    )
+
     bibliography = FieldDefinition(
         name='bibliography',
         es_mapping=keyword_mapping(),
@@ -372,6 +379,7 @@ def __init__(self):
             self.settlement,
             self.location_details,
             self.language,
+            self.language_code,
             self.iconography,
             self.images,
             self.coordinates,

From dc3fab2f5169edf9f8ff8622f444f17ca6312741 Mon Sep 17 00:00:00 2001
From: BeritJanssen <berit.janssen@gmail.com>
Date: Thu, 16 Nov 2023 14:48:08 +0100
Subject: [PATCH 94/98] add load_corpus test

---
 backend/corpora/peaceportal/conftest.py             |  4 ++--
 .../tests/{test_import.py => test_peace.py}         | 13 ++++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)
 rename backend/corpora/peaceportal/tests/{test_import.py => test_peace.py} (96%)

diff --git a/backend/corpora/peaceportal/conftest.py b/backend/corpora/peaceportal/conftest.py
index 0584ac323..15c2e1626 100644
--- a/backend/corpora/peaceportal/conftest.py
+++ b/backend/corpora/peaceportal/conftest.py
@@ -4,7 +4,7 @@
 here = os.path.abspath(os.path.dirname(__file__))
 
 @pytest.fixture()
-def peace_corpus_settings(settings):
+def peace_test_settings(settings):
     settings.CORPORA = {
         'peaceportal': os.path.join(here, 'peaceportal.py'),
         'peaceportal-epidat': os.path.join(here, 'epidat.py'),
@@ -18,4 +18,4 @@ def peace_corpus_settings(settings):
     settings.PEACEPORTAL_IIS_DATA = os.path.join(here, 'tests', 'data', 'iis', 'xml')
     settings.PEACEPORTAL_IIS_TXT_DATA = os.path.join(here, 'tests', 'data', 'iis', 'transcription_txts')
     settings.PEACEPORTAL_TOL_DATA = os.path.join(here, 'tests', 'data', 'tol')
-    settings.PEACEPORTAL_ALIAS = 'peaceportal'
\ No newline at end of file
+    settings.PEACEPORTAL_ALIAS = 'peaceportal'
diff --git a/backend/corpora/peaceportal/tests/test_import.py b/backend/corpora/peaceportal/tests/test_peace.py
similarity index 96%
rename from backend/corpora/peaceportal/tests/test_import.py
rename to backend/corpora/peaceportal/tests/test_peace.py
index c19ba2dc0..54db50ef3 100644
--- a/backend/corpora/peaceportal/tests/test_import.py
+++ b/backend/corpora/peaceportal/tests/test_peace.py
@@ -1,9 +1,9 @@
 import os
-import warnings
 import pytest
-from datetime import datetime
 
 from addcorpus.load_corpus import load_corpus_definition
+from addcorpus.save_corpus import load_and_save_all_corpora
+from addcorpus.models import Corpus
 
 CORPUS_TEST_DATA = [
     {
@@ -236,7 +236,7 @@ def corpus_test_name(corpus_spec):
     return corpus_spec['name']
 
 @pytest.mark.parametrize("corpus_object", CORPUS_TEST_DATA, ids=corpus_test_name)
-def test_imports(peace_corpus_settings, corpus_object):
+def test_imports(peace_test_settings, corpus_object):
     parent_corpus = load_corpus_definition('peaceportal')
     corpus = load_corpus_definition(corpus_object.get('name'))
     assert len(os.listdir(os.path.abspath(corpus.data_directory))) != 0
@@ -271,3 +271,10 @@ def get_documents(corpus, start, end):
         end=end
     )
     return corpus.documents(sources)
+
+def test_peaceportal_validation(db, peace_test_settings):
+    load_and_save_all_corpora()
+    corpus_names = [case['name'] for case in CORPUS_TEST_DATA]
+    for corpus_name in corpus_names:
+      corpus = Corpus.objects.get(name=corpus_name) 
+      assert corpus.active 
\ No newline at end of file

From 2fdf10986b2e5baead7b17aaff9d889752b1b2dd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:04:32 +0000
Subject: [PATCH 95/98] Bump urllib3 from 1.26.17 to 1.26.18 in /backend

Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.17 to 1.26.18.
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/1.26.17...1.26.18)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 backend/requirements.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index 80293e02e..254cb5960 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -55,7 +55,7 @@ defusedxml==0.7.1
     #   djangosaml2
     #   pysaml2
     #   python3-openid
-dj-rest-auth[with_social]==2.2.7
+dj-rest-auth[with-social,with_social]==2.2.7
     # via -r requirements.in
 django==4.1.10
     # via
@@ -155,7 +155,9 @@ pycparser==2.21
 pyfume==0.2.25
     # via fuzzytm
 pyjwt[crypto]==2.8.0
-    # via django-allauth
+    # via
+    #   django-allauth
+    #   pyjwt
 pyopenssl==23.1.1
     # via pysaml2
 pypdf2==3.0.1
@@ -235,7 +237,7 @@ tqdm==4.64.1
     #   nltk
 typing-extensions==4.8.0
     # via pypdf2
-urllib3==1.26.17
+urllib3==1.26.18
     # via
     #   django-revproxy
     #   elastic-transport

From 25f5c7ca0ed44e2f54623bb499651f1ec7b6e256 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:05:08 +0000
Subject: [PATCH 96/98] Bump django from 4.1.10 to 4.1.13 in /backend

Bumps [django](https://github.com/django/django) from 4.1.10 to 4.1.13.
- [Commits](https://github.com/django/django/compare/4.1.10...4.1.13)

---
updated-dependencies:
- dependency-name: django
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 backend/requirements.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index 80293e02e..7235cf209 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -55,9 +55,9 @@ defusedxml==0.7.1
     #   djangosaml2
     #   pysaml2
     #   python3-openid
-dj-rest-auth[with_social]==2.2.7
+dj-rest-auth[with-social,with_social]==2.2.7
     # via -r requirements.in
-django==4.1.10
+django==4.1.13
     # via
     #   -r requirements.in
     #   dj-rest-auth
@@ -155,7 +155,9 @@ pycparser==2.21
 pyfume==0.2.25
     # via fuzzytm
 pyjwt[crypto]==2.8.0
-    # via django-allauth
+    # via
+    #   django-allauth
+    #   pyjwt
 pyopenssl==23.1.1
     # via pysaml2
 pypdf2==3.0.1

From 8eeba13766a407d6caa6800dac652ffdf6c4c4b3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:38:22 +0000
Subject: [PATCH 97/98] Bump axios from 1.1.3 to 1.6.2 in /frontend

Bumps [axios](https://github.com/axios/axios) from 1.1.3 to 1.6.2.
- [Release notes](https://github.com/axios/axios/releases)
- [Changelog](https://github.com/axios/axios/blob/v1.x/CHANGELOG.md)
- [Commits](https://github.com/axios/axios/compare/v1.1.3...v1.6.2)

---
updated-dependencies:
- dependency-name: axios
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 frontend/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontend/yarn.lock b/frontend/yarn.lock
index 3d00346ea..4a54260f0 100644
--- a/frontend/yarn.lock
+++ b/frontend/yarn.lock
@@ -2660,9 +2660,9 @@ autoprefixer@^10.4.2:
     postcss-value-parser "^4.2.0"
 
 axios@^1.0.0:
-  version "1.1.3"
-  resolved "https://registry.yarnpkg.com/axios/-/axios-1.1.3.tgz#8274250dada2edf53814ed7db644b9c2866c1e35"
-  integrity sha512-00tXVRwKx/FZr/IDVFt4C+f9FYairX517WoGCL6dpOntqLkZofjhu43F/Xl44UOpqa+9sLFDrG/XAnFsUYgkDA==
+  version "1.6.2"
+  resolved "https://registry.yarnpkg.com/axios/-/axios-1.6.2.tgz#de67d42c755b571d3e698df1b6504cde9b0ee9f2"
+  integrity sha512-7i24Ri4pmDRfJTR7LDBhsOTtcm+9kjX5WiY1X3wIisx6G9So3pfMkEiU7emUBe46oceVImccTEM3k6C5dbVW8A==
   dependencies:
     follow-redirects "^1.15.0"
     form-data "^4.0.0"

From f99068f9a0f751e4f9d493a99288f7ce49836355 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@gmail.com>
Date: Thu, 16 Nov 2023 17:37:46 +0100
Subject: [PATCH 98/98] enforce colour for active dropdown item

close #1320
---
 frontend/src/app/dropdown/dropdown.component.scss | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/frontend/src/app/dropdown/dropdown.component.scss b/frontend/src/app/dropdown/dropdown.component.scss
index 865791e8a..c5d28eccf 100644
--- a/frontend/src/app/dropdown/dropdown.component.scss
+++ b/frontend/src/app/dropdown/dropdown.component.scss
@@ -4,4 +4,8 @@
     // this prevents the .dropdown item rule from being overwritten
     // when the dropdown is used within a <table>
     color: $text !important;
+
+    &.is-active {
+        color: $text-invert !important;
+    }
 }