Skip to content

Commit

Permalink
Merge branch 'develop' into feature/n-gram-params
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Oct 11, 2023
2 parents f4f1b47 + e14bed8 commit 71496cb
Show file tree
Hide file tree
Showing 31 changed files with 206 additions and 94 deletions.
2 changes: 1 addition & 1 deletion backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def main_content_mapping(token_counts = True, stopword_analysis = False, stemming_analysis = False, updated_highlighting = False):
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
'''
Mapping for the main content field. Options:
Expand Down
2 changes: 1 addition & 1 deletion backend/addcorpus/es_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_nltk_stopwords(language_code):
raise NotImplementedError('language {} has no nltk stopwords list'.format(language))


def es_settings(language = None, stopword_analyzer = False, stemming_analyzer = False):
def es_settings(language=None, stopword_analyzer=False, stemming_analyzer=False):
'''
Make elasticsearch settings json for a corpus index. Options:
- `language`: string with the language code. See addcorpus.constants for options, and which languages support stopwords/stemming
Expand Down
2 changes: 2 additions & 0 deletions backend/corpora/parliament/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def parliament_corpora_settings(settings):
'parliament-denmark': os.path.join(here, 'denmark.py'),
'parliament-denmark-new': os.path.join(here, 'denmark-new.py'),
'parliament-finland': os.path.join(here, 'finland.py'),
'parliament-finland-old': os.path.join(here, 'finland-old.py'),
'parliament-norway': os.path.join(here, 'norway.py'),
'parliament-norway-new': os.path.join(here, 'norway-new.py'),
'parliament-ireland': os.path.join(here, 'ireland.py')
Expand All @@ -34,6 +35,7 @@ def parliament_corpora_settings(settings):
settings.PP_SWEDEN_DATA = os.path.join(here, 'tests', 'data', 'sweden')
settings.PP_SWEDEN_OLD_DATA = os.path.join(here, 'tests', 'data', 'sweden-old')
settings.PP_FINLAND_DATA = os.path.join(here, 'tests', 'data', 'finland')
settings.PP_FINLAND_OLD_DATA = os.path.join(here, 'tests', 'data', 'finland-old')
settings.PP_NORWAY_DATA = os.path.join(here, 'tests', 'data', 'norway')
settings.PP_NORWAY_NEW_DATA = os.path.join(here, 'tests', 'data', 'norway-new')
settings.PP_DENMARK_DATA = os.path.join(here, 'tests', 'data', 'denmark')
Expand Down
12 changes: 2 additions & 10 deletions backend/corpora/parliament/denmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,6 @@
import corpora.parliament.utils.field_defaults as field_defaults
import corpora.utils.formatting as formatting

def get_date_from_year(value, limit='earliest'):
if value and value.isnumeric():
year = int(value)
if limit == 'earliest':
date = datetime(year=year, month=1, day=1)
else:
date = datetime(year=year, month=12, day=31)
return date.strftime('%Y-%m-%d')

def get_book_id(page_id):
if page_id:
Expand Down Expand Up @@ -81,15 +73,15 @@ def sources(self, start, end):
date_earliest = field_defaults.date_earliest()
date_earliest.extractor = CSV(
field='year',
transform= lambda value: get_date_from_year(value, 'earliest')
transform= lambda value: formatting.get_date_from_year(value, 'earliest')
)
date_earliest.search_filter.lower = min_date
date_earliest.search_filter.upper = max_date

date_latest = field_defaults.date_latest()
date_latest.extractor = CSV(
field='year',
transform= lambda value: get_date_from_year(value, 'latest')
transform= lambda value: formatting.get_date_from_year(value, 'latest')
)
date_latest.primary_sort = True
date_latest.search_filter.lower = min_date
Expand Down
4 changes: 0 additions & 4 deletions backend/corpora/parliament/description/canada.md
Original file line number Diff line number Diff line change
@@ -1,5 +1 @@
The debates of the House of Commons.

#### Language
English

3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/denmark-new.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the unicameral parliament Folketinget starting from 2009. The dataset has been compiled with generous assistance by Folketinget but is not yet complete due to ongoing improvement of optical character recognition.

#### Language
Danish
2 changes: 0 additions & 2 deletions backend/corpora/parliament/description/denmark.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
The debates of the two chambers of the Danish bicameral parliament Rigsdag, Landstinget and Folketinget, until 1953 and the debates of the unicameral parliament Folketinget since 1953. The dataset has been compiled with generous assistance by Folketinget but is not yet complete due to ongoing improvement of optical character recognition.

#### Language
Danish
1 change: 1 addition & 0 deletions backend/corpora/parliament/description/finland-old.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Minutes from the debates of the multicameral Diet of Finland. The chambers - Nobility, Clergy, Bourgeoisie, and Peasants - met at irregular intervals between 1863 and the Parliament Act of 1906, which founded the modern unicameral Parliament of Finland.
5 changes: 1 addition & 4 deletions backend/corpora/parliament/description/finland.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the unicameral parliament Eduskunta are expected to be included during 2023 with generous assistance by the Semantic Computing Research Group, Aalto University. Experiments with data from 1919-20, 1999 and 2015 are possible.

#### Language
Finnish and Swedish
The debates of the unicameral parliament Eduskunta, included with generous assistance by the Semantic Computing Research Group, Aalto University.
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/france.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the two chambers of the bicameral parliament of the Third Republic, Sénat and Chambre des députés, the debates of the two chambers of the bicameral parliament of the Fourth Republic, Conseil de la République and Assemblée Nationale, and the debates of the two chambers of the bicameral parliament of the Fifth Republic, Sénat and Assemblée Nationale. Only limited metadata is available. The datasets are accessible for the AP project members only.

#### Language
French
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/germany-new.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the Bundestag from 1949 onwards, with metadata provided by Open Discourse.

#### Language
German
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/germany-old.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the Reichstag from 1867 to 1942 (in practice to 1934), compiled with generous assistance by the Bavarian State Library. Limited metadata is provided and the dataset suffers from OCR errors and missing data.

#### Language
German
4 changes: 0 additions & 4 deletions backend/corpora/parliament/description/ireland.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
The debates of the Irish houses of parliament (Oireachtas) from 1919-2020. Debates up to 2013 are from the Dáil, debates from 2014 are from the Dáil and Seanad.

### Language

Debates are in English or Irish.

### Metadata

The data up to 2013 and from 2014 onwards were extracted from different source data. This means that some metadata fields are only available up to 2013, or only from 2014. Specifically:
Expand Down
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/netherlands.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the First and Second Chamber of the bicameral parliament, enriched until the early 2010s by Maarten Marx for the Political Mashup project, and 2014-2020 by ParlaMINT. Metadata is provided.

#### Language
Dutch
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/norway-new.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the unicameral parliament Stortinget compiled with generous assistance by the Norwegian National Library. Limited metadata is provided. The dataset is accessible for the AP project members only.

#### Language
Norwegian
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/norway.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the unicameral parliament Stortinget compiled with generous assistance by the Norwegian National Library. Limited metadata is provided. The dataset is accessible for the AP project members only.

#### Language
Norwegian
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/sweden-old.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the four estates (Nobility, Clergy, Burghers, Peasants) of the estate diet Riksdag until 1866 and the debates of the First and Second Chambers of the bicameral parliament Riksdag until 1919. Very limited metadata is available until 1920.

#### Language
Swedish
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/sweden.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the First and Second Chambers of the bicameral parliament Riksdag until 1971, and the debates of the unicameral parliament Riksdag since 1971, enriched for the period 1920-2020 by Fredrik Norén and his team at HumLab, Umeå University, Sweden.

#### Language
Swedish
3 changes: 0 additions & 3 deletions backend/corpora/parliament/description/uk.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
The debates of the two chambers of the British bicameral Parliament, the House of Lords and the House of Commons, based on Hansard, enriched especially regarding the nineteenth century by Jo Guldi and Steph Buongiorno, Southern Methodist University, Dallas, USA. Metadata is provided.

#### Language
English
92 changes: 92 additions & 0 deletions backend/corpora/parliament/finland-old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from datetime import datetime
from glob import glob

from addcorpus.corpus import CSVCorpusDefinition
from addcorpus.extract import CSV, Combined, Constant
from addcorpus.filters import MultipleChoiceFilter
from corpora.parliament.parliament import Parliament
import corpora.parliament.utils.field_defaults as field_defaults
from corpora.utils.constants import document_context
from corpora.utils import formatting

from django.conf import settings

class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
title = 'People and Parliament (Finland, 1863-1905)'
description = 'Speeches from the early Finnish estates'
max_date = datetime(year=1905, month=12, day=31)
min_date = datetime(year=1863, month=1, day=1)
data_directory = settings.PP_FINLAND_OLD_DATA
es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')

def sources(self, start, end):
for csv_file in glob('{}/**/*.csv'.format(self.data_directory), recursive=True):
yield csv_file, {}

languages = ['sv', 'fi']
description_page = 'finland-old.md'
image = 'finland-old.jpg'

document_context = document_context()

chamber = field_defaults.chamber()
chamber.extractor = CSV(field='estate')
chamber.search_filter = MultipleChoiceFilter(
description='Search only in debates from the selected chamber(s)',
option_count=4
)

country = field_defaults.country()
country.extractor = Constant('Finland')

date_earliest = field_defaults.date_earliest()
date_earliest.extractor = CSV(
field='year_start',
transform=lambda value: formatting.get_date_from_year(value, 'earliest')
)
date_earliest.search_filter.lower = min_date
date_earliest.search_filter.upper = max_date

date_latest = field_defaults.date_latest()
date_latest.extractor = CSV(
field='year_end',
transform=lambda value: formatting.get_date_from_year(value, 'latest')
)
date_latest.primary_sort = True
date_latest.search_filter.lower = min_date
date_latest.search_filter.upper = max_date

language = field_defaults.language()
language.extractor = CSV(field='language')

page = field_defaults.page()
page.extractor = CSV(field='page')

source_archive = field_defaults.source_archive()
source_archive.extractor = CSV(field='file')

speech = field_defaults.speech()
speech.extractor = CSV(field='text')

speech_id = field_defaults.speech_id()
speech_id.extractor = Combined(
CSV(field='file'),
CSV(field='page'),
transform=lambda x: '_'.join(x)
)

speech_type = field_defaults.speech_type()
speech_type.extractor = CSV(field='type')

def __init__(self):
self.fields = [
self.chamber,
self.country,
self.date_earliest,
self.date_latest,
self.language,
self.page,
self.source_archive,
self.speech,
self.speech_id, self.speech_type,
]
4 changes: 1 addition & 3 deletions backend/corpora/parliament/finland.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
from addcorpus.corpus import XMLCorpusDefinition
from addcorpus.extract import XML, Combined, Constant, Metadata
from corpora.parliament.parliament import Parliament
import corpora.utils.formatting as formatting
import corpora.parliament.utils.field_defaults as field_defaults
from corpora.utils.constants import document_context
import re
from bs4 import BeautifulSoup
from corpora.parliament.utils.parlamint import extract_all_party_data, extract_people_data, extract_role_data, party_attribute_extractor, person_attribute_extractor, clean_value

Expand Down Expand Up @@ -39,7 +37,7 @@ def find_date(speech_node):


class ParliamentFinland(Parliament, XMLCorpusDefinition):
title = 'People and Parliament (Finland)'
title = 'People and Parliament (Finland, 1907-)'
description = 'Speeches from the eduskunta'
min_date = datetime(year=1907, month=1, day=1)
data_directory = settings.PP_FINLAND_DATA
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion backend/corpora/parliament/sweden-old.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class ParliamentSwedenOld(Parliament, CSVCorpusDefinition):
max_date = datetime(year=1919, month=12, day=31)
data_directory = settings.PP_SWEDEN_OLD_DATA
es_index = getattr(settings, 'PP_SWEDEN_OLD_INDEX', 'parliament-sweden-old')

word_model_path = getattr(settings, 'PP_SWEDEN_WM', None)

document_context = constants.document_context(
context_fields=['chamber', 'date_earliest', 'date_latest']
Expand Down
1 change: 1 addition & 0 deletions backend/corpora/parliament/sweden.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ParliamentSweden(Parliament, CSVCorpusDefinition):
min_date = datetime(year=1920, month=1, day=1)
data_directory = settings.PP_SWEDEN_DATA
es_index = getattr(settings, 'PP_SWEDEN_INDEX', 'parliament-sweden')
word_model_path = getattr(settings, 'PP_SWEDEN_WM', None)

def sources(self, start, end):
for csv_file in glob('{}/**/*.csv'.format(self.data_directory), recursive=True):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
text,page,file,type,estate,language,year_start,year_end,volume
"FÖUDT HOS

FINLANDS RIDDERSKAP OCR ADEL

VID

LANDTDAGEN ÅR 1877.

TREDJE HÄFTET.

Från den 1 till den 31 Oktober.

FINSKA LITTERATUR-SÄLLBKAPETS TRYCKERl,

1878.",0,Adeln_Prot_1877_III.pdf,minutes,nobility,swe,1877,1877,3
"488 Presteståndet. 1867. 5 April.

tanke, som låge till grund derför, att nemligen förkasta allt våld, emedan det vore me 1'0. till skada än tilI gagn. Men målsmän borde aldrig medgifvas rättighet att afhålla barn från nya födel- sens bad och delaktighet i det eviga lifvet. Annan vore föräldrar- nes rätt. Men de föräldrar, som icke ville låta döpa sina barn, erkände derigenom, att de icke mera vore lutheraner. Sådana föräldrar borde derföre kunna tvingas att antingen utträda ur kyr- kans gemenskap eller låta döpa sina barn. Ty hvad skulle man göra med sådana hednabarn? Skulle familjerna fyllas med dylika giftiga svampar? Föräldrar borde derföre kunna tvingas att, ifall de ej ville låta döpa sina barn, utträda ur församlingen.

Kyrkoherden J u s eli u s anmärkte att tredskande föräldrar kunde behandlas enligt kap. 13, dit § hänvisade. Föröfrigt före- ej nade sig talaren i harr erkebiskopens reservation, men om denna vunne ståndets bifal1, sä borde åtminstone orden: ""eller måls- mäns"" utgå.

Kontraktsprosten Sir en förenade sig i 0.110 med doktor Bergh, men ansåg dock bäst att hela slutmeningen skulle utgå, emedan den vore oluthersk och uppmuntrade tilI förakt för dopet, i hvilket afseende talaren äberopade art. 9 i Confessio augustana,

Domprosten B 0 r g yttrade, att då han, såsom medlem både i granskningskomiten och kyrkolagsutskottet, icke reserverat sig mot denna §, så syntes deraf att han gilIade densamma. Luther- ska kyrkans karakter vore, att ej med tvång göra någon delaktig af nådemedlen. Den af en värd talare åberopade 9:de art. i con- fessio augustana syftade på barndopets giltighet i allmänhet, men stadgade icke att kyrkan skulle låta döpa alla barn, som hon finge tag i. När lutherska kyrkan således icke tilläte tvång i af- seende å nådemedlen, så borde sådant icke heller här komma i fräga, Om man kallade det andligt mord, att en fader afhölle sitt barn från dopet, hvad skulle man då kallo. det, att ett barn först döptes och sedan öfverlemnades att uppfostras åt en person, som ej alls erkände kyrkans lära, Om man återginge till tvångs- dopet, så skulle man frångå hela principen för kyrkolagsförslaget. Bättre vore att föräldrar, som ej läte döpa sina barn, underkasta- des kyrklig tukt. Efter detta förslag kunde kyrkan icke för sina ändamål använda polismagtens armo

Kyrkoherden B ä c k v 0. 11 öfverensstämde med dornprosten Borg deri att det strede mot den kyrkliga friheten att låta döpa barn annorlunda än med faders eIler målsrnans samtycke. Det kunde ju t, ex. i en grekisk församling inträffa det motsatta för- bållandet, att ett barn, hvars föräldrar gått undan, skulle af en grekisk målsman föras till dopet i den grekiska kyrkan. Något tvång borde icke användas.",3,Praster_Prot_1867_II.pdf,minutes,clergy,swe,1867,1867,2
"Enligt uppdmg af Finlands Ständers Bankfullmäktige hafva dessa protokoll utgifvits af

N. B. Grotenfelt.",4,Borgare_Prot_1867_I.pdf,minutes,burghers,swe,1867,1867,1
"1066 2 p. Tonkok.

tamme, sillä tällä teolla ovat. hallitus ja säädyt parhaimmalla ta- valla täyttäneet velvollisuutensa ei ainoastaan noita kovan onnen ohjaamia lampuoteja, vaan koko kansaamme kohtaan, saattamalla lahjoitusmaiden asukkaat kiitollisina muun Suomen yhteyteen. Tämä hallituksen ja säätyjen hyvänsuopaisuus sai lisä-todistetta niistä helpoituksista, jotka viime valtiopäivillä hallituksen esityk- sestä lampuodeille vuosilunastuksessa suotiin ja jotka helpoitukset näinä ahtaina raha-aikoina ovat olleet sangen suuresta merkityk- sestä, tehden mahdolliseksi monelle pysyä tilallansa, joka muuten olisi ollut mahdotonta. Aikaista on sanoa, missä määrin kansa yleensä kykenee näillä lahjoitusmaa-alueilla käyttämään eduksensa sitä itsenäistä asemaa, jonka valtiomahtimme ovat niille suoneet, vaan, sikäli kuin minä tunnen, on vireys ja työ-into aavistamatto- massa määrässä kohonnut niillä alueilla, joissa isojako on saatu toimitetuksi, niin että, jos samallaista melloa edelleen kestää, ei suinkaan nämä seudut ole kauankaan jälellä muusta Suomesta, vaan mahdollisesti menevät monessa kohden edellekin. Ettei tämmöistä laajaperäistä asiaa toimeen pantaessa yhtä ja toista tekoa hallituksen puolelta aina saata sanoa täysin onnistu- neeksi, ei liene kummeksittava; kumminkin täytyy myöntää, että hallitukselta ei suinkaan hyvää tahtoa ole puuttunut. Pahimpana valituksena kansan puolelta on ollut isojaon viivytys, jota monessa paikassa on saatu odottaa pitkälle toistakymmentä vuotta sano- mattomaksi vahingoksi maanviljelykselle. Viime aikoina on kui- tenkin hallitus tässäkin kohden käynyt kaikella tarmolla asiaan käsiksi. - Asia, joka Pyhäjärven kunnassa on herättänyt suurta tyytymättömyyttä, kuten minulle lähetetty kunnallislautakunnan pöytäkhja osoittaa, on seuraava: Hallitus, saatuaan tietää parooni Friedrichsin vaatimuksen Pyhäjärven alueen hinnasta, lähetti asia- miehensä tiedustamaan lampuodeiIta, suostuvatko he määrätyllä hinnalla tilojansa lunastamaan. Lampuotien tekemään kysymyk- seen, mitä etuja heille annetaan, vastasi kruunun asiamies: teille jaetaan kaikki maa ilman erotuksetta. Lampuodit suostui- vatkin kauppaan tietysti ilolla. Sittemmin on kumminkin kruunu ottanut itselleen kaikki kantatilat ja piirittänyt niille parhaat met- sät hovien ympäriltä, sekä lisäksi eroittanut noin 15 tuho tynny- rinalaa kruununpuistoksi. Tätä tekoa pitävät lampuodit lupauk- sen rikkomisena, ja vaikka lakimiehet selittävät heille että kruu- nulla on ollut täysi laillinen oikeus tekoonsa, niin ei tämä selitys mahdu heidän päähänsä. Mitä erittäin tulee valiokunnan puheenalaisessa mietinnössä ehdoittamaan ponsilauseesen, niin hyväksyn sen täydellisesti. Päätös:

Valiokunnan mietintö hyväksyttiin yksimielisesti.",2,Talonpojat_PTK_1888_III.pdf,minutes,peasants,fin,1888,1888,3
31 changes: 31 additions & 0 deletions backend/corpora/parliament/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,37 @@
],
'n_documents': 22,
},
{
'name': 'parliament-finland-old',
'docs': [
{
'country': 'Finland',
'speech': """FÖUDT HOS
FINLANDS RIDDERSKAP OCR ADEL
VID
LANDTDAGEN ÅR 1877.
TREDJE HÄFTET.
Från den 1 till den 31 Oktober.
FINSKA LITTERATUR-SÄLLBKAPETS TRYCKERl,
1878.""",
'id': 'Adeln_Prot_1877_III.pdf_0',
'chamber': 'nobility',
'date_earliest': '1877-01-01',
'date_latest': '1877-12-31',
'page': '0',
'language': 'swe',
'source_archive': 'Adeln_Prot_1877_III.pdf'
},
],
'n_documents': 4,
},
{
'name': 'parliament-ireland',
'end': datetime(1999, 12, 31),
Expand Down
Loading

0 comments on commit 71496cb

Please sign in to comment.