Skip to content

Commit

Permalink
Merge pull request #1631 from UUDigitalHumanitieslab/feature/euparl
Browse files Browse the repository at this point in the history
Feature/euparl
  • Loading branch information
BeritJanssen authored Sep 25, 2024
2 parents a512140 + 78bdc5a commit 1e89cd4
Show file tree
Hide file tree
Showing 12 changed files with 1,873 additions and 640 deletions.
8 changes: 2 additions & 6 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ ianalyzer/config.py
# csv downloads
download/csv_files/

# word models
corpora/*/wm/*
!corpora/*/wm/documentation.md

# file storage
test_data/
data/
/test_data/
/data/
6 changes: 6 additions & 0 deletions backend/addcorpus/python_corpora/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ianalyzer_readers.readers.xml import XMLReader
from ianalyzer_readers.readers.csv import CSVReader
from ianalyzer_readers.readers.html import HTMLReader
from ianalyzer_readers.readers.rdf import RDFReader
from ianalyzer_readers.readers.xlsx import XLSXReader

from addcorpus.python_corpora.filters import Filter
Expand Down Expand Up @@ -340,6 +341,11 @@ def source2dicts(self, source, *nargs, **kwargs):

yield field_dict

class RDFCorpusDefinition(CorpusDefinition, RDFReader):
'''
A RDFCorpus is any corpus that extracts its data from Linked Data files.
'''

# Fields ######################################################################


Expand Down
689 changes: 674 additions & 15 deletions backend/corpora/parliament/conftest.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions backend/corpora/parliament/description/euparl.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The debates from the European Parliament, in English (translation), as provided by the [Talk of Europe](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x62-ew3m&version=1.0) dataset. The dataset covers debates from July 1999 to July 2017.
238 changes: 238 additions & 0 deletions backend/corpora/parliament/euparl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
from datetime import datetime
from itertools import chain
import logging
import os
from typing import Tuple, Union

from django.conf import settings
from langcodes import standardize_tag, Language
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import DCTERMS, FOAF, RDFS, RDF as RDFNS
from ianalyzer_readers.extract import Backup, Combined, Metadata, RDF

from addcorpus.es_mappings import keyword_mapping
from addcorpus.python_corpora.corpus import FieldDefinition, RDFCorpusDefinition
from corpora.parliament.parliament import Parliament
import corpora.parliament.utils.field_defaults as field_defaults

EVENTS_METADATA = 'Events_and_structure.ttl'
MP_METADATA = 'MembersOfParliament_background.ttl'
SPEECHES = 'English.ttl'

# Namespaces of Linked Politics (NB: the purl links resolve to dead sites)
LP_EU = Namespace('http://purl.org/linkedpolitics/eu/plenary/')
LPV_EU = Namespace('http://purl.org/linkedpolitics/vocabulary/eu/plenary/')
LP = Namespace('http://purl.org/linkedpolitics/')
LPV = Namespace('http://purl.org/linkedpolitics/vocabulary/')

def add_speaker_metadata(filename: str) -> dict:
''' Parse all relevant metadata out of MembersOfParliament ttl to dict'''
logger = logging.getLogger("indexing")
speaker_dict = {}
speaker_graph = Graph()
speaker_graph.parse(filename)
speaker_subjects = speaker_graph.subjects(object=LPV.MemberOfParliament)
for speaker in speaker_subjects:
try:
name = speaker_graph.value(speaker, FOAF.name).value
except AttributeError:
logger.info(f"Cannot find name of speaker subject {speaker}")
continue
country_node = speaker_graph.value(speaker, LPV.countryOfRepresentation)
country_name = speaker_graph.value(country_node, RDFS.label).value
party_list = []
speaker_functions = speaker_graph.objects(speaker, LPV.politicalFunction)
for function in speaker_functions:
function_type = speaker_graph.value(function, LPV.institution)
if speaker_graph.value(function_type, RDFNS.type) == LPV.EUParty:
party_labels = list(speaker_graph.objects(function_type, RDFS.label))
party_acronym = min(party_labels, key=len)
party_name = max(party_labels, key=len)
date_start = speaker_graph.value(function, LPV.beginning)
date_end = speaker_graph.value(function, LPV.end)
party_list.append({
'party_acronym': party_acronym,
'party_name': party_name,
'date_start': date_start.value,
'date_end': date_end.value
})
speaker_dict.update({speaker: {
'name': name,
'country': country_name,
'parties': party_list
}
})
return speaker_dict

def get_identifier(input: str) -> str:
return input.split('/')[-1]


def language_name(lang_code: str) -> str:
return Language.make(language=standardize_tag(lang_code)).display_name()


def get_speaker(input: Tuple[URIRef, dict]) -> str:
(speaker, speaker_dict) = input
return speaker_dict.get(speaker).get('name')

def get_speaker_country(input: Tuple[URIRef, dict]) -> str:
(speaker, speaker_dict) = input
return speaker_dict.get(speaker).get('country')

def get_speaker_party(input: Tuple[str, datetime, dict]) -> str:
''' look up the which EU party the speaker was part of at the date of their speech '''
(speaker, date, party_data) = input
party_list = party_data.get(speaker).get('parties')
return next(
(
f"{p['party_name'].value} ({p['party_acronym'].value})"
for p in party_list
if (date >= p["date_start"] and date <= p["date_end"])
)
)

def get_speech_index(input: Tuple[str, list]) -> int:
''' find index of speech in array of debate parts '''
speech, speeches = input
if not speech:
return None
return speeches.index(speech) + 1

def get_speech_text(input: str) -> str:
''' remove leading language information, e.g., `(IT)`'''
return input.split(') ')[-1]

def get_uri(input: Union[URIRef, str]) -> str:
''' convert input from URIRef to string '''
try:
return input.n3().strip('<>')
except:
return input

class ParliamentEurope(Parliament, RDFCorpusDefinition):
"""
Speeches of the European parliament, (originally in or translated to English),
provided as Linked Open Data by the "Talk of Europe" project
"""
title = 'People & Parliament (European Parliament)'
description = "Speeches from the European Parliament (EP)"
min_date = datetime(year=1999, month=7, day=20)
max_date = datetime(year=2017, month=7, day=6)
data_directory = settings.PP_EUPARL_DATA
es_index = getattr(settings, 'PP_EUPARL_INDEX', 'parliament-euparl')
languages = ['en']
description_page = 'euparl.md'
image = 'euparl.jpeg'

def sources(self, **kwargs):
metadata = {
"speakers": add_speaker_metadata(
os.path.join(self.data_directory, MP_METADATA)
)
}
yield os.path.join(self.data_directory, SPEECHES), metadata

def document_subjects(self, graph: Graph):
"""return all subjects which have either translated or spoken text"""
return chain(
graph.subjects(predicate=LPV.translatedText),
graph.subjects(predicate=LPV.spokenText),
)

def parse_graph_from_filename(self, filename: str) -> Graph:
''' we combine the graphs in place, to keep memory load low '''
graph = Graph()
graph.parse(filename)
graph.parse(os.path.join(self.data_directory, EVENTS_METADATA))
return graph

debate_id = field_defaults.debate_id()
debate_id.extractor = RDF(
DCTERMS.isPartOf,
transform=get_identifier
)

debate_title = field_defaults.debate_title()
debate_title.extractor = RDF(
DCTERMS.isPartOf,
DCTERMS.title
)

date = field_defaults.date(min_date, max_date)
date.extractor = RDF(
DCTERMS.date,
transform=lambda x: x.strftime('%Y-%m-%d')
)

party = field_defaults.party()
party.extractor = Combined(
RDF(LPV.speaker),
RDF(DCTERMS.date),
Metadata('speakers'),
transform=get_speaker_party
)

sequence = field_defaults.sequence()
sequence.extractor = Combined(
RDF(),
RDF(DCTERMS.isPartOf, DCTERMS.hasPart, multiple=True),
transform=get_speech_index,
)

source_language = field_defaults.language()
source_language.name = 'source_language'
source_language.display_name = 'Source language'
source_language.description = 'Original language of the speech'
source_language.search_filter.description = 'Search only in speeches in the selected source languages',
source_language.extractor = RDF(DCTERMS.language, transform=language_name)

speaker = field_defaults.speaker()
speaker.extractor = Combined(
RDF(LPV.speaker),
Metadata('speakers'),
transform=get_speaker
)

speaker_country = FieldDefinition(
name='speaker_country',
display_name='Represented country',
description='The EU country the speaker represents',
es_mapping=keyword_mapping(),
extractor=Combined(
RDF(LPV.speaker),
Metadata('speakers'),
transform=get_speaker_country
)
)

speech = field_defaults.speech(language='en')
speech.extractor = Backup(
RDF(
LPV.spokenText,
),
RDF(
LPV.translatedText,
),
transform=get_speech_text
)

speech_id = field_defaults.speech_id()
speech_id.extractor = RDF(transform=get_identifier)

url = field_defaults.url()
url.extractor = Backup(RDF(LPV.videoURI, transform=get_uri), RDF(transform=get_uri))

def __init__(self):
self.fields = [
self.date,
self.debate_id,
self.debate_title,
self.party,
self.sequence,
self.source_language,
self.speaker,
self.speaker_country,
self.speech, self.speech_id,
self.url
]
Binary file added backend/corpora/parliament/images/euparl.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions backend/corpora/parliament/tests/data/euparl/English.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix lpv_eu: <http://purl.org/linkedpolitics/vocabulary/eu/plenary/> .
@prefix lpv: <http://purl.org/linkedpolitics/vocabulary/> .
@prefix lp_eu: <http://purl.org/linkedpolitics/eu/plenary/> .
@prefix lp: <http://purl.org/linkedpolitics/> .
@prefix dcterms: <http://purl.org/dc/terms/> .

lp_eu:1999-07-21-Speech-3-063 lpv:translatedText "(IT) Mr President, as a Member of the Italian national Parliament for the\n(The Northern League for the Independence of Padania), I did not vote for Professor Prodi in Rome as I considered he would be completely useless as head of government. I was then proved right as he lost the vote of confidence of the Italian Parliament. Reckoning also that a Roman idiot would still be that stupid wherever he was, which, incidently, is reflected in the symbol on the list which bears his name for the election of this Parliament, I cannot for consistency\"s sake express my faith in the President of the Commission. As a native of the Po valley who is Italian only by passport, I am fortunately immune from the national Christian Democrat type of opportunism which brings Berlusconi together with Mastella and De Mita and sees in Prodi not the impartial President of the Commissioners uninfluenced by the States, but the lavish dispenser of favours to a wide and varied assortment of Southern Italian profiteers. Although I hold some of the Commissioners in high esteem, I recall the old mafioso Neapolitan saying: ‘A fish rots from the head downwards’ and I therefore have to express my negative opinion of the Prodi Presidency."@en .
lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Lega Nord per l'indipendenza della Padania" .
lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Speroni (NI)" .

lp_eu:2009-03-24-Speech-2-371 lpv:translatedText "Mr President, ladies and gentlemen, allow me first of all to thank you for once again giving us the opportunity to pursue the constructive dialogue that has been established between the European Investment Bank and Parliament for some years now.\nMight we go further? I would remind you that the Court of Auditors already monitors all the EIB’s activities whenever these involve the use of funds from the European budget. Should we go further towards a formal system of banking supervision? That is what Mrs Stauner was hoping for. Mr Bullmann pointed out that things were perhaps not that simple. In any event, it is worth discussing. All I can do today is confirm that the EIB is fully open to being subjected to formal banking supervision, if it is considered worthwhile.\nFor the moment, we have organised, alongside the Financial Sector Supervisory Commission in Luxembourg, a form of informal supervision.\nIn answer to Mr Audy, I would say that the action that he requested last year from the Committee of European Banking Supervisors (CEBS) has indeed been carried out. We have therefore questioned the CEBS, but it informed us that it itself did not have any authority in the area and that it could not even act in an advisory role. We are therefore still in the hands of those who would like to take an initiative in this regard. I say again that we are open to such initiatives.\nA word in conclusion on cooperation between our two institutions. Mr Mirow has already indicated that it was developing well, particularly in the Western Balkans, and with our neighbours in the East, most recently in Turkey. All I want to say, in order to keep to my speaking time, is that we are in full agreement with the recommendations featured in Mr Mitchell’s report. We think that it would be in the common interest of both our institutions, and of our borrowers too, for us to move towards a more rational and functional division of labour.\nA word in conclusion on Mr Seppänen’s report. I would like to say how much we have appreciated Mr Seppänen’s constructive approach. He proposes a temporary solution, which allows the EIB to continue with its activities, but which fixes a date for an in-depth discussion of the role that the EIB should play outside the European Union. I am in no doubt that this is a debate on which we will spend some time and that, I believe, has come at just the right moment.\nI am particularly happy to have the opportunity to discuss the two reports being presented to us today, because they are two reports – that of Mr Mitchell and that of Mr Seppänen – that are interesting and that raise entirely relevant issues. I hope that we will have the opportunity to return to these issues later.\nToday, of course, we are facing a crisis on an exceptional scale – probably the most serious crisis since the end of the Second World War – and it is therefore quite normal in this context for Member States to call on our two institutions to try to make a contribution to the European Union’s response to this crisis. You know that in this context the Member States, which are our shareholders, have asked the EIB to substantially increase the volume of its lending in 2009, an increase of some 30% compared to the initial forecasts, and to channel this additional effort essentially into three areas: firstly, loans to banks for small and medium-sized enterprises; secondly, energy, and in particular the fight against climate change; and finally, a special effort for those countries that are hardest hit by the crisis.\nWhat point have we reached today? I will give you the statistics covering the last three months of 2008 – in other words, starting from the time at which the first appeals were made to the EIB – and the first two months of 2009. During those five months we lent more than EUR 31 billion, which represents a 38% increase compared with the same period of late-2007/early-2008. In the first area, as regards loans for small and medium-sized enterprises, EUR 5.6 billion in loans were issued in this short period. Several of you have stressed the importance of aiding small and medium-sized enterprises in the current climate. In fact, we are making a very special effort in this area, and I can already tell you that the objective that we were set of releasing EUR 15 billion of these loans during the years 2008 and 2009 will be exceeded.\nAs regards the second objective, energy and the fight against climate change, here too we have made a particular effort, and it is in this context that financing for the automotive industry must be placed. We must be clear: in this sector our funding is going towards projects involving research, development and production of eco-friendly cars, that is, cars that will meet the Union’s new standards regarding the reduction of CO\nemissions.\nFinally, regarding the third area: aid for countries that have been hardest hit by the crisis: during this same five-month period we issued EUR 910 million in loans in Hungary, EUR 600 million in Latvia, EUR 1 billion in Romania and EUR 1.1 billion in Lithuania.\nI therefore think that I can say that we have been doing our best to respond to the Member States’ appeal and to implement the agreed measures without delay. Mr Mirow himself has already alluded to the joint International Finance Corporation-European Bank for Reconstruction and Development action plan regarding aid for the banking sector in Central and Eastern Europe.\nNaturally, this increase in the volume of our loans is only possible thanks to the increase in capital on which our shareholders have decided – it will not cost the Member States anything. However, it was decided that we needed our shareholders’ authorisation to turn our reserves into capital.\nSeveral of you have asked questions about monitoring and supervision of the EIB, and I personally think that the question is totally legitimate. When a financial institution grows in such a way, it is normal for there to be concerns about how it is monitored. There is what is already in place, which is not insignificant: there is a certain amount of internal monitoring and, above all, there is external monitoring by an independent audit committee that reports directly to our governors. Moreover, the Treaty of Lisbon makes provision for strengthening this audit committee with the addition of people who have proven experience of banking supervision."@en .
lp_eu:2009-03-24-Speech-2-371 lpv:unclassifiedMetadata "2" .

lp_eu:2017-07-06-Speech-4-146-000 lpv:spokenText "Mr President, yesterday afternoon we had a lively debate, under Rule 153, on the subject of a single seat for this Parliament. Unfortunately, under that rule, it was not possible to have a resolution, but it was the clear will of this House that we bring forward a report to propose a treaty change. So, as Mr Weber and Mr Pittella are in their seats, could they please take note of the view of this House and, when the matter comes to the Conference of Presidents, could they please authorise that report?"@en .
lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "(Applause)" .
lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "Ashley Fox (ECR )." .

lp_eu:1999-07-21_AgendaItem_5 dcterms:title "Statement by Mr Prodi, President-elect of the Commission"@en .

lp_eu:2009-03-24_AgendaItem_30 dcterms:title "EIB and EBRD annual reports for 2007 - Community guarantee to the European Investment Bank (debate)"@en .

lp_eu:2017-07-06_AgendaItem_13 dcterms:title "Composition of committees and delegations"@en .
Loading

0 comments on commit 1e89cd4

Please sign in to comment.