Merge pull request #1631 from UUDigitalHumanitieslab/feature/euparl

Feature/euparl
CentreForDigitalHumanities · Sep 25, 2024 · 1e89cd4 · 1e89cd4
2 parents a512140 + 78bdc5a
commit 1e89cd4
Show file tree

Hide file tree

Showing 12 changed files with 1,873 additions and 640 deletions.
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -43,10 +43,6 @@ ianalyzer/config.py
 # csv downloads
 download/csv_files/
 
-# word models
-corpora/*/wm/*
-!corpora/*/wm/documentation.md
-
 # file storage
-test_data/
-data/
+/test_data/
+/data/
diff --git a/backend/addcorpus/python_corpora/corpus.py b/backend/addcorpus/python_corpora/corpus.py
@@ -14,6 +14,7 @@
 from ianalyzer_readers.readers.xml import XMLReader
 from ianalyzer_readers.readers.csv import CSVReader
 from ianalyzer_readers.readers.html import HTMLReader
+from ianalyzer_readers.readers.rdf import RDFReader
 from ianalyzer_readers.readers.xlsx import XLSXReader
 
 from addcorpus.python_corpora.filters import Filter
@@ -340,6 +341,11 @@ def source2dicts(self, source, *nargs, **kwargs):
 
         yield field_dict
 
+class RDFCorpusDefinition(CorpusDefinition, RDFReader):
+    '''
+    A RDFCorpus is any corpus that extracts its data from Linked Data files.
+    '''
+
 # Fields ######################################################################
 
 

diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py
diff --git a/backend/corpora/parliament/description/euparl.md b/backend/corpora/parliament/description/euparl.md
@@ -0,0 +1 @@
+The debates from the European Parliament, in English (translation), as provided by the [Talk of Europe](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x62-ew3m&version=1.0) dataset. The dataset covers debates from July 1999 to July 2017.
diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py
@@ -0,0 +1,238 @@
+from datetime import datetime
+from itertools import chain
+import logging
+import os
+from typing import Tuple, Union
+
+from django.conf import settings
+from langcodes import standardize_tag, Language
+from rdflib import Graph, Namespace, URIRef
+from rdflib.namespace import DCTERMS, FOAF, RDFS, RDF as RDFNS
+from ianalyzer_readers.extract import Backup, Combined, Metadata, RDF
+
+from addcorpus.es_mappings import keyword_mapping
+from addcorpus.python_corpora.corpus import FieldDefinition, RDFCorpusDefinition
+from corpora.parliament.parliament import Parliament
+import corpora.parliament.utils.field_defaults as field_defaults
+
+EVENTS_METADATA = 'Events_and_structure.ttl'
+MP_METADATA = 'MembersOfParliament_background.ttl'
+SPEECHES = 'English.ttl'
+
+# Namespaces of Linked Politics (NB: the purl links resolve to dead sites)
+LP_EU = Namespace('http://purl.org/linkedpolitics/eu/plenary/')
+LPV_EU = Namespace('http://purl.org/linkedpolitics/vocabulary/eu/plenary/')
+LP = Namespace('http://purl.org/linkedpolitics/')
+LPV = Namespace('http://purl.org/linkedpolitics/vocabulary/')
+
+def add_speaker_metadata(filename: str) -> dict:
+    ''' Parse all relevant metadata out of MembersOfParliament ttl to dict'''
+    logger = logging.getLogger("indexing")
+    speaker_dict = {}
+    speaker_graph = Graph()
+    speaker_graph.parse(filename)
+    speaker_subjects = speaker_graph.subjects(object=LPV.MemberOfParliament)
+    for speaker in speaker_subjects:
+        try:
+            name = speaker_graph.value(speaker, FOAF.name).value
+        except AttributeError:
+            logger.info(f"Cannot find name of speaker subject {speaker}")
+            continue
+        country_node = speaker_graph.value(speaker, LPV.countryOfRepresentation)
+        country_name = speaker_graph.value(country_node, RDFS.label).value
+        party_list = []
+        speaker_functions = speaker_graph.objects(speaker, LPV.politicalFunction)
+        for function in speaker_functions:
+            function_type = speaker_graph.value(function, LPV.institution)
+            if speaker_graph.value(function_type, RDFNS.type) == LPV.EUParty:
+                party_labels = list(speaker_graph.objects(function_type, RDFS.label))
+                party_acronym = min(party_labels, key=len)
+                party_name = max(party_labels, key=len)
+                date_start = speaker_graph.value(function, LPV.beginning)
+                date_end = speaker_graph.value(function, LPV.end)
+                party_list.append({
+                    'party_acronym': party_acronym,
+                    'party_name': party_name,
+                    'date_start': date_start.value,
+                    'date_end': date_end.value
+                })
+        speaker_dict.update({speaker: {
+            'name': name,
+            'country': country_name,
+            'parties': party_list
+            }
+        })
+    return speaker_dict
+
+def get_identifier(input: str) -> str:
+    return input.split('/')[-1]
+
+
+def language_name(lang_code: str) -> str:
+    return Language.make(language=standardize_tag(lang_code)).display_name()
+
+
+def get_speaker(input: Tuple[URIRef, dict]) -> str:
+    (speaker, speaker_dict) = input
+    return speaker_dict.get(speaker).get('name')
+
+def get_speaker_country(input: Tuple[URIRef, dict]) -> str:
+    (speaker, speaker_dict) = input
+    return speaker_dict.get(speaker).get('country')
+
+def get_speaker_party(input: Tuple[str, datetime, dict]) -> str:
+    ''' look up the which EU party the speaker was part of at the date of their speech '''
+    (speaker, date, party_data) = input
+    party_list = party_data.get(speaker).get('parties')
+    return next(
+        (
+            f"{p['party_name'].value} ({p['party_acronym'].value})"
+            for p in party_list
+            if (date >= p["date_start"] and date <= p["date_end"])
+        )
+    )
+
+def get_speech_index(input: Tuple[str, list]) -> int:
+    ''' find index of speech in array of debate parts '''
+    speech, speeches = input
+    if not speech:
+        return None
+    return speeches.index(speech) + 1
+
+def get_speech_text(input: str) -> str:
+    ''' remove leading language information, e.g., `(IT)`'''
+    return input.split(') ')[-1]
+
+def get_uri(input: Union[URIRef, str]) -> str:
+    ''' convert input from URIRef to string '''
+    try:
+        return input.n3().strip('<>')
+    except:
+        return input
+
+class ParliamentEurope(Parliament, RDFCorpusDefinition):
+    """
+    Speeches of the European parliament, (originally in or translated to English),
+    provided as Linked Open Data by the "Talk of Europe" project
+    """
+    title = 'People & Parliament (European Parliament)'
+    description = "Speeches from the European Parliament (EP)"
+    min_date = datetime(year=1999, month=7, day=20)
+    max_date = datetime(year=2017, month=7, day=6)
+    data_directory = settings.PP_EUPARL_DATA
+    es_index = getattr(settings, 'PP_EUPARL_INDEX', 'parliament-euparl')
+    languages = ['en']
+    description_page = 'euparl.md'
+    image = 'euparl.jpeg'
+
+    def sources(self, **kwargs):
+        metadata = {
+            "speakers": add_speaker_metadata(
+                os.path.join(self.data_directory, MP_METADATA)
+            )
+        }
+        yield os.path.join(self.data_directory, SPEECHES), metadata
+
+    def document_subjects(self, graph: Graph):
+        """return all subjects which have either translated or spoken text"""
+        return chain(
+            graph.subjects(predicate=LPV.translatedText),
+            graph.subjects(predicate=LPV.spokenText),
+        )
+
+    def parse_graph_from_filename(self, filename: str) -> Graph:
+        ''' we combine the graphs in place, to keep memory load low '''
+        graph = Graph()
+        graph.parse(filename)
+        graph.parse(os.path.join(self.data_directory, EVENTS_METADATA))
+        return graph
+
+    debate_id = field_defaults.debate_id()
+    debate_id.extractor = RDF(
+        DCTERMS.isPartOf,
+        transform=get_identifier
+    )
+
+    debate_title = field_defaults.debate_title()
+    debate_title.extractor = RDF(
+        DCTERMS.isPartOf,
+        DCTERMS.title
+    )
+
+    date = field_defaults.date(min_date, max_date)
+    date.extractor = RDF(
+        DCTERMS.date,
+        transform=lambda x: x.strftime('%Y-%m-%d')
+    )
+
+    party = field_defaults.party()
+    party.extractor = Combined(
+        RDF(LPV.speaker),
+        RDF(DCTERMS.date),
+        Metadata('speakers'),
+        transform=get_speaker_party
+    )
+
+    sequence = field_defaults.sequence()
+    sequence.extractor = Combined(
+        RDF(),
+        RDF(DCTERMS.isPartOf, DCTERMS.hasPart, multiple=True),
+        transform=get_speech_index,
+    )
+
+    source_language = field_defaults.language()
+    source_language.name = 'source_language'
+    source_language.display_name = 'Source language'
+    source_language.description = 'Original language of the speech'
+    source_language.search_filter.description = 'Search only in speeches in the selected source languages',
+    source_language.extractor = RDF(DCTERMS.language, transform=language_name)
+
+    speaker = field_defaults.speaker()
+    speaker.extractor = Combined(
+        RDF(LPV.speaker),
+        Metadata('speakers'),
+        transform=get_speaker
+    )
+
+    speaker_country = FieldDefinition(
+        name='speaker_country',
+        display_name='Represented country',
+        description='The EU country the speaker represents',
+        es_mapping=keyword_mapping(),
+        extractor=Combined(
+            RDF(LPV.speaker),
+            Metadata('speakers'),
+            transform=get_speaker_country
+        )
+    )
+
+    speech = field_defaults.speech(language='en')
+    speech.extractor = Backup(
+        RDF(
+            LPV.spokenText,
+        ),
+        RDF(
+            LPV.translatedText,
+        ),
+        transform=get_speech_text
+    )
+
+    speech_id = field_defaults.speech_id()
+    speech_id.extractor = RDF(transform=get_identifier)
+
+    url = field_defaults.url()
+    url.extractor = Backup(RDF(LPV.videoURI, transform=get_uri), RDF(transform=get_uri))
+
+    def __init__(self):
+        self.fields = [
+            self.date,
+            self.debate_id,
+            self.debate_title,
+            self.party,
+            self.sequence,
+            self.source_language,
+            self.speaker,
+            self.speaker_country,
+            self.speech, self.speech_id,
+            self.url
+        ]
diff --git a/backend/corpora/parliament/images/euparl.jpeg b/backend/corpora/parliament/images/euparl.jpeg
diff --git a/backend/corpora/parliament/tests/data/euparl/English.ttl b/backend/corpora/parliament/tests/data/euparl/English.ttl
@@ -0,0 +1,25 @@
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix lpv_eu: <http://purl.org/linkedpolitics/vocabulary/eu/plenary/> .
+@prefix lpv: <http://purl.org/linkedpolitics/vocabulary/> .
+@prefix lp_eu: <http://purl.org/linkedpolitics/eu/plenary/> .
+@prefix lp: <http://purl.org/linkedpolitics/> .
+@prefix dcterms: <http://purl.org/dc/terms/> .
+
+lp_eu:1999-07-21-Speech-3-063 lpv:translatedText "(IT) Mr President, as a Member of the Italian national Parliament for the\n(The Northern League for the Independence of Padania), I did not vote for Professor Prodi in Rome as I considered he would be completely useless as head of government. I was then proved right as he lost the vote of confidence of the Italian Parliament. Reckoning also that a Roman idiot would still be that stupid wherever he was, which, incidently, is reflected in the symbol on the list which bears his name for the election of this Parliament, I cannot for consistency\"s sake express my faith in the President of the Commission. As a native of the Po valley who is Italian only by passport, I am fortunately immune from the national Christian Democrat type of opportunism which brings Berlusconi together with Mastella and De Mita and sees in Prodi not the impartial President of the Commissioners uninfluenced by the States, but the lavish dispenser of favours to a wide and varied assortment of Southern Italian profiteers. Although I hold some of the Commissioners in high esteem, I recall the old mafioso Neapolitan saying: ‘A fish rots from the head downwards’ and I therefore have to express my negative opinion of the Prodi Presidency."@en .
+lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Lega Nord per l'indipendenza della Padania" .
+lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Speroni (NI)" .
+
+lp_eu:2009-03-24-Speech-2-371 lpv:translatedText "Mr President, ladies and gentlemen, allow me first of all to thank you for once again giving us the opportunity to pursue the constructive dialogue that has been established between the European Investment Bank and Parliament for some years now.\nMight we go further? I would remind you that the Court of Auditors already monitors all the EIB’s activities whenever these involve the use of funds from the European budget. Should we go further towards a formal system of banking supervision? That is what Mrs Stauner was hoping for. Mr Bullmann pointed out that things were perhaps not that simple. In any event, it is worth discussing. All I can do today is confirm that the EIB is fully open to being subjected to formal banking supervision, if it is considered worthwhile.\nFor the moment, we have organised, alongside the Financial Sector Supervisory Commission in Luxembourg, a form of informal supervision.\nIn answer to Mr Audy, I would say that the action that he requested last year from the Committee of European Banking Supervisors (CEBS) has indeed been carried out. We have therefore questioned the CEBS, but it informed us that it itself did not have any authority in the area and that it could not even act in an advisory role. We are therefore still in the hands of those who would like to take an initiative in this regard. I say again that we are open to such initiatives.\nA word in conclusion on cooperation between our two institutions. Mr Mirow has already indicated that it was developing well, particularly in the Western Balkans, and with our neighbours in the East, most recently in Turkey. All I want to say, in order to keep to my speaking time, is that we are in full agreement with the recommendations featured in Mr Mitchell’s report. We think that it would be in the common interest of both our institutions, and of our borrowers too, for us to move towards a more rational and functional division of labour.\nA word in conclusion on Mr Seppänen’s report. I would like to say how much we have appreciated Mr Seppänen’s constructive approach. He proposes a temporary solution, which allows the EIB to continue with its activities, but which fixes a date for an in-depth discussion of the role that the EIB should play outside the European Union. I am in no doubt that this is a debate on which we will spend some time and that, I believe, has come at just the right moment.\nI am particularly happy to have the opportunity to discuss the two reports being presented to us today, because they are two reports – that of Mr Mitchell and that of Mr Seppänen – that are interesting and that raise entirely relevant issues. I hope that we will have the opportunity to return to these issues later.\nToday, of course, we are facing a crisis on an exceptional scale – probably the most serious crisis since the end of the Second World War – and it is therefore quite normal in this context for Member States to call on our two institutions to try to make a contribution to the European Union’s response to this crisis. You know that in this context the Member States, which are our shareholders, have asked the EIB to substantially increase the volume of its lending in 2009, an increase of some 30% compared to the initial forecasts, and to channel this additional effort essentially into three areas: firstly, loans to banks for small and medium-sized enterprises; secondly, energy, and in particular the fight against climate change; and finally, a special effort for those countries that are hardest hit by the crisis.\nWhat point have we reached today? I will give you the statistics covering the last three months of 2008 – in other words, starting from the time at which the first appeals were made to the EIB – and the first two months of 2009. During those five months we lent more than EUR 31 billion, which represents a 38% increase compared with the same period of late-2007/early-2008. In the first area, as regards loans for small and medium-sized enterprises, EUR 5.6 billion in loans were issued in this short period. Several of you have stressed the importance of aiding small and medium-sized enterprises in the current climate. In fact, we are making a very special effort in this area, and I can already tell you that the objective that we were set of releasing EUR 15 billion of these loans during the years 2008 and 2009 will be exceeded.\nAs regards the second objective, energy and the fight against climate change, here too we have made a particular effort, and it is in this context that financing for the automotive industry must be placed. We must be clear: in this sector our funding is going towards projects involving research, development and production of eco-friendly cars, that is, cars that will meet the Union’s new standards regarding the reduction of CO\nemissions.\nFinally, regarding the third area: aid for countries that have been hardest hit by the crisis: during this same five-month period we issued EUR 910 million in loans in Hungary, EUR 600 million in Latvia, EUR 1 billion in Romania and EUR 1.1 billion in Lithuania.\nI therefore think that I can say that we have been doing our best to respond to the Member States’ appeal and to implement the agreed measures without delay. Mr Mirow himself has already alluded to the joint International Finance Corporation-European Bank for Reconstruction and Development action plan regarding aid for the banking sector in Central and Eastern Europe.\nNaturally, this increase in the volume of our loans is only possible thanks to the increase in capital on which our shareholders have decided – it will not cost the Member States anything. However, it was decided that we needed our shareholders’ authorisation to turn our reserves into capital.\nSeveral of you have asked questions about monitoring and supervision of the EIB, and I personally think that the question is totally legitimate. When a financial institution grows in such a way, it is normal for there to be concerns about how it is monitored. There is what is already in place, which is not insignificant: there is a certain amount of internal monitoring and, above all, there is external monitoring by an independent audit committee that reports directly to our governors. Moreover, the Treaty of Lisbon makes provision for strengthening this audit committee with the addition of people who have proven experience of banking supervision."@en .
+lp_eu:2009-03-24-Speech-2-371 lpv:unclassifiedMetadata "2" .
+
+lp_eu:2017-07-06-Speech-4-146-000 lpv:spokenText "Mr President, yesterday afternoon we had a lively debate, under Rule 153, on the subject of a single seat for this Parliament. Unfortunately, under that rule, it was not possible to have a resolution, but it was the clear will of this House that we bring forward a report to propose a treaty change. So, as Mr Weber and Mr Pittella are in their seats, could they please take note of the view of this House and, when the matter comes to the Conference of Presidents, could they please authorise that report?"@en .
+lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "(Applause)" .
+lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "Ashley Fox (ECR )." .
+
+lp_eu:1999-07-21_AgendaItem_5 dcterms:title "Statement by Mr Prodi, President-elect of the Commission"@en .
+
+lp_eu:2009-03-24_AgendaItem_30 dcterms:title "EIB and EBRD annual reports for 2007 - Community guarantee to the European Investment Bank (debate)"@en .
+
+lp_eu:2017-07-06_AgendaItem_13 dcterms:title "Composition of committees and delegations"@en .
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The debates from the European Parliament, in English (translation), as provided by the [Talk of Europe](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x62-ew3m&version=1.0) dataset. The dataset covers debates from July 1999 to July 2017.