added spoken corpora

clarin-eric · Oct 3, 2024 · dc047f5 · dc047f5
1 parent 1960adc
commit dc047f5
Show file tree

Hide file tree

Showing 168 changed files with 2,703 additions and 0 deletions.
diff --git a/corpora/spoken-corpora/2nd-gen-israel-migrants.json b/corpora/spoken-corpora/2nd-gen-israel-migrants.json
@@ -0,0 +1,17 @@
+{
+      "Name": "Zweite Generation deutschsprachiger Migranten in Israel",
+      "URL": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2",
+      "Family": "Spoken corpora",
+      "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+      "Languages": ["deu"],
+      "License": "CLARIN RES",
+      "Size": ["125 hours"],
+      "Annotation": ["orthographically transcribed", "code switching"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Concordancer": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2",
+	"Download": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/aalto-dsp.json b/corpora/spoken-corpora/aalto-dsp.json
@@ -0,0 +1,16 @@
+{
+      "Name": "Aalto University DSP Course Conversation Corpus 2013-2016, Downloadable Version",
+      "URL": "http://urn.fi/urn:nbn:fi:lb-2017092133",
+      "Family": "Spoken corpora",
+      "Description": "This corpus contains spontaneous conversations.\nThe corpus is available for download from FIN-CLARIN.",
+      "Languages": ["fin"],
+      "License": "CLARIN ACA",
+      "Size": ["5200 utterances"],
+      "Annotation": [],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://urn.fi/urn:nbn:fi:lb-201708251"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/absolventinnen.json b/corpora/spoken-corpora/absolventinnen.json
@@ -0,0 +1,16 @@
+{
+      "Name": "AbsolventInnen",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0007-EC5D-8",
+      "Family": "Spoken corpora",
+      "Description": "This corpus provides data for examining the pronunciation of gender-neutral forms in German. The recordings took place at the IPS in the Munich region. 56 texts were recorded from 40 speakers. The texts came from newspapers, websites, administration offices, social services, etc., and were modified to contain either one of the three gender-neutral forms or the extended form. Each of the speakers read the 56 sentences, with target words, 25 % each, asterisk, underscore, uppercase-I or the feminine plural-form in a counterbalancing measures design. Filler sentences for this study are not a part of the corpus but will be part of further investigations. That means, that there are 56 recordings per session.",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["2 hours"],
+      "Annotation": ["orthographically transcribed", "phonetic", "phonemic transcription"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0007-EC5D-8"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/acwme.json b/corpora/spoken-corpora/acwme.json
@@ -0,0 +1,15 @@
+{
+      "Name": "The Aston Corpus of West Midlands English (ACWME)",
+      "URL": "https://researchdata.aston.ac.uk/id/eprint/162/",
+	"Family": "Spoken corpora",
+      "Description": "This corpus contains recordings of performances - comedy, drama, poetry, song and story-telling - and related interviews with performers, members of the audience and local and national celebrities.\nThe corpus is available for download from a dedicated webpage.",
+      "Languages": ["eng"],
+      "License": "",
+      "Size": [],
+      "Annotation": ["orthographically transcribed"],
+      "Infrastructure": "Other",
+      "Access": {
+	"Download": "http://www.aston.ac.uk/lss/research/lss-research/ccisc/discourse-and-culture/acwme/"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/agender.json b/corpora/spoken-corpora/agender.json
@@ -0,0 +1,16 @@
+{
+      "Name": "aGender",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0001-1500-7",
+      "Family": "Spoken corpora",
+      "Description": "The speech corpus aGender contains speech sample recordings over public telephone lines with read and (semi-)spontaneous speech. Native German speakers called a voice portal from their private phone, and read text + answered some open questions. The purpose of the corpus is the automatic detection of gender and/or age (7 mixed classes ranging from 7 - 80 years). The corpus contains the voices of 945 German speakers (approx. minimum of 100 speakers per class), each delivering 18 speech items in up to six different sessions.",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["47 hours"],
+      "Annotation": ["orthographically transcribed"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0001-1500-7"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/air-traffic-ctrl.json b/corpora/spoken-corpora/air-traffic-ctrl.json
@@ -0,0 +1,17 @@
+{
+      "Name": "Air Traffic Control Communication",
+      "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-CCA1-0",
+      "Family": "Spoken corpora",
+      "Description": "This corpus contains recordings of communication between air traffic controllers and pilots.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+      "Languages": ["eng"],
+      "License": "CC BY-NC-ND 3.0",
+      "Size": ["20 hours"],
+      "Annotation": ["speaker information"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Concordancer": "https://lindat.mff.cuni.cz/services/kontext/first_form?corpname=airtraffic_en_w",
+	"Download": "http://hdl.handle.net/11858/00-097C-0000-0001-CCA1-0"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/alcebla.json b/corpora/spoken-corpora/alcebla.json
@@ -0,0 +1,15 @@
+{
+      "Name": "ALCEBLA",
+      "URL": "http://hdl.handle.net/11022/0000-0000-50DD-D",
+	"Family": "Spoken corpora",
+      "Description": "This corpus contains Speech tasks performed by bilingual children.",
+      "Languages": ["deu", "spa"],
+      "License": "HZSK-RES (restricted, non-commercial only)",
+      "Size": ["72 hours"],
+      "Annotation": ["orthographic and phonetic transcription"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions only",
+      "Access": {
+	},
+      "Publication": "Ulloa Saceda et al. (2012)"
+}
diff --git a/corpora/spoken-corpora/ananas-mt.json b/corpora/spoken-corpora/ananas-mt.json
@@ -0,0 +1,14 @@
+{
+      "Name": "AN.ANA.S._MT",
+      "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/716-corpus-ananas-multilingue-ananasmt",
+	"Family": "Spoken corpora",
+      "Description": "This corpus contains TV-broadcasts and elicited dialogues.",
+      "Languages": ["eng", "ita", "spa"],
+      "License": "",
+      "Size": [],
+      "Annotation": [],
+      "Infrastructure": "Other",
+      "Access": {
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/arabic-speech.json b/corpora/spoken-corpora/arabic-speech.json
@@ -0,0 +1,16 @@
+{
+      "Name": "Arabic Speech Corpus",
+      "URL": "http://hdl.handle.net/20.500.14106/2561",
+      "Family": "Spoken corpora",
+      "Description": "This corpus is available for download from the Oxford Text Archive.",
+      "Languages": ["ara"],
+      "License": "CC BY 4.0",
+      "Size": [],
+      "Annotation": [],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/20.500.14106/2561"
+	},
+      "Publication": "Halabi (2016)"
+}
diff --git a/corpora/spoken-corpora/asr-artur.json b/corpora/spoken-corpora/asr-artur.json
@@ -0,0 +1,17 @@
+{
+      "Name": "ASR database ARTUR 1.0",
+      "URL": "http://hdl.handle.net/11356/1772",
+      "Family": "Spoken corpora",
+      "Description": "This corpus was designed for the needs of developing automatic speech recognition for the Slovenian language. The complete database includes 1,067 hours of speech, of which 884 hours are transcribed, while the remaining 183 hours are recordings only.\nThe audio files are available in <a href=\"http://hdl.handle.net/11356/1776\">a separate repository entry</a>. Transcriptions are available in the original TRS format of the Transcriber 1.5.1 tool which was used for making the transcriptions. All transcriptions were made manually or manually corrected.\nThe data are structured as follows: <ol> <li>Artur-B, read speech, 573 hours in total.\nIt includes: (1a) Artur-B-Brani, 485 hours: Readings of sentences which were pre-selected from a 10% increment in the Gigafida 2.0 corpus. The sentences were chosen in such a way that they reflect the natural or the actual distribution of triphones in the words. They were distributed between 1,000 speakers, so that we recorded approx. 30 min in read form from each speaker. The speakers were balanced according to gender, age, region, and a small proportion of speakers were non-native speakers of Slovene. Each sentence is its own audio file and has a corresponding transcription file. (1b) Artur-B-Crkovani, 10 hours: Spellings. Speakers were asked to spell abbreviations and personal names and surnames, all chosen so that all Slovene letters were covered, plus the most common foreign letters. (1c) Artur-B-Studio, 51 hours: Designed for the development of speech synthesis. The sentences were read in a studio by a single speaker. Each sentence is its own audio file and has a corresponding transcription file. (1d) Artur-B-Izloceno, 27 hours: The recordings include different types of errors, typically, incorrect reading of sentences or a noisy environment.</li> <li>(2) Artur-J, public speech, 62 hours in total.\nIt includes: (2a) Artur-J-Splosni, 62 hours: media recordings, online recordings of conferences, workshops, education videos, etc.</li> <li>(3) Artur-N, private speech, 74 hours in total.\nIt includes: (3a) Artur-N-Obrazi, 6 hours: Speakers were asked to describe faces on pictures. Designed for a face-description domain-specific speech recognition. (3b) Artur-N-PDom, 7 hours: Speakers were asked to read pre-written sentences, as well as to express instructions for a potential smart-home system freely. Designed for a smart-home domain-specific speech recognition. (3c) Artur-N-Prosti, 61 hours: Monologues and dialogues between two persons, recorded for the purposes of the Artur database creation. Speakers were asked to conversate or explain freely on casual topics.</li> <li>(4) Artur-P, parliamentary speech, 201 hours in total.\nIt includes: (4a) Artur-P-SejeDZ, 201 hours: Speech from the Slovene National Assembly.</li>\nThe corpus is available for download from the CLARIN.SI repository.",
+      "Languages": ["slv"],
+      "License": "CC BY-SA 4.0",
+      "Size": ["884 hours"],
+      "Annotation": ["orthographically transcribed"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download (transcriptions)": "http://hdl.handle.net/11356/1772",
+	"Download (audio files)": "http://hdl.handle.net/11356/1776"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/asr-parlaspeech-hr.json b/corpora/spoken-corpora/asr-parlaspeech-hr.json
@@ -0,0 +1,16 @@
+{
+      "Name": "ASR training dataset for Croatian ParlaSpeech-HR",
+      "URL": "http://hdl.handle.net/11356/1494",
+      "Family": "Spoken corpora",
+      "Description": "This corpus is built from parliamentary proceedings available in the Croatian part of the ParlaMint corpus and the parliamentary recordings available from the Croatian Parliament's YouTube channel. The corpus consists of segments 8-20 seconds in length. There are two transcripts available: the original one, and the one normalised via a simple rule-based normaliser. Each of the transcripts contains word-level alignments to the recordings. Each segment has a reference to the <a href=\"http://hdl.handle.net/11356/1432\">ParlaMint 2.1 corpus</a> via utterance IDs.\nThere is speaker information available for 381,849 segments, i.e., 95% of all segments. Speaker information consists of all the speaker information available from the ParlaMint 2.1 corpus (name, party, gender, age, status, role). There are all together 309 speakers in the dataset.\nThe dataset is divided into a training, a development, and a testing subset. Development data consist of 500 segments coming from the 5 most frequent speakers, with the goal of not losing speaker variety on dev data. Test data consist of 513 segments that come from 3 male (258 segments) and 3 female speakers (255 segments). There are no segments coming from the 6 test speakers in the two remaining subsets. The 22,076 instances not having speaker information are not assigned to any of the three subsets. The remaining 380,836 instances form the training set.\nThis corpus is available for download from the CLARIN.SI repository.",
+      "Languages": ["hrv"],
+      "License": "CC BY-SA 4.0",
+      "Size": ["1816 hours", "403925 entries"],
+      "Annotation": ["normalised transcriptions", "speaker metadata", "word-level alignment to the recordings"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11356/1494"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/australiendeutsch.json b/corpora/spoken-corpora/australiendeutsch.json
@@ -0,0 +1,17 @@
+{
+      "Name": "Australiendeutsch",
+      "URL": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E",
+      "Family": "Spoken corpora",
+      "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+      "Languages": ["deu"],
+      "License": "CLARIN RES",
+      "Size": ["330,000 words", "65 hours"],
+      "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Concordancer": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E",
+	"Download": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/babel.json b/corpora/spoken-corpora/babel.json
@@ -0,0 +1,14 @@
+{
+      "Name": "Babel - A Multi Language Database",
+      "URL": "http://metashare.ilsp.gr:8080/repository/browse/hungarian-babel/9c27b9d481b611e2892a000c29bfc0d46a94c6ce19b843b3a452b382e2e64832/",
+	"Family": "Spoken corpora",
+      "Description": "This corpus contains various elicited speech tasks.",
+      "Languages": ["hun"],
+      "License": "",
+      "Size": [],
+      "Annotation": ["orthographically transcribed"],
+      "Infrastructure": "Other",
+      "Access": {
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-alcohol.json b/corpora/spoken-corpora/bas-alcohol.json
@@ -0,0 +1,16 @@
+{
+      "Name": "BAS Alcohol Language Corpus",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0001-88E5-3",
+      "Family": "Spoken corpora",
+      "Description": "This corpus contains recordings of 162 speakers while being sober and intoxicated. Beginning with version 3, this corpus edition also contains an emuR compatible database version of the corpus (with a minor bugfix in the database in version 3.1).",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["94 hours"],
+      "Annotation": ["orthographically transcribed", "phonemic", "user state"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0001-88E5-3"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-regional-juves.json b/corpora/spoken-corpora/bas-regional-juves.json
@@ -0,0 +1,16 @@
+{
+      "Name": "BAS Regional Variants of German - Juveniles",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0004-AE1D-9",
+      "Family": "Spoken corpora",
+      "Description": "The corpus contains both read and non-scripted German utterances. It comprises the original RVG prompts (telephone numbers, sentences, commands, digits, etc.) plus spellings, date and time expressions, and free form responses to questions, e.g. \"What are you wearing?\", \"How did you get here?\", etc. The speakers were adolescents between 13 and 20 years of age, recruited in public schools in Munich and the suburbs. More than 95% of the speakers have German as their mother language, and almost all of them attended school in Bavaria; 89 of them were male and 93 female.",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["100 hours"],
+      "Annotation": ["orthographically transcribed"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0004-AE1D-9"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-siemens.json b/corpora/spoken-corpora/bas-siemens.json
@@ -0,0 +1,16 @@
+{
+      "Name": "BAS Siemens Hoergeraete Corpus",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0002-1303-5",
+      "Family": "Spoken corpora",
+      "Description": "This is a corpus of spontaneous, relatively casual dialogues in German. Each pair of dialogue partners is recorded conversing under real-noise conditions (in a noisy cafeteria and in a car going at different velocities), as well as in a studio at various levels of lombard noise played directly into the subjects' ears.",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["24 hours"],
+      "Annotation": ["Turn segmentation"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0002-1303-5"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-sl-recog.json b/corpora/spoken-corpora/bas-sl-recog.json
@@ -0,0 +1,16 @@
+{
+      "Name": "BAS Database for Signer-Independent Continuous Sign Language Recognition",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0000-D8A5-2",
+      "Family": "Spoken corpora",
+      "Description": "The contains both isolated and continuous utterances of various signers. Since we use a vision-based approach for sign language recognition the corpus was recorded on video. For quick random access to individual frames, each video clip is stored as a sequence of images. The vocabulary comprises 450 basic signs in German Sign Language (DGS) representing different word types. Based on this vocabulary, overall 780 sentences were constructed.",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["55 hours"],
+      "Annotation": ["Sign language"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0000-D8A5-2"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-smartweb-video.json b/corpora/spoken-corpora/bas-smartweb-video.json
@@ -0,0 +1,16 @@
+{
+      "Name": "BAS SmartWeb Video",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0007-C059-C",
+      "Family": "Spoken corpora",
+      "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The recordings include 156 field recordings using a hand-held UMTS device (one person, SmartWeb Handheld Corpus SHC), 99 field recordings with video capture of the primary speaker and a secondary speaker (SmartWeb Video Corpus SVC) as well as 36 mobile recordings performed on a BMW motorbike (one speaker, SmartWeb Motorbike Corpus SMC).",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["16.2 hours"],
+      "Annotation": ["orthographically transcribed", "user state"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0007-C059-C"
+	},
+      "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-verbmobil-emo.json b/corpora/spoken-corpora/bas-verbmobil-emo.json
@@ -0,0 +1,16 @@
+{
+      "Name": "BAS Verbmobil Emotion",
+      "URL": "http://hdl.handle.net/11022/1009-0000-0004-2BCC-7",
+      "Family": "Spoken corpora",
+      "Description": "This database contains speech signals of dialogues in which a subject was recorded during a conversation via a spontaneous speech translation system. The response of the system was designed to invoke emotions (e.g. anger) in the subjects. It is part of the larger Verbmobil 2 speech data collection. Starting from BAS Clarin Respository version 2, the database is also distributed as an emuR comptatible emu database.",
+      "Languages": ["deu"],
+      "License": "CLARIN ACA",
+      "Size": ["17 hours"],
+      "Annotation": ["orthographically transcribed", "emotions"],
+      "Infrastructure": "CLARIN",
+      "Group": "Corpora with transcriptions and audio recordings",
+      "Access": {
+	"Download": "http://hdl.handle.net/11022/1009-0000-0004-2BCC-7"
+	},
+      "Publication": ""
+}