diff --git a/corpora/spoken-corpora/2nd-gen-israel-migrants.json b/corpora/spoken-corpora/2nd-gen-israel-migrants.json
new file mode 100644
index 0000000..8f3ba3a
--- /dev/null
+++ b/corpora/spoken-corpora/2nd-gen-israel-migrants.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Zweite Generation deutschsprachiger Migranten in Israel",
+ "URL": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["125 hours"],
+ "Annotation": ["orthographically transcribed", "code switching"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2",
+ "Download": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/aalto-dsp.json b/corpora/spoken-corpora/aalto-dsp.json
new file mode 100644
index 0000000..8cab847
--- /dev/null
+++ b/corpora/spoken-corpora/aalto-dsp.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Aalto University DSP Course Conversation Corpus 2013-2016, Downloadable Version",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2017092133",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous conversations.\nThe corpus is available for download from FIN-CLARIN.",
+ "Languages": ["fin"],
+ "License": "CLARIN ACA",
+ "Size": ["5200 utterances"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://urn.fi/urn:nbn:fi:lb-201708251"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/absolventinnen.json b/corpora/spoken-corpora/absolventinnen.json
new file mode 100644
index 0000000..490f1b6
--- /dev/null
+++ b/corpora/spoken-corpora/absolventinnen.json
@@ -0,0 +1,16 @@
+{
+ "Name": "AbsolventInnen",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EC5D-8",
+ "Family": "Spoken corpora",
+ "Description": "This corpus provides data for examining the pronunciation of gender-neutral forms in German. The recordings took place at the IPS in the Munich region. 56 texts were recorded from 40 speakers. The texts came from newspapers, websites, administration offices, social services, etc., and were modified to contain either one of the three gender-neutral forms or the extended form. Each of the speakers read the 56 sentences, with target words, 25 % each, asterisk, underscore, uppercase-I or the feminine plural-form in a counterbalancing measures design. Filler sentences for this study are not a part of the corpus but will be part of further investigations. That means, that there are 56 recordings per session.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["2 hours"],
+ "Annotation": ["orthographically transcribed", "phonetic", "phonemic transcription"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-EC5D-8"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/acwme.json b/corpora/spoken-corpora/acwme.json
new file mode 100644
index 0000000..0239909
--- /dev/null
+++ b/corpora/spoken-corpora/acwme.json
@@ -0,0 +1,15 @@
+{
+ "Name": "The Aston Corpus of West Midlands English (ACWME)",
+ "URL": "https://researchdata.aston.ac.uk/id/eprint/162/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of performances - comedy, drama, poetry, song and story-telling - and related interviews with performers, members of the audience and local and national celebrities.\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["eng"],
+ "License": "",
+ "Size": [],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://www.aston.ac.uk/lss/research/lss-research/ccisc/discourse-and-culture/acwme/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/agender.json b/corpora/spoken-corpora/agender.json
new file mode 100644
index 0000000..51ba5ae
--- /dev/null
+++ b/corpora/spoken-corpora/agender.json
@@ -0,0 +1,16 @@
+{
+ "Name": "aGender",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-1500-7",
+ "Family": "Spoken corpora",
+ "Description": "The speech corpus aGender contains speech sample recordings over public telephone lines with read and (semi-)spontaneous speech. Native German speakers called a voice portal from their private phone, and read text + answered some open questions. The purpose of the corpus is the automatic detection of gender and/or age (7 mixed classes ranging from 7 - 80 years). The corpus contains the voices of 945 German speakers (approx. minimum of 100 speakers per class), each delivering 18 speech items in up to six different sessions.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["47 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-1500-7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/air-traffic-ctrl.json b/corpora/spoken-corpora/air-traffic-ctrl.json
new file mode 100644
index 0000000..5b36d0d
--- /dev/null
+++ b/corpora/spoken-corpora/air-traffic-ctrl.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Air Traffic Control Communication",
+ "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-CCA1-0",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of communication between air traffic controllers and pilots.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+ "Languages": ["eng"],
+ "License": "CC BY-NC-ND 3.0",
+ "Size": ["20 hours"],
+ "Annotation": ["speaker information"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://lindat.mff.cuni.cz/services/kontext/first_form?corpname=airtraffic_en_w",
+ "Download": "http://hdl.handle.net/11858/00-097C-0000-0001-CCA1-0"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/alcebla.json b/corpora/spoken-corpora/alcebla.json
new file mode 100644
index 0000000..d0e8b71
--- /dev/null
+++ b/corpora/spoken-corpora/alcebla.json
@@ -0,0 +1,15 @@
+{
+ "Name": "ALCEBLA",
+ "URL": "http://hdl.handle.net/11022/0000-0000-50DD-D",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains Speech tasks performed by bilingual children.",
+ "Languages": ["deu", "spa"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["72 hours"],
+ "Annotation": ["orthographic and phonetic transcription"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ },
+ "Publication": "Ulloa Saceda et al. (2012)"
+}
diff --git a/corpora/spoken-corpora/ananas-mt.json b/corpora/spoken-corpora/ananas-mt.json
new file mode 100644
index 0000000..a7c14ba
--- /dev/null
+++ b/corpora/spoken-corpora/ananas-mt.json
@@ -0,0 +1,14 @@
+{
+ "Name": "AN.ANA.S._MT",
+ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/716-corpus-ananas-multilingue-ananasmt",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains TV-broadcasts and elicited dialogues.",
+ "Languages": ["eng", "ita", "spa"],
+ "License": "",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "Other",
+ "Access": {
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/arabic-speech.json b/corpora/spoken-corpora/arabic-speech.json
new file mode 100644
index 0000000..3bc766f
--- /dev/null
+++ b/corpora/spoken-corpora/arabic-speech.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Arabic Speech Corpus",
+ "URL": "http://hdl.handle.net/20.500.14106/2561",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available for download from the Oxford Text Archive.",
+ "Languages": ["ara"],
+ "License": "CC BY 4.0",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/20.500.14106/2561"
+ },
+ "Publication": "Halabi (2016)"
+}
diff --git a/corpora/spoken-corpora/asr-artur.json b/corpora/spoken-corpora/asr-artur.json
new file mode 100644
index 0000000..95ef444
--- /dev/null
+++ b/corpora/spoken-corpora/asr-artur.json
@@ -0,0 +1,17 @@
+{
+ "Name": "ASR database ARTUR 1.0",
+ "URL": "http://hdl.handle.net/11356/1772",
+ "Family": "Spoken corpora",
+ "Description": "This corpus was designed for the needs of developing automatic speech recognition for the Slovenian language. The complete database includes 1,067 hours of speech, of which 884 hours are transcribed, while the remaining 183 hours are recordings only.\nThe audio files are available in a separate repository entry. Transcriptions are available in the original TRS format of the Transcriber 1.5.1 tool which was used for making the transcriptions. All transcriptions were made manually or manually corrected.\nThe data are structured as follows:
- Artur-B, read speech, 573 hours in total.\nIt includes: (1a) Artur-B-Brani, 485 hours: Readings of sentences which were pre-selected from a 10% increment in the Gigafida 2.0 corpus. The sentences were chosen in such a way that they reflect the natural or the actual distribution of triphones in the words. They were distributed between 1,000 speakers, so that we recorded approx. 30 min in read form from each speaker. The speakers were balanced according to gender, age, region, and a small proportion of speakers were non-native speakers of Slovene. Each sentence is its own audio file and has a corresponding transcription file. (1b) Artur-B-Crkovani, 10 hours: Spellings. Speakers were asked to spell abbreviations and personal names and surnames, all chosen so that all Slovene letters were covered, plus the most common foreign letters. (1c) Artur-B-Studio, 51 hours: Designed for the development of speech synthesis. The sentences were read in a studio by a single speaker. Each sentence is its own audio file and has a corresponding transcription file. (1d) Artur-B-Izloceno, 27 hours: The recordings include different types of errors, typically, incorrect reading of sentences or a noisy environment.
- (2) Artur-J, public speech, 62 hours in total.\nIt includes: (2a) Artur-J-Splosni, 62 hours: media recordings, online recordings of conferences, workshops, education videos, etc.
- (3) Artur-N, private speech, 74 hours in total.\nIt includes: (3a) Artur-N-Obrazi, 6 hours: Speakers were asked to describe faces on pictures. Designed for a face-description domain-specific speech recognition. (3b) Artur-N-PDom, 7 hours: Speakers were asked to read pre-written sentences, as well as to express instructions for a potential smart-home system freely. Designed for a smart-home domain-specific speech recognition. (3c) Artur-N-Prosti, 61 hours: Monologues and dialogues between two persons, recorded for the purposes of the Artur database creation. Speakers were asked to conversate or explain freely on casual topics.
- (4) Artur-P, parliamentary speech, 201 hours in total.\nIt includes: (4a) Artur-P-SejeDZ, 201 hours: Speech from the Slovene National Assembly.
\nThe corpus is available for download from the CLARIN.SI repository.",
+ "Languages": ["slv"],
+ "License": "CC BY-SA 4.0",
+ "Size": ["884 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download (transcriptions)": "http://hdl.handle.net/11356/1772",
+ "Download (audio files)": "http://hdl.handle.net/11356/1776"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/asr-parlaspeech-hr.json b/corpora/spoken-corpora/asr-parlaspeech-hr.json
new file mode 100644
index 0000000..f0fd231
--- /dev/null
+++ b/corpora/spoken-corpora/asr-parlaspeech-hr.json
@@ -0,0 +1,16 @@
+{
+ "Name": "ASR training dataset for Croatian ParlaSpeech-HR",
+ "URL": "http://hdl.handle.net/11356/1494",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is built from parliamentary proceedings available in the Croatian part of the ParlaMint corpus and the parliamentary recordings available from the Croatian Parliament's YouTube channel. The corpus consists of segments 8-20 seconds in length. There are two transcripts available: the original one, and the one normalised via a simple rule-based normaliser. Each of the transcripts contains word-level alignments to the recordings. Each segment has a reference to the ParlaMint 2.1 corpus via utterance IDs.\nThere is speaker information available for 381,849 segments, i.e., 95% of all segments. Speaker information consists of all the speaker information available from the ParlaMint 2.1 corpus (name, party, gender, age, status, role). There are all together 309 speakers in the dataset.\nThe dataset is divided into a training, a development, and a testing subset. Development data consist of 500 segments coming from the 5 most frequent speakers, with the goal of not losing speaker variety on dev data. Test data consist of 513 segments that come from 3 male (258 segments) and 3 female speakers (255 segments). There are no segments coming from the 6 test speakers in the two remaining subsets. The 22,076 instances not having speaker information are not assigned to any of the three subsets. The remaining 380,836 instances form the training set.\nThis corpus is available for download from the CLARIN.SI repository.",
+ "Languages": ["hrv"],
+ "License": "CC BY-SA 4.0",
+ "Size": ["1816 hours", "403925 entries"],
+ "Annotation": ["normalised transcriptions", "speaker metadata", "word-level alignment to the recordings"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11356/1494"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/australiendeutsch.json b/corpora/spoken-corpora/australiendeutsch.json
new file mode 100644
index 0000000..714a20d
--- /dev/null
+++ b/corpora/spoken-corpora/australiendeutsch.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Australiendeutsch",
+ "URL": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["330,000 words", "65 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E",
+ "Download": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/babel.json b/corpora/spoken-corpora/babel.json
new file mode 100644
index 0000000..c3afc0b
--- /dev/null
+++ b/corpora/spoken-corpora/babel.json
@@ -0,0 +1,14 @@
+{
+ "Name": "Babel - A Multi Language Database",
+ "URL": "http://metashare.ilsp.gr:8080/repository/browse/hungarian-babel/9c27b9d481b611e2892a000c29bfc0d46a94c6ce19b843b3a452b382e2e64832/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains various elicited speech tasks.",
+ "Languages": ["hun"],
+ "License": "",
+ "Size": [],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "Other",
+ "Access": {
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-alcohol.json b/corpora/spoken-corpora/bas-alcohol.json
new file mode 100644
index 0000000..099b413
--- /dev/null
+++ b/corpora/spoken-corpora/bas-alcohol.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS Alcohol Language Corpus",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-88E5-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of 162 speakers while being sober and intoxicated. Beginning with version 3, this corpus edition also contains an emuR compatible database version of the corpus (with a minor bugfix in the database in version 3.1).",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["94 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic", "user state"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-88E5-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-regional-juves.json b/corpora/spoken-corpora/bas-regional-juves.json
new file mode 100644
index 0000000..4ad488f
--- /dev/null
+++ b/corpora/spoken-corpora/bas-regional-juves.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS Regional Variants of German - Juveniles",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0004-AE1D-9",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains both read and non-scripted German utterances. It comprises the original RVG prompts (telephone numbers, sentences, commands, digits, etc.) plus spellings, date and time expressions, and free form responses to questions, e.g. \"What are you wearing?\", \"How did you get here?\", etc. The speakers were adolescents between 13 and 20 years of age, recruited in public schools in Munich and the suburbs. More than 95% of the speakers have German as their mother language, and almost all of them attended school in Bavaria; 89 of them were male and 93 female.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["100 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0004-AE1D-9"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-siemens.json b/corpora/spoken-corpora/bas-siemens.json
new file mode 100644
index 0000000..101be8f
--- /dev/null
+++ b/corpora/spoken-corpora/bas-siemens.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS Siemens Hoergeraete Corpus",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0002-1303-5",
+ "Family": "Spoken corpora",
+ "Description": "This is a corpus of spontaneous, relatively casual dialogues in German. Each pair of dialogue partners is recorded conversing under real-noise conditions (in a noisy cafeteria and in a car going at different velocities), as well as in a studio at various levels of lombard noise played directly into the subjects' ears.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["24 hours"],
+ "Annotation": ["Turn segmentation"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0002-1303-5"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-sl-recog.json b/corpora/spoken-corpora/bas-sl-recog.json
new file mode 100644
index 0000000..437ccab
--- /dev/null
+++ b/corpora/spoken-corpora/bas-sl-recog.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS Database for Signer-Independent Continuous Sign Language Recognition",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-D8A5-2",
+ "Family": "Spoken corpora",
+ "Description": "The contains both isolated and continuous utterances of various signers. Since we use a vision-based approach for sign language recognition the corpus was recorded on video. For quick random access to individual frames, each video clip is stored as a sequence of images. The vocabulary comprises 450 basic signs in German Sign Language (DGS) representing different word types. Based on this vocabulary, overall 780 sentences were constructed.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["55 hours"],
+ "Annotation": ["Sign language"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-D8A5-2"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-smartweb-video.json b/corpora/spoken-corpora/bas-smartweb-video.json
new file mode 100644
index 0000000..6357e81
--- /dev/null
+++ b/corpora/spoken-corpora/bas-smartweb-video.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS SmartWeb Video",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C059-C",
+ "Family": "Spoken corpora",
+ "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The recordings include 156 field recordings using a hand-held UMTS device (one person, SmartWeb Handheld Corpus SHC), 99 field recordings with video capture of the primary speaker and a secondary speaker (SmartWeb Video Corpus SVC) as well as 36 mobile recordings performed on a BMW motorbike (one speaker, SmartWeb Motorbike Corpus SMC).",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["16.2 hours"],
+ "Annotation": ["orthographically transcribed", "user state"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-C059-C"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-verbmobil-emo.json b/corpora/spoken-corpora/bas-verbmobil-emo.json
new file mode 100644
index 0000000..45c73e2
--- /dev/null
+++ b/corpora/spoken-corpora/bas-verbmobil-emo.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS Verbmobil Emotion",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0004-2BCC-7",
+ "Family": "Spoken corpora",
+ "Description": "This database contains speech signals of dialogues in which a subject was recorded during a conversation via a spontaneous speech translation system. The response of the system was designed to invoke emotions (e.g. anger) in the subjects. It is part of the larger Verbmobil 2 speech data collection. Starting from BAS Clarin Respository version 2, the database is also distributed as an emuR comptatible emu database.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["17 hours"],
+ "Annotation": ["orthographically transcribed", "emotions"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0004-2BCC-7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bas-ziptel.json b/corpora/spoken-corpora/bas-ziptel.json
new file mode 100644
index 0000000..d68a594
--- /dev/null
+++ b/corpora/spoken-corpora/bas-ziptel.json
@@ -0,0 +1,16 @@
+{
+ "Name": "BAS ZIPTEL",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0003-1E02-A",
+ "Family": "Spoken corpora",
+ "Description": "The ZipTel telephone speech database contains recordings of people applying for a SpeechDat prompt sheet via telephone. For the SpeechDat data collection, calls for participation were published in \"phone\", the customer magazine of the mobile telephone provider \"e-plus\", and in numerous newspapers all over Germany. In these calls, a telephone number was given where callers could order a SpeechDat prompt sheet. The calls were recorded by an automatic telephone server; callers were asked to provide name, address and telephone number. The ZipTel telephone speech database consists of 1957 recording sessions with a total of 7746 signal files. A recording session corresponds to one phone call, each signal file contains a single recorded utterance from the recording session. ",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["14 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0003-1E02-A"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bcms.json b/corpora/spoken-corpora/bcms.json
new file mode 100644
index 0000000..fea2cbb
--- /dev/null
+++ b/corpora/spoken-corpora/bcms.json
@@ -0,0 +1,18 @@
+{
+ "Name": "Map task corpus of heritage BCMS 1.0",
+ "URL": "http://hdl.handle.net/11356/1750",
+ "Family": "Spoken corpora",
+ "Description": "This corpus of heritage Bosnian/Croatian/Montenegrin/Serbian (BCMS) consists of elicited conversations (map tasks) by 29 second-generation BCMS speakers originating from different regions of former Yugoslavia and living in German-speaking Switzerland. The corpus is suited for researchers of heritage BCMS, as well as students and teachers of BCMS living in diaspora.\nThe corpus contains 30 turn-aligned transcripts with an average length of 6 minutes. The texts are annotated with the CLASSLA pipeline on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features. The corpus is enriched with corpus-specific annotations of truncations, elongations, stutter and code-switches. It is distributed in source TEI and derived vertical formats.\nThe corpus is available for download from CLARIN.SI as well as through the noSketchEngine and KonText concordancers.",
+ "Languages": ["bos", "hrv", "cnr", "srp"],
+ "License": "CC BY-NC-SA 4.0",
+ "Size": ["12,988 tokens"],
+ "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated with corpus-specific annotations"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=maptask_bcms&struct_attr_stats=1",
+ "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=maptask_bcms",
+ "Download": "http://hdl.handle.net/11356/1750"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bea.json b/corpora/spoken-corpora/bea.json
new file mode 100644
index 0000000..bce38c3
--- /dev/null
+++ b/corpora/spoken-corpora/bea.json
@@ -0,0 +1,14 @@
+{
+ "Name": "BEA (Hungarian Spontaneous Speech Database)",
+ "URL": "http://metashare.nytud.hu/repository/browse/bea-hungarian-spontaneous-speech-database/808c4c306ba911e2aa7c68b599c26a062458e40404d44e4087901b5b720d2765/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous speech.",
+ "Languages": ["hun"],
+ "License": "restricted",
+ "Size": ["465 recordings"],
+ "Annotation": ["partial transcription"],
+ "Infrastructure": "Other",
+ "Access": {
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bel-tv-debates.json b/corpora/spoken-corpora/bel-tv-debates.json
new file mode 100644
index 0000000..71dc7ac
--- /dev/null
+++ b/corpora/spoken-corpora/bel-tv-debates.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Belgische TV-Debatten",
+ "URL": "http://hdl.handle.net/10932/00-03FA-9CB0-5E33-8E01-8",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains broadcast TV debates.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["10 hours"],
+ "Annotation": ["orthographically transcribed", "lemmatized"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-03FA-9CB0-5E33-8E01-8",
+ "Download": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/berliner-wende.json b/corpora/spoken-corpora/berliner-wende.json
new file mode 100644
index 0000000..1c7dda5
--- /dev/null
+++ b/corpora/spoken-corpora/berliner-wende.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Berliner Wendekorpus",
+ "URL": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains narrative interviews on German reunification.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["260,000 words", "28 hours"],
+ "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4",
+ "Download": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bielefeld-speech-and-gesture.json b/corpora/spoken-corpora/bielefeld-speech-and-gesture.json
new file mode 100644
index 0000000..4f46c2a
--- /dev/null
+++ b/corpora/spoken-corpora/bielefeld-speech-and-gesture.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Bielefeld Speech and Gesture Alignment Corpus",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-DEC1-C",
+ "Family": "Spoken corpora",
+ "Description": "The corpus is made up of 25 dialogs of interlocutors (50), who engage in a spatial communication task combining direction-giving and sight description. Six of those dialogues with data only from the direction giver are available including audio (*.wav) and video (*.mp4) data. There are 1764 isolated gestures in the corpus",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["9881 words"],
+ "Annotation": ["Annotations of gestures and speech-gesture referents"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-DEC1-C"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bigbrother.json b/corpora/spoken-corpora/bigbrother.json
new file mode 100644
index 0000000..601d480
--- /dev/null
+++ b/corpora/spoken-corpora/bigbrother.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The BigBrother Corpus",
+ "URL": "http://www.tekstlab.uio.no/nota/bigbrother/english.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings and transcripts from the Norwegian Big Brother in 2001.\nThe corpus is available through a Tekstlab concordancer.",
+ "Languages": ["nor"],
+ "License": "CLARIN ACA",
+ "Size": ["440,300 tokens"],
+ "Annotation": ["orthographically transcribed", "msd-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://tekstlab.uio.no/glossa2/bb"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bio-reise.json b/corpora/spoken-corpora/bio-reise.json
new file mode 100644
index 0000000..5796e02
--- /dev/null
+++ b/corpora/spoken-corpora/bio-reise.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Biographische und Reiseerzählungen",
+ "URL": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains narrative and biographic interviews.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["50,000 words", "6 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4",
+ "Download": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/bits.json b/corpora/spoken-corpora/bits.json
new file mode 100644
index 0000000..14017e7
--- /dev/null
+++ b/corpora/spoken-corpora/bits.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus BITS",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C2C0-4",
+ "Family": "Spoken corpora",
+ "Description": "This is a corpus for speech synthesis using concatenative technique.",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["16.5 hours"],
+ "Annotation": ["orthographically transcribed", "phonetic", "phonemic", "prosodic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-C2C0-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/border-karelia.json b/corpora/spoken-corpora/border-karelia.json
new file mode 100644
index 0000000..79f7d36
--- /dev/null
+++ b/corpora/spoken-corpora/border-karelia.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The Corpus of Border Karelia",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073033",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThe corpus is available for download from FIN-CLARIN.",
+ "Languages": ["fin", "krl"],
+ "License": "CC-BY",
+ "Size": ["120 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://www.uef.fi/fi/finka/finka"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/boston-u-radio.json b/corpora/spoken-corpora/boston-u-radio.json
new file mode 100644
index 0000000..4fa1ed1
--- /dev/null
+++ b/corpora/spoken-corpora/boston-u-radio.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Boston University Radio Speech Corpus",
+ "URL": "https://catalog.ldc.upenn.edu/LDC96S36",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings and texts from radio news.",
+ "Languages": ["eng"],
+ "License": "CLARIN RES",
+ "Size": ["7 hours"],
+ "Annotation": ["PoS-tagged", "phonetic alignment", "prosodic markers"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://catalog.ldc.upenn.edu/LDC96S36"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/brothers.json b/corpora/spoken-corpora/brothers.json
new file mode 100644
index 0000000..1a1b033
--- /dev/null
+++ b/corpora/spoken-corpora/brothers.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus BROTHERS",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-55C3-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of pairs of brothers between the ages of 19 and 31. The native and recorded language is German. Recordings consist of minimal pairs in carrier sentences, a different set of sentences aimed at elicitating the full range of German vowels ('Berliner Sätze'), and a spontaneous dialogue about a TV-series. Recordings were made via a table microphone (studio quality) and via telephone (telephone quality). ",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["1.5 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-55C3-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/buckeye.json b/corpora/spoken-corpora/buckeye.json
new file mode 100644
index 0000000..6f6fff9
--- /dev/null
+++ b/corpora/spoken-corpora/buckeye.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Buckeye Corpus of Conversational Speech",
+ "URL": "http://hdl.handle.net/11041/sldr000776",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains an interview.\nThe corpus is available for download from ORTOLANG.",
+ "Languages": ["eng"],
+ "License": "CLARIN RES",
+ "Size": [],
+ "Annotation": ["phonetic labels"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11041/sldr000776"
+ },
+ "Publication": "Pitt et al. (2005)"
+}
diff --git a/corpora/spoken-corpora/budapest-socioling.json b/corpora/spoken-corpora/budapest-socioling.json
new file mode 100644
index 0000000..8f10718
--- /dev/null
+++ b/corpora/spoken-corpora/budapest-socioling.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Budapest Sociolinguistic Interview - version 2",
+ "URL": "http://buszi.nytud.hu/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains sociolinguistic interviews conducted with 50 individuals.\nThe corpus is available for download and through a dedicated concordancer.",
+ "Languages": ["hun"],
+ "License": "CLARIN RES",
+ "Size": ["270,000 words"],
+ "Annotation": ["MSD-tagged", "spoken language phenomena (hesitation, consonant drops)"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://buszi.nytud.hu/keresoprogramok",
+ "Download": "http://buszi.nytud.hu/buszi-2-iranyitott-beszelgetesek"
+ },
+ "Publication": "Kontra and Váradi (1997)"
+}
diff --git a/corpora/spoken-corpora/cans.json b/corpora/spoken-corpora/cans.json
new file mode 100644
index 0000000..4b9afb2
--- /dev/null
+++ b/corpora/spoken-corpora/cans.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus of American Nordic Speech (CANS)",
+ "URL": "http://tekstlab.uio.no/norskiamerika/english/corpus.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews, conversations. Norwegian and Swedish dialects in America.\nThe corpus is available through a Tekstlab concordancer.",
+ "Languages": ["nor", "swe"],
+ "License": "CLARIN ACA",
+ "Size": ["251,000 tokens"],
+ "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://tekstlab.uio.no/glossa2/cans"
+ },
+ "Publication": "Johannessen (2015)"
+}
diff --git a/corpora/spoken-corpora/ci-articulation.json b/corpora/spoken-corpora/ci-articulation.json
new file mode 100644
index 0000000..2199144
--- /dev/null
+++ b/corpora/spoken-corpora/ci-articulation.json
@@ -0,0 +1,16 @@
+{
+ "Name": "CI Articulation",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-8B63-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains speech recordings of normal hearing speakers and speakers equipped with Cochlear Implants (CI). Speech data were collected with the software SpeechRecorder, for each recording a BPF file was generated (*.par).",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["5 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-8B63-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/clapi.json b/corpora/spoken-corpora/clapi.json
new file mode 100644
index 0000000..518f746
--- /dev/null
+++ b/corpora/spoken-corpora/clapi.json
@@ -0,0 +1,16 @@
+{
+ "Name": "CLAPI",
+ "URL": "http://hdl.handle.net/11403/CLAPI",
+ "Family": "Spoken corpora",
+ "Description": "This is a collection containing around 40 corpora which contain social interactions in different contexts: professional, private, institutional, commercial, medical, and educational situations.\nMost of the corpora can be downloaded and queried through a dedicated concordancer.",
+ "Languages": ["fra"],
+ "License": "CC BY-NC-SA 4.0",
+ "Size": ["323,595 words"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://clapi.icar.cnrs.fr/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/clips-mt-manual.json b/corpora/spoken-corpora/clips-mt-manual.json
new file mode 100644
index 0000000..dad8afd
--- /dev/null
+++ b/corpora/spoken-corpora/clips-mt-manual.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus CLIPS_MT_MANUAL",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A9EE-6",
+ "Family": "Spoken corpora",
+ "Description": "This is a sub-corpus of the original Italian CLIPS corpus (Corpora e Lessici dell'Italiano Parlato e Scritto) that is manually annotated and covers only 15 maptask dialogues recorded in 15 locations by local speaker pairs. this corpus contains 3228 inspected and partially repaired WAV signal files, each containing one dialogue turn (*.wav), 3228 corrected original CLIPS annotation files (*.acs, *.phn, *.std, *.wrd), 3228 BAS Partitur files containing the annotation tiers ORT, KAN and SAP (*.par), 3228 EMU database annotation files (*.vot, *.hlb) covering 30 maptask dialogues performed by 30 speakers (each speaker pair performing two different map tasks) recorded in 15 different locations in Italy in 2000-2004.",
+ "Languages": ["ita"],
+ "License": "CLARIN ACA",
+ "Size": ["3 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic", "phonetic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-A9EE-6"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/clips.json b/corpora/spoken-corpora/clips.json
new file mode 100644
index 0000000..594d7ff
--- /dev/null
+++ b/corpora/spoken-corpora/clips.json
@@ -0,0 +1,16 @@
+{
+ "Name": "CLIPS : corpora e lessici di italiano parlato e scritto",
+ "URL": "http://hdl.handle.net/11372/LRT-865",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains speech from 15 different cities in Italy.",
+ "Languages": ["ita"],
+ "License": "",
+ "Size": ["100 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://www.clips.unina.it/it/corpus.jsp"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json b/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json
new file mode 100644
index 0000000..68bb4f9
--- /dev/null
+++ b/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Cluster Production in Cochlear Implant Patients (diachronic data)",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A99C-2",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains diachronic speech recordings from three cochlear implant (CI) users. For data used in the corresponding synchronic study, please refer to the CI_2 corpora. This corpus contains recordings used for the analysis of the temporal dynamics of the consonant cluster /ʃtr/.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["14 min"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-A99C-2"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/consonant-cochlear-patients.json b/corpora/spoken-corpora/consonant-cochlear-patients.json
new file mode 100644
index 0000000..2cb5c28
--- /dev/null
+++ b/corpora/spoken-corpora/consonant-cochlear-patients.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Consonant Cluster Production in Cochlear Implant Patients",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AF40-2",
+ "Family": "Spoken corpora",
+ "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG).",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["2 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-AF40-2"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/contemporary-french.json b/corpora/spoken-corpora/contemporary-french.json
new file mode 100644
index 0000000..a3bf850
--- /dev/null
+++ b/corpora/spoken-corpora/contemporary-french.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus for the Study of Contemporary French",
+ "URL": "https://hdl.handle.net/11403/cefc-orfeo/v1",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains debates, classroom interactions, literary and scientific texts, regional and national press, etc.\nThe corpus is available through a dedicated concordancer.",
+ "Languages": ["fra"],
+ "License": "CC-BY 4.0",
+ "Size": ["10 million words", "350 hours"],
+ "Annotation": ["orthographically aligned", "PoS-tagged"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://repository.ortolang.fr/api/content/cefc-orfeo/4/documentation/site-orfeo/index.html"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/corpus-avip-api.json b/corpora/spoken-corpora/corpus-avip-api.json
new file mode 100644
index 0000000..c096fbb
--- /dev/null
+++ b/corpora/spoken-corpora/corpus-avip-api.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Corpus AVIP-API",
+ "URL": "http://www.parlaritaliano.it/api/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains quasi-spontaneous dialogues (a map task).\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["ita"],
+ "License": "",
+ "Size": [],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://www.parlaritaliano.it/api/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/corpus-lip.json b/corpora/spoken-corpora/corpus-lip.json
new file mode 100644
index 0000000..7847eb6
--- /dev/null
+++ b/corpora/spoken-corpora/corpus-lip.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Corpus LIP",
+ "URL": "http://badip.uni-graz.at/it/corpus-lip/descrizione",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available through a dedicated concordancer.",
+ "Languages": ["ita"],
+ "License": "",
+ "Size": ["490,000 words"],
+ "Annotation": [],
+ "Infrastructure": "Other",
+ "Access": {
+ "Concordancer": "http://badip.uni-graz.at/it/corpus-lip/cerca"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/corpus-lips.json b/corpora/spoken-corpora/corpus-lips.json
new file mode 100644
index 0000000..5061f0f
--- /dev/null
+++ b/corpora/spoken-corpora/corpus-lips.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Corpus Lips",
+ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/653-corpus-lips",
+ "Family": "Spoken corpora",
+ "Description": "This is a L2-learner corpus.\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["ita"],
+ "License": "",
+ "Size": ["700,000 words", "100 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/653-corpus-lips"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/cosi.json b/corpora/spoken-corpora/cosi.json
new file mode 100644
index 0000000..ae845d4
--- /dev/null
+++ b/corpora/spoken-corpora/cosi.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Consecutive and Simultaneous Interpreting (CoSi)",
+ "URL": "http://hdl.handle.net/11022/0000-0000-5225-A",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains lectures in Portuguese with simultaneous interpretation in English.",
+ "Languages": ["por", "eng"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["6 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/czech-malach.json b/corpora/spoken-corpora/czech-malach.json
new file mode 100644
index 0000000..d32d2c7
--- /dev/null
+++ b/corpora/spoken-corpora/czech-malach.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Czech Malach Cross-lingual Speech Retrieval Test Collection",
+ "URL": "http://hdl.handle.net/11234/1-1912",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews with survivors of the Holocaust.\nThe corpus is available for download from LINDAT.",
+ "Languages": ["ces", "eng", "fra", "deu", "spa"],
+ "License": "CC BY-NC-ND 4.0",
+ "Size": ["592 hours"],
+ "Annotation": ["manual annotations of selected topics and interviews' metadata"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11234/1-1912"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/de-hochlautung.json b/corpora/spoken-corpora/de-hochlautung.json
new file mode 100644
index 0000000..102fd38
--- /dev/null
+++ b/corpora/spoken-corpora/de-hochlautung.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Deutsche Hochlautung",
+ "URL": "http://hdl.handle.net/10932/00-0332-C35C-4849-7B01-7",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains broadcasts in standard German.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["10,000 words", "2 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C35C-4849-7B01-7",
+ "Download": "http://hdl.handle.net/10932/00-0332-C35C-4849-7B01-7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/de-koenig.json b/corpora/spoken-corpora/de-koenig.json
new file mode 100644
index 0000000..5a73c96
--- /dev/null
+++ b/corpora/spoken-corpora/de-koenig.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Deutsche Standardsprache: König-Korpus",
+ "URL": "http://hdl.handle.net/10932/00-0332-C489-C64D-6D01-9",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews and elicited speech in standard German\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["50,000 words", "6 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C489-C64D-6D01-9",
+ "Download": "http://hdl.handle.net/10932/00-0332-C489-C64D-6D01-9"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/de-mundarten-ddr.json b/corpora/spoken-corpora/de-mundarten-ddr.json
new file mode 100644
index 0000000..6c6d547
--- /dev/null
+++ b/corpora/spoken-corpora/de-mundarten-ddr.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Deutsche Mundarten: DDR",
+ "URL": "http://hdl.handle.net/10932/00-0332-BE28-4317-5D01-B",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews and elicited speech in German dialects.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["German, (some Sorbian)"],
+ "License": "CLARIN RES",
+ "Size": ["212,000 words", "385 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-BE28-4317-5D01-B",
+ "Download": "http://hdl.handle.net/10932/00-0332-BE28-4317-5D01-B"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/de-mundarten-ost.json b/corpora/spoken-corpora/de-mundarten-ost.json
new file mode 100644
index 0000000..7acede5
--- /dev/null
+++ b/corpora/spoken-corpora/de-mundarten-ost.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Deutsche Mundarten: ehemalige deutsche Ostgebiete",
+ "URL": "http://hdl.handle.net/10932/00-0332-C68C-5D03-EB01-7",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews and elicited speech in German dialects.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["838,000 words", "461 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C68C-5D03-EB01-7",
+ "Download": "http://hdl.handle.net/10932/00-0332-C68C-5D03-EB01-7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/de-mundarten-zwirner.json b/corpora/spoken-corpora/de-mundarten-zwirner.json
new file mode 100644
index 0000000..5478662
--- /dev/null
+++ b/corpora/spoken-corpora/de-mundarten-zwirner.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Deutsche Mundarten: Zwirner-Korpus",
+ "URL": "http://hdl.handle.net/10932/00-0332-D40A-3CEE-B901-4",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews and elicited speech in German dialects.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["German, (some Frisian and Dutch)"],
+ "License": "CLARIN RES",
+ "Size": ["4 million words", "1076 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-D40A-3CEE-B901-4",
+ "Download": "http://hdl.handle.net/10932/00-0332-D40A-3CEE-B901-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/de-pfeffer.json b/corpora/spoken-corpora/de-pfeffer.json
new file mode 100644
index 0000000..e620009
--- /dev/null
+++ b/corpora/spoken-corpora/de-pfeffer.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Deutsche Umgangssprachen: Pfeffer-Korpus",
+ "URL": "http://hdl.handle.net/10932/00-0332-C9D0-78FE-3C01-2",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews in regional varieties of German.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["646,000 words", "80 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C9D0-78FE-3C01-2",
+ "Download": "http://hdl.handle.net/10932/00-0332-C9D0-78FE-3C01-2"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/dialekt.json b/corpora/spoken-corpora/dialekt.json
new file mode 100644
index 0000000..053f806
--- /dev/null
+++ b/corpora/spoken-corpora/dialekt.json
@@ -0,0 +1,17 @@
+{
+ "Name": "DIALEKT v1: dialectal corpus with multi-tier transcription",
+ "URL": "https://wiki.korpus.cz/doku.php/en:cnk:dialekt",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains traditional dialectological material, mostly unprepared monologue-type speech.\nThe corpus is available download (upon request) and through the concordancer KonText.",
+ "Languages": ["ces"],
+ "License": "Academic Licence Agreement for Czech National Corpus Data",
+ "Size": ["100,000 words"],
+ "Annotation": ["orthographically and phonetically (dialect features) transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://kontext.korpus.cz/first_form?corpname=dialekt_v1_dial",
+ "Download": "http://wiki.korpus.cz/doku.php/en:cnk:dialekt"
+ },
+ "Publication": "Komrsková et al. (2018)"
+}
diff --git a/corpora/spoken-corpora/dialogstrukturen.json b/corpora/spoken-corpora/dialogstrukturen.json
new file mode 100644
index 0000000..1bf4ba0
--- /dev/null
+++ b/corpora/spoken-corpora/dialogstrukturen.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Dialogstrukturen",
+ "URL": "http://hdl.handle.net/10932/00-0332-C0BE-562F-C101-E",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains authentic interaction from various domains.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["140,000 words", "15 hours"],
+ "Annotation": ["orthographically transcribed", "intonation", "lemmatised", "PoS-tagged", "time alignment"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C0BE-562F-C101-E",
+ "Download": "http://hdl.handle.net/10932/00-0332-C0BE-562F-C101-E"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/doc-patient-ahus.json b/corpora/spoken-corpora/doc-patient-ahus.json
new file mode 100644
index 0000000..b80ca30
--- /dev/null
+++ b/corpora/spoken-corpora/doc-patient-ahus.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus of Doctor-Patient Conversations from Ahus",
+ "URL": "https://www.hf.uio.no/iln/english/about/organization/text-laboratory/projects/doctor-patient/index.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains doctor-patient conversations.\nThe corpus is available through a Tekstlab concordancer (account needed).",
+ "Languages": ["nor"],
+ "License": "CLARIN ACA",
+ "Size": ["958,830 tokens"],
+ "Annotation": ["orthographically transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "http://tekstlab.uio.no/glossa/html/?corpus=legepasient"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/elfa.json b/corpora/spoken-corpora/elfa.json
new file mode 100644
index 0000000..e8c5e00
--- /dev/null
+++ b/corpora/spoken-corpora/elfa.json
@@ -0,0 +1,16 @@
+{
+ "Name": "ELFA Corpus",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-201403262",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recorded lectures and seminars.\nThe corpus is available for download from FIN-CLARIN.",
+ "Languages": ["eng"],
+ "License": "CLARIN RES, MS-C-No ReD-ND-FF",
+ "Size": ["13 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://urn.fi/urn:nbn:fi:lb-2014052721"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/emigranten-israel-wiener.json b/corpora/spoken-corpora/emigranten-israel-wiener.json
new file mode 100644
index 0000000..029d05f
--- /dev/null
+++ b/corpora/spoken-corpora/emigranten-israel-wiener.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Emigrantendeutsch in Israel: Wiener in Jerusalem",
+ "URL": "http://hdl.handle.net/10932/00-0332-C42A-423C-2401-D",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["225,000 words", "51 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C42A-423C-2401-D",
+ "Download": "http://hdl.handle.net/10932/00-0332-C42A-423C-2401-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/emigranten-israel.json b/corpora/spoken-corpora/emigranten-israel.json
new file mode 100644
index 0000000..19725dd
--- /dev/null
+++ b/corpora/spoken-corpora/emigranten-israel.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Emigrantendeutsch in Israel",
+ "URL": "http://hdl.handle.net/10932/00-0332-C3A7-393A-8A01-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["232,000 words", "285 hours"],
+ "Annotation": ["orthographically transcribed", "lemma", "PoS-tagged", "time alignment"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C3A7-393A-8A01-3",
+ "Download": "http://hdl.handle.net/10932/00-0332-C3A7-393A-8A01-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/eslora.json b/corpora/spoken-corpora/eslora.json
new file mode 100644
index 0000000..b73fc8b
--- /dev/null
+++ b/corpora/spoken-corpora/eslora.json
@@ -0,0 +1,15 @@
+{
+ "Name": "ESLORA 2.0",
+ "URL": "http://eslora.usc.es/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus consists of spontaneous conversations and semi-structured interviews recorded in Galicia between 2007 and 2015, which were orthographically transcribed and manually aligned to the audio files. The transcripts have been morphologically tagged and lemmatized with the statistical PoS-tagger XIADA.\nThe corpus can be browsed via a dedicated search engine. The multiple functions of the search engine are fully described in the User Guide.",
+ "Languages": ["spa"],
+ "License": "academic, non-commercial",
+ "Size": ["83 documents", "768,005 words", "898,914 tokens"],
+ "Annotation": ["manual alignment", "orthographic transcription", "PoS-tagging", "lemmatisation"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Concordancer": "http://eslora.usc.es/search/"
+ },
+ "Publication": ["Barcala et al. (2018)", "Vázquez Rozas and Barcala (2020)"]
+}
diff --git a/corpora/spoken-corpora/est-dialect.json b/corpora/spoken-corpora/est-dialect.json
new file mode 100644
index 0000000..0cc22fe
--- /dev/null
+++ b/corpora/spoken-corpora/est-dialect.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Estonian Dialect Corpus",
+ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00076L",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThe corpus is available for download from META-SHARE (CELR distribution).",
+ "Languages": ["est"],
+ "License": "CLARIN ACA",
+ "Size": ["1.3 million words"],
+ "Annotation": ["phonetically transcribed", "MSD-tagged", "partly syntactically parsed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00076L"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/est-emotional-speech.json b/corpora/spoken-corpora/est-emotional-speech.json
new file mode 100644
index 0000000..61304b7
--- /dev/null
+++ b/corpora/spoken-corpora/est-emotional-speech.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Estonian Emotional Speech Corpus",
+ "URL": "http://hdl.handle.net/10.15155/3-00-0000-0000-0000-0001AL",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains read sentences that express anger, joy and sadness, or are neutral.\nTThe corpus is available for download from META-SHARE (CELR distribution).",
+ "Languages": ["est"],
+ "License": "CC-BY",
+ "Size": ["1234 sentences"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/3-00-0000-0000-0000-0001AL"
+ },
+ "Publication": "Altrov and Pajupuu (2012)"
+}
diff --git a/corpora/spoken-corpora/est-spontaneous-speech.json b/corpora/spoken-corpora/est-spontaneous-speech.json
new file mode 100644
index 0000000..3fa0d23
--- /dev/null
+++ b/corpora/spoken-corpora/est-spontaneous-speech.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Phonetic Corpus of Estonian Spontaneous Speech v.1.0.4",
+ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00154L",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous speech by speakers with different dialectological and social backgrounds.\nThe corpus is available for download from META-SHARE (CELR distribution).",
+ "Languages": ["est"],
+ "License": "CLARIN_RES",
+ "Size": ["635,000 words", "90 hours"],
+ "Annotation": ["orthographically and phonetically transcribed, syllables, prosodic feet, intonation phrases, changes in voice quality"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00154L"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/exmeralda-demo.json b/corpora/spoken-corpora/exmeralda-demo.json
new file mode 100644
index 0000000..76258cf
--- /dev/null
+++ b/corpora/spoken-corpora/exmeralda-demo.json
@@ -0,0 +1,16 @@
+{
+ "Name": "EXMARaLDA Demo Corpus 1.0",
+ "URL": "http://hdl.handle.net/11022/0000-0000-4F70-A",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is a demo of the EXMARaLDA system.\nThe corpus is available for download from a CLARIN-D repository.",
+ "Languages": ["deu", "eng", "fra", "spa", "tur", "pol", "vie", "swe", "nor", "ita", "rus", "afr", "por"],
+ "License": "HZSK-PUB (public, non-commercial only)",
+ "Size": ["2 hours"],
+ "Annotation": ["suprasegmental information", "accentuation/stress marking"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/0000-0000-4F70-A"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/fadac.json b/corpora/spoken-corpora/fadac.json
new file mode 100644
index 0000000..9871719
--- /dev/null
+++ b/corpora/spoken-corpora/fadac.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Faroese Danish Corpus Hamburg 0.2.dan (FADAC-0.2.dan Hamburg)",
+ "URL": "http://hdl.handle.net/11022/0000-0000-A0D3-C",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains informal interviews.",
+ "Languages": ["fao", "dan"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": [],
+ "Annotation": ["EXMARaLDA"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/fin-broadcast.json b/corpora/spoken-corpora/fin-broadcast.json
new file mode 100644
index 0000000..a85bfde
--- /dev/null
+++ b/corpora/spoken-corpora/fin-broadcast.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Finnish Broadcast Corpus",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-201403265",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains radio and TV broadcasts.\nThe corpus is available for download from FIN-CLARIN and for online querying through the LAT-platform.",
+ "Languages": ["fin"],
+ "License": "CLARIN RES",
+ "Size": ["18 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://urn.fi/urn:nbn:fi:lb-1001100133",
+ "Download": "http://urn.fi/urn:nbn:fi:lb-2016021201"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/fin-dialect-syntax.json b/corpora/spoken-corpora/fin-dialect-syntax.json
new file mode 100644
index 0000000..151db5d
--- /dev/null
+++ b/corpora/spoken-corpora/fin-dialect-syntax.json
@@ -0,0 +1,17 @@
+{
+ "Name": "The Finnish Dialect Syntax Archive",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052716",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThe corpus is available for online querying through the LAT platform and through the concordancer Korp.",
+ "Languages": ["fin"],
+ "License": "CC-BY-NC-ND",
+ "Size": ["1.2 million words"],
+ "Annotation": ["MSD-tagged"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2014052715",
+ "LAT Platform": "http://urn.fi/urn:nbn:fi:lb-1001100111532"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/fin-parliament.json b/corpora/spoken-corpora/fin-parliament.json
new file mode 100644
index 0000000..05a24c2
--- /dev/null
+++ b/corpora/spoken-corpora/fin-parliament.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Plenary Sessions of the Parliament of Finland",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-201407305",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains the proceedings of the Finnish Parliament.\nThe corpus is available through a dedicated webpage and through the concordancer Korp.",
+ "Languages": ["fin", "swe"],
+ "License": "CC-BY-NC-ND",
+ "Size": ["22.5 million words"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2017020201",
+ "Download": "https://www.eduskunta.fi/FI/lakiensaataminen/taysistunnon_verkkolahetykset/tallenteet/Sivut/default.aspx"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/followup-fin-dialects.json b/corpora/spoken-corpora/followup-fin-dialects.json
new file mode 100644
index 0000000..af73aee
--- /dev/null
+++ b/corpora/spoken-corpora/followup-fin-dialects.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Follow-up Study of Dialects of Finnish",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073043",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThis corpus is available for online querying through the LAT-platform.",
+ "Languages": ["fin"],
+ "License": "CLARIN RES",
+ "Size": ["12,200 Hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "LAT Platform": "http://urn.fi/urn:nbn:fi:lb-100110017700"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/formtask.json b/corpora/spoken-corpora/formtask.json
new file mode 100644
index 0000000..79b66eb
--- /dev/null
+++ b/corpora/spoken-corpora/formtask.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus FORMTASK",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0005-8535-9",
+ "Family": "Spoken corpora",
+ "Description": "This is a corpus of telephone conversations including prompted descriptions of typical forms (Berlin public transport ticket, invoices, Austrian parking tickets, newsstand receipts, money transfer forms) found in everyday life.",
+ "Languages": ["deu"],
+ "License": "CLARIN PUB",
+ "Size": ["24.5 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0005-8535-9"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/forschung-gespr-de.json b/corpora/spoken-corpora/forschung-gespr-de.json
new file mode 100644
index 0000000..09c5efc
--- /dev/null
+++ b/corpora/spoken-corpora/forschung-gespr-de.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Forschungs- und Lehrkorpus gesprochenes Deutsch",
+ "URL": "http://hdl.handle.net/10932/00-0332-C1B2-A5E3-2A01-D",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains authentic interactions from various domains.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["2.3 million words", "230 hours"],
+ "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C1B2-A5E3-2A01-D",
+ "Download": "http://hdl.handle.net/10932/00-0332-C1B2-A5E3-2A01-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/fra-parisien-2000.json b/corpora/spoken-corpora/fra-parisien-2000.json
new file mode 100644
index 0000000..d7aeb50
--- /dev/null
+++ b/corpora/spoken-corpora/fra-parisien-2000.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus de Français Parlé Parisien des années 2000",
+ "URL": "https://doi.org/10.34847/cocoon.8bc96a4e-9899-30e4-99be-c72d216eb38b",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["fra"],
+ "License": "CC-BY",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "https://doi.org/10.34847/cocoon.8bc96a4e-9899-30e4-99be-c72d216eb38b"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/gamli.json b/corpora/spoken-corpora/gamli.json
new file mode 100644
index 0000000..5278557
--- /dev/null
+++ b/corpora/spoken-corpora/gamli.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Gamli: Icelandic Oral History Corpus",
+ "URL": "http://hdl.handle.net/20.500.12537/315",
+ "Family": "Spoken corpora",
+ "Description": "This is an ASR corpus for Icelandic oral histories.\nThe corpus contains 210 unique speakers, 90 women and 120 men (plus the interviewers: 14 men and 1 woman), but the total audio length with each individual speaker varies quite a lot with three men accounting for one third of the entire data. The age ranges from 38 to 99, but most of the speakers are 60+ (94.8%) and the average age of the speakers is 77 years. This ratio is unprecedented in all existing corpora for Icelandic speech (cf. 4.8% of speakers in Samrómur are 60+) and makes Gamli an important addition to that collection.\nThe corpus is available for download from the CLARIN-IS repository.",
+ "Languages": ["isl"],
+ "License": "CC BY 4.0",
+ "Size": ["146 hours of transcribed audio"],
+ "Annotation": ["Subset is manually annotated with speaker ID and time alignment"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/20.500.12537/315"
+ },
+ "Publication": "O’Brien et al. (2023)"
+}
diff --git a/corpora/spoken-corpora/gender-neutral-de.json b/corpora/spoken-corpora/gender-neutral-de.json
new file mode 100644
index 0000000..ce6997b
--- /dev/null
+++ b/corpora/spoken-corpora/gender-neutral-de.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Spoken production of gender-neutral nouns in German",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0003-FF39-F",
+ "Family": "Spoken corpora",
+ "Description": "This corpus examines the pronunciation of different genderneutral forms in German. Various source texts were used, like newspaper articles, websites, etc.",
+ "Languages": ["deu"],
+ "License": "CLARIN PUB",
+ "Size": ["2 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0003-FF39-F"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/gesprochenes-wortkorpus.json b/corpora/spoken-corpora/gesprochenes-wortkorpus.json
new file mode 100644
index 0000000..caeb49f
--- /dev/null
+++ b/corpora/spoken-corpora/gesprochenes-wortkorpus.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Gesprochenes Wortkorpus für Untersuchungen zur auditiven Verarbeitung von Sprache und emotionaler Prosodie",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-3D30-F",
+ "Family": "Spoken corpora",
+ "Description": "WaSeP contains recordings of one female and one male speaker, both professional actors, uttering single German nouns and pseudowords in multiple emotional prosodies. This edition improves the segmentation of the phonetic annotation, adds Praat TextGrid files and removes a few irregular items.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["3 hours"],
+ "Annotation": ["phonetic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-3D30-F"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/gewiss.json b/corpora/spoken-corpora/gewiss.json
new file mode 100644
index 0000000..a1daae4
--- /dev/null
+++ b/corpora/spoken-corpora/gewiss.json
@@ -0,0 +1,16 @@
+{
+ "Name": "GeWiss",
+ "URL": "https://gewiss.uni-leipzig.de/index.php?id=home&L=1",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains transcripts and audio recordings of spoken academic discourse, primarily talks including discussions and oral exams.",
+ "Languages": ["German (L2 and L1)", "eng", "pol", "Italian (L1)"],
+ "License": "",
+ "Size": ["1.4 million tokens", "123 hours"],
+ "Annotation": ["code switching"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://gewiss.uni-leipzig.de/index.php?id=full_texts&L=1"
+ },
+ "Publication": "Fandrych et al. (2014)"
+}
diff --git a/corpora/spoken-corpora/gos-video.json b/corpora/spoken-corpora/gos-video.json
new file mode 100644
index 0000000..f3163ae
--- /dev/null
+++ b/corpora/spoken-corpora/gos-video.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Spoken corpus Gos VideoLectures 3.0 (transcription)",
+ "URL": "http://hdl.handle.net/11356/1190",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains public academic speech.\nThe corpus is available for download from CLARIN.SI and through the concordancer KonText.\nFor the version with audio recordings, click here.",
+ "Languages": ["slv"],
+ "License": "CC BY 4.0",
+ "Size": ["126,000 words"],
+ "Annotation": ["PoS-tagged", "lemmatised", "orthographically and phonetically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=gos_vl",
+ "Download": "http://hdl.handle.net/11356/1190"
+ },
+ "Publication": "Verdonik (2018)"
+}
diff --git a/corpora/spoken-corpora/gos.json b/corpora/spoken-corpora/gos.json
new file mode 100644
index 0000000..d9f6199
--- /dev/null
+++ b/corpora/spoken-corpora/gos.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Spoken corpus Gos 2.0",
+ "URL": "http://hdl.handle.net/11356/1771",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains transcripts from radio and TV shows, school lessons, private conversations, business meetings. It is composed of three different sources: Spoken corpus Gos 1.1 (112 hours, 1 million words), Spoken corpus Gos VideoLectures 4.2 (22 hours, 179,000 words), a selection from the ASR database ARTUR 1.0 (185 hours, 1.2 mllion words).\nThe corpus is available for download from CLARIN.SI as well as through a dedicated webconcordancer.",
+ "Languages": ["slv"],
+ "License": "CC BY-SA 4.0",
+ "Size": ["1534 texts", "127,604 utterances", "2,462,368 words"],
+ "Annotation": ["phonetic and orthographic transcription", "PoS tagging", "lemmatisation"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "http://www.korpus-gos.net/",
+ "Download": "http://hdl.handle.net/11356/1771"
+ },
+ "Publication": "Verdonik and Zwitter-Vitez (2011)"
+}
diff --git a/corpora/spoken-corpora/gothenburg-dialogue.json b/corpora/spoken-corpora/gothenburg-dialogue.json
new file mode 100644
index 0000000..f4e610b
--- /dev/null
+++ b/corpora/spoken-corpora/gothenburg-dialogue.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Gothenburg Dialogue Corpus",
+ "URL": "https://spraakbanken.gu.se/swe/resurs/gdc#tabs=information",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available through the concordancer Korp (account needed).",
+ "Languages": ["swe"],
+ "License": "CC-BY",
+ "Size": ["1,470,000 tokens"],
+ "Annotation": ["MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "http://spraakbanken.gu.se/korp/#corpus=gdc"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/griffith-australian.json b/corpora/spoken-corpora/griffith-australian.json
new file mode 100644
index 0000000..56c659e
--- /dev/null
+++ b/corpora/spoken-corpora/griffith-australian.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Griffith Corpus of Spoken Australian English",
+ "URL": "https://www.ausnc.org.au/corpora/gcsause",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available for download and through the concordancer of the Australian National Corpus.",
+ "Languages": ["eng"],
+ "License": "",
+ "Size": ["32,134 words"],
+ "Annotation": [],
+ "Infrastructure": "Other",
+ "Access": {
+ "Concordancer": "https://www.ausnc.org.au/corpora/gcsause",
+ "Download": "https://www.ausnc.org.au/corpora/gcsause"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/grundstrukturen-freiburg.json b/corpora/spoken-corpora/grundstrukturen-freiburg.json
new file mode 100644
index 0000000..44b0a25
--- /dev/null
+++ b/corpora/spoken-corpora/grundstrukturen-freiburg.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Grundstrukturen: Freiburger Korpus",
+ "URL": "http://hdl.handle.net/10932/00-0332-C29F-AE56-C501-7",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains authentic interaction from various domains.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["600,000 words", "70 hours"],
+ "Annotation": ["orthographically transcribed", "intonation", "lemmatised", "PoS-tagged", "time alignment"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C29F-AE56-C501-7",
+ "Download": "http://hdl.handle.net/10932/00-0332-C29F-AE56-C501-7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/habla.json b/corpora/spoken-corpora/habla.json
new file mode 100644
index 0000000..2ab8541
--- /dev/null
+++ b/corpora/spoken-corpora/habla.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hamburg Adult Bilingual LAnguage (HABLA)",
+ "URL": "http://hdl.handle.net/11022/0000-0000-5C64-9",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.",
+ "Languages": ["deu", "fra", "ita"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["79 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ },
+ "Publication": "Kupisch et al. (2012)"
+}
diff --git a/corpora/spoken-corpora/hacaspa.json b/corpora/spoken-corpora/hacaspa.json
new file mode 100644
index 0000000..4ed4861
--- /dev/null
+++ b/corpora/spoken-corpora/hacaspa.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hamburg Corpus of Argentinean Spanish (HaCASpa)",
+ "URL": "http://hdl.handle.net/11022/0000-0000-5F0B-B",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous speech and reading tasks.",
+ "Languages": ["Spanish (Argentinian)"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["19 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ },
+ "Publication": "Gabriel et al. (2010)"
+}
diff --git a/corpora/spoken-corpora/hamburg-modern.json b/corpora/spoken-corpora/hamburg-modern.json
new file mode 100644
index 0000000..6a2a8c3
--- /dev/null
+++ b/corpora/spoken-corpora/hamburg-modern.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Hamburg Modern Times Corpus",
+ "URL": "http://hdl.handle.net/11022/0000-0000-6973-9",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains task-oriented communcation (e.g., a film retelling) in the context of studying adult L2 acquisition.",
+ "Languages": ["deu"],
+ "License": "HZSK-ACA (academic, non-commercial only)",
+ "Size": ["3 hours"],
+ "Annotation": ["manual annotation of phonetic phenomena", "accent/stress marking"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "https://hdl.handle.net/11022/0000-0000-6973-9"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/hamcopolig.json b/corpora/spoken-corpora/hamcopolig.json
new file mode 100644
index 0000000..ef226f0
--- /dev/null
+++ b/corpora/spoken-corpora/hamcopolig.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hamburg Corpus of Polish in Germany (HamCoPoliG)",
+ "URL": "http://hdl.handle.net/11022/0000-0000-63CE-9",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous speech and reading tasks.",
+ "Languages": ["pol"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["38 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ },
+ "Publication": "Czachór (2012)"
+}
diff --git a/corpora/spoken-corpora/hempel.json b/corpora/spoken-corpora/hempel.json
new file mode 100644
index 0000000..d5bc808
--- /dev/null
+++ b/corpora/spoken-corpora/hempel.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus HEMPEL",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0002-F80E-8",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is a collection of more than 3900 spontaneous speech items recorded as extra material during the German SpeechDat-II project. Speakers were asked to report what they had been doing during the last hour: \"Was haben Sie in der letzten Stunde gemacht?\". This item was recorded as the last item of the recording session. Speakers had become acquainted with the recording procedure and they were quite relaxed because they knew that this item was the last to be recorded. This resulted in quite natural, colloquial speech, sometimes with marked regional accent.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["25.5 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0002-F80E-8"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/hral.json b/corpora/spoken-corpora/hral.json
new file mode 100644
index 0000000..75f6fed
--- /dev/null
+++ b/corpora/spoken-corpora/hral.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Croatian Adult Spoken Language Corpus (HrAL)",
+ "URL": "http://doi.org/10.21415/T5131S",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous conversations among 617 speakers from all Croatian counties, and it comprises more than 250 000 tokens and more than 100 000 types. Data for the corpus were collected from 2010 to 2012, from 2014 to 2015 and during 2016. Participants were adults who spoke Croatian as their mother tongue and first language. Transcripts were annotated with the ages and genders of the speakers, as well as the location of the conversation. A separate spreadsheet lists the speakers' origin, where they have spent most of their life and their level of education. The coverage of metadata for individual samples varies, and is in general more complete for samples collected from 2014 onwards.\nThe corpus is available for download and browsing from a dedicated website.",
+ "Languages": ["hrv"],
+ "License": "author attribution required",
+ "Size": ["250,000 tokens"],
+ "Annotation": ["speaker metadata"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Browse": "https://sla.talkbank.org/TBB/ca/Croatian",
+ "Download": "http://doi.org/10.21415/T5131S"
+ },
+ "Publication": "Kuvač Kraljević and Hržica (2016)"
+}
diff --git a/corpora/spoken-corpora/hun-broadcast-news.json b/corpora/spoken-corpora/hun-broadcast-news.json
new file mode 100644
index 0000000..b8c0e12
--- /dev/null
+++ b/corpora/spoken-corpora/hun-broadcast-news.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hungarian Broadcast News Database",
+ "URL": "http://metashare.elda.org/repository/browse/hungarian-broadcast-news-database/99bc21d081b611e2892a000c29bfc0d4d3d173ede2e64475b596aa1857a64541/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available for download (upon request) from META-SHARE.",
+ "Languages": ["hun"],
+ "License": "META_SHARE NC-NoReD",
+ "Size": ["25,000 words", "3.5 hours"],
+ "Annotation": ["audio-level annotations"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://metashare.elda.org/repository/browse/hungarian-broadcast-news-database/99bc21d081b611e2892a000c29bfc0d4d3d173ede2e64475b596aa1857a64541/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/hun-gigaword-spoken.json b/corpora/spoken-corpora/hun-gigaword-spoken.json
new file mode 100644
index 0000000..57c434e
--- /dev/null
+++ b/corpora/spoken-corpora/hun-gigaword-spoken.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hungarian Gigaword Corpus / \"spoken language\" subcorpus",
+ "URL": "http://hnc.nytud.hu/index_eng.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains radio broadcasts (reading aloud and spontaneous conversation)\nThe corpus is available through the Hungarian Gigaword Corpus concordancer.",
+ "Languages": ["hun"],
+ "License": "",
+ "Size": ["76 million words"],
+ "Annotation": ["PoS-tagged", "MSD-tagged"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Concordancer": "http://clara.nytud.hu/mnsz2-dev/bonito/run.cgi/first_form?uilang=en"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/hun-kindergarten.json b/corpora/spoken-corpora/hun-kindergarten.json
new file mode 100644
index 0000000..f1e5322
--- /dev/null
+++ b/corpora/spoken-corpora/hun-kindergarten.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hungarian Kindergarten Language Corpus",
+ "URL": "http://metashare.nytud.hu/repository/browse/hungarian-kindergarten-language-corpus/b572a8106ba711e2aa7c68b599c26a06a4db2e695cf94a1cad6bf6793d747d2a/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains elicited speech tasks (picture descriptions) and guided conversation with children.\nThe corpus is available for download through META-SHARE.",
+ "Languages": ["hun"],
+ "License": "restricted",
+ "Size": ["192,000 words"],
+ "Annotation": ["PoS-tagged", "MSD-tagged"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://metashare.nytud.hu/repository/browse/hungarian-kindergarten-language-corpus/b572a8106ba711e2aa7c68b599c26a06a4db2e695cf94a1cad6bf6793d747d2a/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/hun-reference-speech-db.json b/corpora/spoken-corpora/hun-reference-speech-db.json
new file mode 100644
index 0000000..fe7d0d2
--- /dev/null
+++ b/corpora/spoken-corpora/hun-reference-speech-db.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Hungarian Reference Speech Database",
+ "URL": "http://metashare.ilsp.gr:8080/repository/browse/hungarian-mrba/92067ce281b611e2892a000c29bfc0d48e6c8e9c745d446a9a64e48ba4c6462d/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains reading tasks.\nThe corpus is available for download (upon request) from META-SHARE.",
+ "Languages": ["hun"],
+ "License": "META-SHARE No-Redistribution Commercial FF",
+ "Size": ["6 hours"],
+ "Annotation": ["partial phonemic-level annotation"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://metashare.ilsp.gr:8080/repository/browse/hungarian-mrba/92067ce281b611e2892a000c29bfc0d48e6c8e9c745d446a9a64e48ba4c6462d/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/ifa-spoken.json b/corpora/spoken-corpora/ifa-spoken.json
new file mode 100644
index 0000000..1a6cad1
--- /dev/null
+++ b/corpora/spoken-corpora/ifa-spoken.json
@@ -0,0 +1,16 @@
+{
+ "Name": "IFA Spoken Language Corpus",
+ "URL": "http://hdl.handle.net/11372/LRT-734",
+ "Family": "Spoken corpora",
+ "Description": "The corpus is available for download from an informal webpage.",
+ "Languages": ["nld"],
+ "License": "CLARIN PUB",
+ "Size": ["50,000 words (41 minutes/speaker)"],
+ "Annotation": ["Hand-segmented speech"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11372/LRT-734"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/jasmin.json b/corpora/spoken-corpora/jasmin.json
new file mode 100644
index 0000000..6d4675a
--- /dev/null
+++ b/corpora/spoken-corpora/jasmin.json
@@ -0,0 +1,16 @@
+{
+ "Name": "JASMIN Speech Corpus",
+ "URL": "http://hdl.handle.net/10032/tm-a2-j7",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains recordings of human-machine interaction and read speech performed by children, non-native speakers and senior people.\nThe corpus is available download from the Dutch Language Institute.",
+ "Languages": ["nld"],
+ "License": "CLARIN RES",
+ "Size": ["115 hours"],
+ "Annotation": ["PoS-tagged", "lemmatised", "phonetically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10032/tm-a2-j7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/juznevesti-sr.json b/corpora/spoken-corpora/juznevesti-sr.json
new file mode 100644
index 0000000..2628578
--- /dev/null
+++ b/corpora/spoken-corpora/juznevesti-sr.json
@@ -0,0 +1,16 @@
+{
+ "Name": "ASR training dataset for Serbian JuzneVesti-SR",
+ "URL": "http://hdl.handle.net/11356/1679",
+ "Family": "Spoken corpora",
+ "Description": "This corpus consists of audio recordings and manual transcripts from the Južne Vesti website and its host show called the 15 minuta. The processing of the audio and its alignment to the manual transcripts followed the pipeline of the ParlaSpeech-HR dataset as closely as possible. Segments in this dataset range from 2 to 30 seconds. Train-dev-test split has been performed with 80:10:10 ratio.\nAs with the ParlaSpeech-HR dataset, two transcriptions are provided; one with transcripts in their raw form (with punctuation, capital letters, numerals) and another normalised with the same rule-based normaliser as was used in ParlaSpeech-HR dataset creation, which is lowercased, punctuation is removed and numerals are replaced with words. The speaker_info attribute is less abundant due to the fact that compared to parliamentary corpora less data is available in this domain, so it covers only the guest name, guest description, host name, and speaker breakdown (when the host or the guest are speaking).\nThis corpus is available for download from the CLARIN.SI repository.",
+ "Languages": ["srp"],
+ "License": "CC BY-SA 4.0",
+ "Size": ["50.55 hours", "10811 entries"],
+ "Annotation": ["normalised transcriptions (lowercased, punctuation removed, numerals spelled out)", "speaker metadata", "word-level alignment to the recordings"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11356/1679"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/karel-makon.json b/corpora/spoken-corpora/karel-makon.json
new file mode 100644
index 0000000..bf8b2ed
--- /dev/null
+++ b/corpora/spoken-corpora/karel-makon.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Spoken corpus of Karel Makoň",
+ "URL": "http://hdl.handle.net/11234/1-3422",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains talks on Christian mysticism given by Karel Makoň.\nThe corpus is available for download from LINDAT.",
+ "Languages": ["ces"],
+ "License": "CC BY-SA 3.0",
+ "Size": ["1000 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11234/1-3422"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/karl-eberhard.json b/corpora/spoken-corpora/karl-eberhard.json
new file mode 100644
index 0000000..5d932b1
--- /dev/null
+++ b/corpora/spoken-corpora/karl-eberhard.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The Karl-Eberhard-Corpus of spontaneously spoken conversations in Southern German",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-DADB-D",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains 79 speakers of Southern German. Two speakers, usually acquainted with each other, had an one hour long conversation in separate booths. ",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["40 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-DADB-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/kennsluromur.json b/corpora/spoken-corpora/kennsluromur.json
new file mode 100644
index 0000000..9e397be
--- /dev/null
+++ b/corpora/spoken-corpora/kennsluromur.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Kennslurómur",
+ "URL": "http://hdl.handle.net/20.500.12537/171",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of lectures at Reykjavik University and the University of Iceland. The lectures were donated by the lecturers (172 lectures by 14 lecturers), transcribed with an Icelandic speech recognizer and then manually corrected by human transcribers and finally verified by a proofreader.",
+ "Languages": ["isl"],
+ "License": "CC BY 4.0",
+ "Size": ["51 hours"],
+ "Annotation": ["sentence-segmented orthographic transcriptions"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/20.500.12537/171"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/konfliktgespraeche.json b/corpora/spoken-corpora/konfliktgespraeche.json
new file mode 100644
index 0000000..79cc523
--- /dev/null
+++ b/corpora/spoken-corpora/konfliktgespraeche.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Elizitierte Konfliktgespräche",
+ "URL": "http://hdl.handle.net/10932/00-0332-C11A-46E1-0001-A",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains elicited conflict interaction.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["160,000 words", "12 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0332-C11A-46E1-0001-A",
+ "Download": "http://hdl.handle.net/10932/00-0332-C11A-46E1-0001-A"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/kontrastiv.json b/corpora/spoken-corpora/kontrastiv.json
new file mode 100644
index 0000000..9542907
--- /dev/null
+++ b/corpora/spoken-corpora/kontrastiv.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Gesprochene Wissenschaftssprache Kontrastiv",
+ "URL": "http://hdl.handle.net/10932/00-03BC-7412-E7EA-4101-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains academic interaction.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu", "eng", "pol", "bul"],
+ "License": "CLARIN RES",
+ "Size": ["760,000 words", "92 hours"],
+ "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed", "annotation of discourse phenomena and language mixing"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-03BC-7412-E7EA-4101-3",
+ "Download": "http://hdl.handle.net/10932/00-03BC-7412-E7EA-4101-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/lang-in-migration.json b/corpora/spoken-corpora/lang-in-migration.json
new file mode 100644
index 0000000..fa00f71
--- /dev/null
+++ b/corpora/spoken-corpora/lang-in-migration.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Languages in Migration",
+ "URL": "http://hdl.handle.net/11372/LRT-4777",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is a representation of authentic spoken Czech and German.\nIt contains transcriptions of informal speech (private environment, spontaneity, unpreparedness etc.) by Czech-German bilingual speakers born in Czechoslovakia around 1955 and who departed for Germany after becoming 12 years old. The corpus is composed of interviews conducted from 2018–2020 with 20 speakers on language biographies and narrated in Czech and German respectively.\nThe corpus is available for download from LINDAT and for online browsing through the KonText concordancer.",
+ "Languages": ["ces", "deu"],
+ "License": "Czech National Corpus (Shuffled Corpus Data)",
+ "Size": [],
+ "Annotation": ["syntactic dependencies"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "https://www.korpus.cz/kontext/query?corpname=jazyky_v_migraci",
+ "Download": "http://hdl.handle.net/11372/LRT-4777"
+ },
+ "Publication": "this list of publications"
+}
diff --git a/corpora/spoken-corpora/lecture-speech.json b/corpora/spoken-corpora/lecture-speech.json
new file mode 100644
index 0000000..b346831
--- /dev/null
+++ b/corpora/spoken-corpora/lecture-speech.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus of Lecture Speech",
+ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00023L",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of academic lectures and oral conference presentations.\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["est"],
+ "License": "CC-BY-SA",
+ "Size": ["41 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00023L"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/lia.json b/corpora/spoken-corpora/lia.json
new file mode 100644
index 0000000..7de9c2d
--- /dev/null
+++ b/corpora/spoken-corpora/lia.json
@@ -0,0 +1,16 @@
+{
+ "Name": "LIA",
+ "URL": "https://www.hf.uio.no/iln/english/research/projects/language-infrastructure-made-accessible/index.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews and conversation in Norwegian dialects.\nThe corpus is available through a Tekstlab concordancer (account needed).",
+ "Languages": ["nor"],
+ "License": "CLARIN ACA",
+ "Size": ["1.5 million tokens"],
+ "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://tekstlab.uio.no/glossa2/lia"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/lmu-asica.json b/corpora/spoken-corpora/lmu-asica.json
new file mode 100644
index 0000000..924b2ba
--- /dev/null
+++ b/corpora/spoken-corpora/lmu-asica.json
@@ -0,0 +1,16 @@
+{
+ "Name": "LMU AsiCa",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A531-E",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is a documentation of the South Italian dialect 'Calabrese'. The main objects when building this corpus were the analysis of syntactical structures and their geolinguistic mapping in form of interactive, webbased cartography. The corpus consists of several audio files containing recordings of some sixty speakers of Calabrese one half of which having migration experience in Germany the other half almost always having stayed in Calabria. Furthermore the informants were selected equally balanced regarding gender, age and geographical origin. Of most of the informants there exist at least one recording with spontanous speech and one recording based on stimuli each.",
+ "Languages": ["ita"],
+ "License": "CLARIN RES",
+ "Size": ["47 hours"],
+ "Annotation": ["phonetic transcription"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-A531-E"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/long-spoken-fin.json b/corpora/spoken-corpora/long-spoken-fin.json
new file mode 100644
index 0000000..da75887
--- /dev/null
+++ b/corpora/spoken-corpora/long-spoken-fin.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The Longitudinal Corpus of Finnish Spoken in Helsinki (1970s, 1990s and 2010s)",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073041",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThe corpus is available for online querying through the LAT platform and through the concordancer Korp.",
+ "Languages": ["fin"],
+ "License": "restricted",
+ "Size": ["210 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "LAT Platform": "http://urn.fi/urn:nbn:fi:lb-100110016072"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/medical-speech.json b/corpora/spoken-corpora/medical-speech.json
new file mode 100644
index 0000000..635c795
--- /dev/null
+++ b/corpora/spoken-corpora/medical-speech.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Medical Speech Database",
+ "URL": "http://metashare.elda.org/repository/browse/hungarian-medical-speech-database/76a0c9f881b611e2892a000c29bfc0d4ed0651f675914bb2805e26819a60167d/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available for download (upon request) from META-SHARE.",
+ "Languages": ["hun"],
+ "License": "META-SHARE C-NoReD-FF",
+ "Size": [],
+ "Annotation": ["phonetic transcription"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://metashare.elda.org/repository/browse/hungarian-medical-speech-database/76a0c9f881b611e2892a000c29bfc0d4ed0651f675914bb2805e26819a60167d/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/mehrsprachige-kinder.json b/corpora/spoken-corpora/mehrsprachige-kinder.json
new file mode 100644
index 0000000..11c73c7
--- /dev/null
+++ b/corpora/spoken-corpora/mehrsprachige-kinder.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Mehrsprachige Kinder im Vorschulalter",
+ "URL": "http://hdl.handle.net/10932/00-0372-30C6-B67F-ED01-5",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains elicitation tasks with pre-school children.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["17,000 words", "13 hours"],
+ "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-0372-30C6-B67F-ED01-5",
+ "Download": "http://hdl.handle.net/10932/00-0372-30C6-B67F-ED01-5"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/multichannel-articulatory.json b/corpora/spoken-corpora/multichannel-articulatory.json
new file mode 100644
index 0000000..1017049
--- /dev/null
+++ b/corpora/spoken-corpora/multichannel-articulatory.json
@@ -0,0 +1,16 @@
+{
+ "Name": "MultiCHannel Articulatory database: English",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C2B1-5",
+ "Family": "Spoken corpora",
+ "Description": "This coprus features a set of 460 short sentences designed to include the main connected speech processes in English (e.g. assimilations, weak forms ...). All recordings made in the same sound damped studio at the Edinburgh Speech Production Facility based in the department of Speech and Language Sciences, Queen Margaret University College, UK. The database contains audio files, laryngograph waveforms, electromagnetic articulograph (EMA) tracks and electropalatograph (EPG) tracks. ",
+ "Languages": ["eng"],
+ "License": "CLARIN PUB",
+ "Size": ["5 hours"],
+ "Annotation": ["orthographically transcribed", "Electromagnetic Articulography"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-C2B1-5"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/natural-media-motion-capture.json b/corpora/spoken-corpora/natural-media-motion-capture.json
new file mode 100644
index 0000000..e28adbc
--- /dev/null
+++ b/corpora/spoken-corpora/natural-media-motion-capture.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Natural Media Motion-Capture Corpus",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C34C-8",
+ "Family": "Spoken corpora",
+ "Description": "The corpus consists of data from 18 participants, whose task was to describe nine objects each to an experimenter, without using everyday vocabulary about forms, sizes or objects. The participants were recorded on audio and several video cameras, and their hand movements were recorded using an optical VICON motion capture system.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["3 hours"],
+ "Annotation": ["orthographically transcribed", "gestures", "motion capture of hands"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-C34C-8"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/nautilus.json b/corpora/spoken-corpora/nautilus.json
new file mode 100644
index 0000000..3567afb
--- /dev/null
+++ b/corpora/spoken-corpora/nautilus.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Nautilus Speaker Characterization",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C05F-6",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains scripted, semi-spontaneous, and spontaneous human-human dialogs. In total, 300 speakers of German without noticeable accent participated and were recorded in an acoustically-isolated room. Interactions between speakers and their interlocutor are provided in separate mono files, accompanied by timestamps and tags that define the speaker's turns. The speech corresponding to one of the semi-spontaneous dialogs was labeled with respect to perceived interpersonal speaker characteristics and naive voice descriptions. These labels are found alongside the documentation.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["155 hours"],
+ "Annotation": ["orthographically transcribed", "Turn taking", "perceivend inter-personal speaker characteristics", "voice descriptions"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-C05F-6"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/nordic-dialect.json b/corpora/spoken-corpora/nordic-dialect.json
new file mode 100644
index 0000000..08ad9c6
--- /dev/null
+++ b/corpora/spoken-corpora/nordic-dialect.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Nordic Dialect Corpus v. 4.0",
+ "URL": "http://www.tekstlab.uio.no/nota/scandiasyn/nsd.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus consists of pontaneous speech data from dialects of the North Germanic languages across all of the Nordic countries. The linguistic data in the corpus comes from a variety of sources, (see homepage - Data Collection), recorded in 1998 - 2015. The corpus transcribed and linked to audio and video, has a map function, and can be searched in a large variety of ways.#SEPThe corpus can be accessed online via a concordancer provided by the TekstLab (a CLARINO node).",
+ "Languages": ["nor", "swe", "dan", "fao", "isl", "Övdalian"],
+ "License": "CLARIN ACA",
+ "Size": ["2,754,289 tokens"],
+ "Annotation": ["MSD-tagged", "phonetically transcribed", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://www.tekstlab.uio.no/nota/scandiasyn/nsd.html"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/north-wind-sun.json b/corpora/spoken-corpora/north-wind-sun.json
new file mode 100644
index 0000000..52494e8
--- /dev/null
+++ b/corpora/spoken-corpora/north-wind-sun.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Estonian North Wind and the Sun Corpus v.1.0.3",
+ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00129L",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of the tale \"Põhjatuul ja päike\" (North Wind and the Sun) read by the same speakers who participated in the Phonetic Corpus of Estonian Spontaneous Speech.\nThe corpus is available for download from META-SHARE (CELR distribution).",
+ "Languages": ["est"],
+ "License": "",
+ "Size": [],
+ "Annotation": ["word segmentation and phonemes in SAMPA"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00129L"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/nota-oslo.json b/corpora/spoken-corpora/nota-oslo.json
new file mode 100644
index 0000000..45a4414
--- /dev/null
+++ b/corpora/spoken-corpora/nota-oslo.json
@@ -0,0 +1,16 @@
+{
+ "Name": "NoTa-Oslo",
+ "URL": "http://www.tekstlab.uio.no/nota/oslo/english.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews and conversations in Oslo sociolects.\nThe corpus is available through a Tekstlab concordancer (account needed).",
+ "Languages": ["nor"],
+ "License": "CLARIN ACA",
+ "Size": ["1 million tokens"],
+ "Annotation": ["orthographically transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://tekstlab.uio.no/glossa2/nota_oslo"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/nslc.json b/corpora/spoken-corpora/nslc.json
new file mode 100644
index 0000000..a021903
--- /dev/null
+++ b/corpora/spoken-corpora/nslc.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Nganasan Spoken Language Corpus (NSLC)",
+ "URL": "http://hdl.handle.net/11022/0000-0007-C6F2-8",
+ "Family": "Spoken corpora",
+ "Description": "This second version 0.2 of the corpus is a subcorpus that comprises 177 communications, 136 of which contain an aligned audio recording, with glossed (Toolbox/FLEx) and annotated (EXMARaLDA) transcripts from 57 speakers. All texts have been translated into Russian and English, some also into German. The corpus also contains rich metadata on the communications and speakers.",
+ "Languages": ["nio", "rus"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["32 hours"],
+ "Annotation": ["alignment of transcriptions and audio recordings"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "",
+ "Download": ""
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json b/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json
new file mode 100644
index 0000000..6ebfded
--- /dev/null
+++ b/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Voice Onset Time in Cochlear Implant Patients (diachronic data)",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A9CB-D",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains diachronic speech recordings from three cochlear implant (CI) users. For data used in the corresponding synchronic study, please refer to the CI_2 corpora. CI_3_Sibilants contains recordings used for the analysis of /s/ and /ʃ/ in the following words: 'Tasse', 'Tasche'. CI_3_VOT contains recordings used for the analysis of voice onset time in /t/ in the word 'teilen'.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["unknown"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-A9CB-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/onset-cochlear-patients.json b/corpora/spoken-corpora/onset-cochlear-patients.json
new file mode 100644
index 0000000..01baa85
--- /dev/null
+++ b/corpora/spoken-corpora/onset-cochlear-patients.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Voice Onset Time in Cochlear Implant Patients",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AE7E-F",
+ "Family": "Spoken corpora",
+ "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG). It contains recordings used for the analysis of voice onset time in /t/ in the word 'teilen'. ",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["35 min"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-AE7E-F"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/oral2008.json b/corpora/spoken-corpora/oral2008.json
new file mode 100644
index 0000000..e5943ba
--- /dev/null
+++ b/corpora/spoken-corpora/oral2008.json
@@ -0,0 +1,17 @@
+{
+ "Name": "ORAL2008: Balanced corpus of informal spoken Czech",
+ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-119D-A",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+ "Languages": ["ces"],
+ "License": "CC BY-NC-SA 3.0",
+ "Size": ["1 million tokens"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "https://kontext.korpus.cz/first_form?corpname=oral2008",
+ "Download": "http://hdl.handle.net/11858/00-097C-0000-0023-119D-A"
+ },
+ "Publication": "Benešová et al. (2015)"
+}
diff --git a/corpora/spoken-corpora/oral2013.json b/corpora/spoken-corpora/oral2013.json
new file mode 100644
index 0000000..7f0cd07
--- /dev/null
+++ b/corpora/spoken-corpora/oral2013.json
@@ -0,0 +1,17 @@
+{
+ "Name": "ORAL2013: balanced corpus of informal spoken Czech (transcriptions & audio)",
+ "URL": "http://hdl.handle.net/11234/1-1848",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+ "Languages": ["ces"],
+ "License": "Academic Licence Agreement for Czech National Corpus Data",
+ "Size": ["2.8 million words"],
+ "Annotation": ["recordings and transcripts anonymised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://kontext.korpus.cz/first_form?corpname=oral2013",
+ "Download": "http://hdl.handle.net/11234/1-1848"
+ },
+ "Publication": "Benešová et al. (2015)"
+}
diff --git a/corpora/spoken-corpora/orleans.json b/corpora/spoken-corpora/orleans.json
new file mode 100644
index 0000000..3008b59
--- /dev/null
+++ b/corpora/spoken-corpora/orleans.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus of Orleans",
+ "URL": "http://purl.org/poi/crdo.vjf.cnrs.fr/cocoon-5569b8dc-b40f-3ccd-95d1-86d20a1a836c",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of the everyday speech of Orléans residents between 1969 and 1974.\nThe corpus is available for download from the Huma-num repository.",
+ "Languages": ["fra"],
+ "License": "CC BY-NC-SA 3.0",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "https://doi.org/10.34847/cocoon.5569b8dc-b40f-3ccd-95d1-86d20a1a836c"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/ortofon-audio.json b/corpora/spoken-corpora/ortofon-audio.json
new file mode 100644
index 0000000..d7d9452
--- /dev/null
+++ b/corpora/spoken-corpora/ortofon-audio.json
@@ -0,0 +1,17 @@
+{
+ "Name": "ORTOFON v1: balanced corpus of informal spoken Czech with multi-tier transcription (transcriptions & audio)",
+ "URL": "http://hdl.handle.net/11234/1-2579",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+ "Languages": ["ces"],
+ "License": "Academic Licence Agreement for Czech National Corpus Data",
+ "Size": ["1 million words"],
+ "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://kontext.korpus.cz/first_form?corpname=ortofon_v1",
+ "Download": "http://hdl.handle.net/11234/1-2579"
+ },
+ "Publication": "Komrsková et al. (2018)"
+}
diff --git a/corpora/spoken-corpora/ortofon.json b/corpora/spoken-corpora/ortofon.json
new file mode 100644
index 0000000..9db2940
--- /dev/null
+++ b/corpora/spoken-corpora/ortofon.json
@@ -0,0 +1,17 @@
+{
+ "Name": "ORTOFON v1: balanced corpus of informal spoken Czech with multi-tier transcription (transcriptions)",
+ "URL": "http://hdl.handle.net/11234/1-2580",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+ "Languages": ["ces"],
+ "License": "CC BY-NC-SA 4.0",
+ "Size": ["1 million tokens"],
+ "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer": "https://kontext.korpus.cz/first_form?corpname=ortofon_v1",
+ "Download": "http://hdl.handle.net/11234/1-2580"
+ },
+ "Publication": "Komrsková et al. (2018)"
+}
diff --git a/corpora/spoken-corpora/ovm.json b/corpora/spoken-corpora/ovm.json
new file mode 100644
index 0000000..bcd0db4
--- /dev/null
+++ b/corpora/spoken-corpora/ovm.json
@@ -0,0 +1,17 @@
+{
+ "Name": "OVM – Otázky Václava Moravce",
+ "URL": "http://hdl.handle.net/11858/00-097C-0000-000D-EC98-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains transcribed recordings from the Czech political discussion broadcast “Otázky Václava Moravce“.\nThe corpus is available for download from LINDAT and through the concordancer KonText.",
+ "Languages": ["ces"],
+ "License": "CC BY-NC 3.0",
+ "Size": ["35 hours"],
+ "Annotation": ["word-by-word transcriptions, including the transcription of some non-speech events"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://lindat.mff.cuni.cz/services/kontext/first_form?corpname=ovm_cs_w",
+ "Download": "http://hdl.handle.net/11858/00-097C-0000-000D-EC98-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/parcorfull.json b/corpora/spoken-corpora/parcorfull.json
new file mode 100644
index 0000000..c2f8a62
--- /dev/null
+++ b/corpora/spoken-corpora/parcorfull.json
@@ -0,0 +1,16 @@
+{
+ "Name": "ParCorFull: A Parallel Corpus Annotated with Full Coreference",
+ "URL": "http://hdl.handle.net/11372/LRT-2614",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains planned speech and newswire.\nThe corpus is available for download from LINDAT.",
+ "Languages": ["eng", "deu"],
+ "License": "CC BY-NC-ND 4.0",
+ "Size": ["160,000 tokens"],
+ "Annotation": ["coreference (nominal and clausal)"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Download": "http://hdl.handle.net/11372/LRT-2614"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/parlato-telegiornalistico.json b/corpora/spoken-corpora/parlato-telegiornalistico.json
new file mode 100644
index 0000000..a61bd9e
--- /dev/null
+++ b/corpora/spoken-corpora/parlato-telegiornalistico.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Selezione dal \"Corpus di parlato telegiornalistico\". Anni Sessanta vs. 2005",
+ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/647-selezione-dal-qcorpus-di-parlato-telegiornalistico-anni-sessanta-vs-2005q",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains news broadcast.\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["ita"],
+ "License": "",
+ "Size": [],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/647-selezione-dal-qcorpus-di-parlato-telegiornalistico-anni-sessanta-vs-2005q"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/pdtsl.json b/corpora/spoken-corpora/pdtsl.json
new file mode 100644
index 0000000..49de944
--- /dev/null
+++ b/corpora/spoken-corpora/pdtsl.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Prague Dependency Treebank of Spoken Language (PDTSL) 0.5",
+ "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-4914-D",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available for download from LINDAT.",
+ "Languages": ["ces"],
+ "License": "ACADEMIC (PDTSL)",
+ "Size": ["120,000 words"],
+ "Annotation": ["syntactic dependencies"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Download": "http://hdl.handle.net/11858/00-097C-0000-0001-4914-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/phattsessionz.json b/corpora/spoken-corpora/phattsessionz.json
new file mode 100644
index 0000000..80c2aea
--- /dev/null
+++ b/corpora/spoken-corpora/phattsessionz.json
@@ -0,0 +1,16 @@
+{
+ "Name": "PhattSessionz Adolescents Speech Corpus",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-CC6A-4",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of 1019 adolescent speakers of German (age range 12-20). The recordings were performed via the WWW in public schools (Gymnasium) in 45 locations in Germany. The speech material recorded is a superset of the German SpeechDat-II and RVG-I corpora.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["208 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-CC6A-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/phon-contemp-fra.json b/corpora/spoken-corpora/phon-contemp-fra.json
new file mode 100644
index 0000000..c8120f8
--- /dev/null
+++ b/corpora/spoken-corpora/phon-contemp-fra.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Phonologie du Français Contemporain",
+ "URL": "http://cocoon.huma-num.fr/exist/crdo/ark:/87895/1.17-794340",
+ "Family": "Spoken corpora",
+ "Description": "This corpus is available for download from a dedicated webpage.",
+ "Languages": ["fra"],
+ "License": "CC-BY",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "https://cocoon.huma-num.fr/exist/crdo/ark:/87895/1.17-794340"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/phoncat.json b/corpora/spoken-corpora/phoncat.json
new file mode 100644
index 0000000..138f4a8
--- /dev/null
+++ b/corpora/spoken-corpora/phoncat.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Catalan in a bilingual context (PhonCAT)",
+ "URL": "http://hdl.handle.net/11022/0000-0000-772F-7",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains read, elicited and spontaneous speech.",
+ "Languages": ["Spanish (Catalan)"],
+ "License": "HZSK-RES (restricted, non-commercial only)",
+ "Size": ["144 hours"],
+ "Annotation": ["orthographically and phonetically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ },
+ "Publication": "Benet et al. (2012)"
+}
diff --git a/corpora/spoken-corpora/phondat1.json b/corpora/spoken-corpora/phondat1.json
new file mode 100644
index 0000000..d3b6d2b
--- /dev/null
+++ b/corpora/spoken-corpora/phondat1.json
@@ -0,0 +1,16 @@
+{
+ "Name": "PhonDat 1",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-D20B-6",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains read speech of 201 different speakers. Each speaket read a subcorpus of 450 different sentence equivalents (including alphanumericals and two shorter passages of prose text); 8 speakers read the whole sentence corpus; 40 speakers read the subcorpora BR and MR; 112 speakers read 70 utterances of the rest corpus, including alphabet, numbers 0 to 12 and stories. The corpus contains a total of 21587 recorded utterances.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["21.4 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-D20B-6"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/phondat2.json b/corpora/spoken-corpora/phondat2.json
new file mode 100644
index 0000000..90da3ae
--- /dev/null
+++ b/corpora/spoken-corpora/phondat2.json
@@ -0,0 +1,16 @@
+{
+ "Name": "PhonDat 2",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-D288-8",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains read speech of 16 different speakers, 6 women and 10 men. Each speaker reads a corpus of 200 different sentences from a train query task. They were recorded at three different sites in Germany (University of Kiel, University of Bonn, University of Munich). The language is German. The corpus contains a total of 3200 recorded utterances.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["4.3 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic", "phonetic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-D288-8"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/prague-db.json b/corpora/spoken-corpora/prague-db.json
new file mode 100644
index 0000000..d9a6129
--- /dev/null
+++ b/corpora/spoken-corpora/prague-db.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Prague DaTabase of Spoken Czech 1.0",
+ "URL": "http://hdl.handle.net/11234/1-2375",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous dialogue.\nThe corpus is available for download from LINDAT.",
+ "Languages": ["ces"],
+ "License": "CC BY-NC SA 4.0",
+ "Size": ["770,000 tokens", "7324 minutes"],
+ "Annotation": ["MSD-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11234/1-2375"
+ },
+ "Publication": "Hajič et al. (2008)"
+}
diff --git a/corpora/spoken-corpora/radio-interviews.json b/corpora/spoken-corpora/radio-interviews.json
new file mode 100644
index 0000000..c814da2
--- /dev/null
+++ b/corpora/spoken-corpora/radio-interviews.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus of Radio Interviews",
+ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00022L",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains telephone interviews from different radio programmes.\nThe corpus is available for download from META-SHARE (CELR distribution).",
+ "Languages": ["est"],
+ "License": "CC-BY",
+ "Size": ["36 hours"],
+ "Annotation": ["speech annotation to orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00022L"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/radio-news.json b/corpora/spoken-corpora/radio-news.json
new file mode 100644
index 0000000..0499135
--- /dev/null
+++ b/corpora/spoken-corpora/radio-news.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus of Radio News",
+ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00021L",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains public broadcast news.\nThe corpus is available for download from META-SHARE (CELR distribution).",
+ "Languages": ["est"],
+ "License": "",
+ "Size": ["19 hours"],
+ "Annotation": ["speech annotation to orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00021L"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/route-to-a-wing.json b/corpora/spoken-corpora/route-to-a-wing.json
new file mode 100644
index 0000000..722d713
--- /dev/null
+++ b/corpora/spoken-corpora/route-to-a-wing.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Route to A wing",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2014101401",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous conversations.\nThis corpus is available for online querying through the concordancer Korp.",
+ "Languages": ["fin"],
+ "License": "CC-0",
+ "Size": ["218 tokens"],
+ "Annotation": ["PoS-tagged"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2015050502"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/russlanddeutsch.json b/corpora/spoken-corpora/russlanddeutsch.json
new file mode 100644
index 0000000..ca1951a
--- /dev/null
+++ b/corpora/spoken-corpora/russlanddeutsch.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Russlanddeutsche Dialekte",
+ "URL": "http://hdl.handle.net/10932/00-03FA-9D9C-4EEA-BB01-7",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).",
+ "Languages": ["deu"],
+ "License": "CLARIN RES",
+ "Size": ["100,000 words", "10 hours"],
+ "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://hdl.handle.net/10932/00-03FA-9D9C-4EEA-BB01-7",
+ "Download": "http://hdl.handle.net/10932/00-03FA-9D9C-4EEA-BB01-7"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/rvg1_clarin.json b/corpora/spoken-corpora/rvg1_clarin.json
new file mode 100644
index 0000000..10891b9
--- /dev/null
+++ b/corpora/spoken-corpora/rvg1_clarin.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus RVG1_CLARIN",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0004-3FF4-3",
+ "Family": "Spoken corpora",
+ "Description": "The corpus is a collection of more than 500 speakers of different dialect regions of Germany. The recordings were made using four different microphones (two in low and two in high quality) and consist of single digits, connected digits, phone numbers, phonetically balanced sentences, computer command phrases prompted on a screen, and 1 min spontaneous speech (monologue). The speakers were recorded in normal office environments. The backround noise was limited to the usual noise in office environment, eg. door slam, backround crosstalk, phone ringing, paper rustle, PC noise, etc. ",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["32 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0004-3FF4-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/samples-spoken-fin.json b/corpora/spoken-corpora/samples-spoken-fin.json
new file mode 100644
index 0000000..9bbfa07
--- /dev/null
+++ b/corpora/spoken-corpora/samples-spoken-fin.json
@@ -0,0 +1,17 @@
+{
+ "Name": "Samples of Spoken Finnish",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-201407141",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThis corpus is available for online querying through the LAT platform and through the concordancer Korp.",
+ "Languages": ["fin"],
+ "License": "CC-BY",
+ "Size": ["100 hours"],
+ "Annotation": ["syntactically parsed (TDT alpha)", "named entities (FiNER)", "PoS-tagged", "lemmatized", "orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2015040101",
+ "LAT Platform": "http://urn.fi/urn:nbn:fi:lb-1001100134"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/samromur.json b/corpora/spoken-corpora/samromur.json
new file mode 100644
index 0000000..d4c83f0
--- /dev/null
+++ b/corpora/spoken-corpora/samromur.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Samrómur",
+ "URL": "http://hdl.handle.net/20.500.12537/189",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains validated speech-recordings and is a result of a crowd-sourcing effort run by the Language and Voice Lab at Reykjavik University in cooperation with Almannarómur, Center for Language Technology.\nThe corpus contains recordings by 8,392 different speakers, with the average recording lenth being 5.2 seconds. Transcriptions of the read texts are also available.\nThe corpus is available for download from the CLARIN.IS repository.",
+ "Languages": ["isl"],
+ "License": "CC BY 4.0",
+ "Size": ["145 hours", "100,000 utterances"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/20.500.12537/189"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/sc1.json b/corpora/spoken-corpora/sc1.json
new file mode 100644
index 0000000..bc15204
--- /dev/null
+++ b/corpora/spoken-corpora/sc1.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus SC1",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0002-0B76-E",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains speech of 88 different speakers, reading the German story 'Der Nordwind und die Sonne'. Subcorpus T contains the recordings of 16 native Germans (L1). The other 72 speakers which were born and educated in other countries (L2) are pooled in subcorpus C. Every speaker has a distinct accent.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["1.5 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0002-0B76-E"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/sc10.json b/corpora/spoken-corpora/sc10.json
new file mode 100644
index 0000000..a6235b2
--- /dev/null
+++ b/corpora/spoken-corpora/sc10.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus SC10",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0002-1129-D",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains read and non-prompted German and mother tongue speech of 70 different speakers from 17 mother tongues (L1) in a variety of speaking styles e.g. reading, retelling, free talk etc.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["10 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0002-1129-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/sc2.json b/corpora/spoken-corpora/sc2.json
new file mode 100644
index 0000000..549c4f8
--- /dev/null
+++ b/corpora/spoken-corpora/sc2.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus SC2",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0005-0E95-4",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains read speech of 10 different speakers with screen prompted 'automobil diagnosis phrases' recorded under real conditions in two different car maintenance halls. The language is German. All speakers are male native Germans and have never participated in such a task before. They are all experts in the field of car diagnosis. Each speaker has spoken 800 3-7 word utterances derived from 100 different sentences (see sc2_ort.txt) resulting in a total of 8000 utterances.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["9 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0005-0E95-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/schweizer-jugend.json b/corpora/spoken-corpora/schweizer-jugend.json
new file mode 100644
index 0000000..e8e9d17
--- /dev/null
+++ b/corpora/spoken-corpora/schweizer-jugend.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Schweizer Jugendsprache",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A68A-9",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of adolescent pupils in Switzerland.",
+ "Languages": ["Swiss German"],
+ "License": "CLARIN RES",
+ "Size": ["92 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-A68A-9"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/serbian-forms-of-address.json b/corpora/spoken-corpora/serbian-forms-of-address.json
new file mode 100644
index 0000000..a00b624
--- /dev/null
+++ b/corpora/spoken-corpora/serbian-forms-of-address.json
@@ -0,0 +1,18 @@
+{
+ "Name": "Corpus of Serbian Forms of Address 1.1",
+ "URL": "http://hdl.handle.net/11356/1779",
+ "Family": "Spoken corpora",
+ "Description": "This corpus consists of transcripts of audio-recorded biographical interviews with 19 participants. The interviews are about forms of address that speakers use in colloquial and in formal settings, and about their attitudes and evaluations concerning particular forms of address.\nWe provide original transcripts (written according to GAT conventions), as well as transcripts in CoNLL-U and TEI-XML format. The corpus has been normalised, tagged with morphosyntactic and lemma information using the CLASSLA-StanfordNLP tagger, and aligned with the respective turns in the audio files. Time alignments as well as partial annotation corrections are stored in TEI-XML.\nThe corpus is available for download from CLARIN.SI as well as through the noSketchEngine and KonText concordancers.",
+ "Languages": ["srp"],
+ "License": "CC BY-NC-SA 4.1",
+ "Size": ["171,546 words"],
+ "Annotation": ["MSD-tagged", "lemmatised", "normalised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=sfa_sr&struct_attr_stats=1",
+ "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=sfa_sr",
+ "Download": "http://hdl.handle.net/11356/1779"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/shc.json b/corpora/spoken-corpora/shc.json
new file mode 100644
index 0000000..d11cb0a
--- /dev/null
+++ b/corpora/spoken-corpora/shc.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus SHC",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-0700-1",
+ "Family": "Spoken corpora",
+ "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The recordings include field recordings using a hand-held UMTS device (one person, SmartWeb Handheld Corpus SHC), field recordings with video capture of the primary speaker and a secondary speaker (SmartWeb Video Corpus SVC) as well as mobile recordings performed on a BMW motorbike (one speaker, SmartWeb Motorboke Corpus SMC).",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["30.6 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-0700-1"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/si100.json b/corpora/spoken-corpora/si100.json
new file mode 100644
index 0000000..a146594
--- /dev/null
+++ b/corpora/spoken-corpora/si100.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus SI100",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-E9CF-A",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains read speech of 101 different speakers (50 female, 50 male, 1 unknown). Each speaker has read approx. 100 sentences from either the SZ subcorpus or the CeBit subcorpus. The language is German. The subcorpus SZ contains 544 sentences from newspaper articles (\"Sueddeutsche Zeitung\"). The subcorpus CeBit contains 483 sentences from newspaper articles about the CeBit 1995. Each subcorpus is divided into 5 parts of approx. 100 utterances each. Every speaker read only one part of one subcorpus (with some exceptions), thus resulting in a total of 10.387 recorded utterances",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["31.5 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-E9CF-A"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/si1000.json b/corpora/spoken-corpora/si1000.json
new file mode 100644
index 0000000..587067c
--- /dev/null
+++ b/corpora/spoken-corpora/si1000.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus SI1000",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EBFB-6",
+ "Family": "Spoken corpora",
+ "Description": "The corpus contains read speech of 10 different speakers. Each speaker has read approx. 1000 sentences from a German news paper corpus, thus resulting in a total of approx. 10000 recorded utterances. The recording took place at the Institut fuer Phonetik, University of Munich, Germany in 1994.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["32.8 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-EBFB-6"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json b/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json
new file mode 100644
index 0000000..cd23030
--- /dev/null
+++ b/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Sibilant Production in Cochlear Implant Patients (diachronic data)",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A9BB-F",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains diachronic speech recordings from three cochlear implant (CI) users. For data used in the corresponding synchronic study, please refer to the CI_2 corpora. CI_3_Sibilants contains recordings used for the analysis of /s/ and /ʃ/ in the following words: 'Tasse', 'Tasche'.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["unknown"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-A9BB-F"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/sibilant-cochlear-patients.json b/corpora/spoken-corpora/sibilant-cochlear-patients.json
new file mode 100644
index 0000000..e55c115
--- /dev/null
+++ b/corpora/spoken-corpora/sibilant-cochlear-patients.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Sibilant Production in Cochlear Implant Patients",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AEDF-1",
+ "Family": "Spoken corpora",
+ "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG). CI_2_Sibilants contains recordings used for the analysis of /s/ and /ʃ/ in the following words: 'Tasse', 'Tasche'.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["1 hour"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-AEDF-1"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/siebenbuergisch.json b/corpora/spoken-corpora/siebenbuergisch.json
new file mode 100644
index 0000000..d21584b
--- /dev/null
+++ b/corpora/spoken-corpora/siebenbuergisch.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Audioatlas Siebenbuergisch-Saechsischer Dialekte",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-27B9-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains 2274 recordings (approx. 360h) of spoken dialectal German (Saxonian) recorded in Transilvania (Romania) in approx. 250 different locations. This up-to-now unpublished material has been collected on analog tape in the 1960s and 70s by different linguists based at the universities of Bukarest, Hermannstadt and Klausenburg.",
+ "Languages": ["Bavarian", "deu", "ron"],
+ "License": "CLARIN RES",
+ "Size": ["450,000 words"],
+ "Annotation": ["Geomapping", "orthographic/partial phonetic transcription", "semantic labelling"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-27B9-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/skolt-saami.json b/corpora/spoken-corpora/skolt-saami.json
new file mode 100644
index 0000000..ed1db92
--- /dev/null
+++ b/corpora/spoken-corpora/skolt-saami.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Skolt Saami Documentation Corpus (2016)",
+ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073037",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains interviews.\nThis corpus is available for online querying through the LAT platform.",
+ "Languages": ["Skolt Saami"],
+ "License": "CLARIN RES",
+ "Size": ["19 hours"],
+ "Annotation": ["MSD-tagged"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "LAT Platform": "http://hdl.handle.net/11113/00-0000-0000-0000-32A7-7@view"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/smartkom-home.json b/corpora/spoken-corpora/smartkom-home.json
new file mode 100644
index 0000000..8cd0a01
--- /dev/null
+++ b/corpora/spoken-corpora/smartkom-home.json
@@ -0,0 +1,16 @@
+{
+ "Name": "SmartKom Home",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-ED38-0",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains multi modal recordings of 65 actors who use the SmartKom system. SmartKom Home should be an intelligent communication assistant for the private environment. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum; in fact the user only knew that the system is able to understand speech, gestures and even mimical expressions and should more or less communicate like a human.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["11 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic", "gestures", "mimic", "emotions"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-ED38-0"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/smartkom-mobil.json b/corpora/spoken-corpora/smartkom-mobil.json
new file mode 100644
index 0000000..67c65e6
--- /dev/null
+++ b/corpora/spoken-corpora/smartkom-mobil.json
@@ -0,0 +1,16 @@
+{
+ "Name": "SmartKom Mobil",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EDBB-C",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains multi modal recordings of 73 actors who use the SmartKom system. SmartKom Mobil is a portable PDA equipped with a net link and additional intelligent communication devices. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum; in fact the user only knew that the system is able to understand speech, gestures and should more or less communicate like a human. Experiments were not performed in the field but rather in a studio-like environment.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["11 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic", "gestures", "mimic", "emotions"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-EDBB-C"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/smartkom-public.json b/corpora/spoken-corpora/smartkom-public.json
new file mode 100644
index 0000000..f5591b5
--- /dev/null
+++ b/corpora/spoken-corpora/smartkom-public.json
@@ -0,0 +1,16 @@
+{
+ "Name": "SmartKom Public",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EC8B-3",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains multi modal recordings of 86 actors who use the SmartKom system. SmartKom Public is comparable to a traditional public phone booth but equipped with additional intelligent communication devices. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum; in fact the user only knew that the system is able to understand speech, gestures and even mimical expressions and should more or less communicate like a human.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["11 hours"],
+ "Annotation": ["orthographically transcribed", "phonemic", "gestures", "mimic", "emotions"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-EC8B-3"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/smartweb-motorbike.json b/corpora/spoken-corpora/smartweb-motorbike.json
new file mode 100644
index 0000000..54c2317
--- /dev/null
+++ b/corpora/spoken-corpora/smartweb-motorbike.json
@@ -0,0 +1,16 @@
+{
+ "Name": "SmartWeb Motorbike Corpus SMC",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0005-C50F-D",
+ "Family": "Spoken corpora",
+ "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The SMC corpus itself contains 36 mobile recordings performed on a BMW motorbike.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["6.3 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0005-C50F-D"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/spit-mdb.json b/corpora/spoken-corpora/spit-mdb.json
new file mode 100644
index 0000000..07a510f
--- /dev/null
+++ b/corpora/spoken-corpora/spit-mdb.json
@@ -0,0 +1,15 @@
+{
+ "Name": "SpIt-MDb (Spoken Italian - Multilevel Database)",
+ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/644-spit-mdb-spoken-italian-multilevel-database",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains spontaneous speech.\nThe corpus is available for download from a dedicated webpage.",
+ "Languages": ["ita"],
+ "License": "",
+ "Size": [],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/644-spit-mdb-spoken-italian-multilevel-database"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/spjallromur.json b/corpora/spoken-corpora/spjallromur.json
new file mode 100644
index 0000000..7a65f64
--- /dev/null
+++ b/corpora/spoken-corpora/spjallromur.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Spjallromur - Icelandic Conversational Speech",
+ "URL": "http://hdl.handle.net/20.500.12537/187",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains recordings of 54 conversations by 102 speakers, recorded between September 2020 and September 2021.\nThe corpus is available for download from the CLARIN.IS repository.",
+ "Languages": ["isl"],
+ "License": "CC BY 4.0",
+ "Size": ["21 hours"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/20.500.12537/187"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/spoken-bnc2014.json b/corpora/spoken-corpora/spoken-bnc2014.json
new file mode 100644
index 0000000..863b963
--- /dev/null
+++ b/corpora/spoken-corpora/spoken-bnc2014.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Spoken BNC2014",
+ "URL": "http://cass.lancs.ac.uk/cass-projects/spoken-bnc2014/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains face-to-face conversations between people who speak British English as their first language.\nThe corpus is available through the CQP concordancer.",
+ "Languages": ["eng"],
+ "License": "",
+ "Size": ["10 million words"],
+ "Annotation": [],
+ "Infrastructure": "Other",
+ "Access": {
+ "Concordancer": "http://corpora.lancs.ac.uk/bnc2014"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/spoken-estonian.json b/corpora/spoken-corpora/spoken-estonian.json
new file mode 100644
index 0000000..d25e218
--- /dev/null
+++ b/corpora/spoken-corpora/spoken-estonian.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Corpus of Spoken Estonian",
+ "URL": "http://hdl.handle.net/11372/LRT-253",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains transcripts of recordings from various domains.",
+ "Languages": ["est"],
+ "License": "",
+ "Size": ["1 million words"],
+ "Annotation": ["unspecified tagging"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/spoken-icelandic.json b/corpora/spoken-corpora/spoken-icelandic.json
new file mode 100644
index 0000000..c41a89d
--- /dev/null
+++ b/corpora/spoken-corpora/spoken-icelandic.json
@@ -0,0 +1,17 @@
+{
+ "Name": "The Icelandic Spoken Language Corpus",
+ "URL": "https://clarin.is/en/resources/spoken/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains four different subcorpora: (1) Spontaneous conversations, from the project ÍSTAL (An Icelandic Spoken Language Bank), (2) Group conversations, from the project MIN (Modern loanwords in the Nordic languages), (3) Parliamentary debates, (4) Conversations of teenagers with other teenagers and adults\nThe corpus is available for download from CLARIN-IS (as a part of the Icelandic Gigaword Corpus) and for search through the concordancer Korp.",
+ "Languages": ["isl"],
+ "License": "CC-BY 4.0",
+ "Size": ["536,000 tokens"],
+ "Annotation": ["tokenised", "PoS-tagged", "lemmatised"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://malheildir.arnastofnun.is/?mode=mim#?lang=en&stats_reduce=word&isCaseInsensitive&searchBy=word&cqp=%5B%5D&corpus=mim_talmal",
+ "Download": "http://www.malfong.is/index.php?lang=en&pg=&dlid=95"
+ },
+ "Publication": "Steingrímsson et al. (2018)"
+}
diff --git a/corpora/spoken-corpora/spoken-wikipedia.json b/corpora/spoken-corpora/spoken-wikipedia.json
new file mode 100644
index 0000000..0885a2c
--- /dev/null
+++ b/corpora/spoken-corpora/spoken-wikipedia.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The Spoken Wikipedia Corpora",
+ "URL": "http://hdl.handle.net/11022/0000-0007-C641-0",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains transcripts of read Wikipedia articles.\nThe corpus is available for download from a CLARIN-D repository.",
+ "Languages": ["eng", "deu", "nld"],
+ "License": "CC-BY SA 4.0",
+ "Size": ["1005 hours"],
+ "Annotation": ["text segmentation", "normalization", "time-alignment"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions only",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/0000-0007-C641-0"
+ },
+ "Publication": "Köhn et al. (2016)"
+}
diff --git a/corpora/spoken-corpora/talromur-2.json b/corpora/spoken-corpora/talromur-2.json
new file mode 100644
index 0000000..2f7595b
--- /dev/null
+++ b/corpora/spoken-corpora/talromur-2.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Talrómur 2",
+ "URL": "http://hdl.handle.net/20.500.12537/167",
+ "Family": "Spoken corpora",
+ "Description": "This corpus consists of recordings of forty different speakers reading short sentences and is intended for modelling prosody.\nThe corpus is available for download from the CLARIN.IS repository.",
+ "Languages": ["isl"],
+ "License": "CC BY 4.0",
+ "Size": ["56,225 utterances"],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/20.500.12537/167"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/taus.json b/corpora/spoken-corpora/taus.json
new file mode 100644
index 0000000..b099073
--- /dev/null
+++ b/corpora/spoken-corpora/taus.json
@@ -0,0 +1,16 @@
+{
+ "Name": "TAUS",
+ "URL": "http://www.tekstlab.uio.no/nota/taus/english.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains informal interviews in Oslo sociolects.\nThe corpus is available through a Tekstlab concordancer (account needed).",
+ "Languages": ["nor"],
+ "License": "CLARIN ACA",
+ "Size": ["270 000 tokens"],
+ "Annotation": ["MSD-tagged", "lemmatised", "orthographically and partially phonetically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "https://tekstlab.uio.no/glossa2/taus2"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/uraluid.json b/corpora/spoken-corpora/uraluid.json
new file mode 100644
index 0000000..bfb19b3
--- /dev/null
+++ b/corpora/spoken-corpora/uraluid.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Uralic Languages under the Influence (UraLUID) database",
+ "URL": "http://www.nytud.hu/depts/tlp/uralic/dbases.html",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains narratives (e.g., folk storites).\nThe corpus is available for download from a dedicated website.",
+ "Languages": ["Udmurt", "Tundra Nenets", "Synya Khanty", "Surgut Khanty"],
+ "License": "",
+ "Size": ["108,000 tokens", "4 hours"],
+ "Annotation": ["MSD-tagged", "time-alignment", "phonetic and orthographic transcription"],
+ "Infrastructure": "Other",
+ "Access": {
+ "Download": "http://www.nytud.hu/depts/tlp/uralic/dbases.html"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/verbmobil-1.json b/corpora/spoken-corpora/verbmobil-1.json
new file mode 100644
index 0000000..7d5faf9
--- /dev/null
+++ b/corpora/spoken-corpora/verbmobil-1.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus Verbmobil I",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-EB31-0",
+ "Family": "Spoken corpora",
+ "Description": "The Verbmobil (VM) dialog database is a collection of German, American and Japanese dialog recordings in the appointment scheduling task. The data were collected during the first phase (1993 - 1996) of the German VM project funded by the German Ministry of Science and Technology (BMBF). Starting with version 3, the corpus is also provided as an emuR comptatible database.",
+ "Languages": ["deu", "eng", "jpn"],
+ "License": "CLARIN ACA",
+ "Size": ["77 hours"],
+ "Annotation": ["orthographically transcribed", "phonetic", "phonemic", "prosodic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-EB31-0"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/verbmobil-2.json b/corpora/spoken-corpora/verbmobil-2.json
new file mode 100644
index 0000000..ccc1906
--- /dev/null
+++ b/corpora/spoken-corpora/verbmobil-2.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Corpus Verbmobil II",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0000-FC55-5",
+ "Family": "Spoken corpora",
+ "Description": "Verbmobil 2 contains the speech of 401 speakers participating in 810 recordings. The emotional tagged recordings are not part of this edition but are collected inthe corpus 'BAS VMEmo'. The total VM2 corpus amounts to 17.6GB of data containing 58961 conversational turns distributed on 39 CD-R. VM2 contains dialogs in German, English, Japanese and mixed language pairs (partly with interpreter). The domain is appointment scheduling, travel planing, leisure time planing. Starting from version 3, the corpus is also available in emuR compatible emuDB format (see annotation files ending in *_annot.json).",
+ "Languages": ["deu", "eng", "jpn"],
+ "License": "CLARIN ACA",
+ "Size": ["65.8 hours"],
+ "Annotation": ["orthographically transcribed", "phonetic", "phonemic", "prosodic"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0000-FC55-5"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/vienna-oxford.json b/corpora/spoken-corpora/vienna-oxford.json
new file mode 100644
index 0000000..16461d5
--- /dev/null
+++ b/corpora/spoken-corpora/vienna-oxford.json
@@ -0,0 +1,15 @@
+{
+ "Name": "Vienna-Oxford International Corpus of English",
+ "URL": "http://voice.univie.ac.at/",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains naturally occurring, non-scripted face-to-face interactions in English as a lingua franca (ELF).\nThe corpus is available through a dedicated concordancer.",
+ "Languages": ["eng"],
+ "License": "",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "Other",
+ "Access": {
+ "Concordancer": "http://voice.univie.ac.at/"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/vowel-cochlear-patients.json b/corpora/spoken-corpora/vowel-cochlear-patients.json
new file mode 100644
index 0000000..ea653e7
--- /dev/null
+++ b/corpora/spoken-corpora/vowel-cochlear-patients.json
@@ -0,0 +1,16 @@
+{
+ "Name": "Vowel Production in Cochlear Implant Patients",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AFA1-4",
+ "Family": "Spoken corpora",
+ "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG). It contains recordings used for the analysis of sevel long, lexically stressed vowels in the words 'Taten', 'stetig', 'Toter', 'Stute', 'töten', 'Tüte' and 'kriegen'.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["2 hours"],
+ "Annotation": ["orthographically transcribed"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0001-AFA1-4"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/wissenschaftssprache.json b/corpora/spoken-corpora/wissenschaftssprache.json
new file mode 100644
index 0000000..85184c0
--- /dev/null
+++ b/corpora/spoken-corpora/wissenschaftssprache.json
@@ -0,0 +1,17 @@
+{
+ "Name": "",
+ "URL": "",
+ "Family": "Spoken corpora",
+ "Description": "",
+ "Languages": [],
+ "License": "",
+ "Size": [],
+ "Annotation": [],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Concordancer": "",
+ "Download": ""
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/zurich-tangram-bas.json b/corpora/spoken-corpora/zurich-tangram-bas.json
new file mode 100644
index 0000000..d5486b4
--- /dev/null
+++ b/corpora/spoken-corpora/zurich-tangram-bas.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The Zurich Tangram Corpus - BAS Edition",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-D89D-5",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains tasks, where one subject (the instructor) describes different Tangram figures to another subject (the receiver) so that the receiver can recreate the same order of figures that the instructor has in front of them. The subjects initially don't know each other and work together to solve these tasks in three consecutive sessions. This edition only features the transcribed segments, not those in between, and uses separate files for the subject.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["48 hours"],
+ "Annotation": ["orthographically transcribed", "word and phonemic segmentation"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-D89D-5"
+ },
+ "Publication": ""
+}
diff --git a/corpora/spoken-corpora/zurich-tangram-uzh.json b/corpora/spoken-corpora/zurich-tangram-uzh.json
new file mode 100644
index 0000000..19a2cd1
--- /dev/null
+++ b/corpora/spoken-corpora/zurich-tangram-uzh.json
@@ -0,0 +1,16 @@
+{
+ "Name": "The Zurich Tangram Corpus - UZH Edition",
+ "URL": "http://hdl.handle.net/11022/1009-0000-0007-D838-7",
+ "Family": "Spoken corpora",
+ "Description": "This corpus contains tasks, where one subject (the instructor) describes different Tangram figures to another subject (the receiver) so that the receiver can recreate the same order of figures that the instructor has in front of them. The subjects initially don't know each other and work together to solve these tasks in three consecutive sessions. This edition features the complete recordings, but lacking phone and word segmentation. Subjects audio tracks are combined into stereo files. If you would like just the transcribed segments with separate files for the subjects or want the word and phone segmentation see corpus ZTC_BAS.",
+ "Languages": ["deu"],
+ "License": "CLARIN ACA",
+ "Size": ["48 hours"],
+ "Annotation": ["orthographically transcribed", "turn segmentation"],
+ "Infrastructure": "CLARIN",
+ "Group": "Corpora with transcriptions and audio recordings",
+ "Access": {
+ "Download": "http://hdl.handle.net/11022/1009-0000-0007-D838-7"
+ },
+ "Publication": ""
+}