diff --git a/corpora/academic-corpora/ac-lit.json b/corpora/academic-corpora/ac-lit.json index cbe3bb5..2463143 100644 --- a/corpora/academic-corpora/ac-lit.json +++ b/corpora/academic-corpora/ac-lit.json @@ -3,7 +3,7 @@ "URL": "http://coralit.lt/en/node/18", "Family": "Academic corpora", "Description": "This corpus contains textbooks, scientific monographs, journal articles, abstracts, forewords, research reports, and master’s and PhD theses from the following disciplines:\nThe materials were published between 1999 and 2009. The corpus is encoded in TEI 5.\nThe corpus is available for online querying through a dedicated website.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "", "Size": ["9 million words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/academic-corpora/aca-hum.json b/corpora/academic-corpora/aca-hum.json index cdaaaef..eefdeac 100644 --- a/corpora/academic-corpora/aca-hum.json +++ b/corpora/academic-corpora/aca-hum.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/49", "Family": "Academic corpora", "Description": "This corpus contains academic texts from humanities disciplines published between 1997 and 2012. The corpus data are in the XML format and plain text.\nThe corpus is available for download from the SWECLARIN repository and for online querying through the concordancer Korp (SWECLARIN distribution).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY", "Size": ["14.5 million tokens"], "Annotation": [], diff --git a/corpora/academic-corpora/aca-soc.json b/corpora/academic-corpora/aca-soc.json index e606956..a0c7a2b 100644 --- a/corpora/academic-corpora/aca-soc.json +++ b/corpora/academic-corpora/aca-soc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/50", "Family": "Academic corpora", "Description": "This corpus contains academic texts from social sciences disciplines published between 1997 and 2012. The corpus data are in the XML format and plain text.\nThe corpus is available for download from the SWECLARIN repository and for online querying through the concordancer Korp (SWECLARIN distribution).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY", "Size": ["10.8 million tokens"], "Annotation": ["sentence segmentation"], diff --git a/corpora/academic-corpora/acl-anth.json b/corpora/academic-corpora/acl-anth.json index 93630aa..e9667fa 100644 --- a/corpora/academic-corpora/acl-anth.json +++ b/corpora/academic-corpora/acl-anth.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.35111/rfeg-z495", "Family": "Academic corpora", "Description": "This corpus contains research papers in computational linguistics published between 1979 and 2015. The corpus data are in the XML format.\nThe corpus is available for online querying through the Sketch Engine (log-in required) and for download from a dedicated website.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY SA", "Size": ["75 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "author/text metadata"], diff --git a/corpora/academic-corpora/acnz.json b/corpora/academic-corpora/acnz.json index 51b3058..279f526 100644 --- a/corpora/academic-corpora/acnz.json +++ b/corpora/academic-corpora/acnz.json @@ -3,7 +3,7 @@ "URL": "https://www.wgtn.ac.nz/lals/resources/academicwordlist/information/corpus", "Family": "Academic corpora", "Description": "This corpus contains journal articles, book chapters, course workbooks, laboratory manuals, and course notes from the following disciplines: arts, commerce, law, and biology.\nThis corpus is not available.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["3.5 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/chambers-lb.json b/corpora/academic-corpora/chambers-lb.json index 82c65ad..35b5c02 100644 --- a/corpora/academic-corpora/chambers-lb.json +++ b/corpora/academic-corpora/chambers-lb.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2527", "Family": "Academic corpora", "Description": "This corpus contains research papers in the following disciplines:\n\nThe research papers were published between 1998 and 2006. This is a plain text corpus.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "Oxford Text Archive licence (academic use)", "Size": ["1 million words"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/czec-soc.json b/corpora/academic-corpora/czec-soc.json index de27949..18d39e0 100644 --- a/corpora/academic-corpora/czec-soc.json +++ b/corpora/academic-corpora/czec-soc.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-2703", "Family": "Academic corpora", "Description": "This corpus contains research papers in sociology published between 1993 and 2016. The corpus data are in the TSV format.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "MIT", "Size": ["3 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/eng-sci.json b/corpora/academic-corpora/eng-sci.json index 95b25e2..2b6bbf0 100644 --- a/corpora/academic-corpora/eng-sci.json +++ b/corpora/academic-corpora/eng-sci.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CF9-6", "Family": "Academic corpora", "Description": "This corpus contains journal articles in the following disciplines:\n\nThe articles were published in the 1970s, 1980s and the 200s.\nThe corpus is available for online querying through CQPWeb (CLARIN-D distribution).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "restricted", "Size": ["35 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "author/text metadata", "document structure"], diff --git a/corpora/academic-corpora/est-sci.json b/corpora/academic-corpora/est-sci.json index 4291b55..4b2ac95 100644 --- a/corpora/academic-corpora/est-sci.json +++ b/corpora/academic-corpora/est-sci.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11297/1-00-0000-0000-0000-0002-4", "Family": "Academic corpora", "Description": "This corpus contains scientific articles and PhD theses. The corpus data are in the P5 format.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA-NC", "Size": ["5 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/genia.json b/corpora/academic-corpora/genia.json index 2cb3389..2013694 100644 --- a/corpora/academic-corpora/genia.json +++ b/corpora/academic-corpora/genia.json @@ -3,7 +3,7 @@ "URL": "http://www.geniaproject.org/genia-corpus", "Family": "Academic corpora", "Description": "This corpus contains journal paper abstracts in biomedicine. The corpus data are in various formats, e.g., PTB.\nThe corpus is available for download from PORTULAN.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "free but unspecified", "Size": ["437,000 words"], "Annotation": ["PoS-tagged", "syntactically parsed", "annotated for terms, events, semantic relations and coreference", "text metadata"], diff --git a/corpora/academic-corpora/jezkor.json b/corpora/academic-corpora/jezkor.json index cf68fd8..330abca 100644 --- a/corpora/academic-corpora/jezkor.json +++ b/corpora/academic-corpora/jezkor.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1755", "Family": "Academic corpora", "Description": "This corpus contains a collection of linguistic scientific writing in the Slovenian language. It consists of 43 monographs published between 2009 and 2022 by Fran Ramovš institute of Slovenian language and Založba ZRC, 267 papers published in the journal \"Jezikoslovni zapiski\" and 28 papers published in the journal \"Slovenski jezik\". Note that the texts were obtained directly from PDFs, so they contain various types of noise.\nThe corpus is linguistically annotated with the CLASSLA pipeline (https://github.com/clarinsi/classla) on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features, and named entities. It is distributed in CoNLL-U and vertical file format, one file for each text. Text metadata consists of the author(s), title and year of publication.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY", "Size": ["9.3 million tokens"], "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated for named entities and author/text metadata"], diff --git a/corpora/academic-corpora/kas.json b/corpora/academic-corpora/kas.json index c47f829..0d182b2 100644 --- a/corpora/academic-corpora/kas.json +++ b/corpora/academic-corpora/kas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1448", "Family": "Academic corpora", "Description": "This corpus contains BA, MA, and PhD theses in humanities, social sciences, and natural sciences published between 2000 and 2018. The corpus data are in the TEI format.\nThe corpus is available for download from CLARIN.SI. Version 1.0 is also available for online querying through noSketch Engine and KonText (CLARIN.SI distribution).", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CLARIN.SI Licence ACA ID-BY-NC-INF-NORED 1.0", "Size": ["1.5 billion tokens"], "Annotation": ["MSD-tagged", "lemmatised", "marked for bilingual and monolingual term candidates"], diff --git a/corpora/academic-corpora/kiap.json b/corpora/academic-corpora/kiap.json index 7d31370..52ab3dc 100644 --- a/corpora/academic-corpora/kiap.json +++ b/corpora/academic-corpora/kiap.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D989-605B-8F10-5", "Family": "Academic corpora", "Description": "This comparable corpus contains research articles in economics, linguistics, and medicine published between 1992 and 2003.\nThe corpus is available for online browsing through the concordancer Corpuscle (CLARINO distribution).", - "Languages": ["eng","fra","nor"], + "Language": ["eng","fra","nor"], "Licence": "CC-BY 4.0", "Size": ["3.9 million tokens"], "Annotation": ["PoS-tagged"], diff --git a/corpora/academic-corpora/lit-trans.json b/corpora/academic-corpora/lit-trans.json index 3da38c2..682aa21 100644 --- a/corpora/academic-corpora/lit-trans.json +++ b/corpora/academic-corpora/lit-trans.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-24F2-6", "Family": "Academic corpora", "Description": "This corpus contains journal articles in literary and translation studies. This is a plain text corpus.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-SA", "Size": ["48,300 words"], "Annotation": [], diff --git a/corpora/academic-corpora/modern-greek.json b/corpora/academic-corpora/modern-greek.json index 4409310..b2a5813 100644 --- a/corpora/academic-corpora/modern-greek.json +++ b/corpora/academic-corpora/modern-greek.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-2502-4", "Family": "Academic corpora", "Description": "This corpus contains scientific texts in linguistics and dialectology. This is a plain text corpus.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-SA", "Size": ["113,000 words"], "Annotation": [], diff --git a/corpora/academic-corpora/muchmore.json b/corpora/academic-corpora/muchmore.json index 53fcb35..e27c18c 100644 --- a/corpora/academic-corpora/muchmore.json +++ b/corpora/academic-corpora/muchmore.json @@ -3,7 +3,7 @@ "URL": "http://muchmore.dfki.de/resources1.htm", "Family": "Academic corpora", "Description": "This paper contains journal paper abstracts from medical disciplines. The corpus is encoded in MuchMore XML.\nThe corpus is available for download from a dedicated website.", - "Languages": ["eng","deu"], + "Language": ["eng","deu"], "Licence": "free but unspecified", "Size": ["1 million tokens"], "Annotation": ["PoS/MSD-tagged", "phrase chunking", "semantic class and relations", "document structure"], diff --git a/corpora/academic-corpora/open-slo.json b/corpora/academic-corpora/open-slo.json index 751992d..ff6f12f 100644 --- a/corpora/academic-corpora/open-slo.json +++ b/corpora/academic-corpora/open-slo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1774", "Family": "Academic corpora", "Description": "This corpus contains a large collection of scientific writing in the Slovenian language gathered from the Open Science Slovenia portal. It consists of over 150 thousand monographs, articles, diploma, master's and doctoral theses, advanced textbooks, reviews etc. mostly published between 2000 and 2022 by Slovenian universities, research institutions, etc. Texts are accompanied by metadata, i.e. author, supervisor (for theses), year of publication, publisher (mostly faculties of the various universities), type of publication (according to SICRIS classification), keywords, and CERIF and UDC codes. The texts were obtained directly from PDFs, so it should be noted that they can contain various types of character noise. The texts are linguistically annotated with the CLASSLA pipeline on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features, and named entities. The corpus is distributed in CoNLL-U and vertical file formats, one file for each text. The text metadata is given as a TSV file.\nNote that there exist similar, but older and smaller corpora KAS 2.0 and KAS 1.0. These contain only theses and only up to 2018, but are cleaner and with more metadata. The repository also archives a number of KAS-derived datasets; pls. search for \"KAS\" to find them.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA", "Size": ["326 million tokens"], "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated for named entities and author/text metadata"], diff --git a/corpora/academic-corpora/orossimo.json b/corpora/academic-corpora/orossimo.json index 1f11fe7..203eef1 100644 --- a/corpora/academic-corpora/orossimo.json +++ b/corpora/academic-corpora/orossimo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2410-5", "Family": "Academic corpora", "Description": "This corpus contains academic texts in the following disciplines:\nsocial sciences,
  • computer science,
  • economics,
  • linguistics,
  • photography,
  • law,
  • engineering,
  • history,
  • astronomy,
  • earth sciences and geology,
  • medicine and health, and
  • biology.
  • \nThe corpus is encoded in XML (XCES).\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY", "Size": ["2.5 million tokens"], "Annotation": ["marked for term candidates", "mixed structural annotation"], diff --git a/corpora/academic-corpora/reading.json b/corpora/academic-corpora/reading.json index bca7485..102cfb5 100644 --- a/corpora/academic-corpora/reading.json +++ b/corpora/academic-corpora/reading.json @@ -3,7 +3,7 @@ "URL": "http://www.reading.ac.uk/internal/appling/corpus.htm", "Family": "Academic corpora", "Description": "This corpus contains PhD theses from the following disciplines: agriculture, psychology, food science, technology, meteorology, and history. The data are encoded in ASCII and HTML.\nThe corpus is not available because it is restricted at present to staff and researchers at the University of Reading, and it is only available 'on-site'. However, it is possible for people outside the University to make use of the corpus on a Research Attachment arrangement.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "restricted", "Size": [], "Annotation": [], diff --git a/corpora/academic-corpora/roger.json b/corpora/academic-corpora/roger.json index 6ef2b26..16d0db9 100644 --- a/corpora/academic-corpora/roger.json +++ b/corpora/academic-corpora/roger.json @@ -3,7 +3,7 @@ "URL": "https://roger-corpus.org/", "Family": "Academic corpora", "Description": "The corpus contains academic papers from eight disciplines, written by the Romanian students in native Romanian and English L2.\nThe corpus was collected over a three-year period (2018–2021) with the help of 27 collaborators from nine Romanian universities.\nThe corpus is available for online querying through a dedicated platform developed at the CODHUS research centre from the West University of Timisoara.", - "Languages": ["eng","ron"], + "Language": ["eng","ron"], "Licence": "CC BY-NC-ND", "Size": ["3.3 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/roysoc.json b/corpora/academic-corpora/roysoc.json index 7f28ea0..577f5ea 100644 --- a/corpora/academic-corpora/roysoc.json +++ b/corpora/academic-corpora/roysoc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0001-7E8B-6", "Family": "Academic corpora", "Description": "This corpus contains journal articles published in Philosophical Transactions of the Royal Society of London between 1665 and 1869.\nThe corpus is available for online querying through CQPweb and for download from the CLARIN-D repository of the University of Saarland.", - "Languages": ["English (late and early modern)"], + "Language": ["English (late and early modern)"], "Licence": "CC BY", "Size": ["32 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "normalised", "author and document metadata"], diff --git a/corpora/academic-corpora/scientext.json b/corpora/academic-corpora/scientext.json index aad4055..1a4cace 100644 --- a/corpora/academic-corpora/scientext.json +++ b/corpora/academic-corpora/scientext.json @@ -3,7 +3,7 @@ "URL": "https://scientext.hypotheses.org/corpus", "Family": "Academic corpora", "Description": "This corpus contains scientific texts and argumentative essays in humanities, experimental sciences, and applied/technical sciences.\nThe corpus is available for online querying through a dedicated webpage.", - "Languages": ["fra","eng"], + "Language": ["fra","eng"], "Licence": "CC BY", "Size": ["20 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/span-eng.json b/corpora/academic-corpora/span-eng.json index 297e36e..80704de 100644 --- a/corpora/academic-corpora/span-eng.json +++ b/corpora/academic-corpora/span-eng.json @@ -3,7 +3,7 @@ "URL": "https://books.google.si/books?id=NZbWCgAAQBAJ&pg=PA178&lpg=PA178&dq=serac+corpus&source=bl&ots=A7F-vUMJsr&sig=ACfU3U1b8W_r944Bs8OviL9xauHtUoeqVg&hl=sl&sa=X&ved=2ahUKEwiRuq_5nczmAhXT5KYKHWUtBlcQ6AEwAHoECAUQAQ#v=onepage&q=serac%20corpus&f=false", "Family": "Academic corpora", "Description": "This corpus contains journal articles published between 2000 and 2010.\nThe corpus is unavailable.", - "Languages": ["spa","eng"], + "Language": ["spa","eng"], "Licence": "", "Size": ["5.7 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/ufal-papers.json b/corpora/academic-corpora/ufal-papers.json index 34ce095..ec43f12 100644 --- a/corpora/academic-corpora/ufal-papers.json +++ b/corpora/academic-corpora/ufal-papers.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11234/1-1731", "Family": "Academic corpora", "Description": "This parallel corpus contains research paper abstracts in formal and applied linguistics. For each publication, the authors were obliged to provide both the original abstract in Czech or English, and its translation into English or Czech, respectively. The corpus data are in the TSV format.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces","eng"], + "Language": ["ces","eng"], "Licence": "CC BY", "Size": ["2 million words"], "Annotation": ["document aligned"], diff --git a/corpora/academic-corpora/uh-eng.json b/corpora/academic-corpora/uh-eng.json index 809bba0..3bc8fd7 100644 --- a/corpora/academic-corpora/uh-eng.json +++ b/corpora/academic-corpora/uh-eng.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102401", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY", "Size": ["200 million tokens"], "Annotation": ["PoS-tagged", "syntactically parsed"], diff --git a/corpora/academic-corpora/uh-fin.json b/corpora/academic-corpora/uh-fin.json index a39e030..eb1df4a 100644 --- a/corpora/academic-corpora/uh-fin.json +++ b/corpora/academic-corpora/uh-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016090601", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY", "Size": ["12.5 million tokens"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/academic-corpora/uh-fra.json b/corpora/academic-corpora/uh-fra.json index e1d5cae..4ed6bbc 100644 --- a/corpora/academic-corpora/uh-fra.json +++ b/corpora/academic-corpora/uh-fra.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102806", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC BY", "Size": ["580,000 tokens"], "Annotation": [], diff --git a/corpora/academic-corpora/uh-ger.json b/corpora/academic-corpora/uh-ger.json index 9621143..008bac3 100644 --- a/corpora/academic-corpora/uh-ger.json +++ b/corpora/academic-corpora/uh-ger.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102807", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC BY", "Size": ["560,000 tokens"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/uh-rus.json b/corpora/academic-corpora/uh-rus.json index c483fd3..832db9a 100644 --- a/corpora/academic-corpora/uh-rus.json +++ b/corpora/academic-corpora/uh-rus.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102808", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["rus"], + "Language": ["rus"], "Licence": "CC BY", "Size": ["1.1 million words"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/uh-spa.json b/corpora/academic-corpora/uh-spa.json index 0d53738..b0a161f 100644 --- a/corpora/academic-corpora/uh-spa.json +++ b/corpora/academic-corpora/uh-spa.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102809", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "CC BY", "Size": ["2.3 million tokens"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/uh-swe.json b/corpora/academic-corpora/uh-swe.json index 0e2b296..79a0625 100644 --- a/corpora/academic-corpora/uh-swe.json +++ b/corpora/academic-corpora/uh-swe.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102810", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY", "Size": ["105 million tokens"], "Annotation": [], diff --git a/corpora/cmc-corpora/comere.json b/corpora/cmc-corpora/comere.json index 789fb81..d427b4e 100644 --- a/corpora/cmc-corpora/comere.json +++ b/corpora/cmc-corpora/comere.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/comere", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains e-mails, forum posts, online chats, tweets and SMS.\nThe corpus is available for download from Ortolang.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": ["80 million tokens"], "Annotation": ["tokenised", "mostly untagged"], diff --git a/corpora/cmc-corpora/contemp-blogs.json b/corpora/cmc-corpora/contemp-blogs.json index e6de082..ccb0128 100644 --- a/corpora/cmc-corpora/contemp-blogs.json +++ b/corpora/cmc-corpora/contemp-blogs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-000E-011B-8", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY", "Size": ["1 million tokens"], "Annotation": ["tokenised", "sentence tagged"], diff --git a/corpora/cmc-corpora/dereko-news-wiki.json b/corpora/cmc-corpora/dereko-news-wiki.json index 84b52ca..2b9df32 100644 --- a/corpora/cmc-corpora/dereko-news-wiki.json +++ b/corpora/cmc-corpora/dereko-news-wiki.json @@ -3,7 +3,7 @@ "URL": "https://cosmas2.ids-mannheim.de/cosmas2-web/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains content from newsgroup posts and Wikipedia.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["670 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/didi.json b/corpora/cmc-corpora/didi.json index d49495d..eb9d9e2 100644 --- a/corpora/cmc-corpora/didi.json +++ b/corpora/cmc-corpora/didi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12124/7", "Family": "Computer-mediated communication corpora", "Description": "This corpus consists of Facebook posts gathered from 136 Facebook users from South Tyrol. All texts are anonymised.\nThe corpus is available for download from the EURAC Research CLARIN repository.", - "Languages": ["deu","ita","eng","lad"], + "Language": ["deu","ita","eng","lad"], "Licence": "ACA-BY-NC-NORED 1.0", "Size": ["600,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/do-chat.json b/corpora/cmc-corpora/do-chat.json index 2d3ff6d..e75a760 100644 --- a/corpora/cmc-corpora/do-chat.json +++ b/corpora/cmc-corpora/do-chat.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-203Z-0000-002D-ECC7-2", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains online chats from 2000 to 2006\nThe corpus is available for download from the repository of CLARIN-D", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY", "Size": ["1 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/dwds-blogs.json b/corpora/cmc-corpora/dwds-blogs.json index 07089cd..395abba 100644 --- a/corpora/cmc-corpora/dwds-blogs.json +++ b/corpora/cmc-corpora/dwds-blogs.json @@ -3,7 +3,7 @@ "URL": "https://www.dwds.de/r#group-Spezialkorpora", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["102 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/ebay-petit.json b/corpora/cmc-corpora/ebay-petit.json index 2ee61cc..4b55dcb 100644 --- a/corpora/cmc-corpora/ebay-petit.json +++ b/corpora/cmc-corpora/ebay-petit.json @@ -3,7 +3,7 @@ "URL": "https://www.uni-potsdam.de/langage/la-bank/ebay.php", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains eBay listings from 2005, 2017, and 2018. The corpus is manually annotated.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["100,000 tokens"], "Annotation": ["see here"], diff --git a/corpora/cmc-corpora/flemish-teen-talk.json b/corpora/cmc-corpora/flemish-teen-talk.json index 012bc3c..93a7d6f 100644 --- a/corpora/cmc-corpora/flemish-teen-talk.json +++ b/corpora/cmc-corpora/flemish-teen-talk.json @@ -3,7 +3,7 @@ "URL": "https://repository.uantwerpen.be/docman/irua/948a9a/159941.pdf", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains Facebook posts and WhatsApp messages from 2015 and 2016.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": ["2.9 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/global-web-en.json b/corpora/cmc-corpora/global-web-en.json index 6d69c1e..13ff928 100644 --- a/corpora/cmc-corpora/global-web-en.json +++ b/corpora/cmc-corpora/global-web-en.json @@ -3,7 +3,7 @@ "URL": "https://www.kielipankki.fi/corpora/glowbe/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains texts from web-pages in United States, Great Britain, Australia, India, and 16 other countries. About 60% of the texts come from blogs.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN RES (download); CLARIN ACA (online)", "Size": ["1.8 billion words", "1.8 million texts"], "Annotation": "", diff --git a/corpora/cmc-corpora/heid.json b/corpora/cmc-corpora/heid.json index 04909a0..d26c525 100644 --- a/corpora/cmc-corpora/heid.json +++ b/corpora/cmc-corpora/heid.json @@ -3,7 +3,7 @@ "URL": "https://www.researchgate.net/publication/311674809_Political_Discourse_in_Polish_Internet-Corpus_of_Highly_Emotive_Internet_Discussions", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains tweets.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["160 milllion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/hs-fi-news.json b/corpora/cmc-corpora/hs-fi-news.json index 0781dc5..949caed 100644 --- a/corpora/cmc-corpora/hs-fi-news.json +++ b/corpora/cmc-corpora/hs-fi-news.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052718", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains the domestic news of the Helsingin Sanomat website and their comments from 5 September 2011 to 4 September 2012.\nThe corpus has been syntactically parsed using TDT alpha.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA – NC", "Size": ["8 million tokens", "593,760 sentences", "93,602 texts"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed"], diff --git a/corpora/cmc-corpora/janes-blog.json b/corpora/cmc-corpora/janes-blog.json index 858a401..9d6a3e5 100644 --- a/corpora/cmc-corpora/janes-blog.json +++ b/corpora/cmc-corpora/janes-blog.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1138", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts from RTV Slovenija and Publishwall.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["34 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-forum.json b/corpora/cmc-corpora/janes-forum.json index 168a924..5e6c4fb 100644 --- a/corpora/cmc-corpora/janes-forum.json +++ b/corpora/cmc-corpora/janes-forum.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1139", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from Avtomobilizem.com, MedOver.net and RTV Slovenija.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["47 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-news.json b/corpora/cmc-corpora/janes-news.json index 32c92d7..658ab52 100644 --- a/corpora/cmc-corpora/janes-news.json +++ b/corpora/cmc-corpora/janes-news.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1140", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains news comments from RTV Slovenija, Mladina and Reporter.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["14 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-tweet.json b/corpora/cmc-corpora/janes-tweet.json index d0b0cff..a5a351d 100644 --- a/corpora/cmc-corpora/janes-tweet.json +++ b/corpora/cmc-corpora/janes-tweet.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1142", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains tweets written by Slovenian Twitter users from 2013 to 2017.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["139 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-wiki.json b/corpora/cmc-corpora/janes-wiki.json index 4030cc7..988da9e 100644 --- a/corpora/cmc-corpora/janes-wiki.json +++ b/corpora/cmc-corpora/janes-wiki.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1137", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains Slovenian Wikipedia user and talk pages.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["5 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/litis.json b/corpora/cmc-corpora/litis.json index 3365491..3517181 100644 --- a/corpora/cmc-corpora/litis.json +++ b/corpora/cmc-corpora/litis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/11", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from portals delfi.lt and lrytas.lt from 2010 to 2014.\nThe corpus is available for download from the CLARIN-LT repository.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN_ACA", "Size": ["190,000 comments"], "Annotation": "", diff --git a/corpora/cmc-corpora/macocu.json b/corpora/cmc-corpora/macocu.json index 7795459..ccab5ee 100644 --- a/corpora/cmc-corpora/macocu.json +++ b/corpora/cmc-corpora/macocu.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1804", "Family": "Computer-mediated communication corpora", "Description": "These corpora are a collection containing web texts and were built by crawling national internet top-level domains (specified below) and by extending the crawl dynamically to other domains as well. The crawler is available at MaCoCu GitHub channel. Considerable effort was devoted into cleaning the extracted text to provide a high-quality web corpus. This was achieved by removing boilerplate and near-duplicated paragraphs, discarding very short texts as well as texts that are not in the target language. Furthermore, samples from the largest 1,500 domains were manually checked and bad domains, such as machine-translated domains, were removed.\nThe dataset is characterized by extensive metadata which allows filtering the dataset based on text quality and other criteria, making the corpus highly useful for corpus linguistics studies, as well as for training language models and other language technologies. In XML format, each document is accompanied by the following metadata: title, crawl date, url, domain, file type of the original document, distribution of languages inside the document, and a fluency score based on a language model. The text of each document is divided into paragraphs that are accompanied by metadata on the information whether a paragraph is a heading or not, metadata on the paragraph quality (labels, such as \"short\" or \"good\", assigned based on paragraph length, URL and stopword density via the jusText tool) and fluency (score between 0 and 1, assigned with the Monocleaner tool), the automatically identified language of the text in the paragraph, and information whether the paragraph contains sensitive information (identified via the Biroamer tool). As opposed to the previous version in the case of corpora in version 2.0, this version has more accurate metadata on languages of the texts, which was achieved by using Google's Compact Language Detector 2 (CLD2), a high-performance language detector supporting many languages. Other tools, used for web corpora creation and curation, have been updated as well, resulting in an even cleaner, as well as larger corpus.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be easily read with the prevert parser.", - "Languages": ["sqi","bos","bul","cat","hrv","ell","isl","mkd","mlt","cnr","srp","tur","ukr","slv"], + "Language": ["sqi","bos","bul","cat","hrv","ell","isl","mkd","mlt","cnr","srp","tur","ukr","slv"], "Licence": "CC0 No Rights Reserved", "Size": "", "Annotation": ["annotated with extensive metadata"], diff --git a/corpora/cmc-corpora/mixed-newmedia.json b/corpora/cmc-corpora/mixed-newmedia.json index f7f4300..9146432 100644 --- a/corpora/cmc-corpora/mixed-newmedia.json +++ b/corpora/cmc-corpora/mixed-newmedia.json @@ -3,7 +3,7 @@ "URL": "http://www.cl.ut.ee/korpused/segakorpus/uusmeedia/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains chat room messages, forum posts and news comments from 2000 to 2008\nThe corpus is available for download from a dedicated webpage associated with CLARIN Estonia and through a dedicated concordancer.", - "Languages": ["est"], + "Language": ["est"], "Licence": "", "Size": ["25 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/monitor-at-tweets.json b/corpora/cmc-corpora/monitor-at-tweets.json index 1583cf0..79a5b3b 100644 --- a/corpora/cmc-corpora/monitor-at-tweets.json +++ b/corpora/cmc-corpora/monitor-at-tweets.json @@ -3,7 +3,7 @@ "URL": "https://hal.archives-ouvertes.fr/hal-01323274/document", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains tweets from 2007 to 2017.", - "Languages": ["deu","eng"], + "Language": ["deu","eng"], "Licence": "", "Size": ["40 million tweets"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/cmc-corpora/monitor-slo-trendi.json b/corpora/cmc-corpora/monitor-slo-trendi.json index 2dc433b..2723e41 100644 --- a/corpora/cmc-corpora/monitor-slo-trendi.json +++ b/corpora/cmc-corpora/monitor-slo-trendi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1782", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains news from 107 different media websites, published by 72 different publishers, and is a monitor corpus of Slovene. Trendi 2023-02 covers the period from January 2019 to February 2023, complementing the Gigafida 2.0 reference corpus of written Slovene. All the contents of the Trendi corpus are at the moment obtained using the Jožef Stefan Institute Newsfeed service. The texts have been annotated using the CLASSLA-Stanza pipeline, including syntactic parsing according to the Universal Dependencies and Named Entities.\nAn important addition are topics or thematical categories, which have been automatically assigned to each text. There are 13 categories altogether: Arts and culture, Crime and accidents, Economy, Environment, Health, Leisure, Politics and Law, Science and Technology, Society, Sports, Weather, Entertainment, and Education. Text classification models are available at Text classification model SloBERTa-Trendi-Topics 1.0, Text classification model fastText-Trendi-Topics 1.0, and SloBERTa model. At the moment, the corpus is not available as a dataset due to copyright restrictions but we hope to make at least some of it available in the near future.\nThe corpus can be queried through noSketchEngine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "", "Size": ["700 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed", "annotated for named entities and topics"], diff --git a/corpora/cmc-corpora/ntap-en.json b/corpora/cmc-corpora/ntap-en.json index 956f59c..864e2b2 100644 --- a/corpora/cmc-corpora/ntap-en.json +++ b/corpora/cmc-corpora/ntap-en.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/DAB8-BE65-64FD-4", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts that are related to climate change issues across science, politics, and the environment. The vast majority of the posts are from 2005 onwards.\nThe corpus is available for searching online through the Corpuscle concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["660,798,199 tokens"], "Annotation": "", diff --git a/corpora/cmc-corpora/ntap-fr.json b/corpora/cmc-corpora/ntap-fr.json index 0d60169..4cb2b62 100644 --- a/corpora/cmc-corpora/ntap-fr.json +++ b/corpora/cmc-corpora/ntap-fr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/DE48-00A5-6536-1", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts that are related to climate change issues across science, politics, and the environment. The vast majority of the posts are from 2005 onwards.\nThe corpus is available for searching online through the Corpuscle concordancer.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "", "Size": ["1,506,064,082 words"], "Annotation": "", diff --git a/corpora/cmc-corpora/paisa.json b/corpora/cmc-corpora/paisa.json index 04e206b..1029260 100644 --- a/corpora/cmc-corpora/paisa.json +++ b/corpora/cmc-corpora/paisa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12124/3", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains approximately 380,000 documents coming from about 1,000 different websites, for a total of about 250 million words. Approximately 260,000 documents are from Wikipedia, approx. 5,600 from other Wikimedia Foundation projects. About 9,300 documents come from Indymedia, and we estimate that about 65,000 documents come from blog services.\nThe corpus is available for download from the EURAC Research CLARIN repository.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["380,000 pages", "250 million words"], "Annotation": "", diff --git a/corpora/cmc-corpora/pdrs.json b/corpora/cmc-corpora/pdrs.json index b2e491e..a9c3a61 100644 --- a/corpora/cmc-corpora/pdrs.json +++ b/corpora/cmc-corpora/pdrs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1752", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains texts from the web obtained by crawling the .rs domain. Crawling has been done in September and October 2022 with BootCat. As search terms, appr. 2,800 word forms with a frequency between 5,000 and 500,000 in srWaC have been used. The texts are deduplicated, cyrillic texts have been transliterated into the Latin alphabet. The linguistic processing was done with the CLASSLA package for tokenization, lemmatization and morpho-syntactic tagging (both MULTEXT-East and Universal Dependencies).\nIn addition, some 80% of the URLs are manually tagged for 10 different types of sources (\"area\"): media (media outlets with several posts daily), inform (topic-centered sites with infrequent posts - maximum 3 per day), company (presentations of companies), state (websites of government bodies on nationa, regional and local level), forum (forum posts), portal (topic-centered portals without daily coverage), science (scientific publications), shop (with descriptions of products), database (knowledge bases, dictionaries, databases and similar) and community (NGOs, fan clubs, associations and other). The corpus is distributed in the CoNLL-U format in batches of appr. 2x50 mio. tokens.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through noSketchEngine and KonText concordancers.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC-BY", "Size": ["715 million tokens"], "Annotation": ["tokenised", "MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated for text source"], diff --git a/corpora/cmc-corpora/sfnet.json b/corpora/cmc-corpora/sfnet.json index 33413be..a1086dc 100644 --- a/corpora/cmc-corpora/sfnet.json +++ b/corpora/cmc-corpora/sfnet.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20150126", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains written posts from the SFNET forum in Finnish from 2002 to 2003.\nThe PoS-tagging has been done with the FI-FDG Parser, which uses a computational implementation of Functional Dependency Grammar.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank)", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA – NC", "Size": ["100 million words"], "Annotation": ["PoS-tagged", "sentence and word segmentation"], diff --git a/corpora/cmc-corpora/sms4science.json b/corpora/cmc-corpora/sms4science.json index 568bbbc..3255557 100644 --- a/corpora/cmc-corpora/sms4science.json +++ b/corpora/cmc-corpora/sms4science.json @@ -3,7 +3,7 @@ "URL": "http://sms4science.ch/Main/WebHome", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains around 25000 SMS from 2009.\nThe corpus comes in two different versions which are available through separate concordancers - SMS Navigator and ANNIS. The version accessible through ANNIS is more richly annotated and includes PoS-tagging, normalization, annotation of nonce borrowings, etc. Access through the concordancers requires free registration.", - "Languages": ["gsw","deu","fra","ita","roh"], + "Language": ["gsw","deu","fra","ita","roh"], "Licence": "", "Size": ["0.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/sonar-newmedia.json b/corpora/cmc-corpora/sonar-newmedia.json index b22fb92..0f26b00 100644 --- a/corpora/cmc-corpora/sonar-newmedia.json +++ b/corpora/cmc-corpora/sonar-newmedia.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/157d6fee6134f5beab09b159dd7c710a", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains tweets, chats and SMS from 2005 to 2012.\nThe corpus is available for searching online through the OpenSONAR environment.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN ACA", "Size": ["35 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/suomi24.json b/corpora/cmc-corpora/suomi24.json index 0875be4..e2511e9 100644 --- a/corpora/cmc-corpora/suomi24.json +++ b/corpora/cmc-corpora/suomi24.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017021506", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from the Suomi24 website from 2001 to 2016.\nThe corpus is available for download from the FIN-CLARIN repository and through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA", "Size": ["2.6 billion tokens"], "Annotation": ["tokenised", "MSD-tagged"], diff --git a/corpora/cmc-corpora/welsh-tweets.json b/corpora/cmc-corpora/welsh-tweets.json index 6024fd3..a368818 100644 --- a/corpora/cmc-corpora/welsh-tweets.json +++ b/corpora/cmc-corpora/welsh-tweets.json @@ -3,7 +3,7 @@ "URL": "http://techiaith.cymru/corpora/twitter/?lang=en", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains tweets.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["cym"], + "Language": ["cym"], "Licence": "unclear", "Size": ["7 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/whatsup-ch.json b/corpora/cmc-corpora/whatsup-ch.json index 3d54503..7a45939 100644 --- a/corpora/cmc-corpora/whatsup-ch.json +++ b/corpora/cmc-corpora/whatsup-ch.json @@ -3,7 +3,7 @@ "URL": "http://cmc-corpora.ch/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains 216 WhatsApp chats from 2014.\nThe corpus is accessible online through the ANNIS system.", - "Languages": ["gsw","deu","fra","ita","roh"], + "Language": ["gsw","deu","fra","ita","roh"], "Licence": "", "Size": ["5 million tokens"], "Annotation": "", diff --git a/corpora/cmc-corpora/ylilauta.json b/corpora/cmc-corpora/ylilauta.json index 9cc3018..1081342 100644 --- a/corpora/cmc-corpora/ylilauta.json +++ b/corpora/cmc-corpora/ylilauta.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015031802", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains text from discussions of the Ylilauta online discussion board from 2012 to 2014.\nThe corpus has been syntactically annotated with the TDT alpha parser, while the named entities have been assigned using the FiNER tool.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-NC", "Size": ["26.9 million words"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed", "named entities"], diff --git a/corpora/corpora-of-disordered-speech/adhd-uva.json b/corpora/corpora-of-disordered-speech/adhd-uva.json index 32f29c2..4856f03 100644 --- a/corpora/corpora-of-disordered-speech/adhd-uva.json +++ b/corpora/corpora-of-disordered-speech/adhd-uva.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-2766F32F-4305-4F13-A02C-F4A8F5216425", "Family": "Corpora of Disordered Speech", "Description": "This corpus aims to compare the language and executive functioning profiles of children with ADHD to children with Specific Language Impairment and children with Tourette’s Disorder.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["4 GB (67 recordings) of 26 Dutch children with ADHD, 19 Dutch children with SLI, 22 children Dutch controls"], "Annotation": ["Transcriptions (CHAT-format)"], diff --git a/corpora/corpora-of-disordered-speech/adresso-challenge.json b/corpora/corpora-of-disordered-speech/adresso-challenge.json index 561435c..02077dd 100644 --- a/corpora/corpora-of-disordered-speech/adresso-challenge.json +++ b/corpora/corpora-of-disordered-speech/adresso-challenge.json @@ -3,7 +3,7 @@ "URL": "https://sla.talkbank.org/TBB/dementia", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in dementia.\nAccess to the data in DementiaBank is password protected and restricted to members of the DementiaBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng", "deu", "cmn", "spa", "Taiwanese"], + "Language": ["eng", "deu", "cmn", "spa", "Taiwanese"], "Licence": "email request for access", "Size": [], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json b/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json index d5ba406..21e92fc 100644 --- a/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json +++ b/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json @@ -3,7 +3,7 @@ "URL": "https://catalog.elra.info/en-us/repository/browse/ELRA-S0413/", "Family": "Corpora of Disordered Speech", "Description": "This corpus primarily consists of recordings of 31 laryngectomees (27 males and 4 females) pronouncing 100 phonetically balanced sentences.\nEsophageal voices were recorded in a soundproof recording cubicle with a Neuman microphone.\nThe corpus also includes parallel recordings of the sentences by 9 healthy speakers (6 males and 3 females) to facilitate speech processing tasks that require small parallel corpora, such as voice conversion or synthetic speech adaptation. Apart from the sentences, the database also contains 4 sustained vowels and a small set of isolated words (14) which can be very valuable for research on esophageal speech analysis, diagnosis and evaluation. ", - "Languages": ["Spanish, Castilian"], + "Language": ["Spanish, Castilian"], "Licence": "Non Commercial Use - ELRA END USER", "Size": ["10.8 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/aphasiabank.json b/corpora/corpora-of-disordered-speech/aphasiabank.json index 39a403f..e5dff53 100644 --- a/corpora/corpora-of-disordered-speech/aphasiabank.json +++ b/corpora/corpora-of-disordered-speech/aphasiabank.json @@ -3,7 +3,7 @@ "URL": "https://aphasia.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in aphasia.\n Access to the data in AphasiaBank is password protected and restricted to members of the AphasiaBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["yue", "hrv", "eng", "fra", "deu", "ell", "hun", "ita", "jpn", "cmn", "ron", "spa"], + "Language": ["yue", "hrv", "eng", "fra", "deu", "ell", "hun", "ita", "jpn", "cmn", "ron", "spa"], "Licence": "email request for access", "Size": ["380 MB transcripts", "827 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/asdbank.json b/corpora/corpora-of-disordered-speech/asdbank.json index 20fd36c..98c683d 100644 --- a/corpora/corpora-of-disordered-speech/asdbank.json +++ b/corpora/corpora-of-disordered-speech/asdbank.json @@ -3,7 +3,7 @@ "URL": "https://asd.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in autism-spectrum disorder.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["nld", "eng", "fra", "ell", "cmn", "spa"], + "Language": ["nld", "eng", "fra", "ell", "cmn", "spa"], "Licence": "open access", "Size": ["42 MB transcripts", "401 MB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json b/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json index cf109c2..e2e62de 100644 --- a/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json +++ b/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-F6BC06C4-B2AD-4ED8-8527-AB81F4EF4E8F", "Family": "Corpora of Disordered Speech", "Description": "The corpus is used for investigating the bilingual language and communication development of young deaf children in Sign Language of the Netherlands (SLN) and Dutch.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["4 GB complete video recordings. 1 GB selected parts video recordings. 0,1 GB selected parts transcripts. 0,5 GB test and background data of 11 deaf children, longitudinal, 104 recordings"], "Annotation": [" CHAT-like format for 104 recordings"], diff --git a/corpora/corpora-of-disordered-speech/cleft-dataset.json b/corpora/corpora-of-disordered-speech/cleft-dataset.json index a5c6df6..4ff0601 100644 --- a/corpora/corpora-of-disordered-speech/cleft-dataset.json +++ b/corpora/corpora-of-disordered-speech/cleft-dataset.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/cleft/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound and audio recorded with children with cleft lip and palate.", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["11 speakers"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/copas.json b/corpora/corpora-of-disordered-speech/copas.json index e93ec3f..98dda2a 100644 --- a/corpora/corpora-of-disordered-speech/copas.json +++ b/corpora/corpora-of-disordered-speech/copas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-n3", "Family": "Corpora of Disordered Speech", "Description": "This corpus has been constructed within the framework of the project Speech Algorithms for Clinical and Educational applications (SPACE).", - "Languages": ["Dutch (Flemish)"], + "Language": ["Dutch (Flemish)"], "Licence": "Academic, bespoke", "Size": ["319 speakers of which 122 normal controls and 197 with a speech disorder. Corpus size: 1.3 GB"], "Annotation": ["Orthographic transcription"], diff --git a/corpora/corpora-of-disordered-speech/deaf-adults-ru.json b/corpora/corpora-of-disordered-speech/deaf-adults-ru.json index 75ab8b0..99aacfd 100644 --- a/corpora/corpora-of-disordered-speech/deaf-adults-ru.json +++ b/corpora/corpora-of-disordered-speech/deaf-adults-ru.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-97AF29EA-877D-422A-BAF7-25FA269351A6", "Family": "Corpora of Disordered Speech", "Description": "This corpus aims at the investigation of the acquisition of Dutch by deaf Dutch adults (late L1/early L2) and comparison to hearing Turkish and Moroccan-Arabic.", - "Languages": ["nld", "tur", "ary"], + "Language": ["nld", "tur", "ary"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["2GB of 46 deaf Dutch adults, 38 hearing Turkish adults, 24 hearing Moroccan adults, 10 Dutch controls"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json b/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json index 77feca7..f6eab6e 100644 --- a/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json +++ b/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/OPEN-989", "Family": "Corpora of Disordered Speech", "Description": "This corpus consists of semi-spontaneous speech data produced by elderly residents of the Basilicata region in Italy.\nIn total, 40 individuals participated: the patient group consists of 20 participants with a diagnosis of dementia (9 cases of Alzheimer’s disease, 2 patients with mixed dementia, 5 patients with not-further-specified dementia, 3 patients with vascular dementia, and 1 patient with frontotemporal dementia).\nthe control group consists of 20 healthy individuals matched for age, gender, and geographical origin. Three linguistic tasks were administered to all participants: two narrative tasks (the first one was about an excursion or a trip, and the second was about Christmas festivities), and an image description task. This resulted in 8 hours and 50 minutes of recorded semi-spontaneous speech, which was then transcribed, segmented, and annotated using ELAN. ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "Processed data available by request", "Size": ["08:50 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/ewa-db.json b/corpora/corpora-of-disordered-speech/ewa-db.json index 72d3248..99b9b73 100644 --- a/corpora/corpora-of-disordered-speech/ewa-db.json +++ b/corpora/corpora-of-disordered-speech/ewa-db.json @@ -3,7 +3,7 @@ "URL": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0489/", "Family": "Corpora of Disordered Speech", "Description": "This corpus contains data from 3 clinical groups: Alzheimer's disease, Parkinson's disease, mild cognitive impairment, and a control group of healthy subjects.\nSpeech samples of each clinical group were obtained using the EWA smartphone application, which contains 4 different language tasks: sustained vowel phonation, diadochokinesis, object and action naming (30 objects and 30 actions), and picture description (two single pictures and three complex pictures).", - "Languages": ["slk"], + "Language": ["slk"], "Licence": "Non-commercial and commercial options", "Size": ["150 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/fluencybank.json b/corpora/corpora-of-disordered-speech/fluencybank.json index 8efcc14..5f500e0 100644 --- a/corpora/corpora-of-disordered-speech/fluencybank.json +++ b/corpora/corpora-of-disordered-speech/fluencybank.json @@ -3,7 +3,7 @@ "URL": "https://fluency.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This corpus is intended for the study of fluency development.\nParticipants include typically-developing monolingual and bilingual children, children and adults who stutter (C/AWS) or who clutter (C/AWC), and second language learners.\nAccess to the research data in FluencyBank is password protected and restricted to members of the FluencyBank consortium group, although a subset of the corpus is publicly available.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["nld", "eng", "fra", "deu"], + "Language": ["nld", "eng", "fra", "deu"], "Licence": "email request for access", "Size": ["481 MB transcripts", "207 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/itaasd.json b/corpora/corpora-of-disordered-speech/itaasd.json index 056737f..2c20b40 100644 --- a/corpora/corpora-of-disordered-speech/itaasd.json +++ b/corpora/corpora-of-disordered-speech/itaasd.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/OPEN-990", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of semi-spontaneous speech produced by 34 children between 6 and 13 years of age, residents in the Campania region of Italy.#sepHalf of the participating children were diagnosed with high-functioning Autism Spectrum Disorder, and the other half were neurotypical children matched for age, gender, and geographical origin.#sepAll participants were administered three tasks: a complex image description task, a story-telling task, and a story-retelling task. This resulted in 4 hours and 19 minutes of recorded speech, which were then transcribed and annotated using ELAN. ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["04.19 hours"], "Annotation": ["Orthographic"], diff --git a/corpora/corpora-of-disordered-speech/oplon.json b/corpora/corpora-of-disordered-speech/oplon.json index ae39864..9266b73 100644 --- a/corpora/corpora-of-disordered-speech/oplon.json +++ b/corpora/corpora-of-disordered-speech/oplon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/ILC-992", "Family": "Corpora of Disordered Speech", "Description": "This corpus consists of semi-spontaneous speech data collected from 96 elderly participants who were divided into two groups: the pathological and the control group.\nThe pathological group refers to three categories: (i) 16 participants with amnestic Mild Cognitive Impairment (MCI), (ii) 16 participants with multiple-domain MCI, and (iii) 16 participants with Early Dementia (probable Alzheimer Dementia, Fronto-Temporal Dementia, Mixed Dementia, and Lewy Body Dementia).\nThe control group includes 48 healthy individuals matched for gender, age, educational level, and geographical origin. The corpus was subjected to PoS Tagging and Dependency Parsing (CoNLL format). ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["06:50 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/perceptual-voice-q.json b/corpora/corpora-of-disordered-speech/perceptual-voice-q.json index 093bf4c..71365a7 100644 --- a/corpora/corpora-of-disordered-speech/perceptual-voice-q.json +++ b/corpora/corpora-of-disordered-speech/perceptual-voice-q.json @@ -3,7 +3,7 @@ "URL": "https://data.mendeley.com/datasets/9dz247gnyb/4", "Family": "Corpora of Disordered Speech", "Description": "This corpus contains voice samples which have been rated by experienced voice professionals (at least 3 different raters with a minimum of 2 years’ clinical experience) in order to provide educators with standardized materials to better train pre-service clinical voice professionals. ", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC 4.0", "Size": ["296 audio files of varying sizes"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/phonologyt-project.json b/corpora/corpora-of-disordered-speech/phonologyt-project.json index 638938a..c1b17db 100644 --- a/corpora/corpora-of-disordered-speech/phonologyt-project.json +++ b/corpora/corpora-of-disordered-speech/phonologyt-project.json @@ -3,7 +3,7 @@ "URL": "https://phonodevelopment.sites.olt.ubc.ca/", "Family": "Corpora of Disordered Speech", "Description": "This corpus is used for investigating the phonological development across languages, and to evaluate intervention outcomes given a nonlinear phonological approach and ultrasound intervention outcomes across speech disorders.", - "Languages": ["eng", "fra", "spa", "cmn", "yue", "slv"], + "Language": ["eng", "fra", "spa", "cmn", "yue", "slv"], "Licence": "CC 4.0 Non-commercial", "Size": ["4 speakers for transcription resource"], "Annotation": ["Phonemic and phonetic transcription"], diff --git a/corpora/corpora-of-disordered-speech/plan-v-aphasia.json b/corpora/corpora-of-disordered-speech/plan-v-aphasia.json index be13c97..dcdc58d 100644 --- a/corpora/corpora-of-disordered-speech/plan-v-aphasia.json +++ b/corpora/corpora-of-disordered-speech/plan-v-aphasia.json @@ -3,7 +3,7 @@ "URL": "https://planv-project.gr/", "Family": "Corpora of Disordered Speech", "Description": "This corpus contains spoken discourse data collected from Greek-speaking People with Aphasia (PWA) and from neurotypical adults.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY 4.0", "Size": ["1.84 MB"], "Annotation": ["Sentence", "utterance", "clause", "POS"], diff --git a/corpora/corpora-of-disordered-speech/polish-cued.json b/corpora/corpora-of-disordered-speech/polish-cued.json index 49405ba..fc1770e 100644 --- a/corpora/corpora-of-disordered-speech/polish-cued.json +++ b/corpora/corpora-of-disordered-speech/polish-cued.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/dbcd8568-d17d-4861-94bb-aa553e943399", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of recordings of the DIA (Dutch Intelligibilty Assessment).\nThe corpus also contains a variety of other samples like reading passages, isolated sentences and recordings of spontaneous speech.\nThe corpus contains samples of 187 speakers with a speech disorder and samples of 122 speakers without a speech disorder. ", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "open access or through email request for access", "Size": ["20 children (11 girls and 9 boys)"], "Annotation": ["CHAT format"], diff --git a/corpora/corpora-of-disordered-speech/psychosisbank.json b/corpora/corpora-of-disordered-speech/psychosisbank.json index de0993b..8e69c45 100644 --- a/corpora/corpora-of-disordered-speech/psychosisbank.json +++ b/corpora/corpora-of-disordered-speech/psychosisbank.json @@ -3,7 +3,7 @@ "URL": "https://psychosis.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus intended for the study of language in psychosis.\nThe site is noted as under construction.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["English (various dialects)", "spa"], + "Language": ["English (various dialects)", "spa"], "Licence": "email request for access", "Size": ["Not available"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/raput.json b/corpora/corpora-of-disordered-speech/raput.json index abd23e1..af3a0cc 100644 --- a/corpora/corpora-of-disordered-speech/raput.json +++ b/corpora/corpora-of-disordered-speech/raput.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1435", "Family": "Corpora of Disordered Speech", "Description": "The corpus consists of texts produced by nonprofessional typical speakers and speakers with different language disorders (developmental language disorder, dyslexia, traumatic brain injury, aphasia, other).\nRoughly half of the corpus consists of texts of typical speakers, and the other half of speakers with language disorders.\nLanguage samples were elicited by six groups of tasks representing different writing styles (descriptive, expository, narrative, and letter) and different levels of formality.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC-BY-SA 4.0", "Size": ["6760 texts", "34469 sentences", "426187 tokens"], "Annotation": ["MULTEXT-East tagset"], diff --git a/corpora/corpora-of-disordered-speech/rhdbank.json b/corpora/corpora-of-disordered-speech/rhdbank.json index 1b93e77..3c6de75 100644 --- a/corpora/corpora-of-disordered-speech/rhdbank.json +++ b/corpora/corpora-of-disordered-speech/rhdbank.json @@ -3,7 +3,7 @@ "URL": "https://rhd.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in people with Right Hemisphere Damage (RHD).\nAccess to the data in RHDBank is password protected and restricted to members of the RHDBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng", "spa"], + "Language": ["eng", "spa"], "Licence": "email request for access", "Size": ["30 MB transcripts", "28 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/seed.json b/corpora/corpora-of-disordered-speech/seed.json index 8c16353..4c293e0 100644 --- a/corpora/corpora-of-disordered-speech/seed.json +++ b/corpora/corpora-of-disordered-speech/seed.json @@ -3,7 +3,7 @@ "URL": "https://osf.io/ygc8n/", "Family": "Corpora of Disordered Speech", "Description": "This corpus includes recordings of single words and continuous speech samples that provide examples of speakers with and without speech disorders.", - "Languages": ["English (American)"], + "Language": ["English (American)"], "Licence": "Access by registration", "Size": [], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json b/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json index d72e3cd..cb695b1 100644 --- a/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json +++ b/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-97AF29EA-877D-422A-BAF7-25FA269351A6", "Family": "Corpora of Disordered Speech", "Description": "The corpus has been collected to investigate of the expression of spatial relations by children with SLI and normally developing children in their spoken language production. ", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["2 GB"], "Annotation": ["Praat transcripts"], diff --git a/corpora/corpora-of-disordered-speech/ssnce-tamil.json b/corpora/corpora-of-disordered-speech/ssnce-tamil.json index 78049c8..103dc99 100644 --- a/corpora/corpora-of-disordered-speech/ssnce-tamil.json +++ b/corpora/corpora-of-disordered-speech/ssnce-tamil.json @@ -3,7 +3,7 @@ "URL": "https://catalog.ldc.upenn.edu/LDC2021S04", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of Tamil Dysarthric Speech.\nThe corpus contains approximately eight hours of Tamil speech data, time-aligned transcripts and metadata collected from 30 speakers (20 dysarthric speakers and 10 non-dysarthric speakers).\nThe non-dysarthric speakers consisted of five female and five male subjects. The dysarthric speakers (7 female, 13 male) reported a diagnosis of cerebral palsy and ranged in age from 12 years old to 37 years ol.\nIn total, each speaker recorded 365 utterances consisting of single words and of sentences that included a combination of common and uncommon Tamil phrases.\nThe corpus includes time-aligned phonetic transcripts for all collected speech data. Additional documentation includes phoneme mappings and speaker metadata. Audio data is presented as 16-bit 16kHz FLAC compressed linear pcm wav. Transcripts are presented as UTF-8 encoded plain text.", - "Languages": ["tam"], + "Language": ["tam"], "Licence": "LDC", "Size": ["30 speakers"], "Annotation": ["phonetic"], diff --git a/corpora/corpora-of-disordered-speech/star-sentences.json b/corpora/corpora-of-disordered-speech/star-sentences.json index 9b70322..b08d0b3 100644 --- a/corpora/corpora-of-disordered-speech/star-sentences.json +++ b/corpora/corpora-of-disordered-speech/star-sentences.json @@ -3,7 +3,7 @@ "URL": "https://www.seeingspeech.ac.uk/speechstar/disordered-child-speech-sentences-database/", "Family": "Corpora of Disordered Speech", "Description": "This is a collection of multiple audio-articulatory speech-disorder corpora.\nDatabase items are composite videos containing (i) midsagittal tongue movement, imaged with ultrasound tongue imaging (UTI), (ii) optional profile lip movement, recorded with a headset-mounted camera, and (iii) synchronised audio.\nRecordings in this database are of sentences produced by child speakers (aged 6,1-13,4) who were either reading orthographic stimuli from a screen, or repeating sentences produced by a researcher. Diagnoses are based on clinicians' reports.", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "CC BY-NC-ND", "Size": ["18 speakers"], "Annotation": ["orthographic", "phonemic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/star-speech-error.json b/corpora/corpora-of-disordered-speech/star-speech-error.json index 85a649b..9b4b3ec 100644 --- a/corpora/corpora-of-disordered-speech/star-speech-error.json +++ b/corpora/corpora-of-disordered-speech/star-speech-error.json @@ -3,7 +3,7 @@ "URL": "https://www.seeingspeech.ac.uk/speechstar/child-speech-error-database/", "Family": "Corpora of Disordered Speech", "Description": "This is a collection of multiple audio-articulatory speech disorder corpora.\nThe corpus is constituted of composite videos containing (i) midsagittal tongue movement, imaged with ultrasound tongue imaging (UTI), (ii) optional profile lip movement, recorded with a headset-mounted camera, and (iii) synchronised audio.\nRecordings in this database are of single words, or short phrases, produced by child speakers who were either reading orthographic stimuli from a screen, naming pictures, or repeating words produced by a researcher. Phonemic transcriptions are provided in order that those who are not familiar with the (rhotic) central Scottish accent can be aware of the speech sound targets.", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "CC BY-NC-ND", "Size": ["162 audio files"], "Annotation": ["orthographic", "phonemic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/tbibank.json b/corpora/corpora-of-disordered-speech/tbibank.json index 70a3873..ece2c80 100644 --- a/corpora/corpora-of-disordered-speech/tbibank.json +++ b/corpora/corpora-of-disordered-speech/tbibank.json @@ -3,7 +3,7 @@ "URL": "https://tbi.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in people with traumatic brain injury.\nAccess to the data in TBIBank is password protected and restricted to members of the TBIBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "email request for access", "Size": ["63 MB transcripts", "98 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/torgo.json b/corpora/corpora-of-disordered-speech/torgo.json index addd044..725894a 100644 --- a/corpora/corpora-of-disordered-speech/torgo.json +++ b/corpora/corpora-of-disordered-speech/torgo.json @@ -3,7 +3,7 @@ "URL": "http://www.cs.toronto.edu/~complingweb/data/TORGO/torgo.html", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of dysarthric articulation and consists of aligned acoustics and measured 3D articulatory features from speakers with either cerebral palsy (CP) or amyotrophic lateral sclerosis (ALS), which are two of the most prevalent causes of speech disability, and matched controls.\nThis dataset contains 2000 samples for dysarthric males, dysarthric females, non-dysarthric males, and non-dysarthric females.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY", "Size": ["Originally TORGO database contains 18GB of data"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/uclass.json b/corpora/corpora-of-disordered-speech/uclass.json index d40a012..7f303df 100644 --- a/corpora/corpora-of-disordered-speech/uclass.json +++ b/corpora/corpora-of-disordered-speech/uclass.json @@ -3,7 +3,7 @@ "URL": "https://www.uclass.psychol.ucl.ac.uk/", "Family": "Corpora of Disordered Speech", "Description": "This corpus consists of data from a study by Howell, Davis, Bartrip, and Wormald (2004).\nThe study looked at the fluency-enhancing effects of speaking at the same time as a frequency shifted version of the voice.\nThere were 14 speakers and four recording per speaker making 56 files in all. Recording are in SFS format.\nThe four recordings for a speaker were for two texts and two readings of each text.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "open access", "Size": ["56 files"], "Annotation": ["None"], diff --git a/corpora/corpora-of-disordered-speech/ultraphonix.json b/corpora/corpora-of-disordered-speech/ultraphonix.json index 90b704f..aefc934 100644 --- a/corpora/corpora-of-disordered-speech/ultraphonix.json +++ b/corpora/corpora-of-disordered-speech/ultraphonix.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/uxssd/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound and audio recordings from children with speech sound disorders. It contains data from 20 speakers (16 male, 4 female), aged 6-13 years. ", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["19 hours"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/ultrax-2020.json b/corpora/corpora-of-disordered-speech/ultrax-2020.json index 681f67a..c829802 100644 --- a/corpora/corpora-of-disordered-speech/ultrax-2020.json +++ b/corpora/corpora-of-disordered-speech/ultrax-2020.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/ux2020/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound tongue imaging and audio data, gathered from children with speech sound disorders by speech and language therapists in hospital environments.\n11 female speakers and 26 male, aged 5-12 years. There is one recording per child.\nThe following metadata are available for each recording: speech waveform, raw ultrasound data, ultrasound parameters, and prompt text with date/time of utterance recording. ", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["37 speakers"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/ultrax-disorders.json b/corpora/corpora-of-disordered-speech/ultrax-disorders.json index 2c9c286..00f81ed 100644 --- a/corpora/corpora-of-disordered-speech/ultrax-disorders.json +++ b/corpora/corpora-of-disordered-speech/ultrax-disorders.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/uxssd/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound and audio recordings from children with speech sound disorders.\nIt contains data from 8 speakers (2 female and 6 male), aged 5-10 years. ", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["11 hours"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/historical-corpora/15th-nt-trans.json b/corpora/historical-corpora/15th-nt-trans.json index a337130..86e486a 100644 --- a/corpora/historical-corpora/15th-nt-trans.json +++ b/corpora/historical-corpora/15th-nt-trans.json @@ -3,7 +3,7 @@ "URL": "http://stnt.ijp.pan.pl/", "Family": "Historical corpora", "Description": "This corpus contains Biblical texts from 1380 to 1500.\nThis corpus is available through a dedicated concordancer.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": ["400,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/17th-18th-polish.json b/corpora/historical-corpora/17th-18th-polish.json index dd6949d..7a5276a 100644 --- a/corpora/historical-corpora/17th-18th-polish.json +++ b/corpora/historical-corpora/17th-18th-polish.json @@ -3,7 +3,7 @@ "URL": "https://www.korba.edu.pl/query_corpus/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1601 to 1772.\nThe corpus is available through a dedicated concordancer.\nA manually annotated subset is available here.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["13.5 million tokens"], "Annotation": ["tokenised", "partially PoS-tagged", "structural annotation"], diff --git a/corpora/historical-corpora/19th-polish.json b/corpora/historical-corpora/19th-polish.json index bf27cd3..440f37c 100644 --- a/corpora/historical-corpora/19th-polish.json +++ b/corpora/historical-corpora/19th-polish.json @@ -3,7 +3,7 @@ "URL": "http://korpus19.nlp.ipipan.waw.pl/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1830 to 1918.\nThe corpus is available for download through a dedicated webpage.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["625,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "transliteration", "transcription"], diff --git a/corpora/historical-corpora/agricola-db.json b/corpora/historical-corpora/agricola-db.json index d87d1b9..71bacd1 100644 --- a/corpora/historical-corpora/agricola-db.json +++ b/corpora/historical-corpora/agricola-db.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730170", "Family": "Historical corpora", "Description": "This corpus contains texts from 1544 to 1551 written by the clergyman Mikael Agricola.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-ND", "Size": ["428,300 tokens"], "Annotation": ["tokenised", "PoS-tagged", "morphological components and syntactic function"], diff --git a/corpora/historical-corpora/aleksis-kivi.json b/corpora/historical-corpora/aleksis-kivi.json index dff8c1c..16c8a4b 100644 --- a/corpora/historical-corpora/aleksis-kivi.json +++ b/corpora/historical-corpora/aleksis-kivi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405274", "Family": "Historical corpora", "Description": "This corpus contains the works by Finnish author Aleksis Kivi from 1855 to 1871.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY-NC", "Size": ["413,700 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/anno-cuneiform.json b/corpora/historical-corpora/anno-cuneiform.json index 94b5109..403e102 100644 --- a/corpora/historical-corpora/anno-cuneiform.json +++ b/corpora/historical-corpora/anno-cuneiform.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2018071121", "Family": "Historical corpora", "Description": "This corpus contains cuneiform texts from Ancient history.\nThe texts come from the Oracc project and include collections such as the Corpus of Ancient Mesopotamian Scholarship, The Digital Corpus of Cuneiform Lexical Texts, and Royal Inscriptions of Babylonia online.\nThe corpus is available through the concordancer Korp and for download from the repository of FIN-CLARIN.", - "Languages": ["akk"], + "Language": ["akk"], "Licence": "CC-BY-SA", "Size": ["1,600,563 tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged", "semantically annotated"], diff --git a/corpora/historical-corpora/anth-mid-eng.json b/corpora/historical-corpora/anth-mid-eng.json index 0a3c5ac..9c7c1b6 100644 --- a/corpora/historical-corpora/anth-mid-eng.json +++ b/corpora/historical-corpora/anth-mid-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1398", "Family": "Historical corpora", "Description": "This corpus contains literary texts from 1100 to 1400.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["enm","heb"], + "Language": ["enm","heb"], "Licence": "Oxford Text Archive licence", "Size": ["4000 words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/archer.json b/corpora/historical-corpora/archer.json index 9ff21a4..c3686de 100644 --- a/corpora/historical-corpora/archer.json +++ b/corpora/historical-corpora/archer.json @@ -3,7 +3,7 @@ "URL": "http://www.projects.alc.manchester.ac.uk/archer/", "Family": "Historical corpora", "Description": "The corpus contains texts from 1600 to 1999.\nThe corpus is available through the CQPConcordancer. ", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/austrian-baroque.json b/corpora/historical-corpora/austrian-baroque.json index 3566d2f..15d6a0c 100644 --- a/corpora/historical-corpora/austrian-baroque.json +++ b/corpora/historical-corpora/austrian-baroque.json @@ -3,7 +3,7 @@ "URL": "https://acdh.oeaw.ac.at/abacus/", "Family": "Historical corpora", "Description": "This corpus contains sermons from 1650 to 1750.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["200,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "named entities"], diff --git a/corpora/historical-corpora/b4-hist-preach.json b/corpora/historical-corpora/b4-hist-preach.json index 932950c..c009591 100644 --- a/corpora/historical-corpora/b4-hist-preach.json +++ b/corpora/historical-corpora/b4-hist-preach.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B23-A", "Family": "Historical corpora", "Description": "This corpus contains sermons from an Upper German (Balvarian-Alemannic) dialect area.\nThe corpus is available for download from the repository of the University of Hamburg and through the ANNIS environment.", - "Languages": ["gmh"], + "Language": ["gmh"], "Licence": "CLARIN ACA", "Size": ["92,500 tokens"], "Annotation": ["tokenised", "syntactic and discursive annotation"], diff --git a/corpora/historical-corpora/b4-ludolf.json b/corpora/historical-corpora/b4-ludolf.json index 2830e14..7afcac1 100644 --- a/corpora/historical-corpora/b4-ludolf.json +++ b/corpora/historical-corpora/b4-ludolf.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B22-B", "Family": "Historical corpora", "Description": "This corpus contains texts from a journey diary from 1350.\nThe corpus is available for download from the repository of the University of Hamburg and through the ANNIS environment.", - "Languages": ["gmh"], + "Language": ["gmh"], "Licence": "CLARIN ACA", "Size": ["6,690 tokens"], "Annotation": ["tokenised", "tagged for clause type and grammatical function"], diff --git a/corpora/historical-corpora/b4-tatian.json b/corpora/historical-corpora/b4-tatian.json index b801bda..4c1b068 100644 --- a/corpora/historical-corpora/b4-tatian.json +++ b/corpora/historical-corpora/b4-tatian.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B1E-1", "Family": "Historical corpora", "Description": "This corpus contains the OHG Tatian, which is one of the largest prose texts from the Old High German period.\nThe corpus is available for download and through a concordancer from the repository of the University of Hamburg.", - "Languages": ["lat", "goh"], + "Language": ["lat", "goh"], "Licence": "CC-BY", "Size": ["11,300 tokens"], "Annotation": ["tokenised", "MSD-tagged"], diff --git a/corpora/historical-corpora/bib-text-scots.json b/corpora/historical-corpora/bib-text-scots.json index 554b8d0..dd73bb9 100644 --- a/corpora/historical-corpora/bib-text-scots.json +++ b/corpora/historical-corpora/bib-text-scots.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1713", "Family": "Historical corpora", "Description": "This corpus contains Biblical texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["sco"], + "Language": ["sco"], "Licence": "Oxford Text Archive licence", "Size": ["35,506 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/brieven-buit.json b/corpora/historical-corpora/brieven-buit.json index 7d0a980..fb84cd2 100644 --- a/corpora/historical-corpora/brieven-buit.json +++ b/corpora/historical-corpora/brieven-buit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/f6d68fed217ef7364a32c431396ac465", "Family": "Historical corpora", "Description": "This corpus contains 40,000 letters from the 17th to the 19th century.\nThese letters were sent home by sailors and others from abroad but also vice versa by those staying behind who needed to keep in touch with their loved ones. Many letters did not reach their destinations: they were taken as loot by privateers and confiscated by the High Court of Admiralty during the wars fought between The Netherlands and England\nThe corpus is available through a dedicated concordancer.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB", "Size": ["460,000 words"], "Annotation": ["lemmatised", "PoS-tagged", "grammatically tagged"], diff --git a/corpora/historical-corpora/bundesblatt.json b/corpora/historical-corpora/bundesblatt.json index a473d63..5078ccf 100644 --- a/corpora/historical-corpora/bundesblatt.json +++ b/corpora/historical-corpora/bundesblatt.json @@ -3,7 +3,7 @@ "URL": "https://feuille-federale.unige.ch/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1849 to 2014.\nThe corpus is available through the CQPWeb concordancer.", - "Languages": ["deu","fra","ita"], + "Language": ["deu","fra","ita"], "Licence": "", "Size": ["203,585,806 tokens (German)", "239,125,036 tokens (French)", "85,223,085 tokens (Italian)"], "Annotation": ["tokenised", "syntactically-parsed"], diff --git a/corpora/historical-corpora/carniolan-pa.json b/corpora/historical-corpora/carniolan-pa.json index 8eda22e..d385795 100644 --- a/corpora/historical-corpora/carniolan-pa.json +++ b/corpora/historical-corpora/carniolan-pa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1824", "Family": "Historical corpora", "Description": "The corpus contains meeting proceedings of 694 sessions of the Carniolan Provincial Assembly from 1861 to 1913.\nThe source data (scanned and OCR processed pdf documents) originally come from The Digital Library of Slovenia dLib.si and History of Slovenia - SIstory portals. The documents are bilingual, in Slovenian and German, depending on the speaker. German was first typeset in the Gothic script and later on in Latin.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Language was detected on the sentence level, roughly 58% sentences are in Slovenian and 42% in German. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using Trankit for Slovenian and German, while Lingua is used for language detection.\nThe documents are in the Parla-CLARIN compliant TEI XML format. Each session in one file.", - "Languages": ["deu", "slv"], + "Language": ["deu", "slv"], "Licence": "CC-BY 4.0", "Size": ["10.9 million words"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/ced.json b/corpora/historical-corpora/ced.json index 30c27a2..96f9847 100644 --- a/corpora/historical-corpora/ced.json +++ b/corpora/historical-corpora/ced.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2507", "Family": "Historical corpora", "Description": "This corpus contains dialogues from literary and didactic works from 1560 to 1760.\n There are five text-types in the CED. The text-types representative of constructed dialogue are drama comedy, didactic works (language manuals and other handbooks) and fiction; the text-types representative of authentic dialogue are trial proceedings and witness depositions. In addition, a small group of miscellaneous dialogic texts is included in the collection.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["1.2 million words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/ceecs.json b/corpora/historical-corpora/ceecs.json index d38413b..3a6673f 100644 --- a/corpora/historical-corpora/ceecs.json +++ b/corpora/historical-corpora/ceecs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2461", "Family": "Historical corpora", "Description": "This corpus contains 1147 letters from 1418 to 1680.\nThe corpus was created from the larger Corpus of Early English Correspondence.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["450,000 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/chroniclItaly.json b/corpora/historical-corpora/chroniclItaly.json index 3aa3b28..57a8d3c 100644 --- a/corpora/historical-corpora/chroniclItaly.json +++ b/corpora/historical-corpora/chroniclItaly.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.24416/uu01-t4ymow", "Family": "Historical corpora", "Description": "This corpus contains Italian language newspapers published in the United States between 1898 and 1920. The corpus includes seven Italian language newspapers published in California, Massachusetts, Pennsylvania, Vermont, and West Virginia. The collection includes the following titles: L’Italia, Cronaca sovversiva, La libera parola, The patriot, La ragione, La rassegna, and La sentinella del West Virginia.\nThe corpus is available for download from the repository of the University of Utrecht.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "ODC Attribution License (ODC-By)", "Size": ["16.6 million words"], "Annotation": ["unannotated"], diff --git a/corpora/historical-corpora/chronopress.json b/corpora/historical-corpora/chronopress.json index 7d90141..47423d4 100644 --- a/corpora/historical-corpora/chronopress.json +++ b/corpora/historical-corpora/chronopress.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/260", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1945 to 1954.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY-SA", "Size": ["16 million tokens"], "Annotation": [], diff --git a/corpora/historical-corpora/cipm.json b/corpora/historical-corpora/cipm.json index 0fa6795..9c4f8ea 100644 --- a/corpora/historical-corpora/cipm.json +++ b/corpora/historical-corpora/cipm.json @@ -3,7 +3,7 @@ "URL": "http://cipm.fcsh.unl.pt/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 9th to the 16th century.\nThe corpus is available through a dedicated concordancer (restricted access).", - "Languages": ["por"], + "Language": ["por"], "Licence": "", "Size": ["2 million tokens"], "Annotation": ["tokenised", "PoS-tagged"], diff --git a/corpora/historical-corpora/class-lib-nat-lib-fi.json b/corpora/historical-corpora/class-lib-nat-lib-fi.json index 7f412bd..fc42b30 100644 --- a/corpora/historical-corpora/class-lib-nat-lib-fi.json +++ b/corpora/historical-corpora/class-lib-nat-lib-fi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2018051701", "Family": "Historical corpora", "Description": "This corpus will contain literary texts from 1549 to 1944.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/ddr-press.json b/corpora/historical-corpora/ddr-press.json index b1ecd73..e481055 100644 --- a/corpora/historical-corpora/ddr-press.json +++ b/corpora/historical-corpora/ddr-press.json @@ -3,7 +3,7 @@ "URL": "https://clarin.bbaw.de/en/corpus/", "Family": "Historical corpora", "Description": "This corpus contains newspaper texts from 1945 to 1994.\nThe corpus is available through a concordancer provided by CLARIN-D.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/diacoris.json b/corpora/historical-corpora/diacoris.json index 02649f9..d3aa78c 100644 --- a/corpora/historical-corpora/diacoris.json +++ b/corpora/historical-corpora/diacoris.json @@ -3,7 +3,7 @@ "URL": "http://corpora.dslo.unibo.it/coris_ita.html", "Family": "Historical corpora", "Description": "This corpus contains texts from 1861 to 1945.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/diakorp.json b/corpora/historical-corpora/diakorp.json index 7ee47fb..a74b6e7 100644 --- a/corpora/historical-corpora/diakorp.json +++ b/corpora/historical-corpora/diakorp.json @@ -3,7 +3,7 @@ "URL": "http://wiki.korpus.cz/doku.php/en:cnk:diakorp", "Family": "Historical corpora", "Description": "This corpus contains texts from the 14th to the 20th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY-NC-SA", "Size": ["4 million tokens"], "Annotation": ["basic structural markup"], diff --git a/corpora/historical-corpora/dig-hist-slovene.json b/corpora/historical-corpora/dig-hist-slovene.json index dfb3bd2..e81471b 100644 --- a/corpora/historical-corpora/dig-hist-slovene.json +++ b/corpora/historical-corpora/dig-hist-slovene.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1031", "Family": "Historical corpora", "Description": "This corpus contains 658 unique texts from 1584 to 1919.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancer KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY-SA 4.0", "Size": ["17.7 million tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged"], diff --git a/corpora/historical-corpora/diorisis-ancient-greek.json b/corpora/historical-corpora/diorisis-ancient-greek.json index ae32eb0..7394b7d 100644 --- a/corpora/historical-corpora/diorisis-ancient-greek.json +++ b/corpora/historical-corpora/diorisis-ancient-greek.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-4769", "Family": "Historical corpora", "Description": "This corpus consists of 820 texts spanning between the beginnings of the Ancient Greek literary tradition (Homer) to the fifth century AD.\nThe texts are sourced from the Perseus Canonical Greek Lit Repository, \"The Little Sailing\" digital library, and the Bibliotheca Augustana digital library.\nThe corpus is available for download from Figshare.", - "Languages": ["grc"], + "Language": ["grc"], "Licence": "CC BY 4.0", "Size": ["10.2 million words"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/doec.json b/corpora/historical-corpora/doec.json index 2f7a906..c3d11ce 100644 --- a/corpora/historical-corpora/doec.json +++ b/corpora/historical-corpora/doec.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2488", "Family": "Historical corpora", "Description": "This corpus contains 3037 texts from 600 to 1150.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang","lat"], + "Language": ["ang","lat"], "Licence": "Oxford Text Archive licence", "Size": [], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/dta.json b/corpora/historical-corpora/dta.json index b7ddbdc..6576c96 100644 --- a/corpora/historical-corpora/dta.json +++ b/corpora/historical-corpora/dta.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11120/0000-0005-0ABA-F", "Family": "Historical corpora", "Description": "This corpus contains texts from the 17th to the 20th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN PUB", "Size": ["215,168,761 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/early-modern-fi.json b/corpora/historical-corpora/early-modern-fi.json index ad47d1c..00ee32e 100644 --- a/corpora/historical-corpora/early-modern-fi.json +++ b/corpora/historical-corpora/early-modern-fi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730147", "Family": "Historical corpora", "Description": "This corpus contains texts from 1809 to 1899.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "rus", "deu", "lat"], + "Language": ["fin", "rus", "deu", "lat"], "Licence": "EUPL v.1.1 SA", "Size": ["8.6 million words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/ecco-tcp.json b/corpora/historical-corpora/ecco-tcp.json index 73c80e6..f123611 100644 --- a/corpora/historical-corpora/ecco-tcp.json +++ b/corpora/historical-corpora/ecco-tcp.json @@ -3,7 +3,7 @@ "URL": "https://textcreationpartnership.org/tcp-texts/ecco-tcp-eighteenth-century-collections-online/", "Family": "Historical corpora", "Description": "This corpus contains texts (literature, philosophy, politics, religion, geography, science and all other areas of human endeavour) from 1700 to 1800.\nThe corpus is available for download from a dedicated webpage and through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-0", "Size": ["74 million tokens"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/edinburgh-dost.json b/corpora/historical-corpora/edinburgh-dost.json index 304a1dc..0a40140 100644 --- a/corpora/historical-corpora/edinburgh-dost.json +++ b/corpora/historical-corpora/edinburgh-dost.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/0701", "Family": "Historical corpora", "Description": "This corpus contains texts from 1450 to 1600.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["877,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/eebo-tcp.json b/corpora/historical-corpora/eebo-tcp.json index 380eff6..dba9d9b 100644 --- a/corpora/historical-corpora/eebo-tcp.json +++ b/corpora/historical-corpora/eebo-tcp.json @@ -3,7 +3,7 @@ "URL": "https://textcreationpartnership.org/tcp-texts/eebo-tcp-early-english-books-online/", "Family": "Historical corpora", "Description": "This corpus contains texts (literature, philosophy, politics, religion, geography, science and all other areas of human endeavour) from 1450 to 1750.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-0", "Size": ["766 million tokens"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/efontes.json b/corpora/historical-corpora/efontes.json index 2ca177b..2c07f5e 100644 --- a/corpora/historical-corpora/efontes.json +++ b/corpora/historical-corpora/efontes.json @@ -3,7 +3,7 @@ "URL": "http://scriptores.pl/efontes/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 11th to the middle of the 16th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": ["5 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/historical-corpora/en-nw-late-modern.json b/corpora/historical-corpora/en-nw-late-modern.json index 565222b..94e8727 100644 --- a/corpora/historical-corpora/en-nw-late-modern.json +++ b/corpora/historical-corpora/en-nw-late-modern.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2468", "Family": "Historical corpora", "Description": "This corpus contains texts from 1761 to 1790.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["300,000 words"], "Annotation": ["COCOA-style"], diff --git a/corpora/historical-corpora/evans-tcp.json b/corpora/historical-corpora/evans-tcp.json index 889dcd1..596a87e 100644 --- a/corpora/historical-corpora/evans-tcp.json +++ b/corpora/historical-corpora/evans-tcp.json @@ -3,7 +3,7 @@ "URL": "https://textcreationpartnership.org/tcp-texts/evans-tcp-evans-early-american-imprints/", "Family": "Historical corpora", "Description": "This corpus contains American texts from 1640 to 1821.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-0", "Size": ["766 million tokens"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/fin-classics.json b/corpora/historical-corpora/fin-classics.json index 7a4ade7..fe4692b 100644 --- a/corpora/historical-corpora/fin-classics.json +++ b/corpora/historical-corpora/fin-classics.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730186", "Family": "Historical corpora", "Description": "This corpus contains literary texts from 1880 to 1949.\nIn terms of genre, the texts correspond to prose fiction, plays, poetry and aphorisms.\nThe corpus is available through the concordancer Korp (FIN-CLARIN).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "EUPL v.1.1 SA", "Size": ["1.5 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/fin-folk.json b/corpora/historical-corpora/fin-folk.json index 1769380..467c143 100644 --- a/corpora/historical-corpora/fin-folk.json +++ b/corpora/historical-corpora/fin-folk.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052712", "Family": "Historical corpora", "Description": "This corpus contains poems from 1564 to 1939.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], + "Language": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], "Licence": "CC-BY-NC", "Size": ["7.1 million words"], "Annotation": ["normalised (added diacritics)"], diff --git a/corpora/historical-corpora/fin-gutenberg.json b/corpora/historical-corpora/fin-gutenberg.json index 22a4bb2..8d7cdd2 100644 --- a/corpora/historical-corpora/fin-gutenberg.json +++ b/corpora/historical-corpora/fin-gutenberg.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014100301", "Family": "Historical corpora", "Description": "This corpus contains books published up to 1925 that are made available through the Gutenberg project.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY", "Size": ["34.5 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/fin-news-periodicals.json b/corpora/historical-corpora/fin-news-periodicals.json index a60e249..f61f0d8 100644 --- a/corpora/historical-corpora/fin-news-periodicals.json +++ b/corpora/historical-corpora/fin-news-periodicals.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016050302", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1840 to 2011.\nFor a comprehensive list of newspapers included in the corpus, see here.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC_BY-SA", "Size": ["5.2 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/frantext.json b/corpora/historical-corpora/frantext.json index cb48215..eaa105b 100644 --- a/corpora/historical-corpora/frantext.json +++ b/corpora/historical-corpora/frantext.json @@ -3,7 +3,7 @@ "URL": "https://www.frantext.fr/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 10th to the 21st century.\nThe corpus is available through a dedicated concordancer (restricted access).", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "", "Size": ["300 million words"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/germanc.json b/corpora/historical-corpora/germanc.json index 3006ecb..4882cda 100644 --- a/corpora/historical-corpora/germanc.json +++ b/corpora/historical-corpora/germanc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2544", "Family": "Historical corpora", "Description": "This corpus contains personal letters, sermons and fictional, scholarly (i.e., humanities), scientific and legal texts from 1650 to 1800.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["700,000 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/grek-medieval.json b/corpora/historical-corpora/grek-medieval.json index 077dc76..c33f456 100644 --- a/corpora/historical-corpora/grek-medieval.json +++ b/corpora/historical-corpora/grek-medieval.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-251D-7", "Family": "Historical corpora", "Description": "This corpus contains texts from the 4th to the 16th century.\nThe texts belong to the following categories: religious, poetical-literary, political, and historical texts, as well as hymns and epigrams.\nThe corpus is available for download from the clarin:el repository. ", - "Languages": ["grc"], + "Language": ["grc"], "Licence": "CC-BY", "Size": ["3.4 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/gysseling.json b/corpora/historical-corpora/gysseling.json index 6bf9354..b8acfe8 100644 --- a/corpora/historical-corpora/gysseling.json +++ b/corpora/historical-corpora/gysseling.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-j4", "Family": "Historical corpora", "Description": "This corpus contains texts from the 13th century.\nThe texts were prepared and originally published in the 1970s and 1980s by the Ghent linguist Maurits Gysseling.\nThe corpus is available for download from the Instituut voor de Nederlandse Taal and through a dedicated concordancer.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "INT Licence for researchers", "Size": ["1.5 million words"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/hacossa.json b/corpora/historical-corpora/hacossa.json index a2638b7..eb535b1 100644 --- a/corpora/historical-corpora/hacossa.json +++ b/corpora/historical-corpora/hacossa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9D16-7", "Family": "Historical corpora", "Description": "This corpus contains texts written in the Late Old Swedish period (from 1375 to 1550).\nThe corpus is available for download from the repository of the University of Hamburg.", - "Languages": ["eng", "deu", "lat", "non", "swe"], + "Language": ["eng", "deu", "lat", "non", "swe"], "Licence": "CLARIN RES", "Size": ["128,000 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/hansard.json b/corpora/historical-corpora/hansard.json index 75b3d16..f855320 100644 --- a/corpora/historical-corpora/hansard.json +++ b/corpora/historical-corpora/hansard.json @@ -3,7 +3,7 @@ "URL": "https://www.clarin.ac.uk/hansard-corpus", "Family": "Historical corpora", "Description": "This corpus contains parliamentary debates from 1803 to 2005.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["1.6 billion tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "semantic tags"], diff --git a/corpora/historical-corpora/helsinki-eng.json b/corpora/historical-corpora/helsinki-eng.json index 4dbfc1a..50e25a8 100644 --- a/corpora/historical-corpora/helsinki-eng.json +++ b/corpora/historical-corpora/helsinki-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1477", "Family": "Historical corpora", "Description": "This corpus contains religious and fictional texts from 730 to 1710.\nSee the project page for a list of all the texts included in the corpus.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["English (Old)", "English (Middle)"], + "Language": ["English (Old)", "English (Middle)"], "Licence": "Oxford Text Archive licence", "Size": ["240,000 words"], "Annotation": [], diff --git a/corpora/historical-corpora/helsinki-old-scot.json b/corpora/historical-corpora/helsinki-old-scot.json index 765da34..53a85a5 100644 --- a/corpora/historical-corpora/helsinki-old-scot.json +++ b/corpora/historical-corpora/helsinki-old-scot.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2081", "Family": "Historical corpora", "Description": "This corpus contains texts of different domains and genres (e.g., burgh records, diaries, pamphlets, scientific treatises, sermons) from 1450 to 1700.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["sco"], + "Language": ["sco"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["1,940,706 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/helsinki-scot.json b/corpora/historical-corpora/helsinki-scot.json index 471c1a4..afc35ed 100644 --- a/corpora/historical-corpora/helsinki-scot.json +++ b/corpora/historical-corpora/helsinki-scot.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201411071", "Family": "Historical corpora", "Description": "This corpus contains personal correspondence from 1540 to 1750.\nthe corpus consists of transcripts of original letter manuscripts. The texts are reproduced without any modernisation or normalisation. Language-external variables such as date, region, gender, addressee, hand and script type have been coded.\nThe writers originate from fifteen different regions of Scotland. A fifth of the correspondents in the corpus are women.\nThe corpus is available through the concordancer Korp.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN ACA", "Size": ["500,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/hist-am-eng.json b/corpora/historical-corpora/hist-am-eng.json index ccfb90e..1449b1f 100644 --- a/corpora/historical-corpora/hist-am-eng.json +++ b/corpora/historical-corpora/hist-am-eng.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017061925", "Family": "Historical corpora", "Description": "This corpus contains texts from 1810 to 2009.\nEach decade has roughly the same balance of fiction, popular magazine, newspaper, and non-fiction books.\nThe corpus is available through the concordancer Korp.", - "Languages": ["English (American)"], + "Language": ["English (American)"], "Licence": "CLARIN ACA", "Size": ["385 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/hist-lancaster.json b/corpora/historical-corpora/hist-lancaster.json index 328635c..902fe28 100644 --- a/corpora/historical-corpora/hist-lancaster.json +++ b/corpora/historical-corpora/hist-lancaster.json @@ -3,7 +3,7 @@ "URL": "https://cqpweb.lancs.ac.uk/", "Family": "Historical corpora", "Description": "The corpus contains texts in various domains (e.g., fiction, newspaper texts, religious texts) from 1500 on.\nThe corpus is available through the CQPConcordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Annotation": ["tokenised", "PoS-tagged", "partial semantic tagging (USAS system)"], diff --git a/corpora/historical-corpora/hist-welsh.json b/corpora/historical-corpora/hist-welsh.json index 18dd50d..5e4d924 100644 --- a/corpora/historical-corpora/hist-welsh.json +++ b/corpora/historical-corpora/hist-welsh.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-883", "Family": "Historical corpora", "Description": "This corpus contains 30 texts from 1500 to 1850.\nThe corpus is available for download from a dedicated website and through a dedicated concordancer.", - "Languages": ["cym"], + "Language": ["cym"], "Licence": "", "Size": ["420,000 words"], "Annotation": [], diff --git a/corpora/historical-corpora/hun-courts.json b/corpora/historical-corpora/hun-courts.json index a6c2bec..3accae9 100644 --- a/corpora/historical-corpora/hun-courts.json +++ b/corpora/historical-corpora/hun-courts.json @@ -3,7 +3,7 @@ "URL": "http://tmk.nytud.hu/about.php", "Family": "Historical corpora", "Description": "This corpus contains private letters and testimonies from the 16th to the 18th  century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["850,000 words"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "sociolinguistic metadata"], diff --git a/corpora/historical-corpora/hun-hist.json b/corpora/historical-corpora/hun-hist.json index 4812be1..a93e8ce 100644 --- a/corpora/historical-corpora/hun-hist.json +++ b/corpora/historical-corpora/hun-hist.json @@ -3,7 +3,7 @@ "URL": "http://clara.nytud.hu/mtsz/run.cgi/first_form", "Family": "Historical corpora", "Description": "This corpus contains historical texts from the 18th century to the 2000s.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["30 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/impact-gt.json b/corpora/historical-corpora/impact-gt.json index 6347dc3..2dc71fb 100644 --- a/corpora/historical-corpora/impact-gt.json +++ b/corpora/historical-corpora/impact-gt.json @@ -3,7 +3,7 @@ "URL": "https://szukajwslownikach.uw.edu.pl/IMPACT_GT_1/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1570 to 1756.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["1.5 million tokens"], "Annotation": ["transcription"], diff --git a/corpora/historical-corpora/lampeter-tracts.json b/corpora/historical-corpora/lampeter-tracts.json index 91933aa..fc9ada1 100644 --- a/corpora/historical-corpora/lampeter-tracts.json +++ b/corpora/historical-corpora/lampeter-tracts.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/3193", "Family": "Historical corpora", "Description": "This corpus contains tracts from 1640 to 1740.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["50,797,916 words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/lancaster-newsbooks.json b/corpora/historical-corpora/lancaster-newsbooks.json index 482e28d..162058d 100644 --- a/corpora/historical-corpora/lancaster-newsbooks.json +++ b/corpora/historical-corpora/lancaster-newsbooks.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2531", "Family": "Historical corpora", "Description": "This corpus contains two collections of English printed pamphlets, books, and newspapers from 1654 to 1655.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["3,001,604 words"], "Annotation": [], diff --git a/corpora/historical-corpora/late-modern-en-prose.json b/corpora/historical-corpora/late-modern-en-prose.json index a46dfe1..5d67dd3 100644 --- a/corpora/historical-corpora/late-modern-en-prose.json +++ b/corpora/historical-corpora/late-modern-en-prose.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2077", "Family": "Historical corpora", "Description": "This corpus contains fictional texts from 1837 to 1926.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["580,056 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/late-modern-en-texts.json b/corpora/historical-corpora/late-modern-en-texts.json index 393b8b1..754cb71 100644 --- a/corpora/historical-corpora/late-modern-en-texts.json +++ b/corpora/historical-corpora/late-modern-en-texts.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0002-43F3-0", "Family": "Historical corpora", "Description": "This corpus contains texts written by British and Irish authors from 1710 to 1920.\nIn terms of genre, the texts correspond to narrative fiction and non-fiction, drama, letters, treatises, and miscellaneous written works.\nThe corpus is available for download from a CLARIN-D repository. ", - "Languages": ["English (Late Modern)"], + "Language": ["English (Late Modern)"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["34 million words"], "Annotation": ["PoS-tagged"], diff --git a/corpora/historical-corpora/latinise.json b/corpora/historical-corpora/latinise.json index 1a3eaf0..3a83668 100644 --- a/corpora/historical-corpora/latinise.json +++ b/corpora/historical-corpora/latinise.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-3170", "Family": "Historical corpora", "Description": "This corpus consists of Latin texts from the 2nd century B.C. to the 21st century. Non-linguistic metadata include information on genre, title, century and specific date.\nThe corpus is available for download from LINDAT and for search online through Sketch Engine.", - "Languages": ["lat"], + "Language": ["lat"], "Licence": "CC BY-NC-SA 4.0", "Size": ["13.3 million tokens"], "Annotation": ["sentence segmented", "PoS-tagged", "lemmatized"], diff --git a/corpora/historical-corpora/letter-sinebrychoff.json b/corpora/historical-corpora/letter-sinebrychoff.json index ac34e9b..08576c6 100644 --- a/corpora/historical-corpora/letter-sinebrychoff.json +++ b/corpora/historical-corpora/letter-sinebrychoff.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407303", "Family": "Historical corpora", "Description": "This corpus contains letters from 1895 to 1909.\nThe corpus is available through a dedicated online search environment.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": ["8.6 million words"], "Annotation": ["Finnish subset: MSD-tagged, syntactically parsed; Swedish subset: no linguistic annotation"], diff --git a/corpora/historical-corpora/mannheim-hist.json b/corpora/historical-corpora/mannheim-hist.json index 9dcecff..eb8d0ca 100644 --- a/corpora/historical-corpora/mannheim-hist.json +++ b/corpora/historical-corpora/mannheim-hist.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-01B8-AE41-41A4-DC01-5", "Family": "Newspaper corpora", "Description": "This corpus contains articles from 21 German newspapers from the 18th and 19th century.\nThe corpus is available for download from the CLARIN-D repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["3532 pages", "4.1 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/medi-charter.json b/corpora/historical-corpora/medi-charter.json index 2217bf6..7b543ca 100644 --- a/corpora/historical-corpora/medi-charter.json +++ b/corpora/historical-corpora/medi-charter.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1952", "Family": "Historical corpora", "Description": "This corpus contains Latin charters created in the era of John the Bling, King of Bohemia.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces","lat"], + "Language": ["ces","lat"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["57 chapters"], "Annotation": ["manually-tagged", "named entities"], diff --git a/corpora/historical-corpora/menota.json b/corpora/historical-corpora/menota.json index a3e69fc..6ca0ef3 100644 --- a/corpora/historical-corpora/menota.json +++ b/corpora/historical-corpora/menota.json @@ -3,7 +3,7 @@ "URL": "http://clarino.uib.no/menota/page", "Family": "Historical corpora", "Description": "This corpus contains Medieval Nordic texts.\nThe corpus is available for download and through the concordancer Corpuscle.", - "Languages": ["Old Norse"], + "Language": ["Old Norse"], "Licence": "CC-BY", "Size": ["1.6 million tokens"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/midia.json b/corpora/historical-corpora/midia.json index bb294d0..ab228f8 100644 --- a/corpora/historical-corpora/midia.json +++ b/corpora/historical-corpora/midia.json @@ -3,7 +3,7 @@ "URL": "http://www.corpusmidia.unito.it/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 13th to the 20th century.\nThe corpus is available through a dedicated concordancer", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC-BY-NC 4.0", "Size": ["7.5 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/news-fin-17-18.json b/corpora/historical-corpora/news-fin-17-18.json index 7c64504..632eb32 100644 --- a/corpora/historical-corpora/news-fin-17-18.json +++ b/corpora/historical-corpora/news-fin-17-18.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015051201", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1771 to 1874.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/news-fin-18-19.json b/corpora/historical-corpora/news-fin-18-19.json index 1e3b039..362ff6a 100644 --- a/corpora/historical-corpora/news-fin-18-19.json +++ b/corpora/historical-corpora/news-fin-18-19.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201801192", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1875 to 1920.\nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CLARIN ACA", "Size": ["8.7 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/news-fin.json b/corpora/historical-corpora/news-fin.json index 089f6c4..f214427 100644 --- a/corpora/historical-corpora/news-fin.json +++ b/corpora/historical-corpora/news-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405276", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1770 to 2011.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": ["8.7 billion words"], "Annotation": [], diff --git a/corpora/historical-corpora/notthingham-de-medicine.json b/corpora/historical-corpora/notthingham-de-medicine.json index fed772d..f0058c9 100644 --- a/corpora/historical-corpora/notthingham-de-medicine.json +++ b/corpora/historical-corpora/notthingham-de-medicine.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2562", "Family": "Historical corpora", "Description": "This corpus contains medical writing from 1500 to 1700.\nThe texts are taken primarily from digital facsimile copies available online via the University of Würzburg’s library interface, particularly from the subcategory of pertaining to gynaecology.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["120,000 tokens"], "Annotation": ["TEI Lite markup", "no linguistic annotation"], diff --git a/corpora/historical-corpora/old-bailey.json b/corpora/historical-corpora/old-bailey.json index 9dd9482..b51a42c 100644 --- a/corpora/historical-corpora/old-bailey.json +++ b/corpora/historical-corpora/old-bailey.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CFB-2", "Family": "Historical corpora", "Description": "This corpus contains proceedings of the Old Bailey (i.e., legal documents) from 1674 to 1913.\nThe corpus is available for download from the CLARIN-D repository and through the CQPConcordancer.\nFor the corpus manual, see Huber et al. (2016).", - "Languages": ["English (Late Modern)"], + "Language": ["English (Late Modern)"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["134 million words"], "Annotation": ["detailed sociobiographical, pragmatic and textual annotation"], diff --git a/corpora/historical-corpora/old-hungarian.json b/corpora/historical-corpora/old-hungarian.json index 860409a..be42d32 100644 --- a/corpora/historical-corpora/old-hungarian.json +++ b/corpora/historical-corpora/old-hungarian.json @@ -3,7 +3,7 @@ "URL": "http://oldhungariancorpus.nytud.hu/en-descr.html", "Family": "Historical corpora", "Description": "This corpus contains texts (codices, letters) from the 12th to the 17th century.\nThe corpus is available for download from a dedicated webpage and through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["3 million tokens"], "Annotation": ["tokenised", "partially normalized", "partially MSD-tagged"], diff --git a/corpora/historical-corpora/old-lit-fin.json b/corpora/historical-corpora/old-lit-fin.json index d86e208..b08d513 100644 --- a/corpora/historical-corpora/old-lit-fin.json +++ b/corpora/historical-corpora/old-lit-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407165", "Family": "Historical corpora", "Description": "This corpus contains both literary and non-literary texts from 1543 to 1810.\nIn terms of genre, the texts correspond to bible translations and religious texts (for instance, all of the clergyman Mikael Agricola's Finnish works), legal texts, poems, and texts concerning agriculture, nature, health, and so on.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "EUPL v.1.1 SA", "Size": ["4.1 million words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/orossimo.json b/corpora/historical-corpora/orossimo.json index f2993a8..a332429 100644 --- a/corpora/historical-corpora/orossimo.json +++ b/corpora/historical-corpora/orossimo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-240F-8", "Family": "Historical corpora", "Description": "This corpus contains historic academic texts.\nThe corpus is available for download from the clarin:el repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY", "Size": ["553,000 tokens"], "Annotation": ["structural annotation (paragraph)"], diff --git a/corpora/historical-corpora/pamphlets-am.json b/corpora/historical-corpora/pamphlets-am.json index 9ec5ea7..da284a5 100644 --- a/corpora/historical-corpora/pamphlets-am.json +++ b/corpora/historical-corpora/pamphlets-am.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2021", "Family": "Historical corpora", "Description": "This corpus contains pamphlets of the American Revolution from 1750 to 1776.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["431,013 words"], "Annotation": [], diff --git a/corpora/historical-corpora/parsed-hist-pt.json b/corpora/historical-corpora/parsed-hist-pt.json index ecfde89..4ebbb26 100644 --- a/corpora/historical-corpora/parsed-hist-pt.json +++ b/corpora/historical-corpora/parsed-hist-pt.json @@ -3,7 +3,7 @@ "URL": "http://www.tycho.iel.unicamp.br/", "Family": "Historical corpora", "Description": "This corpus contains 76 texts written by authors born between 1380 and 1881.\nThe corpus is available for download and through a dedicated concordancer.", - "Languages": ["por"], + "Language": ["por"], "Licence": "", "Size": ["3.3 million"], "Annotation": ["tokenised", "PoS-tagged (2 million)", "treebanked (1.2 million)"], diff --git a/corpora/historical-corpora/partonopeus-de-blois.json b/corpora/historical-corpora/partonopeus-de-blois.json index 288e88b..6fc874b 100644 --- a/corpora/historical-corpora/partonopeus-de-blois.json +++ b/corpora/historical-corpora/partonopeus-de-blois.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2499", "Family": "Historical corpora", "Description": "This corpus contains transcriptions of the manuscripts and fragments of the romance Partonopeus de Blois.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["fro"], + "Language": ["fro"], "Licence": "CC BY-NC-SA 3.0", "Size": ["21,736,766 words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/pceec.json b/corpora/historical-corpora/pceec.json index 3003726..b82abf0 100644 --- a/corpora/historical-corpora/pceec.json +++ b/corpora/historical-corpora/pceec.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2510", "Family": "Historical corpora", "Description": "This corpus contains correspondence from around 1410 to 1681.\nThere are 4970 personal letters by 666 writers. The letters have been selected to be as socially representative of the literate social ranks of the time as possible.\nThis corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["2.2 million words"], "Annotation": ["tokenised", "PoS-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/pol-16th.json b/corpora/historical-corpora/pol-16th.json index 7e1e260..e268a42 100644 --- a/corpora/historical-corpora/pol-16th.json +++ b/corpora/historical-corpora/pol-16th.json @@ -3,7 +3,7 @@ "URL": "https://spxvi.edu.pl/korpus/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 16th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": [], "Annotation": ["lemmatised", "transliteration"], diff --git a/corpora/historical-corpora/pol-bf-1500.json b/corpora/historical-corpora/pol-bf-1500.json index 1f09c3e..4ed07c8 100644 --- a/corpora/historical-corpora/pol-bf-1500.json +++ b/corpora/historical-corpora/pol-bf-1500.json @@ -3,7 +3,7 @@ "URL": "https://ijp.pan.pl/publikacje-elektroniczne/korpus-tekstow-staropolskich", "Family": "Historical corpora", "Description": "This corpus contains texts until 1500.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": ["620,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/pol-lang-1960s.json b/corpora/historical-corpora/pol-lang-1960s.json index 75dab1a..3ec99ce 100644 --- a/corpora/historical-corpora/pol-lang-1960s.json +++ b/corpora/historical-corpora/pol-lang-1960s.json @@ -3,7 +3,7 @@ "URL": "ihttp://hdl.handle.net/20.500.14106/2482", "Family": "Historical corpora", "Description": "This corpus contains essays, news articles, and scientific and literary texts from 1963 to 1967.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["500,000 words"], "Annotation": ["MSD-tagged"], diff --git a/corpora/historical-corpora/poldilemma.json b/corpora/historical-corpora/poldilemma.json index df6bef6..c60c766 100644 --- a/corpora/historical-corpora/poldilemma.json +++ b/corpora/historical-corpora/poldilemma.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8C44-B", "Family": "Historical corpora", "Description": "This corpus contains political, religious and scientific texts from the 16th to the 18th century.\nThe corpus is available for download from the CLARIN-D repository.", - "Languages": ["ces","lat","deu","pol"], + "Language": ["ces","lat","deu","pol"], "Licence": "CC BY-NC-SA 4.0", "Size": ["7 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/historical-corpora/ref-hist-slovene.json b/corpora/historical-corpora/ref-hist-slovene.json index 4b076af..83e86f1 100644 --- a/corpora/historical-corpora/ref-hist-slovene.json +++ b/corpora/historical-corpora/ref-hist-slovene.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1025", "Family": "Historical corpora", "Description": "This corpus contains 89 unique texts from 1584 to 1899.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancer KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY 4.0", "Size": ["300,000 tokens"], "Annotation": ["manually tokenised", "lemmatised", "PoS-tagged", "modern synonyms for archaic words"], diff --git a/corpora/historical-corpora/ref-mhd.json b/corpora/historical-corpora/ref-mhd.json index 1ebb8e0..24ce7fb 100644 --- a/corpora/historical-corpora/ref-mhd.json +++ b/corpora/historical-corpora/ref-mhd.json @@ -3,7 +3,7 @@ "URL": "http://deutschestextarchiv.de/rem/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1050 to 1350.\nThe corpus is available for download from the Deutsches Text Archiv and through a concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-SA 4.0", "Size": ["2.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "normalised", "morphosyntactic description"], diff --git a/corpora/historical-corpora/ref-mid-low-de.json b/corpora/historical-corpora/ref-mid-low-de.json index ce458fb..79bedbc 100644 --- a/corpora/historical-corpora/ref-mid-low-de.json +++ b/corpora/historical-corpora/ref-mid-low-de.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0007-C64C-5", "Family": "Historical corpora", "Description": "This corpus contains texts from the 13th century to the middle of the 17th century.\nThe corpus is available for download from the repository of the University of Hamburg through the ANNIS environment.", - "Languages": ["gml"], + "Language": ["gml"], "Licence": "CC-BY", "Size": ["200,700 tokens"], "Annotation": ["tokenised", "MSD-tagged"], diff --git a/corpora/historical-corpora/roysoc-corp.json b/corpora/historical-corpora/roysoc-corp.json index 327de17..a9889c3 100644 --- a/corpora/historical-corpora/roysoc-corp.json +++ b/corpora/historical-corpora/roysoc-corp.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0001-7E8B-6", "Family": "Historical corpora", "Description": "This corpus contains articles from the  Philosophical Transactions of the Royal Society of London journal from 1665 to 1869.\nThe corpus is available for download from the CLARIN-D repository as well as through a concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA-4.0", "Size": ["35 million tokens"], "Annotation": ["PoS-tagged using PennTreebank tagset", "lemmatised", "normalised"], diff --git a/corpora/historical-corpora/sacoco.json b/corpora/historical-corpora/sacoco.json index 0a25e56..f8ca803 100644 --- a/corpora/historical-corpora/sacoco.json +++ b/corpora/historical-corpora/sacoco.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-001F-7C43-1", "Family": "Historical corpora", "Description": "This corpus contains historical cookbook recipes from  1569 to 1800, as well as contemporary ones from 2012.\nThe corpus is available through the CQPweb concordancer provided by CLARIN-D.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-NC-SA-3.0", "Size": ["436,000 tokens"], "Annotation": ["PoS-tagged using the STTS tagset", "lemmatised", "normalised"], diff --git a/corpora/historical-corpora/saga.json b/corpora/historical-corpora/saga.json index fd98201..583e977 100644 --- a/corpora/historical-corpora/saga.json +++ b/corpora/historical-corpora/saga.json @@ -3,7 +3,7 @@ "URL": "https://clarin.is/en/resources/sagacorpus/", "Family": "Historical corpora", "Description": "This corpus contains Old Icelandic (Old Norse) Narrative texts from the 13th to the 15th century.\nThe corpus is available for download from CLARIN-IS and for search through the concordancer Korp.", - "Languages": ["Icelandic (Old)"], + "Language": ["Icelandic (Old)"], "Licence": "CC-BY 4.0", "Size": ["1.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "normalized orthography"], diff --git a/corpora/historical-corpora/sheffield-chin.json b/corpora/historical-corpora/sheffield-chin.json index 392f104..7d8fae9 100644 --- a/corpora/historical-corpora/sheffield-chin.json +++ b/corpora/historical-corpora/sheffield-chin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2481", "Family": "Historical corpora", "Description": "This corpus contains three texts (two non-fictional and one fictional) from the Medieval and Modern Chinese periods.\nThe text \"Zhuzi Yulei\" is genre-wise similar to sermons and vernacular dialogues, and is representative of Medieval Chinese. The two other texts are the novel \"Shuihu Zhuan\", which is from the Ming Dynasty (1368–1644), and the novel \"Rulin Waishi\", which is from the Quing Dynasty (1644–1911).\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["zho"], + "Language": ["zho"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["148,876 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/sprakbanken-hist.json b/corpora/historical-corpora/sprakbanken-hist.json index 084d588..4f3e5f1 100644 --- a/corpora/historical-corpora/sprakbanken-hist.json +++ b/corpora/historical-corpora/sprakbanken-hist.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/korp/?mode=all_hist#?lang=en&stats_reduce=word&cqp=%5B%5D", "Family": "Historical corpora", "Description": "This collection of corpora contains – among others – diachronic legal texts, Bible translations, medieval letters, digitized newspapers from the Swedish National Library and 19th century fiction from the Swedish Literature Bank.\nThe corpora are available through the concordancer Korp.", - "Languages": ["swe", "deu", "fra", "and others"], + "Language": ["swe", "deu", "fra", "and others"], "Licence": "CC-BY", "Size": ["1.34 billion tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "syntactically parsed", "word sense (for materials more recent than 1800)"], diff --git a/corpora/historical-corpora/sumerian-rev.json b/corpora/historical-corpora/sumerian-rev.json index bae57f6..c435846 100644 --- a/corpora/historical-corpora/sumerian-rev.json +++ b/corpora/historical-corpora/sumerian-rev.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2518", "Family": "Historical corpora", "Description": "This corpus contains transliterations and English translations of 394 Sumerian compositions from approximately 2100 to 1700 BCE.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng", "sux"], + "Language": ["eng", "sux"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["5,151,373 words"], "Annotation": ["Each word form in the composite transliterations has been assigned to a lexeme which is specified by a citation form, word class information and basic English translation."], diff --git a/corpora/historical-corpora/swe-news-periodicals.json b/corpora/historical-corpora/swe-news-periodicals.json index 7c76e36..48fe10a 100644 --- a/corpora/historical-corpora/swe-news-periodicals.json +++ b/corpora/historical-corpora/swe-news-periodicals.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016050301", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1770 to 1950.\nThe corpus is available through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY-SA.", "Size": ["3.5 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/syn-ref-fra.json b/corpora/historical-corpora/syn-ref-fra.json index 5f227cf..5ca658a 100644 --- a/corpora/historical-corpora/syn-ref-fra.json +++ b/corpora/historical-corpora/syn-ref-fra.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1007-0000-0000-9D2B-0", "Family": "Historical corpora", "Description": "This corpus contains texts from the 9th to the 13th century.\nThe syntactic categories of the SRCMF annotation and the grammatical principles of the annotation are explained in detail in the documentation.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["fro"], + "Language": ["fro"], "Licence": "CLARIN ACA", "Size": ["245,000 tokens"], "Annotation": ["tokenised", "syntactically-parsed"], diff --git a/corpora/historical-corpora/tlio.json b/corpora/historical-corpora/tlio.json index 9689101..6e166ee 100644 --- a/corpora/historical-corpora/tlio.json +++ b/corpora/historical-corpora/tlio.json @@ -3,7 +3,7 @@ "URL": " http://tlio.ovi.cnr.it/TLIO/", "Family": "Historical corpora", "Description": "This corpus contains early Italian texts before 1375.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["23 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/historical-corpora/vvks.json b/corpora/historical-corpora/vvks.json index ea536e6..2c61dcc 100644 --- a/corpora/historical-corpora/vvks.json +++ b/corpora/historical-corpora/vvks.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017082101", "Family": "Historical corpora", "Description": "This corpus contains literary texts from 1543 to 1791.\nThis corpus complements the Corpus of Old Literary Finnish available through FIN-CLARIN.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-NC-ND", "Size": ["48 texts"], "Annotation": [], diff --git a/corpora/historical-corpora/written-est.json b/corpora/historical-corpora/written-est.json index 508ca70..47cb65e 100644 --- a/corpora/historical-corpora/written-est.json +++ b/corpora/historical-corpora/written-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11297/1-00-0000-0000-0000-0002-6", "Family": "Historical corpora", "Description": "This corpus covers secular and religious texts from the 16th to the 18th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY", "Size": ["2 million tokens"], "Annotation": ["tokenised, 16.-18. century texts have been tagged with contemporary Estonian, morphological and language information. 19. century texts are unannotated."], diff --git a/corpora/historical-corpora/ycoe.json b/corpora/historical-corpora/ycoe.json index 61065ea..c925fac 100644 --- a/corpora/historical-corpora/ycoe.json +++ b/corpora/historical-corpora/ycoe.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2462", "Family": "Historical corpora", "Description": "This corpus contains fictional texts from 600 to 1150.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang","lat"], + "Language": ["ang","lat"], "Licence": "Oxford Text Archive licence", "Size": ["1.5 million words"], "Annotation": ["syntactically-parsed"], diff --git a/corpora/historical-corpora/ycoep.json b/corpora/historical-corpora/ycoep.json index 16a0040..6732cc3 100644 --- a/corpora/historical-corpora/ycoep.json +++ b/corpora/historical-corpora/ycoep.json @@ -3,7 +3,7 @@ "URL": "ihttp://hdl.handle.net/20.500.14106/2425", "Family": "Historical corpora", "Description": "This corpus contains poems from 730 to 1710.\nThe corpus contains a selection of poems taken from the Old English subpart of the Helsinki Corpus of English Texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang"], + "Language": ["ang"], "Licence": "Oxford Text Archive licence", "Size": ["71,500 words"], "Annotation": ["syntactically-parsed"], diff --git a/corpora/historical-corpora/yu1parl.json b/corpora/historical-corpora/yu1parl.json index 2ae3137..a297b25 100644 --- a/corpora/historical-corpora/yu1parl.json +++ b/corpora/historical-corpora/yu1parl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1845", "Family": "Historical corpora", "Description": "This historical parliamentary corpus contains meeting proceedings of the National Representation of the Kingdom of Yugoslavia from 191 to 1939. The corpus comprises 714 sessions.\nThe source data (scanned images of printed Stenographic Minutes) come from the History of Slovenia - SIstory portal. The images were OCR processed and the results saved as pdf, docx and txt. The documents are multilingual, in Serbo-Croatian and Slovenian, depending on the speaker. Serbo-Croatian is typeset in the Cyrillic (Serbian) or in the Latin (Croatian) alphabet.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Lingua was used for language detection on the sentence level. Roughly 59% of sentences are in Serbian (Cyrillic script), 38% in Croatian (Latin script) and 3% in Slovenian. Some sentences in German and French were also detected. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using CLASSLA for Serbian, Croatian and Slovenian. Words in Serbian (Cyrillic script) have lemmas in Latin script.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["hrv", "srp", "slv"], + "Language": ["hrv", "srp", "slv"], "Licence": "CC BY 4.0", "Size": ["34,542 utterances", "578,958 sentences", "13,271,885 words", "15,403 pages"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/legal-corpora/ann-czech-case-law.json b/corpora/legal-corpora/ann-czech-case-law.json index 78ede92..1e7e28c 100644 --- a/corpora/legal-corpora/ann-czech-case-law.json +++ b/corpora/legal-corpora/ann-czech-case-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-3008", "Family": "Legal corpora", "Description": "This corpus consists of 350 manually annotated decisions at Czech top-tier courts (Supreme Court, Supreme Administrative Court, Constitutional Court). Each decision has been manually annotated by two trained annotators; the corpus is primarily developed as training and testing materials for reference recognition tasks. See also the variant of this corpus annotated for segmentation tasks.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY 4.0", "Size": [], "Annotation": ["legal references (identifier of court decision; author of law book or article, etc.)"], diff --git a/corpora/legal-corpora/cabank-eng-scotus.json b/corpora/legal-corpora/cabank-eng-scotus.json index 80ec727..47661a0 100644 --- a/corpora/legal-corpora/cabank-eng-scotus.json +++ b/corpora/legal-corpora/cabank-eng-scotus.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.21415/T5Z315", "Family": "Legal corpora", "Description": "This corpus consists of transcripts and recordings of oral arguments at the Supreme Court of the United States.\nThe transcripts and audio recordings are aligned at the utterance level; the utterances are annotated based on speaker role (the primary one being Justice) and name, as well as gender.\nThe corpus is part of the CABank collection and available for download from and online browsing through TalkBank.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-NC-SA 3.0", "Size": [], "Annotation": ["speaker segmentation", "sociolinguistic annotation"], diff --git a/corpora/legal-corpora/covid-19-eur-lex-cef.json b/corpora/legal-corpora/covid-19-eur-lex-cef.json index 6a4574e..8fb2297 100644 --- a/corpora/legal-corpora/covid-19-eur-lex-cef.json +++ b/corpora/legal-corpora/covid-19-eur-lex-cef.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000D-FE69-0", "Family": "Legal corpora", "Description": "This is a multilingual corpus of the European Union Law pertaining to COVID-19 period.\nThe corpus is available for download from the PORTULAN repository.", - "Languages": ["mlt", "hun", "lit", "lav", "pol", "por", "eng", "slv", "ell", "Spanish (Castilian)", "ron", "slk", "Moldavian", "swe", "bul", "ita", "deu", "hrv", "fra", "Dutch (Flemish)", "ces", "fin", "dan", "Irish", "est"], + "Language": ["mlt", "hun", "lit", "lav", "pol", "por", "eng", "slv", "ell", "Spanish (Castilian)", "ron", "slk", "Moldavian", "swe", "bul", "ita", "deu", "hrv", "fra", "Dutch (Flemish)", "ces", "fin", "dan", "Irish", "est"], "Licence": "CC BY", "Size": ["475,931 translation pairs"], "Annotation": [], diff --git a/corpora/legal-corpora/covid-19-eur-lex-en-pt.json b/corpora/legal-corpora/covid-19-eur-lex-en-pt.json index d537908..5564d3b 100644 --- a/corpora/legal-corpora/covid-19-eur-lex-en-pt.json +++ b/corpora/legal-corpora/covid-19-eur-lex-en-pt.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000D-FE66-3", "Family": "Legal corpora", "Description": "This is a parallel corpus of the European Union Law pertaining to COVID-19 period.\nThe corpus is available for download from the PORTULAN repository.", - "Languages": ["eng", "por"], + "Language": ["eng", "por"], "Licence": "CC BY", "Size": ["21,000 units"], "Annotation": [], diff --git a/corpora/legal-corpora/czcdc.json b/corpora/legal-corpora/czcdc.json index f707f8a..904fcfe 100644 --- a/corpora/legal-corpora/czcdc.json +++ b/corpora/legal-corpora/czcdc.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-3052", "Family": "Legal corpora", "Description": "This corpus consists of around 237,000 court decisions from three top-tier courts (Supreme, Supreme Administrative, and Constitutional) in Czechia, published between 1993 and 2018.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC 4.0", "Size": ["460 million words"], "Annotation": ["unannotated"], diff --git a/corpora/legal-corpora/czech-legal-tree.json b/corpora/legal-corpora/czech-legal-tree.json index cfd5114..b0ce6a8 100644 --- a/corpora/legal-corpora/czech-legal-tree.json +++ b/corpora/legal-corpora/czech-legal-tree.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2498", "Family": "Legal corpora", "Description": "This corpus consists of two legal documents: Accounting Act (563/1991 Coll., as amended) and Decree on Double-entry Accounting for undertakers (500/2002 Coll., as amended).\nThe corpus is available for download from LINDAT and online browsing through the treebank viewer PML-TQ and the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["1128 sentences"], "Annotation": ["manual syntactic annotation; manual annotation of entities from the accouting domain and relations definition, obligation, right"], diff --git a/corpora/legal-corpora/deu-sub-mulcold.json b/corpora/legal-corpora/deu-sub-mulcold.json index ae3ca79..674e004 100644 --- a/corpora/legal-corpora/deu-sub-mulcold.json +++ b/corpora/legal-corpora/deu-sub-mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042606", "Family": "Legal corpora", "Description": "This corpus, which is a subcorpus of MULCOLD (see also the Parallel corpora resource family) contains international conventions and treaties.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN Distribution).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC BY-ND", "Size": ["198,035 tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/eng-sub-mulcold.json b/corpora/legal-corpora/eng-sub-mulcold.json index 82fc515..14e0a30 100644 --- a/corpora/legal-corpora/eng-sub-mulcold.json +++ b/corpora/legal-corpora/eng-sub-mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042605", "Family": "Legal corpora", "Description": "This corpus, which is a subcorpus of MULCOLD (see also the Parallel corpora resource family) contains international conventions and treaties.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN Distribution).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-ND", "Size": ["359,874 tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/english-acquis.json b/corpora/legal-corpora/english-acquis.json index 34db725..76d1132 100644 --- a/corpora/legal-corpora/english-acquis.json +++ b/corpora/legal-corpora/english-acquis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D50A-A", "Family": "Legal corpora", "Description": "This corpus contains selected texts from the Acquis Communautaire between the 1950s and today, translated to English.\nThe corpus is available for download from PORTULAN.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "MIT (academic)", "Size": ["34.6 million tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/est-law.json b/corpora/legal-corpora/est-law.json index 2285726..34d2a86 100644 --- a/corpora/legal-corpora/est-law.json +++ b/corpora/legal-corpora/est-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11297/1-00-0000-0000-0000-0002-2", "Family": "Legal corpora", "Description": "This corpus contains Estonian laws (1.8 million tokens) as well as European legislation (9.6 million tokens) translated into Estonian.\nThe corpus is available for download from a dedicated webpage hosted by CLARIN Estonia.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN PUB", "Size": ["11 million tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/fin-sub-firulex.json b/corpora/legal-corpora/fin-sub-firulex.json index d0dfdf5..b5c2890 100644 --- a/corpora/legal-corpora/fin-sub-firulex.json +++ b/corpora/legal-corpora/fin-sub-firulex.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042604", "Family": "Legal corpora", "Description": "This is the Finnish subcorpus of FiRuLex, which contains juridical texts in Russian and Finnish.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN distribution)", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY-ND", "Size": ["1.5 million tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/fin-sub-jrc.json b/corpora/legal-corpora/fin-sub-jrc.json index cde2273..0faafd3 100644 --- a/corpora/legal-corpora/fin-sub-jrc.json +++ b/corpora/legal-corpora/fin-sub-jrc.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042710", "Family": "Legal corpora", "Description": "This is the legal subcorpus of the Helsinki Korp Version of the Finnish TreeBank 3.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN distribution) and for download from the Finnish Language Bank.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY", "Size": ["44.1 million tokens"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/igc-laws.json b/corpora/legal-corpora/igc-laws.json index efe6c58..f7397f3 100644 --- a/corpora/legal-corpora/igc-laws.json +++ b/corpora/legal-corpora/igc-laws.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/116", "Family": "Legal corpora", "Description": "IGC-Laws is a subcorpus of the The Icelandic Gigaword Corpus (see also CLARIN reference corpora). IGC-Laws contains 1) the Icelandic laws, 2) explanatory reports and observations extracted from bills submitted to Althingi, and 3) parliamentary proposals and resolutions. The corpus comes in two formats. One contains the texts untokenized and untagged while the other has been tokenized, PoS-tagged and lemmatized.\nThe corpus is available for download from the CLARIN-IS repository.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["2,2 million sentences", "40,6 million words"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/jrc-acquis.json b/corpora/legal-corpora/jrc-acquis.json index 23a6509..493e5ec 100644 --- a/corpora/legal-corpora/jrc-acquis.json +++ b/corpora/legal-corpora/jrc-acquis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/ATHENA-0000-0000-25C9-4", "Family": "Legal corpora", "Description": "This is a parallel corpus of Acquis Communautaire, which is the total body of European Union law applicable in European member states.\nMost texts have been manually classified according to the EUROVOC subject domains so that the collection can also be used to train and test multi-label classification algorithms and keyword-assignment software. The corpus is encoded in XML, according to the Text Encoding Initiative Guidelines. Due to the large number of parallel texts in many languages, the JRC-Acquis is particularly suitable to carry out all types of cross-language research, as well as to test and benchmark text analysis software across different languages (for instance for alignment, sentence splitting and term extraction). The sentence-level alignment was done using the hunalign tool.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["bul", "ces", "dan", "deu", "eng", "spa", "est", "fin", "fra", "hun", "ita", "lit", "lav", "mlt", "nld", "pol", "por", "ron", "slk", "slv", "swe"], + "Language": ["bul", "ces", "dan", "deu", "eng", "spa", "est", "fin", "fra", "hun", "ita", "lit", "lav", "mlt", "nld", "pol", "por", "ron", "slk", "slv", "swe"], "Licence": "CC BY 4.0", "Size": ["1 billion words"], "Annotation": ["paragraph and sentence alignment"], diff --git a/corpora/legal-corpora/jrc-eu-dgt.json b/corpora/legal-corpora/jrc-eu-dgt.json index 6c203a6..4f7fa3c 100644 --- a/corpora/legal-corpora/jrc-eu-dgt.json +++ b/corpora/legal-corpora/jrc-eu-dgt.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1197", "Family": "Legal corpora", "Description": "", - "Languages": ["bul", "hrv", "ces", "dan", "nld", "eng", "est", "fin", "fra", "deu", "hun", "gle", "ita", "lav", "lit", "ell", "pol", "por", "ron", "slk", "slv", "spa", "swe"], + "Language": ["bul", "hrv", "ces", "dan", "nld", "eng", "est", "fin", "fra", "deu", "hun", "gle", "ita", "lav", "lit", "ell", "pol", "por", "ron", "slk", "slv", "spa", "swe"], "Licence": "CC BY 4.0", "Size": ["2.1 billion tokens"], "Annotation": ["syntactically parsed (Universal Dependencies)"], diff --git a/corpora/legal-corpora/judicial-rhetoric.json b/corpora/legal-corpora/judicial-rhetoric.json index 0624b51..29ad413 100644 --- a/corpora/legal-corpora/judicial-rhetoric.json +++ b/corpora/legal-corpora/judicial-rhetoric.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/CLARIN-EL-0000-0000-6114-C", "Family": "Legal corpora", "Description": "This corpus consists of transcriptions of defendants’ and witnesses’ speeches in criminal cases of rape, attempted rape, murder, and attempted murder.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC BY-NC-ND 4.0", "Size": [], "Annotation": [], diff --git a/corpora/legal-corpora/juridisch-nl.json b/corpora/legal-corpora/juridisch-nl.json index 18ef41c..86a37e2 100644 --- a/corpora/legal-corpora/juridisch-nl.json +++ b/corpora/legal-corpora/juridisch-nl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-u2", "Family": "Legal corpora", "Description": "This corpus contains legal texts from 1814 to 1989, compiled year by year.\nThe corpus is available for online browsing on a dedicated webpage", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB", "Size": ["5,856 texts"], "Annotation": ["lemmatised", "PoS-tagged"], diff --git a/corpora/legal-corpora/legal-est-min-just.json b/corpora/legal-corpora/legal-est-min-just.json index 224353d..c34d915 100644 --- a/corpora/legal-corpora/legal-est-min-just.json +++ b/corpora/legal-corpora/legal-est-min-just.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000D-FAD1-D ", "Family": "Legal corpora", "Description": "This corpus contains Estonian-English translations of the Acts of Estonian law.\nThe corpus is available for download from PORTULAN.", - "Languages": ["Estonian-English"], + "Language": ["Estonian-English"], "Licence": "CC BY", "Size": ["47,000 units"], "Annotation": [], diff --git a/corpora/legal-corpora/legal-nynorsk-munic.json b/corpora/legal-corpora/legal-nynorsk-munic.json index 3163c9d..2f1f523 100644 --- a/corpora/legal-corpora/legal-nynorsk-munic.json +++ b/corpora/legal-corpora/legal-nynorsk-munic.json @@ -3,7 +3,7 @@ "URL": "https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-60/", "Family": "Legal corpora", "Description": "This corpus contains 50,000 legal documents and meeting minutes collected with the web crawler Veidemann. Around 88.5 million words are in Nynork, while the rest are in Bokmal (Bokmål).\nThe corpus is available for download from the Norwegian Language Bank.", - "Languages": ["Norwegian (Nynorsk and Bokmål)"], + "Language": ["Norwegian (Nynorsk and Bokmål)"], "Licence": "CC0 1.0 Universal", "Size": ["127 million words"], "Annotation": [], diff --git a/corpora/legal-corpora/lifr-law.json b/corpora/legal-corpora/lifr-law.json index bfededf..2b55314 100644 --- a/corpora/legal-corpora/lifr-law.json +++ b/corpora/legal-corpora/lifr-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-5020", "Family": "Legal corpora", "Description": "This is a corpus of Czech legal and administrative texts with measured reading comprehension and a subjective expert annotation of diverse textual properties based on the Hamburg Comprehensibility Concept.\nThe corpus is comprised of 18 documents in total; that is, six different texts from the legal/administration domain, each in three versions: the original and two paraphrases. Each such document triple shares one reading-comprehension test administered to at least thirty readers of random gender, educational background, and age. The data set also captures basic demographic information about each reader, their familiarity with the topic, and their subjective assessment of the stylistic properties of the given document, roughly corresponding to the key text properties identified by the Hamburg Comprehensibility Concept.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY 4.0", "Size": ["17601 tokens"], "Annotation": ["textual annotation"], diff --git a/corpora/legal-corpora/likumi.json b/corpora/legal-corpora/likumi.json index 2fe0940..d9f4ac0 100644 --- a/corpora/legal-corpora/likumi.json +++ b/corpora/legal-corpora/likumi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12574/65", "Family": "Legal corpora", "Description": "The corpus contains all legal acts of the Republic of Latvia published on the website likumi.lv (until February 2022).\nThe corpus is available for download from the CLARIN.LV repository.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "CC BY 4.0", "Size": ["116 million tokens", "73 million words"], "Annotation": [], diff --git a/corpora/legal-corpora/lit-eu-law.json b/corpora/legal-corpora/lit-eu-law.json index c853c17..16fbfad 100644 --- a/corpora/legal-corpora/lit-eu-law.json +++ b/corpora/legal-corpora/lit-eu-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/18", "Family": "Legal corpora", "Description": "This corpus contains primary and secondary European law acts (32 texts) translated into Lithuanian.\nThe corpus is available for download from CLARIN-LT.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN PUB", "Size": ["274,460 words"], "Annotation": [], diff --git a/corpora/legal-corpora/maltese-acquis.json b/corpora/legal-corpora/maltese-acquis.json index 40b00fc..ea6df06 100644 --- a/corpora/legal-corpora/maltese-acquis.json +++ b/corpora/legal-corpora/maltese-acquis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D4FD-9", "Family": "Legal corpora", "Description": "", - "Languages": ["mlt"], + "Language": ["mlt"], "Licence": "MIT (academic)", "Size": ["20.9 million tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/meta-nord-dan.json b/corpora/legal-corpora/meta-nord-dan.json index d2863b6..a87373e 100644 --- a/corpora/legal-corpora/meta-nord-dan.json +++ b/corpora/legal-corpora/meta-nord-dan.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9BE-2D15-4C1C-1", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC BY 4.0", "Size": ["102 sentences", "1799 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-est.json b/corpora/legal-corpora/meta-nord-est.json index 6bc53cb..b2fed71 100644 --- a/corpora/legal-corpora/meta-nord-est.json +++ b/corpora/legal-corpora/meta-nord-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D1-EE49-223F-3", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY 4.0", "Size": ["78 sentences", "1443 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-fin.json b/corpora/legal-corpora/meta-nord-fin.json index 80e5fe8..35c9f57 100644 --- a/corpora/legal-corpora/meta-nord-fin.json +++ b/corpora/legal-corpora/meta-nord-fin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D1-FD1D-3174-1", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank. The corpus is syntactically parsed using the FinnTreeBank 2 schema and is available for download and online browsing through INESS (CLARINO).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY 4.0", "Size": ["122 sentences", "1464 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-isl.json b/corpora/legal-corpora/meta-nord-isl.json index 5d6084e..86420b8 100644 --- a/corpora/legal-corpora/meta-nord-isl.json +++ b/corpora/legal-corpora/meta-nord-isl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D2-09F8-20E3-8", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["73 sentences", "1880 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-nor.json b/corpora/legal-corpora/meta-nord-nor.json index bb9f0a1..0247757 100644 --- a/corpora/legal-corpora/meta-nord-nor.json +++ b/corpora/legal-corpora/meta-nord-nor.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D937-A55E-278E-1", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CC BY 4.0", "Size": ["101 sentences", "1862 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-swe.json b/corpora/legal-corpora/meta-nord-swe.json index db3cbfe..75555c3 100644 --- a/corpora/legal-corpora/meta-nord-swe.json +++ b/corpora/legal-corpora/meta-nord-swe.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D3-24E5-429B-9", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY 4.0", "Size": ["102 sentences", "1982 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/multieurlex.json b/corpora/legal-corpora/multieurlex.json index 57064bd..9991e5a 100644 --- a/corpora/legal-corpora/multieurlex.json +++ b/corpora/legal-corpora/multieurlex.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/CLARIN-EL-0000-0000-61A7-6", "Family": "Legal corpora", "Description": "This corpus consists of 65,000 European laws in 23 official European languages. Each law has been annotated with the EuroVoc concept labels.\nThe corpus is available for download from the repository of CLARIN:EL.", - "Languages": ["fin", "slk", "lit", "hrv", "slv", "est", "lav", "mlt", "eng", "deu", "fra", "ita", "spa", "Castilian", "pol", "ron", "Moldavian", "Moldovan", "nld", "Flemish", "ell", "hun", "por", "ces", "swe", "bul", "dan"], + "Language": ["fin", "slk", "lit", "hrv", "slv", "est", "lav", "mlt", "eng", "deu", "fra", "ita", "spa", "Castilian", "pol", "ron", "Moldavian", "Moldovan", "nld", "Flemish", "ell", "hun", "por", "ces", "swe", "bul", "dan"], "Licence": "CC BY", "Size": [], "Annotation": ["conceptual annotation"], diff --git a/corpora/legal-corpora/nor-acquis.json b/corpora/legal-corpora/nor-acquis.json index 453abb8..0012ad1 100644 --- a/corpora/legal-corpora/nor-acquis.json +++ b/corpora/legal-corpora/nor-acquis.json @@ -3,7 +3,7 @@ "URL": "https://www.nb.no/sprakbanken/ressurskatalog/oai-nb-no-sbr-2/", "Family": "Legal corpora", "Description": "This corpus contains Norwegian translations of 5414 documents in Acquis Communautaire.\nThe corpus is available for download from the Norwegian Language Bank.", - "Languages": ["Norwegian (Bokmål and Nynorsk)"], + "Language": ["Norwegian (Bokmål and Nynorsk)"], "Licence": "CC BY-NC 4.0", "Size": ["14 million words"], "Annotation": [], diff --git a/corpora/legal-corpora/old-bailey.json b/corpora/legal-corpora/old-bailey.json index e1ba8e3..208c55e 100644 --- a/corpora/legal-corpora/old-bailey.json +++ b/corpora/legal-corpora/old-bailey.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CFB-2", "Family": "Legal corpora", "Description": "This historical corpus consists of Proceedings of the Old Bailey; the Old Bailey was London’s central criminal court between 1674 and 1913. The corpus consists of texts from 1970 to 1913, and is annotated for detailed utterance-level sociolinguistic annotation at the following three levels: sociobiographical speaker information (gender, age, occupation, social class), pragmatic information (speaker role in the courtroom such as judge, witness, etc.), and metatextual information (the scribe, printer, and publisher of the individual Proceeding).\nThe corpus is available for download from CLARIN-D (Saarland University) and for online browsing through CQPWeb.", - "Languages": ["English (Late Modern)"], + "Language": ["English (Late Modern)"], "Licence": "CC BY-NC-SA 4.0", "Size": ["24.4 million words"], "Annotation": ["sociolinguistic annotation"], diff --git a/corpora/legal-corpora/rus-sub-firulex.json b/corpora/legal-corpora/rus-sub-firulex.json index c9f70b3..0ea3590 100644 --- a/corpora/legal-corpora/rus-sub-firulex.json +++ b/corpora/legal-corpora/rus-sub-firulex.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042603", "Family": "Legal corpora", "Description": "This is the Russian subcorpus of FiRuLex, which contains juridical texts in Russian and Finnish.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN distribution)", - "Languages": ["rus"], + "Language": ["rus"], "Licence": "CC BY-ND", "Size": ["1.2 million tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/rus-sub-mulcold.json b/corpora/legal-corpora/rus-sub-mulcold.json index 5fcbfc7..8ffee1f 100644 --- a/corpora/legal-corpora/rus-sub-mulcold.json +++ b/corpora/legal-corpora/rus-sub-mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042607", "Family": "Legal corpora", "Description": "This corpus, which is a subcorpus of MULCOLD (see also the Parallel corpora resource family) contains international conventions and treaties.\nThe corpus can be accessed online through the concordancer Korp (FIN-CLARIN Distribution).", - "Languages": ["rus"], + "Language": ["rus"], "Licence": "CC BY-ND", "Size": ["198,035 tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/literary-corpora/1000-novels.json b/corpora/literary-corpora/1000-novels.json index 6eef07b..fb60b97 100644 --- a/corpora/literary-corpora/1000-novels.json +++ b/corpora/literary-corpora/1000-novels.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/312", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY 4.0", "Size": ["1000 texts"], "Annotation": [], diff --git a/corpora/literary-corpora/1000plus-novels.json b/corpora/literary-corpora/1000plus-novels.json index ebbe8ce..c3b761d 100644 --- a/corpora/literary-corpora/1000plus-novels.json +++ b/corpora/literary-corpora/1000plus-novels.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/699", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY-SA 3.0", "Size": ["1000 texts", "17,352,826 words"], "Annotation": [], diff --git a/corpora/literary-corpora/15c-castilian.json b/corpora/literary-corpora/15c-castilian.json index b938e52..2ca1993 100644 --- a/corpora/literary-corpora/15c-castilian.json +++ b/corpora/literary-corpora/15c-castilian.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-873", "Family": "Literary corpora", "Description": "This is a lyric corpus of 15th century cancioneros.\nThe corpus is available for online browsing through an external interface.", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/1920-polish.json b/corpora/literary-corpora/1920-polish.json index a06e32b..18ae818 100644 --- a/corpora/literary-corpora/1920-polish.json +++ b/corpora/literary-corpora/1920-polish.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/57", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY 3.0", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/aformes.json b/corpora/literary-corpora/aformes.json index f3fd03b..3281572 100644 --- a/corpora/literary-corpora/aformes.json +++ b/corpora/literary-corpora/aformes.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2575-3", "Family": "Literary corpora", "Description": "This corpus contains fiction texts from a journal of undergraduate creative writing at the Faculty of English Language and Literature.\nThe corpus is available for download from clarin:el.", - "Languages": ["ell","eng"], + "Language": ["ell","eng"], "Licence": "CC-BY-NC", "Size": ["376,250 words"], "Annotation": [], diff --git a/corpora/literary-corpora/anglosaxon.json b/corpora/literary-corpora/anglosaxon.json index 1cc5a54..3287ad6 100644 --- a/corpora/literary-corpora/anglosaxon.json +++ b/corpora/literary-corpora/anglosaxon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-867", "Family": "Literary corpora", "Description": "This corpus is available for online browsing through an external interface.", - "Languages": ["ang"], + "Language": ["ang"], "Licence": "", "Size": [], "Annotation": ["none"], diff --git a/corpora/literary-corpora/anth-me.json b/corpora/literary-corpora/anth-me.json index 4ce8fce..e5f48e6 100644 --- a/corpora/literary-corpora/anth-me.json +++ b/corpora/literary-corpora/anth-me.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1398", "Family": "Literary corpora", "Description": "This corpus contains literary texts from 1100 to 1400.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["enm", "heb"], + "Language": ["enm", "heb"], "Licence": "Oxford Text Archive Licence", "Size": ["4,000 words"], "Annotation": [], diff --git a/corpora/literary-corpora/bonnier-one.json b/corpora/literary-corpora/bonnier-one.json index dd82862..9545a3a 100644 --- a/corpora/literary-corpora/bonnier-one.json +++ b/corpora/literary-corpora/bonnier-one.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/115", "Family": "Literary corpora", "Description": "This corpus presents 69 Bonnier novels from 1976-77.\nThe corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY 4.0", "Size": ["6,578,675 tokens", "462,625 sentences"], "Annotation": ["sentence scrambling"], diff --git a/corpora/literary-corpora/bonnier-two.json b/corpora/literary-corpora/bonnier-two.json index 543cc3f..37b7fc5 100644 --- a/corpora/literary-corpora/bonnier-two.json +++ b/corpora/literary-corpora/bonnier-two.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/116", "Family": "Literary corpora", "Description": "This corpus presents 60 Bonnier novels from 1980-81.\nThe corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY 4.0", "Size": ["4,304,271 tokens", "298,361 sentences"], "Annotation": ["sentence scrambling"], diff --git a/corpora/literary-corpora/ceal.json b/corpora/literary-corpora/ceal.json index 017283e..2f4c80b 100644 --- a/corpora/literary-corpora/ceal.json +++ b/corpora/literary-corpora/ceal.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016110901", "Family": "Literary corpora", "Description": "This corpus contains Finnish translations of the following three texts: Jane Austen: Ylpeys ja ennakkoluulo (Pride and Prejudice), translated by Kersti Juva, Teos 2013; Henry James: Washingtonin aukio (Washington Square), translated by Kersti Juva, Otava 2003; Charles Dickens: Kolea talo (Bleak House), translated by Kersti Juva, Tammi, 2006.\nThe corpus is available for online browsing through Korp in two versions - Version 1 (Sentences and Paragraphs in the Original Order) and Version 2 (Scrambled Paragraphs))", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN RES + NC", "Size": ["3 novels", "484,010 tokens"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/classic-fin-lit.json b/corpora/literary-corpora/classic-fin-lit.json index f6d054c..c4f5bf7 100644 --- a/corpora/literary-corpora/classic-fin-lit.json +++ b/corpora/literary-corpora/classic-fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-773", "Family": "Literary corpora", "Description": "This corpus contains works by established Finnish fiction writers from the 1880s to the 1930s. There are different types of prose and plays, as well as lyrics and aphorisms.\nThis corpus is available for online browsing through an external interface.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "", "Size": ["1,456,658 words"], "Annotation": [], diff --git a/corpora/literary-corpora/classic-fin.json b/corpora/literary-corpora/classic-fin.json index 55b4a99..7835873 100644 --- a/corpora/literary-corpora/classic-fin.json +++ b/corpora/literary-corpora/classic-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2018051701", "Family": "Literary corpora", "Description": "This corpus contains literary texts from 1549 to 1944.\nThe corpus is available for online browsing through FIN-CLARIN.", - "Languages": ["fin","swe"], + "Language": ["fin","swe"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/early-fin-lit.json b/corpora/literary-corpora/early-fin-lit.json index b92b9df..7d89b0d 100644 --- a/corpora/literary-corpora/early-fin-lit.json +++ b/corpora/literary-corpora/early-fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-772", "Family": "Literary corpora", "Description": "The corpus of Early Modern Finnish contains Finnish-language works in various fields published during the 19th century, annual issues of the oldest periodicals and newspapers, almanac and decree texts, and some dictionaries. An effort has been made to include the earliest, most important and (based on the number of reprints, for example) most widely distributed works. The selection of publications has also been made with a view to achieving the widest possible thematic coverage, although more works originally written in Finnish have been included than translations. These have been alphabetised by the name of their translator, seasonal publications by their title, and other works by their author. The Finnish translations of unknown authors are in the Anonymous folder, the texts of unknown authors in the Other folder. The materials cover the period between Old and Modern English and a little beyond. The earliest book dates from 1809, the latest from 1891, but there are texts of the regulations right up to the end of the century. However, most of the material is from 1810-1880. This later material can also be found in the Classics corpus.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/est-fiction.json b/corpora/literary-corpora/est-fiction.json index 09c3580..12d2210 100644 --- a/corpora/literary-corpora/est-fiction.json +++ b/corpora/literary-corpora/est-fiction.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-0007EL", "Family": "Literary corpora", "Description": "This corpus contains texts from 1990 onwards.\nThe corpus is available for download from CELR.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA - NC", "Size": ["5,768,504 words"], "Annotation": [], diff --git a/corpora/literary-corpora/est-runic.json b/corpora/literary-corpora/est-runic.json index 89a219a..7417e3e 100644 --- a/corpora/literary-corpora/est-runic.json +++ b/corpora/literary-corpora/est-runic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-0008FL", "Family": "Literary corpora", "Description": "These are the oldest text recordings of Estonian runic songs (the text recordings were created in the 19th century and in the first decades of the 20th century). In addition to the runic songs, the database also has songs of transitional form and end-rhymed songs (about 6000).\nThe corpus is available for online browsing through an external interface.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA", "Size": ["92,134 texts"], "Annotation": [], diff --git a/corpora/literary-corpora/etcsl.json b/corpora/literary-corpora/etcsl.json index 121680b..ec2e52f 100644 --- a/corpora/literary-corpora/etcsl.json +++ b/corpora/literary-corpora/etcsl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-874", "Family": "Literary corpora", "Description": "This corpus presents a selection of nearly 400 literary compositions recorded on sources which come from ancient Mesopotamia and date to the late third and early second millennia BCE.\nThe corpus is available for online browsing through an external interface.", - "Languages": ["sux"], + "Language": ["sux"], "Licence": "", "Size": ["400 literary compositions"], "Annotation": [], diff --git a/corpora/literary-corpora/fin-folk.json b/corpora/literary-corpora/fin-folk.json index 614b361..993f97e 100644 --- a/corpora/literary-corpora/fin-folk.json +++ b/corpora/literary-corpora/fin-folk.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052712", "Family": "Literary corpora", "Description": "This corpus contains poems from 1564 to 1939.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], + "Language": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], "Licence": "CC-BY-NC", "Size": ["7.1 million words"], "Annotation": ["unannotated"], diff --git a/corpora/literary-corpora/fin-gutenberg.json b/corpora/literary-corpora/fin-gutenberg.json index 2f6dd05..b78b710 100644 --- a/corpora/literary-corpora/fin-gutenberg.json +++ b/corpora/literary-corpora/fin-gutenberg.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014100301", "Family": "Literary corpora", "Description": "This corpus contains Finnish books made available by the Gutenberg project. The texts have not been linguistically annotated.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY", "Size": ["34,487,420 words"], "Annotation": [], diff --git a/corpora/literary-corpora/fin-lit.json b/corpora/literary-corpora/fin-lit.json index 6bd08d1..0cd64ad 100644 --- a/corpora/literary-corpora/fin-lit.json +++ b/corpora/literary-corpora/fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730186", "Family": "Literary corpora", "Description": "This corpus contains prose fiction, plays, poetry and aphorisms (some written originally in Swedish) of established Finnish authors published from 1880s to 1949.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "EUPL v.1.1 SA", "Size": ["1,500,000 words"], "Annotation": ["syntactically parsed (TDT alpha)", "named entities (FiNER)", "MSD-tagged", "lemmatized"], diff --git a/corpora/literary-corpora/greek-medieval.json b/corpora/literary-corpora/greek-medieval.json index dd51dcd..d1d4e46 100644 --- a/corpora/literary-corpora/greek-medieval.json +++ b/corpora/literary-corpora/greek-medieval.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-251D-7", "Family": "Literary corpora", "Description": "This corpus contains medieval texts contains written material covering the period from the 4th till the 16th century A.D. The texts can be classified into the following categories: religious, poetical-literary, political-historical, hymns, epigrams.\nThe corpus is available for download from clarin:el.", - "Languages": ["ell","grc"], + "Language": ["ell","grc"], "Licence": "CC-BY-NC", "Size": ["3,419,553 words"], "Annotation": [], diff --git a/corpora/literary-corpora/greek-thesaurus.json b/corpora/literary-corpora/greek-thesaurus.json index 91b40d0..3c3946d 100644 --- a/corpora/literary-corpora/greek-thesaurus.json +++ b/corpora/literary-corpora/greek-thesaurus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E3-8", "Family": "Literary corpora", "Description": "This corpus contains prose, poetry, drama, and essays from the 18th century onwards.\nThe corpus is available for online browsing through a dedicated webpage.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "proprietary", "Size": ["1 million tokens"], "Annotation": ["semantic"], diff --git a/corpora/literary-corpora/joh-jen.json b/corpora/literary-corpora/joh-jen.json index bb07d26..0d9df39 100644 --- a/corpora/literary-corpora/joh-jen.json +++ b/corpora/literary-corpora/joh-jen.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12115/20", "Family": "Literary corpora", "Description": "This corpus presents the collected works of the Danish author Johannes Jensen.\nThe corpus is available for download from CLARIN-DK and for online browsing through a dedicated concordancer.", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC BY-SA 4.0", "Size": ["1,760,093 words", "8,489 pages"], "Annotation": ["unannotated"], diff --git a/corpora/literary-corpora/kdsp.json b/corpora/literary-corpora/kdsp.json index b21341e..629bb85 100644 --- a/corpora/literary-corpora/kdsp.json +++ b/corpora/literary-corpora/kdsp.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1823", "Family": "Literary corpora", "Description": "This corpus contains 262 texts of longer older Slovenian narrative prose. The texts were published between 1836 and 1918 and are at least 20,000 words long.\nThe texts have bibliographical metadata (author name, title, year of publication, length) and are classified according to the decade of publication, length, text type, text subtype, theme, and level of canonicity (texts by those authors included in school textbooks after 1980 and/or included in the Collected writings of Slovenian poets and writers, are marked with a high degree of canonicity). The metadata about the authors of the texts are provided with their gender, occupation, and years of birth and death. The corpus texts come from three digital sources, and each text is marked for its source. They are Wikisource (145 texts), the ELTeC corpus (96 texts), and the dLib digital library (21 texts). The corpus is provided in two variants, one containing running text and the other with added linguistic analyses. These comprise tokens, sentences, lemmas, MULTEXT-East morphosytactic descriptions and Universal Dependencies morphological features. The linguistic annotation was performed with the CLASSLA program. The source format of the corpus in TEI/XML, with two derived formats also available: one is plain text, and the other vertical files, as used by concordances, like the CWB.\nThe corpus is available for download from CLARIN.SI as well as through the noSketchEngine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY 4.0", "Size": ["262 texts", "11 million words", "14 million tokens"], "Annotation": ["MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated with author and text metadata"], diff --git a/corpora/literary-corpora/kivi.json b/corpora/literary-corpora/kivi.json index 80f6802..d6c3141 100644 --- a/corpora/literary-corpora/kivi.json +++ b/corpora/literary-corpora/kivi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405274", "Family": "Literary corpora", "Description": "This corpus contains all the known letters, manuscripts and published works by Finnish author Aleksis Kivi (1834–1872). Most of the texts were written in Finnish while some of the letters and manuscripts are in Swedish. The time coverage of the texts: 1855-1871.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin","swe"], + "Language": ["fin","swe"], "Licence": "CC-BY-NC", "Size": ["413,735 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/lat-lit-classic.json b/corpora/literary-corpora/lat-lit-classic.json index 67a052f..759791f 100644 --- a/corpora/literary-corpora/lat-lit-classic.json +++ b/corpora/literary-corpora/lat-lit-classic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-184", "Family": "Literary corpora", "Description": "This corpus presents classics from the end of the 19th century to the beginning of the 20th century.", - "Languages": ["lat"], + "Language": ["lat"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/ltcorpus.json b/corpora/literary-corpora/ltcorpus.json index 600e1ed..a52c650 100644 --- a/corpora/literary-corpora/ltcorpus.json +++ b/corpora/literary-corpora/ltcorpus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net//21.11115/0000-000B-D33D-3", "Family": "Literary corpora", "Description": "This corpus contains 70 copyright-free classics (61 Portugal and 9 Brazil) published before 1940.\nThe corpus is available for download from PORTULAN.", - "Languages": ["por"], + "Language": ["por"], "Licence": "CLARIN RES", "Size": ["1,781,083 words"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/literary-corpora/m-agricola.json b/corpora/literary-corpora/m-agricola.json index 76e4212..b68f1d3 100644 --- a/corpora/literary-corpora/m-agricola.json +++ b/corpora/literary-corpora/m-agricola.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730170", "Family": "Literary corpora", "Description": "This corpus contains the Finnish parts of Mikael Agricola’s works (Abckiria, Rukouskiria, Se Wsi testamenti, Käsikiria, Messu, Piina, Psaltari, Veisut, Profeetat).\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-ND", "Size": ["83,678 sentences", "428,314 tokens", "38,308 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/micro-pol.json b/corpora/literary-corpora/micro-pol.json index df1e570..df374ba 100644 --- a/corpora/literary-corpora/micro-pol.json +++ b/corpora/literary-corpora/micro-pol.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/604", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "plWordNet", "Size": [], "Annotation": ["unannotated"], diff --git a/corpora/literary-corpora/multext1984.json b/corpora/literary-corpora/multext1984.json index d6eca95..18db9fb 100644 --- a/corpora/literary-corpora/multext1984.json +++ b/corpora/literary-corpora/multext1984.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1043", "Family": "Literary corpora", "Description": "This is a parallel corpus of George Orwell's 1984 and its translations.\nThe corpus is available for download from CLARIN.SI.", - "Languages": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], + "Language": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], "Licence": "CC BY-NC SA 4.0", "Size": ["12 texts", "79,718 sentences", "1,064,424 words"], "Annotation": ["sentence-alignment", "MSD tagging"], diff --git a/corpora/literary-corpora/norbok-children.json b/corpora/literary-corpora/norbok-children.json index 1dd20d1..5cfbb3a 100644 --- a/corpora/literary-corpora/norbok-children.json +++ b/corpora/literary-corpora/norbok-children.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D988-1F83-B1F5-1", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nob"], + "Language": ["nob"], "Licence": "CLARIN ACA", "Size": ["4,111,213 words", "389,564 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/norbok-fiction.json b/corpora/literary-corpora/norbok-fiction.json index 4c52e97..5fd0fa8 100644 --- a/corpora/literary-corpora/norbok-fiction.json +++ b/corpora/literary-corpora/norbok-fiction.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D988-2078-6447-1", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nob"], + "Language": ["nob"], "Licence": "CLARIN ACA", "Size": ["26,903,637 words", "2,469,916 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/nornyn-children.json b/corpora/literary-corpora/nornyn-children.json index d42831e..6846830 100644 --- a/corpora/literary-corpora/nornyn-children.json +++ b/corpora/literary-corpora/nornyn-children.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D963-33EA-65BD-0", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nno"], + "Language": ["nno"], "Licence": "CLARIN ACA", "Size": ["1,043,260 words", "106,434 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/nornyn-fiction.json b/corpora/literary-corpora/nornyn-fiction.json index 414c2e2..3cc07c1 100644 --- a/corpora/literary-corpora/nornyn-fiction.json +++ b/corpora/literary-corpora/nornyn-fiction.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D985-7B94-F361-1", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nno"], + "Language": ["nno"], "Licence": "CLARIN ACA", "Size": ["2,884,376 words", "260,285 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/north-saami.json b/corpora/literary-corpora/north-saami.json index f6fe3e0..7137740 100644 --- a/corpora/literary-corpora/north-saami.json +++ b/corpora/literary-corpora/north-saami.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014032620", "Family": "Literary corpora", "Description": "This corpus contains Kerttu Vuolab's novel Cheppari cháráhus.\nThe corpus is available for online browsing through the TAITO shell.", - "Languages": ["sme"], + "Language": ["sme"], "Licence": "CLARIN RES +NC +NORED +PLAN", "Size": ["17,830 words"], "Annotation": [], diff --git a/corpora/literary-corpora/old-fin-lit.json b/corpora/literary-corpora/old-fin-lit.json index 3f11433..8e59b16 100644 --- a/corpora/literary-corpora/old-fin-lit.json +++ b/corpora/literary-corpora/old-fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-776", "Family": "Literary corpora", "Description": "This corpus contains various works published during the Swedish rule (from the 16th century to about 1810), extensive manuscripts from that period (most of which were later printed), as well as individual almanac and decree texts, sermons and poetry.\nThis corpus is available for online browsing through an external interface.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "", "Size": ["3,428,618 words"], "Annotation": [], diff --git a/corpora/literary-corpora/one-mil-cro.json b/corpora/literary-corpora/one-mil-cro.json index 9877216..60549e5 100644 --- a/corpora/literary-corpora/one-mil-cro.json +++ b/corpora/literary-corpora/one-mil-cro.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-234", "Family": "Literary corpora", "Description": "The corpus is listed in the LINDAT repository. ", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "", "Size": ["1 million tokens"], "Annotation": [], diff --git a/corpora/literary-corpora/orig-est.json b/corpora/literary-corpora/orig-est.json index 70a1265..3d9dd40 100644 --- a/corpora/literary-corpora/orig-est.json +++ b/corpora/literary-corpora/orig-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00088L", "Family": "Literary corpora", "Description": "This corpus collects older Estonian literary texts published on \"Kreutzwald's Century: the Estonian Cultural History Web\". The electronically republished books, included in the collection, are based on the first editions of works by more important Estonian authors, published in 1854-1944.\nThe corpus is available for online browsing through an external interface.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA", "Size": ["173 texts"], "Annotation": [], diff --git a/corpora/literary-corpora/parfin.json b/corpora/literary-corpora/parfin.json index 74db151..b532ec8 100644 --- a/corpora/literary-corpora/parfin.json +++ b/corpora/literary-corpora/parfin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016121610", "Family": "Literary corpora", "Description": "This corpus contains Finnish literary texts from 1990-2010 and their translations into Russian aligned at sentence level.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin","rus"], + "Language": ["fin","rus"], "Licence": "CLARIN RES +NC +INF +ND", "Size": ["2,044,172 tokens"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/parrus.json b/corpora/literary-corpora/parrus.json index 3324d8c..e8e6b7a 100644 --- a/corpora/literary-corpora/parrus.json +++ b/corpora/literary-corpora/parrus.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730173", "Family": "Literary corpora", "Description": "This corpus contains Russian literary texts (classical literature & 20th century) and their translations into Finnish aligned at paragraph level.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin","rus"], + "Language": ["fin","rus"], "Licence": "CLARIN RES +NC +INF +ND", "Size": ["5,900,000 tokens"], "Annotation": ["MSD-tagged, syntactically parsed"], diff --git a/corpora/literary-corpora/prilit.json b/corpora/literary-corpora/prilit.json index b5971d2..a95078e 100644 --- a/corpora/literary-corpora/prilit.json +++ b/corpora/literary-corpora/prilit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1319", "Family": "Literary corpora", "Description": "This corpus contains texts of older Slovenian narrative prose by 12 authors.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY 4.0", "Size": ["43 texts", "1,275,209 tokens"], "Annotation": ["word modernisation", "lemmatisation", "syntactic annotation (Universal Dependencies)"], diff --git a/corpora/literary-corpora/rep-bastille.json b/corpora/literary-corpora/rep-bastille.json index 4a230cc..d0877f8 100644 --- a/corpora/literary-corpora/rep-bastille.json +++ b/corpora/literary-corpora/rep-bastille.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AUTH-0000-0000-24DC-0", "Family": "Literary corpora", "Description": "This corpus contains République-Bastille, a novel by Melpo Axioti. This French text is of particular linguistic interest since it is a text written in a language other than the author's mother tongue and is suited for research on bilingualism and self-translation. It would be worth measuring the naturalness of the language with computational tools, for example.\nThe corpus is available for download from clarin:el.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": ["37,965 words"], "Annotation": [], diff --git a/corpora/literary-corpora/sol.json b/corpora/literary-corpora/sol.json index 5c1dd2b..14a4f35 100644 --- a/corpora/literary-corpora/sol.json +++ b/corpora/literary-corpora/sol.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/80", "Family": "Literary corpora", "Description": "This corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "CC-BY 4.0", "Size": ["1,267,391 tokens", "69,270 sentences"], "Annotation": ["sentence scrambled"], diff --git a/corpora/literary-corpora/strindberg.json b/corpora/literary-corpora/strindberg.json index 730ba17..37a7205 100644 --- a/corpora/literary-corpora/strindberg.json +++ b/corpora/literary-corpora/strindberg.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/79", "Family": "Literary corpora", "Description": "This corpus presents the collected works of August Strindberg.\nThe corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY 4.0", "Size": ["4,309,037 tokens", "321,759 sentences"], "Annotation": ["sentence scrambling"], diff --git a/corpora/literary-corpora/uhlcs.json b/corpora/literary-corpora/uhlcs.json index 1eb9c24..aa6b49a 100644 --- a/corpora/literary-corpora/uhlcs.json +++ b/corpora/literary-corpora/uhlcs.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014032622", "Family": "Literary corpora", "Description": "This corpus contains samples of Finnish literature published by the WSOY publishing company in the 1990.\nThe corpus is available online through FIN-CLARIN.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN RES", "Size": ["68,425 words"], "Annotation": ["tagged"], diff --git a/corpora/literary-corpora/york-poetry.json b/corpora/literary-corpora/york-poetry.json index 8b136f0..7d1d5c2 100644 --- a/corpora/literary-corpora/york-poetry.json +++ b/corpora/literary-corpora/york-poetry.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2425", "Family": "Literary corpora", "Description": "This corpus contains a selection of poetic texts (71,490 words) from the Old English Section of the Helsinki Corpus of English Texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang"], + "Language": ["ang"], "Licence": "Restricted", "Size": ["71,490 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/manually-annotated-corpora/acl-rd-tex.json b/corpora/manually-annotated-corpora/acl-rd-tex.json index f41bb52..4a3b293 100644 --- a/corpora/manually-annotated-corpora/acl-rd-tex.json +++ b/corpora/manually-annotated-corpora/acl-rd-tex.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-1661", "Family": "Manually annotated corpora", "Description": "This corpus contains 6818 terms extracted from abstracts of computational linguistics papers.\nThe corpus is available for download from LINDAT and through KonText.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-NC-SA 4.0", "Size": ["33216 tokens"], "Annotation": ["terminology extraction/classification"], diff --git a/corpora/manually-annotated-corpora/alksnis.json b/corpora/manually-annotated-corpora/alksnis.json index bbe4215..6bbfa73 100644 --- a/corpora/manually-annotated-corpora/alksnis.json +++ b/corpora/manually-annotated-corpora/alksnis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/10", "Family": "Manually annotated corpora", "Description": "Syntactic parsing follows the rules of the Prague Dependency Treebank\nThis corpus is available for download from the CLARIN-LT repository. The second version is available upon request.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN PUB", "Size": ["2,355 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/artificial-treebank.json b/corpora/manually-annotated-corpora/artificial-treebank.json index f556ced..381191e 100644 --- a/corpora/manually-annotated-corpora/artificial-treebank.json +++ b/corpora/manually-annotated-corpora/artificial-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2616", "Family": "Manually annotated corpora", "Description": "This syntactic parsing follows the Universal Dependencies schema.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces", "eng", "fin", "rus", "slk"], + "Language": ["ces", "eng", "fin", "rus", "slk"], "Licence": "Licence Universal dependencies v2.1", "Size": ["106,000 tokens", "10,604 sentences"], "Annotation": ["syntactic parsing", "mark-up of elliptical constructions"], diff --git a/corpora/manually-annotated-corpora/artur.json b/corpora/manually-annotated-corpora/artur.json index fc5ff74..4a6bbf2 100644 --- a/corpora/manually-annotated-corpora/artur.json +++ b/corpora/manually-annotated-corpora/artur.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1772", "Family": "Manually annotated corpora", "Description": "This corpus was designed for the needs of developing automatic speech recognition for the Slovenian language. The complete database includes 1,067 hours of speech, of which 884 hours are transcribed, while the remaining 183 hours are recordings only.\nThe audio files are available in a separate repository entry. Transcriptions are available in the original TRS format of the Transcriber 1.5.1 tool which was used for making the transcriptions. All transcriptions were made manually or manually corrected.\nThe data are structured as follows:
    1. Artur-B, read speech, 573 hours in total.\nIt includes: (1a) Artur-B-Brani, 485 hours: Readings of sentences which were pre-selected from a 10% increment in the Gigafida 2.0 corpus. The sentences were chosen in such a way that they reflect the natural or the actual distribution of triphones in the words. They were distributed between 1,000 speakers, so that we recorded approx. 30 min in read form from each speaker. The speakers were balanced according to gender, age, region, and a small proportion of speakers were non-native speakers of Slovene. Each sentence is its own audio file and has a corresponding transcription file. (1b) Artur-B-Crkovani, 10 hours: Spellings. Speakers were asked to spell abbreviations and personal names and surnames, all chosen so that all Slovene letters were covered, plus the most common foreign letters. (1c) Artur-B-Studio, 51 hours: Designed for the development of speech synthesis. The sentences were read in a studio by a single speaker. Each sentence is its own audio file and has a corresponding transcription file. (1d) Artur-B-Izloceno, 27 hours: The recordings include different types of errors, typically, incorrect reading of sentences or a noisy environment.
    2. (2) Artur-J, public speech, 62 hours in total.\nIt includes: (2a) Artur-J-Splosni, 62 hours: media recordings, online recordings of conferences, workshops, education videos, etc.
    3. (3) Artur-N, private speech, 74 hours in total.\nIt includes: (3a) Artur-N-Obrazi, 6 hours: Speakers were asked to describe faces on pictures. Designed for a face-description domain-specific speech recognition. (3b) Artur-N-PDom, 7 hours: Speakers were asked to read pre-written sentences, as well as to express instructions for a potential smart-home system freely. Designed for a smart-home domain-specific speech recognition. (3c) Artur-N-Prosti, 61 hours: Monologues and dialogues between two persons, recorded for the purposes of the Artur database creation. Speakers were asked to conversate or explain freely on casual topics.
    4. (4) Artur-P, parliamentary speech, 201 hours in total.\nIt includes: (4a) Artur-P-SejeDZ, 201 hours: Speech from the Slovene National Assembly.
    5. \nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["884 hours"], "Annotation": ["orthographically transcribed speech"], diff --git a/corpora/manually-annotated-corpora/aspect-term-czech.json b/corpora/manually-annotated-corpora/aspect-term-czech.json index e63fffa..1894fbe 100644 --- a/corpora/manually-annotated-corpora/aspect-term-czech.json +++ b/corpora/manually-annotated-corpora/aspect-term-czech.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1507", "Family": "Manually annotated corpora", "Description": "This corpus contains online user-product reviews.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 3.0", "Size": ["2200 reviews"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/austrian-baroque.json b/corpora/manually-annotated-corpora/austrian-baroque.json index de281b8..683bd61 100644 --- a/corpora/manually-annotated-corpora/austrian-baroque.json +++ b/corpora/manually-annotated-corpora/austrian-baroque.json @@ -3,7 +3,7 @@ "URL": "https://acdh.oeaw.ac.at/abacus/", "Family": "Manually annotated corpora", "Description": "This historical corpus contains sermons from 1650 to 1750. For linguistic annotation, each individual token was automatically assigned to a morphosyntactic word class using the TreeTagger software. As a classification system, the 54-part Stuttgart-Tübingen TagSet (STTS) was used. For lemmatization , a normalized basic word form was used for each token and the Duden and the German dictionary by Jacob and Wilhelm Grimm were used as reference works. The part-of-speech tagging and lemmatization was then manually checked.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["200,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "named entities"], diff --git a/corpora/manually-annotated-corpora/b4-heliand.json b/corpora/manually-annotated-corpora/b4-heliand.json index 8e42cad..2f10df6 100644 --- a/corpora/manually-annotated-corpora/b4-heliand.json +++ b/corpora/manually-annotated-corpora/b4-heliand.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B24-9", "Family": "Manually annotated corpora", "Description": "This corpus contains historical German texts.\nThe corpus is available for download from the HZSK repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY", "Size": ["3495 tokens"], "Annotation": ["PoS tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/bnc-sampler.json b/corpora/manually-annotated-corpora/bnc-sampler.json index 5288cee..846ad5e 100644 --- a/corpora/manually-annotated-corpora/bnc-sampler.json +++ b/corpora/manually-annotated-corpora/bnc-sampler.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2551", "Family": "Manually annotated corpora", "Description": "The corpus was manually post-edited to correct the PoS tags automatically assigned by CLAWS.\nThe corpus is available for online querying via CQPWeb (registration required) for download from the Oxford Text Archive", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "BNC Licence", "Size": ["2 million tokens"], "Annotation": ["PoS tagging"], diff --git a/corpora/manually-annotated-corpora/bultreebank.json b/corpora/manually-annotated-corpora/bultreebank.json index cc6d5e2..30779ab 100644 --- a/corpora/manually-annotated-corpora/bultreebank.json +++ b/corpora/manually-annotated-corpora/bultreebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D93F-C6E9-65D9-2", "Family": "Manually annotated corpora", "Description": "This corpus is available for download through the concordancer Corpuscle.", - "Languages": ["bul"], + "Language": ["bul"], "Licence": "MS-NC-NoReD", "Size": ["214,000 tokens"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/cintil-deepbank.json b/corpora/manually-annotated-corpora/cintil-deepbank.json index c208ff2..7441897 100644 --- a/corpora/manually-annotated-corpora/cintil-deepbank.json +++ b/corpora/manually-annotated-corpora/cintil-deepbank.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D34F-F", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the PORTULAN CLARIN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["PoS-tagging", "syntactic parsing", "grammatical functions", "logical forms"], diff --git a/corpora/manually-annotated-corpora/cintil-dependency.json b/corpora/manually-annotated-corpora/cintil-dependency.json index 05f618b..70a3648 100644 --- a/corpora/manually-annotated-corpora/cintil-dependency.json +++ b/corpora/manually-annotated-corpora/cintil-dependency.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D31C-8", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the PORTULAN CLARIN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["morphosyntactic tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/cintil-portugues.json b/corpora/manually-annotated-corpora/cintil-portugues.json index 9b203f9..e3c2672 100644 --- a/corpora/manually-annotated-corpora/cintil-portugues.json +++ b/corpora/manually-annotated-corpora/cintil-portugues.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D33B-5", "Family": "Manually annotated corpora", "Description": "The corpus contains transcriptions of spoken communication as well as written texts from several genres (news, literature, magazines, etc.).\nThe corpus is available for download from the CLARIN PORTULAN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "CLARIN RES", "Size": ["1 million tokens"], "Annotation": ["morphosyntactic tagging", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/cintil-propbank.json b/corpora/manually-annotated-corpora/cintil-propbank.json index e40d7ef..1075ab1 100644 --- a/corpora/manually-annotated-corpora/cintil-propbank.json +++ b/corpora/manually-annotated-corpora/cintil-propbank.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D300-6", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the ELRA catalogue.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["syntactic parsing", "phrase semantic roles"], diff --git a/corpora/manually-annotated-corpora/cintil-treebank.json b/corpora/manually-annotated-corpora/cintil-treebank.json index 7d0cf4c..417c408 100644 --- a/corpora/manually-annotated-corpora/cintil-treebank.json +++ b/corpora/manually-annotated-corpora/cintil-treebank.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D2FE-A", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the PORTULAN CLARIN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/cmc-training-janes-norm.json b/corpora/manually-annotated-corpora/cmc-training-janes-norm.json index b1e0619..404dc85 100644 --- a/corpora/manually-annotated-corpora/cmc-training-janes-norm.json +++ b/corpora/manually-annotated-corpora/cmc-training-janes-norm.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1084", "Family": "Manually annotated corpora", "Description": "This corpus is partially also manually annotated with MSD tags and lemmatized.\nThe corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["184,755 tokens"], "Annotation": ["normalization"], diff --git a/corpora/manually-annotated-corpora/cmc-training-janes-tag.json b/corpora/manually-annotated-corpora/cmc-training-janes-tag.json index e0e003e..71b7a80 100644 --- a/corpora/manually-annotated-corpora/cmc-training-janes-tag.json +++ b/corpora/manually-annotated-corpora/cmc-training-janes-tag.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1123", "Family": "Manually annotated corpora", "Description": "This corpus contains computer-mediated communication (CMC). The corpus is morphosyntactically tagged following the MULTEXT-East Version 5 tagset.\nThe corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["75,000 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "word normalisation", "morphosyntactic tagging", "lemmatisation", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/czech-legal-treebank.json b/corpora/manually-annotated-corpora/czech-legal-treebank.json index 9a5e6ee..d1449f3 100644 --- a/corpora/manually-annotated-corpora/czech-legal-treebank.json +++ b/corpora/manually-annotated-corpora/czech-legal-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2498", "Family": "Manually annotated corpora", "Description": "This corpus contains legal texts.\nThe corpus is available through the concordance KonText, the PML-TQ tool and for download from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["1121 sentences"], "Annotation": ["syntactic parsing", "labelling of semantic entities"], diff --git a/corpora/manually-annotated-corpora/czech-ne-corpus.json b/corpora/manually-annotated-corpora/czech-ne-corpus.json index 6bd133f..d1ca72c 100644 --- a/corpora/manually-annotated-corpora/czech-ne-corpus.json +++ b/corpora/manually-annotated-corpora/czech-ne-corpus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-1B04-C", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 3.0", "Size": ["5868 sentences", "35220 NEs"], "Annotation": ["Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/dep-anno-creg.json b/corpora/manually-annotated-corpora/dep-anno-creg.json index c5bdf31..46140b4 100644 --- a/corpora/manually-annotated-corpora/dep-anno-creg.json +++ b/corpora/manually-annotated-corpora/dep-anno-creg.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-2CA4-6", "Family": "Manually annotated corpora", "Description": "This corpus consists of answers to reading comprehension questions written by American college students learning German.\nThe corpus is available for download from the Tübingen CLARIN Repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["109 sentences"], "Annotation": ["PoS tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/est-treebank-coref.json b/corpora/manually-annotated-corpora/est-treebank-coref.json index c6a8773..eca4443 100644 --- a/corpora/manually-annotated-corpora/est-treebank-coref.json +++ b/corpora/manually-annotated-corpora/est-treebank-coref.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-0016AL", "Family": "Manually annotated corpora", "Description": "This corpus contains newspaper texts plus one scientific medical text.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "GPL", "Size": ["107,000 words"], "Annotation": ["anaphora relations"], diff --git a/corpora/manually-annotated-corpora/est-treebank.json b/corpora/manually-annotated-corpora/est-treebank.json index 2cc8a4a..3780a2e 100644 --- a/corpora/manually-annotated-corpora/est-treebank.json +++ b/corpora/manually-annotated-corpora/est-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00080L", "Family": "Manually annotated corpora", "Description": "The corpus contains fictional and newspaper texts.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN_ACA", "Size": ["1,000 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/facebook-sentiment.json b/corpora/manually-annotated-corpora/facebook-sentiment.json index 76eabd0..7c92a8d 100644 --- a/corpora/manually-annotated-corpora/facebook-sentiment.json +++ b/corpora/manually-annotated-corpora/facebook-sentiment.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0022-FE82-7", "Family": "Manually annotated corpora", "Description": "This corpus contains Facebook posts.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-SA 3.0", "Size": ["10,000 Facebook posts"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/fictree.json b/corpora/manually-annotated-corpora/fictree.json index a3f60a2..cdac007 100644 --- a/corpora/manually-annotated-corpora/fictree.json +++ b/corpora/manually-annotated-corpora/fictree.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2517", "Family": "Manually annotated corpora", "Description": "This corpus contains fictional texts.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["12760 sentences"], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/fin-treebank-1.json b/corpora/manually-annotated-corpora/fin-treebank-1.json index 0c0b5bd..0e09089 100644 --- a/corpora/manually-annotated-corpora/fin-treebank-1.json +++ b/corpora/manually-annotated-corpora/fin-treebank-1.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016011501", "Family": "Manually annotated corpora", "Description": "This corpus contains 19,000 sentences from the Large Grammar of Finnish.\nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY 3.0", "Size": ["160,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/fin-treebank-2.json b/corpora/manually-annotated-corpora/fin-treebank-2.json index da5d46c..804a601 100644 --- a/corpora/manually-annotated-corpora/fin-treebank-2.json +++ b/corpora/manually-annotated-corpora/fin-treebank-2.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407163", "Family": "Manually annotated corpora", "Description": "This corpus contains 19,000 sentences from the Large Grammar of Finnish.\nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY 3.0", "Size": ["160,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/finnsentiment.json b/corpora/manually-annotated-corpora/finnsentiment.json index 5d486b9..1e4c19b 100644 --- a/corpora/manually-annotated-corpora/finnsentiment.json +++ b/corpora/manually-annotated-corpora/finnsentiment.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2023012701", "Family": "Manually annotated corpora", "Description": "This corpus contains sentences from Finnish social media that have been manually annotated for sentiment polarity by three native annotators.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY", "Size": ["27,000 sentences"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/frenk-styria.json b/corpora/manually-annotated-corpora/frenk-styria.json index 5d42c35..1060627 100644 --- a/corpora/manually-annotated-corpora/frenk-styria.json +++ b/corpora/manually-annotated-corpora/frenk-styria.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1202", "Family": "Manually annotated corpora", "Description": "This corpus contains news comments from the website 24sata.hr.\nThe corpus is available for download from CLARIN.SI.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["407.5 million words"], "Annotation": ["sentiment analysis (socially unacceptable discourse)"], diff --git a/corpora/manually-annotated-corpora/greek-coref.json b/corpora/manually-annotated-corpora/greek-coref.json index afb8084..f5f2157 100644 --- a/corpora/manually-annotated-corpora/greek-coref.json +++ b/corpora/manually-annotated-corpora/greek-coref.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25DC-F", "Family": "Manually annotated corpora", "Description": "In addition to coreference, the corpus is annotated for identity and bridging relations.\nIn addition to coreference, the corpus is annotated for identity and bridging relations.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC-SA", "Size": ["62,988 tokens"], "Annotation": ["coreference"], diff --git a/corpora/manually-annotated-corpora/greek-entailment.json b/corpora/manually-annotated-corpora/greek-entailment.json index 3cac88b..7e07ce7 100644 --- a/corpora/manually-annotated-corpora/greek-entailment.json +++ b/corpora/manually-annotated-corpora/greek-entailment.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23DB-2", "Family": "Manually annotated corpora", "Description": "This corpus contains texts from the domains of politics, law and travel.\nThis corpus is available for download from the clarin:el repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY", "Size": ["600 sentence-pairs"], "Annotation": ["logical entailment"], diff --git a/corpora/manually-annotated-corpora/grug-para-tree.json b/corpora/manually-annotated-corpora/grug-para-tree.json index aba4677..b330e4f 100644 --- a/corpora/manually-annotated-corpora/grug-para-tree.json +++ b/corpora/manually-annotated-corpora/grug-para-tree.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0006-C150-9", "Family": "Manually annotated corpora", "Description": "The corpus is syntactically parsed following the TIGER guidelines.\nThe corpus is available for download from a dedicated website provided by the CLARIN-D consortium.", - "Languages": ["kat", "ukr", "rus", "deu"], + "Language": ["kat", "ukr", "rus", "deu"], "Licence": "CC-BY", "Size": ["10,400 sentence pairs"], "Annotation": ["syntactic parsing", "PoS tagging"], diff --git a/corpora/manually-annotated-corpora/grundtvig.json b/corpora/manually-annotated-corpora/grundtvig.json index 60e5778..69803bd 100644 --- a/corpora/manually-annotated-corpora/grundtvig.json +++ b/corpora/manually-annotated-corpora/grundtvig.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12115/31", "Family": "Manually annotated corpora", "Description": "This corpus contains the literary works of the Danish bishop N.F.S Grundtvig.\nThe corpus is available for download from the CLARIN-DK repository.", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC BY-NC 4.0", "Size": ["11,417,194 words"], "Annotation": ["linked data (places, persons, bible citations, etc.)"], diff --git a/corpora/manually-annotated-corpora/hamledt.json b/corpora/manually-annotated-corpora/hamledt.json index 36da328..6baa04a 100644 --- a/corpora/manually-annotated-corpora/hamledt.json +++ b/corpora/manually-annotated-corpora/hamledt.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1508", "Family": "Manually annotated corpora", "Description": "This treebank collection is available for download from LINDAT.\nThe treebanks can be individually queried through KonText and the treebank tool PML-TQ. We list them here by language:\n
      1. Arabic(KonText, PML-TQ)
      2. Bengali (KonText)
      3. Catalan (KonText)
      4. Czech (KonText, PML-TQ)
      5. Dutch (KonText, PML-TQ)
      6. English (KonText)
      7. Estonian (KonText, PML-TQ)
      8. German (KonText)
      9. Greek (KonText)
      10. Hindi (KonText)
      11. Latin (KonText, PML-TQ)
      12. Persian (KonText, PML-TQ)
      13. Polish (KonText, PML-TQ)
      14. Portuguese (KonText, PML-TQ)
      15. Romanian (KonText, PML-TQ)
      16. Russian (KonText)
      17. Slovenian (KonText, PML-TQ)
      18. Spanish (KonText)
      19. Tamil (KonText, PML-TQ)
      ", - "Languages": ["19 languages"], + "Language": ["19 languages"], "Licence": "HamleDT 3.0 Licence Terms", "Size": ["19 treebanks"], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/hr500k-1.json b/corpora/manually-annotated-corpora/hr500k-1.json index 8ce7475..fe8e034 100644 --- a/corpora/manually-annotated-corpora/hr500k-1.json +++ b/corpora/manually-annotated-corpora/hr500k-1.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1183", "Family": "Manually annotated corpora", "Description": "This corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["500,000 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "morphosyntactic tagging", "lemmatisation", "Named Entity recognition", "Half of corpus also syntactically parsed"], diff --git a/corpora/manually-annotated-corpora/hr500k-2.json b/corpora/manually-annotated-corpora/hr500k-2.json index e7713ad..2092d36 100644 --- a/corpora/manually-annotated-corpora/hr500k-2.json +++ b/corpora/manually-annotated-corpora/hr500k-2.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1792", "Family": "Manually annotated corpora", "Description": "This training corpus contains about 500,000 tokens manually annotated on the levels of tokenisation, sentence segmentation, morphosyntactic tagging, lemmatisation and named entities. About half of the corpus is also manually annotated with syntactic dependencies. A subset of the syntactically annotated corpus is also annotated for multi-word expressions. Furthermore, about a fifth of the corpus is annotated with semantic role labels.\nThe annotation formalisms followed in the hr500k corpus are (1) the MULTEXT-East V6 morphosyntactic specifications for the Serbo-Croatian macro-language, (2) the UDv2 Guidelines, (3) the Janes annotation guidelines for named entities, (4) the PARSEME guidelines for annotating multi-word expressions and (4) the semantic role labelling annotation protocol for Slovenian and Croatian.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["499,635 tokens"], "Annotation": ["fully – tokenisation, sentence segmentation, morphosyntactic tagging, and lemmatisation, named entities. Half of the corpus – syntactic parsing, a subset also for multi-word expressions. Fifth of the corpus: semantic roles."], diff --git a/corpora/manually-annotated-corpora/icepahc.json b/corpora/manually-annotated-corpora/icepahc.json index 0999bbf..b3f2dcd 100644 --- a/corpora/manually-annotated-corpora/icepahc.json +++ b/corpora/manually-annotated-corpora/icepahc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/62", "Family": "Manually annotated corpora", "Description": "This corpus contains Icelandic texts from the 12th through the 21st centuries – approximately 100,000 words from each century. The corpus is syntactically parsed following the UPenn scheme for historical texts.\nThe corpus is available for online search through treebankstudio.org and for download in different formats from a dedicated webpage.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "GNU LGPL", "Size": ["1 million tokens"], "Annotation": ["morphosyntactic tagging", "lemmatisation", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/jos1m.json b/corpora/manually-annotated-corpora/jos1m.json index cecf106..c183ca4 100644 --- a/corpora/manually-annotated-corpora/jos1m.json +++ b/corpora/manually-annotated-corpora/jos1m.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1037", "Family": "Manually annotated corpora", "Description": "This corpus contains sampled paragraphs from the Slovenian national corpus FidaPLUS. The corpus is morphosyntactically tagged following the MULTEXT-East Version 4 tagset.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-NC 4.0", "Size": ["1 million words"], "Annotation": ["morphosyntactic tagging", "lemmatisation"], diff --git a/corpora/manually-annotated-corpora/kas-biterm.json b/corpora/manually-annotated-corpora/kas-biterm.json index 8ab2ee2..6550be6 100644 --- a/corpora/manually-annotated-corpora/kas-biterm.json +++ b/corpora/manually-annotated-corpora/kas-biterm.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1199", "Family": "Manually annotated corpora", "Description": "This corpus contains PHD theses.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv", "eng"], + "Language": ["slv", "eng"], "Licence": "CC BY-SA 4.0", "Size": ["1,950 sentences", "78,500 tokens", "3,700 terms"], "Annotation": ["bi-lingual term extraction"], diff --git a/corpora/manually-annotated-corpora/kas-term.json b/corpora/manually-annotated-corpora/kas-term.json index 82308e9..d768c59 100644 --- a/corpora/manually-annotated-corpora/kas-term.json +++ b/corpora/manually-annotated-corpora/kas-term.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1198", "Family": "Manually annotated corpora", "Description": "This corpus contains term candidates from PhD theses in chemistry, computer science and political science.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["22,950 term candidates"], "Annotation": ["monolingual term extraction"], diff --git a/corpora/manually-annotated-corpora/kpwr.json b/corpora/manually-annotated-corpora/kpwr.json index 0e0fbad..93aad3c 100644 --- a/corpora/manually-annotated-corpora/kpwr.json +++ b/corpora/manually-annotated-corpora/kpwr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/270", "Family": "Manually annotated corpora", "Description": "This corpus contains texts in a variety of domains (blogs, science, stenographic recordings, etc.).\nThe corpus is available for download from the CLARIN-PL repository.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY-SA 3.0", "Size": ["447,000 tokens"], "Annotation": ["chunks and selected predicate-argument relations", "Named Entity recognition", "relations between named entities", "anaphora relations", "word senses", "events", "temporal expressions", "spatial relations between entities", "keywords and semantic roles within nominal and adjective phrases"], diff --git a/corpora/manually-annotated-corpora/lassy-klein.json b/corpora/manually-annotated-corpora/lassy-klein.json index 0430b40..97c5d64 100644 --- a/corpora/manually-annotated-corpora/lassy-klein.json +++ b/corpora/manually-annotated-corpora/lassy-klein.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/efc201791fadf20f67858b602553874b", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from the Dutch Language Institute and through the online environments PaQu and GrETEL.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "VAGUE", "Size": ["1 million tokens"], "Annotation": ["PoS tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/lvtb.json b/corpora/manually-annotated-corpora/lvtb.json index 77aec5a..9373dc7 100644 --- a/corpora/manually-annotated-corpora/lvtb.json +++ b/corpora/manually-annotated-corpora/lvtb.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12574/86", "Family": "Manually annotated corpora", "Description": "This treebank is manually annotated according to a hybrid dependency-constituency grammar.\nThe treebank is available for download from the CLARIN-LV repository.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "CC BY-SA 4.0", "Size": ["289,791 tokens", "17,127 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/matas.json b/corpora/manually-annotated-corpora/matas.json index 5ec8d77..5c3f195 100644 --- a/corpora/manually-annotated-corpora/matas.json +++ b/corpora/manually-annotated-corpora/matas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/9", "Family": "Manually annotated corpora", "Description": "The corpus contains texts from various domains (documents, fiction, periodicals, scientific texts, wordforms).\nThis corpus is available for download from the CLARIN-LT repository.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN ACA", "Size": ["1.6 million words"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/morph-dist-estonian.json b/corpora/manually-annotated-corpora/morph-dist-estonian.json index e4f9417..f09c223 100644 --- a/corpora/manually-annotated-corpora/morph-dist-estonian.json +++ b/corpora/manually-annotated-corpora/morph-dist-estonian.json @@ -3,7 +3,7 @@ "URL": "http://doi.org/10.15155/1-00-0000-0000-0000-00085L", "Family": "Manually annotated corpora", "Description": "This corpus contains texts from the 1980s subcorpus of the Corpus of Written Estonian 1890-1990.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN_ACA-NC", "Size": ["513,000 tokens"], "Annotation": ["morphological disambiguation"], diff --git a/corpora/manually-annotated-corpora/multext-east.json b/corpora/manually-annotated-corpora/multext-east.json index 88b385d..54d946b 100644 --- a/corpora/manually-annotated-corpora/multext-east.json +++ b/corpora/manually-annotated-corpora/multext-east.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1043", "Family": "Manually annotated corpora", "Description": "This corpus contains 11 human translations of George Orwell’s Nineteen Eighty-Four, as well as the original text. The corpus is morphosyntactically tagged following the MULTEXT-East Version 4 tagset.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], + "Language": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], "Licence": "CC BY-NC-SA 4.0", "Size": ["80,000 sentences", "1 million words"], "Annotation": ["morphosyntactic tagging", "lemmatisation", "sentence alignment"], diff --git a/corpora/manually-annotated-corpora/nkjp1m.json b/corpora/manually-annotated-corpora/nkjp1m.json index 9b91ed5..d6fce68 100644 --- a/corpora/manually-annotated-corpora/nkjp1m.json +++ b/corpora/manually-annotated-corpora/nkjp1m.json @@ -3,7 +3,7 @@ "URL": "http://clip.ipipan.waw.pl/NationalCorpusOfPolish", "Family": "Manually annotated corpora", "Description": "This corpus is a manually annotated subset of the National Corpus of Polish.\nThe corpus is available for download from the Computational Linguistics in Poland website.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "GNU GPL 3", "Size": ["1 million tokens"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/nl2sh.json b/corpora/manually-annotated-corpora/nl2sh.json index 92f2110..8be7c38 100644 --- a/corpora/manually-annotated-corpora/nl2sh.json +++ b/corpora/manually-annotated-corpora/nl2sh.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1822", "Family": "Manually annotated corpora", "Description": "This corpus can be used to build and evaluate methods for knowledge extraction and representation based on a semantic hypergraph. Each sentence has natural language annotations and dedicated semantic hyperedge. Majority of the sentences used in this dataset are taken from the following sources:\n