diff --git a/corpora/academic-corpora/aca-hum.json b/corpora/academic-corpora/aca-hum.json index 36c8f95..a735ac0 100644 --- a/corpora/academic-corpora/aca-hum.json +++ b/corpora/academic-corpora/aca-hum.json @@ -9,7 +9,7 @@ "Annotation": [], "Infrastructure": "CLARIN", "Access": { - "Concordancer": "https://spraakbanken.gu.se/korp/?corpus=sweachum" + "Concordancer": "https://spraakbanken.gu.se/korp/?corpus=sweachum", "Download": "http://hdl.handle.net/10794/49" }, "Publication":"" diff --git a/corpora/academic-corpora/aca-soc.json b/corpora/academic-corpora/aca-soc.json index 6ed0238..238d9f5 100644 --- a/corpora/academic-corpora/aca-soc.json +++ b/corpora/academic-corpora/aca-soc.json @@ -9,7 +9,7 @@ "Annotation": ["sentence segmentation"], "Infrastructure": "CLARIN", "Access": { - "Concordancer": "https://spraakbanken.gu.se/korp/?corpus=sweacsam" + "Concordancer": "https://spraakbanken.gu.se/korp/?corpus=sweacsam", "Download": "http://hdl.handle.net/10794/50" }, "Publication":"" diff --git a/corpora/academic-corpora/jezkor.json b/corpora/academic-corpora/jezkor.json index 6328c30..6d8af79 100644 --- a/corpora/academic-corpora/jezkor.json +++ b/corpora/academic-corpora/jezkor.json @@ -9,8 +9,8 @@ "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated for named entities and author/text metadata"], "Infrastructure": "CLARIN", "Access": { - "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=jezkor" - "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=jezkor" + "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=jezkor", + "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=jezkor", "Download": "http://hdl.handle.net/11356/1755" }, "Publication":"" diff --git a/corpora/academic-corpora/open-slo.json b/corpora/academic-corpora/open-slo.json index 5dc80cb..aecfd35 100644 --- a/corpora/academic-corpora/open-slo.json +++ b/corpora/academic-corpora/open-slo.json @@ -2,15 +2,15 @@ "Name": "Corpus of scientific texts from the Open Science Slovenia portal OSS 1.0", "URL": "http://hdl.handle.net/11356/1774", "Family": "Academic corpora", - "Description": "This corpus contains a large collection of scientific writing in the Slovenian language gathered from the Open Science Slovenia portal. It consists of over 150 thousand monographs, articles, diploma, master's and doctoral theses, advanced textbooks, reviews etc. mostly published between 2000 and 2022 by Slovenian universities, research institutions, etc. Texts are accompanied by metadata, i.e. author, supervisor (for theses), year of publication, publisher (mostly faculties of the various universities), type of publication (according to SICRIS classification), keywords, and CERIF and UDC codes. The texts were obtained directly from PDFs, so it should be noted that they can contain various types of character noise. The texts are linguistically annotated with the CLASSLA pipeline on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features, and named entities. The corpus is distributed in CoNLL-U and vertical file formats, one file for each text. The text metadata is given as a TSV file.\nNote that there exist similar, but older and smaller corpora KAS 2.0 and KAS 1.0. These contain only theses and only up to 2018, but are cleaner and with more metadata. The repository also archives a number of KAS-derived datasets; pls. search for "KAS" to find them.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", + "Description": "This corpus contains a large collection of scientific writing in the Slovenian language gathered from the Open Science Slovenia portal. It consists of over 150 thousand monographs, articles, diploma, master's and doctoral theses, advanced textbooks, reviews etc. mostly published between 2000 and 2022 by Slovenian universities, research institutions, etc. Texts are accompanied by metadata, i.e. author, supervisor (for theses), year of publication, publisher (mostly faculties of the various universities), type of publication (according to SICRIS classification), keywords, and CERIF and UDC codes. The texts were obtained directly from PDFs, so it should be noted that they can contain various types of character noise. The texts are linguistically annotated with the CLASSLA pipeline on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features, and named entities. The corpus is distributed in CoNLL-U and vertical file formats, one file for each text. The text metadata is given as a TSV file.\nNote that there exist similar, but older and smaller corpora KAS 2.0 and KAS 1.0. These contain only theses and only up to 2018, but are cleaner and with more metadata. The repository also archives a number of KAS-derived datasets; pls. search for \"KAS\" to find them.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", "Languages": ["slv"], "License": "CC BY-SA", "Size": ["326 million tokens"], "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated for named entities and author/text metadata"], "Infrastructure": "CLARIN", "Access": { - "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=oss10" - "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=oss10" + "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=oss10", + "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=oss10", "Download": "http://hdl.handle.net/11356/1774" }, "Publication":"" diff --git a/corpora/academic-corpora/roysoc.json b/corpora/academic-corpora/roysoc.json index 2dbb212..56aeaef 100644 --- a/corpora/academic-corpora/roysoc.json +++ b/corpora/academic-corpora/roysoc.json @@ -9,7 +9,7 @@ "Annotation": ["PoS-tagged", "lemmatised", "normalised", "author and document metadata"], "Infrastructure": "CLARIN", "Access": { - "Concordancer": "http://fedora.clarin-d.uni-saarland.de/rsc_v4/access.html#cqpweb" + "Concordancer": "http://fedora.clarin-d.uni-saarland.de/rsc_v4/access.html#cqpweb", "Download": "http://fedora.clarin-d.uni-saarland.de/rsc_v4/access.html#download" }, "Publication": "https://www.zotero.org/groups/562080/items/FWYERQ4A" diff --git a/corpora/corpora-of-disordered-speech/adresso-challenge.json b/corpora/corpora-of-disordered-speech/adresso-challenge.json index 66af4ac..642a411 100644 --- a/corpora/corpora-of-disordered-speech/adresso-challenge.json +++ b/corpora/corpora-of-disordered-speech/adresso-challenge.json @@ -3,7 +3,7 @@ "URL": "https://sla.talkbank.org/TBB/dementia", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in dementia.\nAccess to the data in DementiaBank is password protected and restricted to members of the DementiaBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng", "deu", "cmn", "spa", "nan" (Taiwanese)], + "Languages": ["eng", "deu", "cmn", "spa", "Taiwanese"], "License": "email request for access", "Size": [], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/polish-cued.json b/corpora/corpora-of-disordered-speech/polish-cued.json index c8a9a84..26078de 100644 --- a/corpora/corpora-of-disordered-speech/polish-cued.json +++ b/corpora/corpora-of-disordered-speech/polish-cued.json @@ -9,7 +9,7 @@ "Annotation": ["CHAT format"], "Infrastructure": "CLARIN", "Access": { - "Download": ""https://hdl.handle.net/1839/dbcd8568-d17d-4861-94bb-aa553e943399 + "Download": "https://hdl.handle.net/1839/dbcd8568-d17d-4861-94bb-aa553e943399" }, "Publication": "" } diff --git a/corpora/historical-corpora/anno-cuneiform.json b/corpora/historical-corpora/anno-cuneiform.json index 9b2fbe3..3abe62e 100644 --- a/corpora/historical-corpora/anno-cuneiform.json +++ b/corpora/historical-corpora/anno-cuneiform.json @@ -8,7 +8,7 @@ "Size": ["1,600,563 tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged", "semantically annotated"], "Access": { - "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2019060601" + "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2019060601", "Download": "http://urn.fi/urn:nbn:fi:lb-2019111602" }, "Publication": "" diff --git a/corpora/historical-corpora/b4-hist-preach.json b/corpora/historical-corpora/b4-hist-preach.json index 9a22773..a453f6a 100644 --- a/corpora/historical-corpora/b4-hist-preach.json +++ b/corpora/historical-corpora/b4-hist-preach.json @@ -8,7 +8,7 @@ "Size": ["92,500 tokens"], "Annotation": ["tokenised", "syntactic and discursive annotation"], "Access": { - "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/sfb632" + "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/sfb632", "Download": "http://hdl.handle.net/11022/0000-0000-9B23-A" }, "Publication": "" diff --git a/corpora/historical-corpora/b4-ludolf.json b/corpora/historical-corpora/b4-ludolf.json index 31e9bc5..83bded3 100644 --- a/corpora/historical-corpora/b4-ludolf.json +++ b/corpora/historical-corpora/b4-ludolf.json @@ -8,7 +8,7 @@ "Size": ["6,690 tokens"], "Annotation": ["tokenised", "tagged for clause type and grammatical function"], "Access": { - "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/sfb632" + "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/sfb632", "Download": "http://hdl.handle.net/11022/0000-0000-9B22-B" }, "Publication": "" diff --git a/corpora/historical-corpora/b4-tatian.json b/corpora/historical-corpora/b4-tatian.json index 0bf5808..d162742 100644 --- a/corpora/historical-corpora/b4-tatian.json +++ b/corpora/historical-corpora/b4-tatian.json @@ -8,7 +8,7 @@ "Size": ["11,300 tokens"], "Annotation": ["tokenised", "MSD-tagged"], "Access": { - "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/sfb632" + "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/sfb632", "Download": "http://hdl.handle.net/11022/0000-0000-9B1E-1" }, "Publication": "" diff --git a/corpora/historical-corpora/dig-hist-slovene.json b/corpora/historical-corpora/dig-hist-slovene.json index 6982143..4a78332 100644 --- a/corpora/historical-corpora/dig-hist-slovene.json +++ b/corpora/historical-corpora/dig-hist-slovene.json @@ -8,7 +8,7 @@ "Size": ["17.7 million tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged"], "Access": { - "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=imp" + "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=imp", "Download": "http://hdl.handle.net/11356/1031" }, "Publication": "Erjavec (2015)." diff --git a/corpora/historical-corpora/ecco-tcp.json b/corpora/historical-corpora/ecco-tcp.json index 5982d1a..9ddf7a3 100644 --- a/corpora/historical-corpora/ecco-tcp.json +++ b/corpora/historical-corpora/ecco-tcp.json @@ -8,7 +8,7 @@ "Size": ["74 million tokens"], "Annotation": ["no linguistic annotation"], "Access": { - "Concordancer": "https://quod.lib.umich.edu/e/ecco/" + "Concordancer": "https://quod.lib.umich.edu/e/ecco/", "Download": "https://textcreationpartnership.org/tcp-texts/ecco-tcp-eighteenth-century-collections-online/" }, "Publication": "" diff --git a/corpora/historical-corpora/gysseling.json b/corpora/historical-corpora/gysseling.json index 42aaa24..0c7b621 100644 --- a/corpora/historical-corpora/gysseling.json +++ b/corpora/historical-corpora/gysseling.json @@ -8,7 +8,7 @@ "Size": ["1.5 million words"], "Annotation": ["PoS-tagged", "lemmatised"], "Access": { - "Concordancer": "https://corpusgysseling.ivdnt.org/corpus-frontend/Gysseling/search/" + "Concordancer": "https://corpusgysseling.ivdnt.org/corpus-frontend/Gysseling/search/", "Download": "http://hdl.handle.net/10032/tm-a2-j4" }, "Publication": "" diff --git a/corpora/historical-corpora/helsinki-eng.json b/corpora/historical-corpora/helsinki-eng.json index 30472ea..9888d44 100644 --- a/corpora/historical-corpora/helsinki-eng.json +++ b/corpora/historical-corpora/helsinki-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1477", "Family": "Historical corpora", "Description": "This corpus contains religious and fictional texts from 730 to 1710.\nSee the project page for a list of all the texts included in the corpus.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": [English (Old and Middle)], + "Languages": ["English (Old)", "English (Middle)"], "License": "Oxford Text Archive licence", "Size": ["240,000 words"], "Annotation": [], diff --git a/corpora/historical-corpora/hist-am-eng.json b/corpora/historical-corpora/hist-am-eng.json index 8270d7b..213876e 100644 --- a/corpora/historical-corpora/hist-am-eng.json +++ b/corpora/historical-corpora/hist-am-eng.json @@ -3,8 +3,8 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017061925", "Family": "Historical corpora", "Description": "This corpus contains texts from 1810 to 2009.\nEach decade has roughly the same balance of fiction, popular magazine, newspaper, and non-fiction books.\nThe corpus is available through the concordancer Korp.", - "Languages": [English (American)], - "License": "CLARN ACA", + "Languages": ["English (American)"], + "License": "CLARIN ACA", "Size": ["385 million tokens"], "Annotation": ["tokenised"], "Access": { diff --git a/corpora/historical-corpora/late-modern-en-texts.json b/corpora/historical-corpora/late-modern-en-texts.json index 62ad2aa..d7bdb6b 100644 --- a/corpora/historical-corpora/late-modern-en-texts.json +++ b/corpora/historical-corpora/late-modern-en-texts.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0002-43F3-0", "Family": "Historical corpora", "Description": "This corpus contains texts written by British and Irish authors from 1710 to 1920.\nIn terms of genre, the texts correspond to narrative fiction and non-fiction, drama, letters, treatises, and miscellaneous written works.\nThe corpus is available for download from a CLARIN-D repository. ", - "Languages": [English (Late Modern)], + "Languages": ["English (Late Modern)"], "License": "CC-BY-NC-SA 4.0", "Size": ["34 million words"], "Annotation": ["PoS-tagged"], diff --git a/corpora/historical-corpora/latinise.json b/corpora/historical-corpora/latinise.json index dd406aa..2f346ea 100644 --- a/corpora/historical-corpora/latinise.json +++ b/corpora/historical-corpora/latinise.json @@ -8,7 +8,7 @@ "Size": ["13.3 million tokens"], "Annotation": ["sentence segmented", "PoS-tagged", "lemmatized"], "Access": { - "Concordancer": "https://app.sketchengine.eu/#dashboard?corpname=preloaded%2Flatinise_4" + "Concordancer": "https://app.sketchengine.eu/#dashboard?corpname=preloaded%2Flatinise_4", "Download": "http://hdl.handle.net/11372/LRT-3170" }, "Publication": "McGillivray and Kilgarriff (2015)" diff --git a/corpora/historical-corpora/menota.json b/corpora/historical-corpora/menota.json index 18799ce..7e2bfde 100644 --- a/corpora/historical-corpora/menota.json +++ b/corpora/historical-corpora/menota.json @@ -3,12 +3,12 @@ "URL": "http://clarino.uib.no/menota/page", "Family": "Historical corpora", "Description": "This corpus contains Medieval Nordic texts.\nThe corpus is available for download and through the concordancer Corpuscle.", - "Languages": [Old Norse], + "Languages": ["Old Norse"], "License": "CC-BY", "Size": ["1.6 million tokens"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], "Access": { - "Concordancer": "http://clarino.uib.no/menota/concordance" + "Concordancer": "http://clarino.uib.no/menota/concordance", "Download": "http://clarino.uib.no/menota/catalogue" }, "Publication": "" diff --git a/corpora/historical-corpora/old-bailey.json b/corpora/historical-corpora/old-bailey.json index 0d78414..3c9787d 100644 --- a/corpora/historical-corpora/old-bailey.json +++ b/corpora/historical-corpora/old-bailey.json @@ -3,12 +3,12 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CFB-2", "Family": "Historical corpora", "Description": "This corpus contains proceedings of the Old Bailey (i.e., legal documents) from 1674 to 1913.\nThe corpus is available for download from the CLARIN-D repository and through the CQPConcordancer.\nFor the corpus manual, see Huber et al. (2016).", - "Languages": [English (Late Modern)], + "Languages": ["English (Late Modern)"], "License": "CC-BY-NC-SA 4.0", "Size": ["134 million words"], "Annotation": ["detailed sociobiographical, pragmatic and textual annotation"], "Access": { - "Concordancer": "http://corpora.clarin-d.uni-saarland.de/cqpweb" + "Concordancer": "http://corpora.clarin-d.uni-saarland.de/cqpweb", "Download": "http://fedora.clarin-d.uni-saarland.de/oldbailey/downloads.html" }, "Publication": "" diff --git a/corpora/historical-corpora/old-hungarian.json b/corpora/historical-corpora/old-hungarian.json index cfc05cc..36f5ce6 100644 --- a/corpora/historical-corpora/old-hungarian.json +++ b/corpora/historical-corpora/old-hungarian.json @@ -8,7 +8,7 @@ "Size": ["3 million tokens"], "Annotation": ["tokenised", "partially normalized", "partially MSD-tagged"], "Access": { - "Concordancer": "http://oldhungariancorpus.nytud.hu/en-search.html" + "Concordancer": "http://oldhungariancorpus.nytud.hu/en-search.html", "Download": "http://oldhungariancorpus.nytud.hu/en-codices.html" }, "Publication": "" diff --git a/corpora/historical-corpora/parsed-hist-pt.json b/corpora/historical-corpora/parsed-hist-pt.json index 638fdff..cfd67cb 100644 --- a/corpora/historical-corpora/parsed-hist-pt.json +++ b/corpora/historical-corpora/parsed-hist-pt.json @@ -8,7 +8,7 @@ "Size": ["3.3 million"], "Annotation": ["tokenised", "PoS-tagged (2 million)", "treebanked (1.2 million)"], "Access": { - "Concordancer": "http://www.tycho.iel.unicamp.br/~tycho/corpus/texts/csquery/en/csquery.html" + "Concordancer": "http://www.tycho.iel.unicamp.br/~tycho/corpus/texts/csquery/en/csquery.html", "Download": "http://www.tycho.iel.unicamp.br/~tycho/corpus/en/index.html" }, "Publication": "" diff --git a/corpora/historical-corpora/poldilemma.json b/corpora/historical-corpora/poldilemma.json index 27d63f2..8e5f9ff 100644 --- a/corpora/historical-corpora/poldilemma.json +++ b/corpora/historical-corpora/poldilemma.json @@ -1,7 +1,7 @@ { - "Name": ""PolDiLemma" Middle Polish Diachrone Lemmatised Corpus", + "Name": "\"PolDiLemma\" Middle Polish Diachrone Lemmatised Corpus", "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8C44-B", - "Family": "Historical corpora", + "Family": "Historical corpora", "Description": "This corpus contains political, religious and scientific texts from the 16th to the 18th century.\nThe corpus is available for download from the CLARIN-D repository.", "Languages": ["ces","lat","deu","pol"], "License": "CC BY-NC-SA 4.0", diff --git a/corpora/historical-corpora/ref-hist-slovene.json b/corpora/historical-corpora/ref-hist-slovene.json index 8be3c66..f3d3042 100644 --- a/corpora/historical-corpora/ref-hist-slovene.json +++ b/corpora/historical-corpora/ref-hist-slovene.json @@ -8,7 +8,7 @@ "Size": ["300,000 tokens"], "Annotation": ["manually tokenised", "lemmatised", "PoS-tagged", "modern synonyms for archaic words"], "Access": { - "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=goo300k" + "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=goo300k", "Download": "http://hdl.handle.net/11356/1025" }, "Publication": "Erjavec (2012)." diff --git a/corpora/historical-corpora/ref-mhd.json b/corpora/historical-corpora/ref-mhd.json index 0cbb843..1b0596f 100644 --- a/corpora/historical-corpora/ref-mhd.json +++ b/corpora/historical-corpora/ref-mhd.json @@ -8,7 +8,7 @@ "Size": ["2.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "normalised", "morphosyntactic description"], "Access": { - "Concordancer": "http://www.deutschestextarchiv.de/" + "Concordancer": "http://www.deutschestextarchiv.de/", "Download": "http://deutschestextarchiv.de/rem/" }, "Publication": "Klein and Dipper (2016)." diff --git a/corpora/historical-corpora/ref-mid-low-de.json b/corpora/historical-corpora/ref-mid-low-de.json index fed53c9..546dd99 100644 --- a/corpora/historical-corpora/ref-mid-low-de.json +++ b/corpora/historical-corpora/ref-mid-low-de.json @@ -8,7 +8,7 @@ "Size": ["200,700 tokens"], "Annotation": ["tokenised", "MSD-tagged"], "Access": { - "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/#_c=UmVOXzIwMTctMDYtMTU" + "Concordancer": "http://annis.corpora.uni-hamburg.de:8080/gui/#_c=UmVOXzIwMTctMDYtMTU", "Download": "http://hdl.handle.net/11022/0000-0007-C64C-5" }, "Publication": "Schröder (2014)." diff --git a/corpora/historical-corpora/roysoc-corp.json b/corpora/historical-corpora/roysoc-corp.json index 160e64a..da4794d 100644 --- a/corpora/historical-corpora/roysoc-corp.json +++ b/corpora/historical-corpora/roysoc-corp.json @@ -8,7 +8,7 @@ "Size": ["35 million tokens"], "Annotation": ["PoS-tagged using PennTreebank tagset", "lemmatised", "normalised"], "Access": { - "Concordancer": "http://fedora.clarin-d.uni-saarland.de/rsc_v4/access.html#cqpweb" + "Concordancer": "http://fedora.clarin-d.uni-saarland.de/rsc_v4/access.html#cqpweb", "Download": "http://fedora.clarin-d.uni-saarland.de/rsc_v4/access.html#download" }, "Publication": "" diff --git a/corpora/historical-corpora/saga.json b/corpora/historical-corpora/saga.json index f62a334..ef021e9 100644 --- a/corpora/historical-corpora/saga.json +++ b/corpora/historical-corpora/saga.json @@ -3,12 +3,12 @@ "URL": "https://clarin.is/en/resources/sagacorpus/", "Family": "Historical corpora", "Description": "This corpus contains Old Icelandic (Old Norse) Narrative texts from the 13th to the 15th century.\nThe corpus is available for download from CLARIN-IS and for search through the concordancer Korp.", - "Languages": [Icelandic (Old)], + "Languages": ["Icelandic (Old)"], "License": "CC-BY 4.0", "Size": ["1.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "normalized orthography"], "Access": { - "Concordancer": "https://malheildir.arnastofnun.is/?mode=fornrit#?lang=en&stats_reduce=word&isCaseInsensitive&searchBy=word&cqp=%5B%5D" + "Concordancer": "https://malheildir.arnastofnun.is/?mode=fornrit#?lang=en&stats_reduce=word&isCaseInsensitive&searchBy=word&cqp=%5B%5D", "Download": "http://www.malfong.is/index.php?dlid=2&lang=en" }, "Publication": "Rögnvaldsson and Helgadóttir (2011)" diff --git a/corpora/historical-corpora/sprakbanken-hist.json b/corpora/historical-corpora/sprakbanken-hist.json index 48d0e33..e248bb5 100644 --- a/corpora/historical-corpora/sprakbanken-hist.json +++ b/corpora/historical-corpora/sprakbanken-hist.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/korp/?mode=all_hist#?lang=en&stats_reduce=word&cqp=%5B%5D", "Family": "Historical corpora", "Description": "This collection of corpora contains – among others – diachronic legal texts, Bible translations, medieval letters, digitized newspapers from the Swedish National Library and 19th century fiction from the Swedish Literature Bank.\nThe corpora are available through the concordancer Korp.", - "Languages": [FIXME Swedish, German, French and others], + "Languages": ["swe", "deu", "fra", "and others"], "License": "CC-BY", "Size": ["1.34 billion tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "syntactically parsed", "word sense (for materials more recent than 1800)"], diff --git a/corpora/historical-corpora/yu1parl.json b/corpora/historical-corpora/yu1parl.json index a5c450c..b9ae7a5 100644 --- a/corpora/historical-corpora/yu1parl.json +++ b/corpora/historical-corpora/yu1parl.json @@ -8,8 +8,8 @@ "Size": ["34,542 utterances", "578,958 sentences", "13,271,885 words", "15,403 pages"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], "Access": { - "Concordancer (noSketch)": "https://www.clarin.si/ske/#dashboard?corpname=yu1parl" - "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=yu1parl" + "Concordancer (noSketch)": "https://www.clarin.si/ske/#dashboard?corpname=yu1parl", + "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=yu1parl", "Download": "http://hdl.handle.net/11356/1845" }, "Publication": "" diff --git a/corpora/literary-corpora/anth-me.json b/corpora/literary-corpora/anth-me.json index 5e2def0..dbf1ff6 100644 --- a/corpora/literary-corpora/anth-me.json +++ b/corpora/literary-corpora/anth-me.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1398", "Family": "Literary corpora", "Description": "This corpus contains literary texts from 1100 to 1400.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["enm"), "heb"], + "Languages": ["enm", "heb"], "License": "Oxford Text Archive Licence", "Size": ["4,000 words"], "Annotation": [], diff --git a/corpora/literary-corpora/bonnier-one.json b/corpora/literary-corpora/bonnier-one.json index afa4218..b1de7cf 100644 --- a/corpora/literary-corpora/bonnier-one.json +++ b/corpora/literary-corpora/bonnier-one.json @@ -8,7 +8,7 @@ "Size": ["6,578,675 tokens", "462,625 sentences"], "Annotation": ["sentence scrambling"], "Access": { - "Browse": "https://spraakbanken.gu.se/korp/#corpus=romi" + "Browse": "https://spraakbanken.gu.se/korp/#corpus=romi", "Download": "http://hdl.handle.net/10794/115" }, "Publication": "" diff --git a/corpora/literary-corpora/bonnier-two.json b/corpora/literary-corpora/bonnier-two.json index eadbf3d..0cd2b18 100644 --- a/corpora/literary-corpora/bonnier-two.json +++ b/corpora/literary-corpora/bonnier-two.json @@ -8,7 +8,7 @@ "Size": ["4,304,271 tokens", "298,361 sentences"], "Annotation": ["sentence scrambling"], "Access": { - "Browse": "https://spraakbanken.gu.se/korp/#corpus=romii" + "Browse": "https://spraakbanken.gu.se/korp/#corpus=romii", "Download": "http://hdl.handle.net/10794/116" }, "Publication": "" diff --git a/corpora/literary-corpora/ceal.json b/corpora/literary-corpora/ceal.json index d5c6719..2b0e8da 100644 --- a/corpora/literary-corpora/ceal.json +++ b/corpora/literary-corpora/ceal.json @@ -8,7 +8,7 @@ "Size": ["3 novels", "484,010 tokens"], "Annotation": ["MSD-tagged", "syntactically parsed"], "Access": { - "Browse (original)": "http://urn.fi/urn:nbn:fi:lb-2018011201" + "Browse (original)": "http://urn.fi/urn:nbn:fi:lb-2018011201", "Browse (scrambled)": "http://urn.fi/urn:nbn:fi:lb-2018011202" }, "Publication": "" diff --git a/corpora/literary-corpora/joh-jen.json b/corpora/literary-corpora/joh-jen.json index 2811d6a..1d718db 100644 --- a/corpora/literary-corpora/joh-jen.json +++ b/corpora/literary-corpora/joh-jen.json @@ -8,7 +8,7 @@ "Size": ["1,760,093 words", "8,489 pages"], "Annotation": ["unannotated"], "Access": { - "Browse": "http://johannesvjensen.dk/jensenonline/liste-over-vaerker/" + "Browse": "http://johannesvjensen.dk/jensenonline/liste-over-vaerker/", "Download": "http://hdl.handle.net/20.500.12115/20" }, "Publication": "" diff --git a/corpora/literary-corpora/kdsp.json b/corpora/literary-corpora/kdsp.json index 0def583..72abb0e 100644 --- a/corpora/literary-corpora/kdsp.json +++ b/corpora/literary-corpora/kdsp.json @@ -8,8 +8,8 @@ "Size": ["262 texts", "11 million words", "14 million tokens"], "Annotation": ["MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated with author and text metadata"], "Access": { - "Browse (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=kdsp" - "Browse (KonText)": "https://www.clarin.si/kontext/query?corpname=kdsp" + "Browse (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=kdsp", + "Browse (KonText)": "https://www.clarin.si/kontext/query?corpname=kdsp", "Download": "http://hdl.handle.net/11356/1823" }, "Publication": "" diff --git a/corpora/literary-corpora/multext1984.json b/corpora/literary-corpora/multext1984.json index f0b427b..d903cf2 100644 --- a/corpora/literary-corpora/multext1984.json +++ b/corpora/literary-corpora/multext1984.json @@ -1,8 +1,8 @@ { - "Name": "MULTEXT-East "1984" annotated corpus 4.0 ", + "Name": "MULTEXT-East \"1984\" annotated corpus 4.0 ", "URL": "http://hdl.handle.net/11356/1043", "Family": "Literary corpora", - "Description": "This is Parallel corpus of George Orwell's 1984 and its translations.\nThe corpus is available for download from CLARIN.SI.", + "Description": "This is a parallel corpus of George Orwell's 1984 and its translations.\nThe corpus is available for download from CLARIN.SI.", "Languages": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], "License": "CC BY-NC SA 4.0", "Size": ["12 texts", "79,718 sentences", "1,064,424 words"], diff --git a/corpora/literary-corpora/orig-est.json b/corpora/literary-corpora/orig-est.json index 1bc7ef3..a04ac39 100644 --- a/corpora/literary-corpora/orig-est.json +++ b/corpora/literary-corpora/orig-est.json @@ -2,7 +2,7 @@ "Name": "Collection of older original Estonian-language works of fiction", "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00088L", "Family": "Literary corpora", - "Description": "This corpus collects older Estonian literary texts published on "Kreutzwald's Century: the Estonian Cultural History Web". The electronically republished books, included in the collection, are based on the first editions of works by more important Estonian authors, published in 1854-1944.\nThe corpus is available for online browsing through an external interface.", + "Description": "This corpus collects older Estonian literary texts published on \"Kreutzwald's Century: the Estonian Cultural History Web\". The electronically republished books, included in the collection, are based on the first editions of works by more important Estonian authors, published in 1854-1944.\nThe corpus is available for online browsing through an external interface.", "Languages": ["est"], "License": "CLARIN ACA", "Size": ["173 texts"], diff --git a/corpora/literary-corpora/sol.json b/corpora/literary-corpora/sol.json index b240d04..e43b6b7 100644 --- a/corpora/literary-corpora/sol.json +++ b/corpora/literary-corpora/sol.json @@ -8,7 +8,7 @@ "Size": ["1,267,391 tokens", "69,270 sentences"], "Annotation": ["sentence scrambled"], "Access": { - "Browse": "https://spraakbanken.gu.se/korp/?mode=spanish#?corpus=one71" + "Browse": "https://spraakbanken.gu.se/korp/?mode=spanish#?corpus=one71", "Download": "http://hdl.handle.net/10794/80" }, "Publication": "" diff --git a/corpora/literary-corpora/strindberg.json b/corpora/literary-corpora/strindberg.json index 7f53f16..f4e7844 100644 --- a/corpora/literary-corpora/strindberg.json +++ b/corpora/literary-corpora/strindberg.json @@ -8,7 +8,7 @@ "Size": ["4,309,037 tokens", "321,759 sentences"], "Annotation": ["sentence scrambling"], "Access": { - "Browse": "https://spraakbanken.gu.se/korp/#?corpus=strindbergromaner" + "Browse": "https://spraakbanken.gu.se/korp/#?corpus=strindbergromaner", "Download": "http://hdl.handle.net/10794/79" }, "Publication": "" diff --git a/corpora/newspaper-corpora/8-sidor.json b/corpora/newspaper-corpora/8-sidor.json index eb6d946..567b34f 100644 --- a/corpora/newspaper-corpora/8-sidor.json +++ b/corpora/newspaper-corpora/8-sidor.json @@ -2,7 +2,7 @@ "Name": "8 sidor", "URL": "https://spraakbanken.gu.se/eng/resource/attasidor", "Family": "Newspaper corpora", - "Description": "This corpus contains articles from the Swedish newspaper 8 sidor from 2003 to 2012.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", + "Description": "This corpus contains articles from the Swedish newspaper 8 sidor from 2003 to 2012.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", "Languages": ["swe"], "License": "CC-BY", "Size": ["678,000 tokens"], diff --git a/corpora/newspaper-corpora/contemp-serbian.json b/corpora/newspaper-corpora/contemp-serbian.json index b02011a..927eef5 100644 --- a/corpora/newspaper-corpora/contemp-serbian.json +++ b/corpora/newspaper-corpora/contemp-serbian.json @@ -8,7 +8,7 @@ "Size": ["916 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], "Access": { - "Special": "For access, contact the resource manager." + "Special": "For access, contact the resource manager." }, "Publication": "" } diff --git a/corpora/newspaper-corpora/dn-1987.json b/corpora/newspaper-corpora/dn-1987.json index 77deab9..e879140 100644 --- a/corpora/newspaper-corpora/dn-1987.json +++ b/corpora/newspaper-corpora/dn-1987.json @@ -2,7 +2,7 @@ "Name": "DN 1987", "URL": "https://spraakbanken.gu.se/eng/resource/dn1987", "Family": "Newspaper corpora", - "Description": "This corpus contains articles from the Swedish newspaper Dagens Nyheter from 1987.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", + "Description": "This corpus contains articles from the Swedish newspaper Dagens Nyheter from 1987.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", "Languages": ["swe"], "License": "CC-BY", "Size": ["5 million tokens"], diff --git a/corpora/newspaper-corpora/makedonia.json b/corpora/newspaper-corpora/makedonia.json index 440b492..afc89e0 100644 --- a/corpora/newspaper-corpora/makedonia.json +++ b/corpora/newspaper-corpora/makedonia.json @@ -1,5 +1,5 @@ { - "Name": "Modern Greek Texts Corpus - "Makedonia" newspaper", + "Name": "Modern Greek Texts Corpus - \"Makedonia\" newspaper", "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-24FB-D", "Family": "Newspaper corpora", "Description": "This corpus contains newspaper articles in various topics (politics, economy, sports).\nThe corpus is available for download from the CLARIN:EL repository.", diff --git a/corpora/newspaper-corpora/parallel-global.json b/corpora/newspaper-corpora/parallel-global.json index 7bf500a..88a5fd6 100644 --- a/corpora/newspaper-corpora/parallel-global.json +++ b/corpora/newspaper-corpora/parallel-global.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25DD-E", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the https://globalvoices.org/ website, where volunteers publish and translate news stories in more than 40 languages.", - "Languages": [40 languages], + "Languages": ["40 languages"], "License": "CC BY", "Size": ["8 million units"], "Annotation": [], diff --git a/corpora/newspaper-corpora/setimes.json b/corpora/newspaper-corpora/setimes.json index 89399f5..d19812f 100644 --- a/corpora/newspaper-corpora/setimes.json +++ b/corpora/newspaper-corpora/setimes.json @@ -1,6 +1,6 @@ { - "Name": ""SETIMES - A parallel corpus of the Balkan languages, - "URL": ""http://hdl.grnet.gr/11500/ATHENA-0000-0000-2591-2, + "Name": "SETIMES - A parallel corpus of the Balkan languages", + "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2591-2", "Family": "Newspaper corpora", "Description": "This parallel corpus contains online news articles extracted from the SETimes webpage.\nThe corpus is available for download from the CLARIN:EL repository.", "Languages": ["ron","tur","srp","eng","bul", "mkd", "hrv","ell","sqi"], diff --git a/corpora/newspaper-corpora/ta-nea.json b/corpora/newspaper-corpora/ta-nea.json index 72c9503..484beb6 100644 --- a/corpora/newspaper-corpora/ta-nea.json +++ b/corpora/newspaper-corpora/ta-nea.json @@ -1,5 +1,5 @@ { - "Name": "Modern Greek Texts Corpus - "Ta Nea" newspaper", + "Name": "Modern Greek Texts Corpus - \"Ta Nea\" newspaper", "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-24F9-F", "Family": "Newspaper corpora", "Description": "This corpus contains newspaper articles in various topics (politics, economy, sports).\nThe corpus is available for download from the CLARIN:EL repository.", diff --git a/corpora/newspaper-corpora/timed-jsi-web.json b/corpora/newspaper-corpora/timed-jsi-web.json index 6be8255..42caf6d 100644 --- a/corpora/newspaper-corpora/timed-jsi-web.json +++ b/corpora/newspaper-corpora/timed-jsi-web.json @@ -3,7 +3,7 @@ "URL": "https://www.sketchengine.co.uk/jozef-stefan-institute-newsfeed-corpus/", "Family": "Newspaper corpora", "Description": "This corpus contains articles from newsfeed from 2014 to 2017.\nThe corpus is available through noSketchEingine.", - "Languages": [18 languages], + "Languages": ["18 languages"], "License": "", "Size": ["35 billion tokens"], "Annotation": ["tokenised", "PoS-tagged"], diff --git a/corpora/newspaper-corpora/tuebingen-tree.json b/corpora/newspaper-corpora/tuebingen-tree.json index 0c67c7a..2832726 100644 --- a/corpora/newspaper-corpora/tuebingen-tree.json +++ b/corpora/newspaper-corpora/tuebingen-tree.json @@ -6,7 +6,7 @@ "Languages": ["deu"], "License": "CLARIN RES", "Size": ["1.8 million tokens"], - "Annotation": ["tokenised", "MSD tagged", "lemmatised", "syntactic constituency", "named-entities"], + "Annotation": ["tokenised","MSD tagged","lemmatised","syntactic constituency","named-entities"], "Access": { "Concordancer": "https://weblicht.sfs.uni-tuebingen.de/weblichtwiki/index.php/Tundra" }, diff --git a/corpora/newspaper-corpora/zurich.json b/corpora/newspaper-corpora/zurich.json index d8c16fa..b5869b1 100644 --- a/corpora/newspaper-corpora/zurich.json +++ b/corpora/newspaper-corpora/zurich.json @@ -8,7 +8,7 @@ "Size": ["1.6 million tokens"], "Annotation": ["tokenised"], "Access": { - "Special": "For access, contact the authors." + "Special": "For access, contact the authors." }, "Publication": "" }