From ef1d1dde8c04c0ce81c60412860535e554957272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6nig?= Date: Fri, 6 Sep 2024 10:42:05 +0200 Subject: [PATCH] reverted all fields to strings to mirror CSVs --- corpora/cmc-corpora/comere.json | 2 +- corpora/cmc-corpora/contemp-blogs.json | 2 +- corpora/cmc-corpora/didi.json | 2 +- corpora/cmc-corpora/do-chat.json | 2 +- corpora/cmc-corpora/ebay-petit.json | 2 +- corpora/cmc-corpora/global-web-en.json | 2 +- corpora/cmc-corpora/hs-fi-news.json | 2 +- corpora/cmc-corpora/janes-blog.json | 2 +- corpora/cmc-corpora/janes-forum.json | 2 +- corpora/cmc-corpora/janes-news.json | 2 +- corpora/cmc-corpora/janes-tweet.json | 2 +- corpora/cmc-corpora/janes-wiki.json | 2 +- corpora/cmc-corpora/litis.json | 2 +- corpora/cmc-corpora/macocu.json | 4 ++-- corpora/cmc-corpora/monitor-slo-trendi.json | 2 +- corpora/cmc-corpora/paisa.json | 4 ++-- corpora/cmc-corpora/pdrs.json | 2 +- corpora/cmc-corpora/sfnet.json | 2 +- corpora/cmc-corpora/suomi24.json | 2 +- corpora/cmc-corpora/ylilauta.json | 2 +- corpora/corpora-of-disordered-speech/ssnce-tamil.json | 2 +- corpora/historical-corpora/letter-sinebrychoff.json | 2 +- corpora/reference-corpora/bnc.json | 2 +- corpora/reference-corpora/conae.json | 2 +- corpora/reference-corpora/dereko.json | 2 +- corpora/reference-corpora/enc2019.json | 2 +- corpora/reference-corpora/gigafida.json | 2 +- corpora/reference-corpora/riznica.json | 2 +- corpora/reference-corpora/sonar.json | 2 +- 29 files changed, 31 insertions(+), 31 deletions(-) diff --git a/corpora/cmc-corpora/comere.json b/corpora/cmc-corpora/comere.json index e551af0..e0b2730 100644 --- a/corpora/cmc-corpora/comere.json +++ b/corpora/cmc-corpora/comere.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains e-mails, forum posts, online chats, tweets and SMS.\nThe corpus is available for download from Ortolang.", "Languages": ["fra"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["80 million tokens"], "Annotation": ["tokenised", "mostly untagged"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/contemp-blogs.json b/corpora/cmc-corpora/contemp-blogs.json index 36b7e15..a101216 100644 --- a/corpora/cmc-corpora/contemp-blogs.json +++ b/corpora/cmc-corpora/contemp-blogs.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts.\nThe corpus is available for download from LINDAT.", "Languages": ["ces"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["1 million tokens"], "Annotation": ["tokenised", "sentence tagged"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/didi.json b/corpora/cmc-corpora/didi.json index 1d65f62..423f015 100644 --- a/corpora/cmc-corpora/didi.json +++ b/corpora/cmc-corpora/didi.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus consists of Facebook posts gathered from 136 Facebook users from South Tyrol. All texts are anonymised.\nThe corpus is available for download from the EURAC Research CLARIN repository.", "Languages": ["deu","ita","eng","lad"], - "License": "https://gitlab.inf.unibz.it/commul/var/eurac-licenses/-/raw/v1.0/EULA-CLARIN-ACA-BY-NC-NORED.md", + "License": "ACA-BY-NC-NORED 1.0", "Size": ["600,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/do-chat.json b/corpora/cmc-corpora/do-chat.json index f5cdf79..8f9c818 100644 --- a/corpora/cmc-corpora/do-chat.json +++ b/corpora/cmc-corpora/do-chat.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains online chats from 2000 to 2006\nThe corpus is available for download from the repository of CLARIN-D", "Languages": ["deu"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["1 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/ebay-petit.json b/corpora/cmc-corpora/ebay-petit.json index d0d3f6e..bb5dab8 100644 --- a/corpora/cmc-corpora/ebay-petit.json +++ b/corpora/cmc-corpora/ebay-petit.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains eBay listings from 2005, 2017, and 2018. The corpus is manually annotated.\nThe corpus is available for download from a dedicated webpage.", "Languages": ["fra"], - "License": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "License": "CC-BY-NC-SA 4.0", "Size": ["100,000 tokens"], "Annotation": ["see here"], "Infrastructure": "Other", diff --git a/corpora/cmc-corpora/global-web-en.json b/corpora/cmc-corpora/global-web-en.json index a1550b9..d0a144c 100644 --- a/corpora/cmc-corpora/global-web-en.json +++ b/corpora/cmc-corpora/global-web-en.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains texts from web-pages in United States, Great Britain, Australia, India, and 16 other countries. About 60% of the texts come from blogs.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", "Languages": ["eng"], - "License": "FIXME CLARIN RES (download); CLARIN ACA (online)", + "License": "CLARIN RES (download); CLARIN ACA (online)", "Size": ["1.8 billion words", "1.8 million texts"], "Annotation": "", "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/hs-fi-news.json b/corpora/cmc-corpora/hs-fi-news.json index ec7589a..9a13c88 100644 --- a/corpora/cmc-corpora/hs-fi-news.json +++ b/corpora/cmc-corpora/hs-fi-news.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains the domestic news of the Helsingin Sanomat website and their comments from 5 September 2011 to 4 September 2012.\nThe corpus has been syntactically parsed using TDT alpha.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", "Languages": ["fin"], - "License": "FIXME CLARIN ACA – NC", + "License": "CLARIN ACA – NC", "Size": ["8 million tokens", "593,760 sentences", "93,602 texts"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/janes-blog.json b/corpora/cmc-corpora/janes-blog.json index 06a18bb..cfb3661 100644 --- a/corpora/cmc-corpora/janes-blog.json +++ b/corpora/cmc-corpora/janes-blog.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts from RTV Slovenija and Publishwall.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText", "Languages": ["slv"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["34 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/janes-forum.json b/corpora/cmc-corpora/janes-forum.json index fc8ec95..20a8277 100644 --- a/corpora/cmc-corpora/janes-forum.json +++ b/corpora/cmc-corpora/janes-forum.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from Avtomobilizem.com, MedOver.net and RTV Slovenija.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", "Languages": ["slv"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["47 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/janes-news.json b/corpora/cmc-corpora/janes-news.json index f98aa87..fb9427f 100644 --- a/corpora/cmc-corpora/janes-news.json +++ b/corpora/cmc-corpora/janes-news.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains news comments from RTV Slovenija, Mladina and Reporter.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", "Languages": ["slv"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["14 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/janes-tweet.json b/corpora/cmc-corpora/janes-tweet.json index f5b7ba0..a3ba710 100644 --- a/corpora/cmc-corpora/janes-tweet.json +++ b/corpora/cmc-corpora/janes-tweet.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains tweets written by Slovenian Twitter users from 2013 to 2017.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", "Languages": ["slv"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["139 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/janes-wiki.json b/corpora/cmc-corpora/janes-wiki.json index 02f0635..bf9baf3 100644 --- a/corpora/cmc-corpora/janes-wiki.json +++ b/corpora/cmc-corpora/janes-wiki.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains Slovenian Wikipedia user and talk pages.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", "Languages": ["slv"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["5 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/litis.json b/corpora/cmc-corpora/litis.json index 5c57fe5..39e905a 100644 --- a/corpora/cmc-corpora/litis.json +++ b/corpora/cmc-corpora/litis.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from portals delfi.lt and lrytas.lt from 2010 to 2014.\nThe corpus is available for download from the CLARIN-LT repository.", "Languages": ["lit"], - "License": "FIXME CLARIN_ACA", + "License": "CLARIN_ACA", "Size": ["190,000 comments"], "Annotation": "", "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/macocu.json b/corpora/cmc-corpora/macocu.json index 3141b44..4974357 100644 --- a/corpora/cmc-corpora/macocu.json +++ b/corpora/cmc-corpora/macocu.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "These corpora are a collection containing web texts and were built by crawling national internet top-level domains (specified below) and by extending the crawl dynamically to other domains as well. The crawler is available at MaCoCu GitHub channel. Considerable effort was devoted into cleaning the extracted text to provide a high-quality web corpus. This was achieved by removing boilerplate and near-duplicated paragraphs, discarding very short texts as well as texts that are not in the target language. Furthermore, samples from the largest 1,500 domains were manually checked and bad domains, such as machine-translated domains, were removed.\nThe dataset is characterized by extensive metadata which allows filtering the dataset based on text quality and other criteria, making the corpus highly useful for corpus linguistics studies, as well as for training language models and other language technologies. In XML format, each document is accompanied by the following metadata: title, crawl date, url, domain, file type of the original document, distribution of languages inside the document, and a fluency score based on a language model. The text of each document is divided into paragraphs that are accompanied by metadata on the information whether a paragraph is a heading or not, metadata on the paragraph quality (labels, such as \"short\" or \"good\", assigned based on paragraph length, URL and stopword density via the jusText tool) and fluency (score between 0 and 1, assigned with the Monocleaner tool), the automatically identified language of the text in the paragraph, and information whether the paragraph contains sensitive information (identified via the Biroamer tool). As opposed to the previous version in the case of corpora in version 2.0, this version has more accurate metadata on languages of the texts, which was achieved by using Google's Compact Language Detector 2 (CLD2), a high-performance language detector supporting many languages. Other tools, used for web corpora creation and curation, have been updated as well, resulting in an even cleaner, as well as larger corpus.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be easily read with the prevert parser.", "Languages": ["sqi","bos","bul","cat","hrv","ell","isl","mkd","mlt","cnr","srp","tur","ukr","slv"], - "License": "https://creativecommons.org/publicdomain/zero/1.0/", + "License": "CC0 No Rights Reserved", "Size": "", "Annotation": ["annotated with extensive metadata"], "Infrastructure": "CLARIN", @@ -24,5 +24,5 @@ "Download (Turkish)": "http://hdl.handle.net/11356/1802", "Download (Ukrainian)": "http://hdl.handle.net/11356/1838" }, - "Publication": "FIXME Bañón et al. (2022)" + "Publication": "Bañón et al. (2022)" } diff --git a/corpora/cmc-corpora/monitor-slo-trendi.json b/corpora/cmc-corpora/monitor-slo-trendi.json index 27d3a8d..69756cd 100644 --- a/corpora/cmc-corpora/monitor-slo-trendi.json +++ b/corpora/cmc-corpora/monitor-slo-trendi.json @@ -12,5 +12,5 @@ "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=trendi", "Concordancer(KonText)": "https://www.clarin.si/kontext/query?corpname=trendi" }, - "Publication":"FIXME:Kosem (2022)#SEPKosem et al. (2022)" + "Publication":"Kosem (2022)#SEPKosem et al. (2022)" } diff --git a/corpora/cmc-corpora/paisa.json b/corpora/cmc-corpora/paisa.json index 1a3dcd0..0dd4d04 100644 --- a/corpora/cmc-corpora/paisa.json +++ b/corpora/cmc-corpora/paisa.json @@ -4,12 +4,12 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains approximately 380,000 documents coming from about 1,000 different websites, for a total of about 250 million words. Approximately 260,000 documents are from Wikipedia, approx. 5,600 from other Wikimedia Foundation projects. About 9,300 documents come from Indymedia, and we estimate that about 65,000 documents come from blog services.\nThe corpus is available for download from the EURAC Research CLARIN repository.", "Languages": ["ita"], - "License": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "License": "CC-BY-NC-SA 4.0", "Size": ["380,000 pages", "250 million words"], "Annotation": "", "Infrastructure": "CLARIN", "Access": { "Download": "http://hdl.handle.net/20.500.12124/3" }, - "Publication": "FIXME https://aclanthology.org/W14-0406/" + "Publication": "" } diff --git a/corpora/cmc-corpora/pdrs.json b/corpora/cmc-corpora/pdrs.json index 0bc9123..988f56a 100644 --- a/corpora/cmc-corpora/pdrs.json +++ b/corpora/cmc-corpora/pdrs.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains texts from the web obtained by crawling the .rs domain. Crawling has been done in September and October 2022 with BootCat. As search terms, appr. 2,800 word forms with a frequency between 5,000 and 500,000 in srWaC have been used. The texts are deduplicated, cyrillic texts have been transliterated into the Latin alphabet. The linguistic processing was done with the CLASSLA package for tokenization, lemmatization and morpho-syntactic tagging (both MULTEXT-East and Universal Dependencies).\nIn addition, some 80% of the URLs are manually tagged for 10 different types of sources (\"area\"): media (media outlets with several posts daily), inform (topic-centered sites with infrequent posts - maximum 3 per day), company (presentations of companies), state (websites of government bodies on nationa, regional and local level), forum (forum posts), portal (topic-centered portals without daily coverage), science (scientific publications), shop (with descriptions of products), database (knowledge bases, dictionaries, databases and similar) and community (NGOs, fan clubs, associations and other). The corpus is distributed in the CoNLL-U format in batches of appr. 2x50 mio. tokens.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through noSketchEngine and KonText concordancers.", "Languages": ["srp"], - "License": "https://creativecommons.org/licenses/by/4.0/", + "License": "CC-BY", "Size": ["715 million tokens"], "Annotation": ["tokenised", "MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated for text source"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/sfnet.json b/corpora/cmc-corpora/sfnet.json index 7666106..ccd6161 100644 --- a/corpora/cmc-corpora/sfnet.json +++ b/corpora/cmc-corpora/sfnet.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains written posts from the SFNET forum in Finnish from 2002 to 2003.\nThe PoS-tagging has been done with the FI-FDG Parser, which uses a computational implementation of Functional Dependency Grammar.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank)", "Languages": ["fin"], - "License": "FIXME CLARIN ACA – NC", + "License": "CLARIN ACA – NC", "Size": ["100 million words"], "Annotation": ["PoS-tagged", "sentence and word segmentation"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/suomi24.json b/corpora/cmc-corpora/suomi24.json index 29f92d6..d16b251 100644 --- a/corpora/cmc-corpora/suomi24.json +++ b/corpora/cmc-corpora/suomi24.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from the Suomi24 website from 2001 to 2016.\nThe corpus is available for download from the FIN-CLARIN repository and through the concordancer Korp.", "Languages": ["fin"], - "License": "FIXME CLARIN ACA", + "License": "CLARIN ACA", "Size": ["2.6 billion tokens"], "Annotation": ["tokenised", "MSD-tagged"], "Infrastructure": "CLARIN", diff --git a/corpora/cmc-corpora/ylilauta.json b/corpora/cmc-corpora/ylilauta.json index b49de61..e89824c 100644 --- a/corpora/cmc-corpora/ylilauta.json +++ b/corpora/cmc-corpora/ylilauta.json @@ -4,7 +4,7 @@ "Family": "Computer-mediated communication corpora", "Description": "The corpus contains text from discussions of the Ylilauta online discussion board from 2012 to 2014.\nThe corpus has been syntactically annotated with the TDT alpha parser, while the named entities have been assigned using the FiNER tool.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", "Languages": ["fin"], - "License": "https://creativecommons.org/licenses/by-nc/4.0/", + "License": "CC-BY-NC", "Size": ["26.9 million words"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed", "named entities"], "Infrastructure": "CLARIN", diff --git a/corpora/corpora-of-disordered-speech/ssnce-tamil.json b/corpora/corpora-of-disordered-speech/ssnce-tamil.json index 2cb6daf..ede1b46 100644 --- a/corpora/corpora-of-disordered-speech/ssnce-tamil.json +++ b/corpora/corpora-of-disordered-speech/ssnce-tamil.json @@ -4,7 +4,7 @@ "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of Tamil Dysarthric Speech.\nThe corpus contains approximately eight hours of Tamil speech data, time-aligned transcripts and metadata collected from 30 speakers (20 dysarthric speakers and 10 non-dysarthric speakers).\nThe non-dysarthric speakers consisted of five female and five male subjects. The dysarthric speakers (7 female, 13 male) reported a diagnosis of cerebral palsy and ranged in age from 12 years old to 37 years ol.\nIn total, each speaker recorded 365 utterances consisting of single words and of sentences that included a combination of common and uncommon Tamil phrases.\nThe corpus includes time-aligned phonetic transcripts for all collected speech data. Additional documentation includes phoneme mappings and speaker metadata. Audio data is presented as 16-bit 16kHz FLAC compressed linear pcm wav. Transcripts are presented as UTF-8 encoded plain text.", "Languages": ["tam"], - "License": "https://catalog.ldc.upenn.edu/license/the-ssnce-database-of-tamil-dysarthric-speech-agreement.pdf", + "License": "LDC", "Size": ["30 speakers"], "Annotation": ["phonetic"], "Infrastructure": "Other", diff --git a/corpora/historical-corpora/letter-sinebrychoff.json b/corpora/historical-corpora/letter-sinebrychoff.json index d2e1fe3..526b653 100644 --- a/corpora/historical-corpora/letter-sinebrychoff.json +++ b/corpora/historical-corpora/letter-sinebrychoff.json @@ -6,7 +6,7 @@ "Languages": ["fin", "swe"], "License": "CC-BY", "Size": ["8.6 million words"], - "Annotation": ["FIXME Finnish subset: MSD-tagged, syntactically parsed; Swedish subset: no linguistic annotation"], + "Annotation": ["Finnish subset: MSD-tagged, syntactically parsed; Swedish subset: no linguistic annotation"], "Infrastructure": "CLARIN", "Access": { "Concordancer": "http://kirjearkisto.siff.fi/Sinebrychoff/tabid/55/Default.aspx" diff --git a/corpora/reference-corpora/bnc.json b/corpora/reference-corpora/bnc.json index c583451..8622f4a 100644 --- a/corpora/reference-corpora/bnc.json +++ b/corpora/reference-corpora/bnc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2554", "Family": "Reference corpora", "Description": "This corpus includes English texts (fiction, magazines, newspapers, and academic writing) published between 1980 and 1993.\nThe corpus is encoded in TEI. Non-linguistic metadata include contextual and bibliographic information. Aside from written materials, the corpus also includes transcriptions of spoken language.\nThe corpus is available for online browsing through a dedicated concordancer and can be downloaded from the Oxford Text Archive (CLARIN-UK).", - "Languages": ["FIXME eng (British)"], + "Languages": ["English (British)"], "License": "BNC User Licence (restricted for the downloadable version)", "Size": ["100 million words"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/conae.json b/corpora/reference-corpora/conae.json index e6687d6..8d2fdd9 100644 --- a/corpora/reference-corpora/conae.json +++ b/corpora/reference-corpora/conae.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2019031901", "Family": "Reference corpora", "Description": "This corpus includes American English texts evenly divided into the spoken, fiction, magazine, newspaper, and academic genres (around 88 million words each) published between 1990 and 2012.\nThe corpus is available for download from the Finnish Language Bank as well as for online browsing through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["FIXME eng (American)"], + "Languages": ["English (American)"], "License": ["CLARIN ACA (online version)", "CLARIN RES (downloadable version)"], "Size": ["440 million words", "190,000 texts"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/dereko.json b/corpora/reference-corpora/dereko.json index 9d3cd13..c07289e 100644 --- a/corpora/reference-corpora/dereko.json +++ b/corpora/reference-corpora/dereko.json @@ -4,7 +4,7 @@ "Family": "Reference corpora", "Description": "This corpus includes German texts in a wide variety of genres published from 1947 onwards. Non-linguistic metadata include rich bibliographic information and partial layout information.\nPart of the corpus is available for download from a dedicated webpage (CLARIN-D distribution), while the entire corpus can be queried online through the COSMAS II platform.", "Languages": ["deu"], - "License": "https://creativecommons.org/licenses/by-sa/4.0/", + "License": "CC-BY-SA", "Size": ["31.7 billion words"], "Annotation": ["MSD-tagged", "lemmatized"], "Infrastructure": "CLARIN", diff --git a/corpora/reference-corpora/enc2019.json b/corpora/reference-corpora/enc2019.json index 1072a49..ca2606e 100644 --- a/corpora/reference-corpora/enc2019.json +++ b/corpora/reference-corpora/enc2019.json @@ -4,7 +4,7 @@ "Family": "Reference corpora", "Description": "This corpus includes Estonian texts published between 1990 and 2019. Amongst others, this corpus contains the Estonian Reference Corpus as a subcorpus.\nThe corpus is available for download from META-SHARE (CELR distribution).", "Languages": ["est"], - "License": "https://creativecommons.org/licenses/by-sa/4.0/", + "License": "CC-BY-SA", "Size": ["1.5 billion words"], "Annotation": ["MSD-tagged", "lemmatized"], "Infrastructure": "CLARIN", diff --git a/corpora/reference-corpora/gigafida.json b/corpora/reference-corpora/gigafida.json index 6528712..d9e1821 100644 --- a/corpora/reference-corpora/gigafida.json +++ b/corpora/reference-corpora/gigafida.json @@ -4,7 +4,7 @@ "Family": "Reference corpora", "Description": "This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2018. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.\nThe corpus is available for online browsing through the noSketch Engine concordancer (CLARIN.SI distribution), as well as through a dedicated search engine.", "Languages": ["slv"], - "License": "https://viri.cjvt.si/gigafida/System/About", + "License": "Individual terms of agreement", "Size": ["1.3 billion tokens", "1.1 billion words", "38,310 texts"], "Annotation": ["MSD-tagged", "lemmatized"], "Infrastructure": "CLARIN", diff --git a/corpora/reference-corpora/riznica.json b/corpora/reference-corpora/riznica.json index dbec68a..987134b 100644 --- a/corpora/reference-corpora/riznica.json +++ b/corpora/reference-corpora/riznica.json @@ -4,7 +4,7 @@ "Family": "Reference corpora", "Description": "This corpus includes Croatian texts taken from fiction (28%) and specialised texts (72%).\nThe corpus is available for online browsing via noSketch Engine and KonText and for download from the CLARIN.SI repository.", "Languages": ["hrv"], - "License": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "License": "CC-BY-NC-SA 4.0", "Size": ["101.8 million tokens", "85.3 million words", "4.7 million sentences", "14,781 texts"], "Annotation": ["sentence segmented", "PoS-tagged", "lemmatized"], "Infrastructure": "CLARIN", diff --git a/corpora/reference-corpora/sonar.json b/corpora/reference-corpora/sonar.json index 945d28f..0bacfa4 100644 --- a/corpora/reference-corpora/sonar.json +++ b/corpora/reference-corpora/sonar.json @@ -4,7 +4,7 @@ "Family": "Reference corpora", "Description": "This corpus includes representative Dutch texts (fiction, brochures, magazines, legal texts, newspapers, parliamentary proceedings, and computer-mediated communication).\nAside from written materials, the corpus also contains transcriptions of spoken language. The corpus is encoded in FoLiA.\nThe corpus is available for online browsing through the OpenSONAR concordancer and can be downloaded from the Dutch Language Institute (CLARIAH-NL).", "Languages": ["nld"], - "License": "https://ivdnt.org/images/stories/producten/voorwaarden/voorwaarden_sonar-corpus.pdf", + "License": "Terms of Agreement", "Size": ["500 million words"], "Annotation": ["PoS-tagged", "lemmatized", "named entities", "coreference annotation and annotation of spatial and temporal relations for the manually annotated SoNaR-1 subset "], "Infrastructure": "CLARIN",