From 24c9f1ef9dd28e2f6928110776a1c16b9440d81e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6nig?= Date: Thu, 17 Oct 2024 12:14:19 +0200 Subject: [PATCH] Languages -> Language --- corpora/academic-corpora/ac-lit.json | 2 +- corpora/academic-corpora/aca-hum.json | 2 +- corpora/academic-corpora/aca-soc.json | 2 +- corpora/academic-corpora/acl-anth.json | 2 +- corpora/academic-corpora/acnz.json | 2 +- corpora/academic-corpora/chambers-lb.json | 2 +- corpora/academic-corpora/czec-soc.json | 2 +- corpora/academic-corpora/eng-sci.json | 2 +- corpora/academic-corpora/est-sci.json | 2 +- corpora/academic-corpora/genia.json | 2 +- corpora/academic-corpora/jezkor.json | 2 +- corpora/academic-corpora/kas.json | 2 +- corpora/academic-corpora/kiap.json | 2 +- corpora/academic-corpora/lit-trans.json | 2 +- corpora/academic-corpora/modern-greek.json | 2 +- corpora/academic-corpora/muchmore.json | 2 +- corpora/academic-corpora/open-slo.json | 2 +- corpora/academic-corpora/orossimo.json | 2 +- corpora/academic-corpora/reading.json | 2 +- corpora/academic-corpora/roger.json | 2 +- corpora/academic-corpora/roysoc.json | 2 +- corpora/academic-corpora/scientext.json | 2 +- corpora/academic-corpora/span-eng.json | 2 +- corpora/academic-corpora/ufal-papers.json | 2 +- corpora/academic-corpora/uh-eng.json | 2 +- corpora/academic-corpora/uh-fin.json | 2 +- corpora/academic-corpora/uh-fra.json | 2 +- corpora/academic-corpora/uh-ger.json | 2 +- corpora/academic-corpora/uh-rus.json | 2 +- corpora/academic-corpora/uh-spa.json | 2 +- corpora/academic-corpora/uh-swe.json | 2 +- corpora/cmc-corpora/comere.json | 2 +- corpora/cmc-corpora/contemp-blogs.json | 2 +- corpora/cmc-corpora/dereko-news-wiki.json | 2 +- corpora/cmc-corpora/didi.json | 2 +- corpora/cmc-corpora/do-chat.json | 2 +- corpora/cmc-corpora/dwds-blogs.json | 2 +- corpora/cmc-corpora/ebay-petit.json | 2 +- corpora/cmc-corpora/flemish-teen-talk.json | 2 +- corpora/cmc-corpora/global-web-en.json | 2 +- corpora/cmc-corpora/heid.json | 2 +- corpora/cmc-corpora/hs-fi-news.json | 2 +- corpora/cmc-corpora/janes-blog.json | 2 +- corpora/cmc-corpora/janes-forum.json | 2 +- corpora/cmc-corpora/janes-news.json | 2 +- corpora/cmc-corpora/janes-tweet.json | 2 +- corpora/cmc-corpora/janes-wiki.json | 2 +- corpora/cmc-corpora/litis.json | 2 +- corpora/cmc-corpora/macocu.json | 2 +- corpora/cmc-corpora/mixed-newmedia.json | 2 +- corpora/cmc-corpora/monitor-at-tweets.json | 2 +- corpora/cmc-corpora/monitor-slo-trendi.json | 2 +- corpora/cmc-corpora/ntap-en.json | 2 +- corpora/cmc-corpora/ntap-fr.json | 2 +- corpora/cmc-corpora/paisa.json | 2 +- corpora/cmc-corpora/pdrs.json | 2 +- corpora/cmc-corpora/sfnet.json | 2 +- corpora/cmc-corpora/sms4science.json | 2 +- corpora/cmc-corpora/sonar-newmedia.json | 2 +- corpora/cmc-corpora/suomi24.json | 2 +- corpora/cmc-corpora/welsh-tweets.json | 2 +- corpora/cmc-corpora/whatsup-ch.json | 2 +- corpora/cmc-corpora/ylilauta.json | 2 +- corpora/corpora-of-disordered-speech/adhd-uva.json | 2 +- corpora/corpora-of-disordered-speech/adresso-challenge.json | 2 +- corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json | 2 +- corpora/corpora-of-disordered-speech/aphasiabank.json | 2 +- corpora/corpora-of-disordered-speech/asdbank.json | 2 +- .../corpora-of-disordered-speech/bil-deaf-ru-kentalis.json | 2 +- corpora/corpora-of-disordered-speech/cleft-dataset.json | 2 +- corpora/corpora-of-disordered-speech/copas.json | 2 +- corpora/corpora-of-disordered-speech/deaf-adults-ru.json | 2 +- .../corpora-of-disordered-speech/demcorpus-basilicata.json | 2 +- corpora/corpora-of-disordered-speech/ewa-db.json | 2 +- corpora/corpora-of-disordered-speech/fluencybank.json | 2 +- corpora/corpora-of-disordered-speech/itaasd.json | 2 +- corpora/corpora-of-disordered-speech/oplon.json | 2 +- corpora/corpora-of-disordered-speech/perceptual-voice-q.json | 2 +- corpora/corpora-of-disordered-speech/phonologyt-project.json | 2 +- corpora/corpora-of-disordered-speech/plan-v-aphasia.json | 2 +- corpora/corpora-of-disordered-speech/polish-cued.json | 2 +- corpora/corpora-of-disordered-speech/psychosisbank.json | 2 +- corpora/corpora-of-disordered-speech/raput.json | 2 +- corpora/corpora-of-disordered-speech/rhdbank.json | 2 +- corpora/corpora-of-disordered-speech/seed.json | 2 +- corpora/corpora-of-disordered-speech/sli-ru-kentalis.json | 2 +- corpora/corpora-of-disordered-speech/ssnce-tamil.json | 2 +- corpora/corpora-of-disordered-speech/star-sentences.json | 2 +- corpora/corpora-of-disordered-speech/star-speech-error.json | 2 +- corpora/corpora-of-disordered-speech/tbibank.json | 2 +- corpora/corpora-of-disordered-speech/torgo.json | 2 +- corpora/corpora-of-disordered-speech/uclass.json | 2 +- corpora/corpora-of-disordered-speech/ultraphonix.json | 2 +- corpora/corpora-of-disordered-speech/ultrax-2020.json | 2 +- corpora/corpora-of-disordered-speech/ultrax-disorders.json | 2 +- corpora/historical-corpora/15th-nt-trans.json | 2 +- corpora/historical-corpora/17th-18th-polish.json | 2 +- corpora/historical-corpora/19th-polish.json | 2 +- corpora/historical-corpora/agricola-db.json | 2 +- corpora/historical-corpora/aleksis-kivi.json | 2 +- corpora/historical-corpora/anno-cuneiform.json | 2 +- corpora/historical-corpora/anth-mid-eng.json | 2 +- corpora/historical-corpora/archer.json | 2 +- corpora/historical-corpora/austrian-baroque.json | 2 +- corpora/historical-corpora/b4-hist-preach.json | 2 +- corpora/historical-corpora/b4-ludolf.json | 2 +- corpora/historical-corpora/b4-tatian.json | 2 +- corpora/historical-corpora/bib-text-scots.json | 2 +- corpora/historical-corpora/brieven-buit.json | 2 +- corpora/historical-corpora/bundesblatt.json | 2 +- corpora/historical-corpora/carniolan-pa.json | 2 +- corpora/historical-corpora/ced.json | 2 +- corpora/historical-corpora/ceecs.json | 2 +- corpora/historical-corpora/chroniclItaly.json | 2 +- corpora/historical-corpora/chronopress.json | 2 +- corpora/historical-corpora/cipm.json | 2 +- corpora/historical-corpora/class-lib-nat-lib-fi.json | 2 +- corpora/historical-corpora/ddr-press.json | 2 +- corpora/historical-corpora/diacoris.json | 2 +- corpora/historical-corpora/diakorp.json | 2 +- corpora/historical-corpora/dig-hist-slovene.json | 2 +- corpora/historical-corpora/diorisis-ancient-greek.json | 2 +- corpora/historical-corpora/doec.json | 2 +- corpora/historical-corpora/dta.json | 2 +- corpora/historical-corpora/early-modern-fi.json | 2 +- corpora/historical-corpora/ecco-tcp.json | 2 +- corpora/historical-corpora/edinburgh-dost.json | 2 +- corpora/historical-corpora/eebo-tcp.json | 2 +- corpora/historical-corpora/efontes.json | 2 +- corpora/historical-corpora/en-nw-late-modern.json | 2 +- corpora/historical-corpora/evans-tcp.json | 2 +- corpora/historical-corpora/fin-classics.json | 2 +- corpora/historical-corpora/fin-folk.json | 2 +- corpora/historical-corpora/fin-gutenberg.json | 2 +- corpora/historical-corpora/fin-news-periodicals.json | 2 +- corpora/historical-corpora/frantext.json | 2 +- corpora/historical-corpora/germanc.json | 2 +- corpora/historical-corpora/grek-medieval.json | 2 +- corpora/historical-corpora/gysseling.json | 2 +- corpora/historical-corpora/hacossa.json | 2 +- corpora/historical-corpora/hansard.json | 2 +- corpora/historical-corpora/helsinki-eng.json | 2 +- corpora/historical-corpora/helsinki-old-scot.json | 2 +- corpora/historical-corpora/helsinki-scot.json | 2 +- corpora/historical-corpora/hist-am-eng.json | 2 +- corpora/historical-corpora/hist-lancaster.json | 2 +- corpora/historical-corpora/hist-welsh.json | 2 +- corpora/historical-corpora/hun-courts.json | 2 +- corpora/historical-corpora/hun-hist.json | 2 +- corpora/historical-corpora/impact-gt.json | 2 +- corpora/historical-corpora/lampeter-tracts.json | 2 +- corpora/historical-corpora/lancaster-newsbooks.json | 2 +- corpora/historical-corpora/late-modern-en-prose.json | 2 +- corpora/historical-corpora/late-modern-en-texts.json | 2 +- corpora/historical-corpora/latinise.json | 2 +- corpora/historical-corpora/letter-sinebrychoff.json | 2 +- corpora/historical-corpora/mannheim-hist.json | 2 +- corpora/historical-corpora/medi-charter.json | 2 +- corpora/historical-corpora/menota.json | 2 +- corpora/historical-corpora/midia.json | 2 +- corpora/historical-corpora/news-fin-17-18.json | 2 +- corpora/historical-corpora/news-fin-18-19.json | 2 +- corpora/historical-corpora/news-fin.json | 2 +- corpora/historical-corpora/notthingham-de-medicine.json | 2 +- corpora/historical-corpora/old-bailey.json | 2 +- corpora/historical-corpora/old-hungarian.json | 2 +- corpora/historical-corpora/old-lit-fin.json | 2 +- corpora/historical-corpora/orossimo.json | 2 +- corpora/historical-corpora/pamphlets-am.json | 2 +- corpora/historical-corpora/parsed-hist-pt.json | 2 +- corpora/historical-corpora/partonopeus-de-blois.json | 2 +- corpora/historical-corpora/pceec.json | 2 +- corpora/historical-corpora/pol-16th.json | 2 +- corpora/historical-corpora/pol-bf-1500.json | 2 +- corpora/historical-corpora/pol-lang-1960s.json | 2 +- corpora/historical-corpora/poldilemma.json | 2 +- corpora/historical-corpora/ref-hist-slovene.json | 2 +- corpora/historical-corpora/ref-mhd.json | 2 +- corpora/historical-corpora/ref-mid-low-de.json | 2 +- corpora/historical-corpora/roysoc-corp.json | 2 +- corpora/historical-corpora/sacoco.json | 2 +- corpora/historical-corpora/saga.json | 2 +- corpora/historical-corpora/sheffield-chin.json | 2 +- corpora/historical-corpora/sprakbanken-hist.json | 2 +- corpora/historical-corpora/sumerian-rev.json | 2 +- corpora/historical-corpora/swe-news-periodicals.json | 2 +- corpora/historical-corpora/syn-ref-fra.json | 2 +- corpora/historical-corpora/tlio.json | 2 +- corpora/historical-corpora/vvks.json | 2 +- corpora/historical-corpora/written-est.json | 2 +- corpora/historical-corpora/ycoe.json | 2 +- corpora/historical-corpora/ycoep.json | 2 +- corpora/historical-corpora/yu1parl.json | 2 +- corpora/legal-corpora/ann-czech-case-law.json | 2 +- corpora/legal-corpora/cabank-eng-scotus.json | 2 +- corpora/legal-corpora/covid-19-eur-lex-cef.json | 2 +- corpora/legal-corpora/covid-19-eur-lex-en-pt.json | 2 +- corpora/legal-corpora/czcdc.json | 2 +- corpora/legal-corpora/czech-legal-tree.json | 2 +- corpora/legal-corpora/deu-sub-mulcold.json | 2 +- corpora/legal-corpora/eng-sub-mulcold.json | 2 +- corpora/legal-corpora/english-acquis.json | 2 +- corpora/legal-corpora/est-law.json | 2 +- corpora/legal-corpora/fin-sub-firulex.json | 2 +- corpora/legal-corpora/fin-sub-jrc.json | 2 +- corpora/legal-corpora/igc-laws.json | 2 +- corpora/legal-corpora/jrc-acquis.json | 2 +- corpora/legal-corpora/jrc-eu-dgt.json | 2 +- corpora/legal-corpora/judicial-rhetoric.json | 2 +- corpora/legal-corpora/juridisch-nl.json | 2 +- corpora/legal-corpora/legal-est-min-just.json | 2 +- corpora/legal-corpora/legal-nynorsk-munic.json | 2 +- corpora/legal-corpora/lifr-law.json | 2 +- corpora/legal-corpora/likumi.json | 2 +- corpora/legal-corpora/lit-eu-law.json | 2 +- corpora/legal-corpora/maltese-acquis.json | 2 +- corpora/legal-corpora/meta-nord-dan.json | 2 +- corpora/legal-corpora/meta-nord-est.json | 2 +- corpora/legal-corpora/meta-nord-fin.json | 2 +- corpora/legal-corpora/meta-nord-isl.json | 2 +- corpora/legal-corpora/meta-nord-nor.json | 2 +- corpora/legal-corpora/meta-nord-swe.json | 2 +- corpora/legal-corpora/multieurlex.json | 2 +- corpora/legal-corpora/nor-acquis.json | 2 +- corpora/legal-corpora/old-bailey.json | 2 +- corpora/legal-corpora/rus-sub-firulex.json | 2 +- corpora/legal-corpora/rus-sub-mulcold.json | 2 +- corpora/literary-corpora/1000-novels.json | 2 +- corpora/literary-corpora/1000plus-novels.json | 2 +- corpora/literary-corpora/15c-castilian.json | 2 +- corpora/literary-corpora/1920-polish.json | 2 +- corpora/literary-corpora/aformes.json | 2 +- corpora/literary-corpora/anglosaxon.json | 2 +- corpora/literary-corpora/anth-me.json | 2 +- corpora/literary-corpora/bonnier-one.json | 2 +- corpora/literary-corpora/bonnier-two.json | 2 +- corpora/literary-corpora/ceal.json | 2 +- corpora/literary-corpora/classic-fin-lit.json | 2 +- corpora/literary-corpora/classic-fin.json | 2 +- corpora/literary-corpora/early-fin-lit.json | 2 +- corpora/literary-corpora/est-fiction.json | 2 +- corpora/literary-corpora/est-runic.json | 2 +- corpora/literary-corpora/etcsl.json | 2 +- corpora/literary-corpora/fin-folk.json | 2 +- corpora/literary-corpora/fin-gutenberg.json | 2 +- corpora/literary-corpora/fin-lit.json | 2 +- corpora/literary-corpora/greek-medieval.json | 2 +- corpora/literary-corpora/greek-thesaurus.json | 2 +- corpora/literary-corpora/joh-jen.json | 2 +- corpora/literary-corpora/kdsp.json | 2 +- corpora/literary-corpora/kivi.json | 2 +- corpora/literary-corpora/lat-lit-classic.json | 2 +- corpora/literary-corpora/ltcorpus.json | 2 +- corpora/literary-corpora/m-agricola.json | 2 +- corpora/literary-corpora/micro-pol.json | 2 +- corpora/literary-corpora/multext1984.json | 2 +- corpora/literary-corpora/norbok-children.json | 2 +- corpora/literary-corpora/norbok-fiction.json | 2 +- corpora/literary-corpora/nornyn-children.json | 2 +- corpora/literary-corpora/nornyn-fiction.json | 2 +- corpora/literary-corpora/north-saami.json | 2 +- corpora/literary-corpora/old-fin-lit.json | 2 +- corpora/literary-corpora/one-mil-cro.json | 2 +- corpora/literary-corpora/orig-est.json | 2 +- corpora/literary-corpora/parfin.json | 2 +- corpora/literary-corpora/parrus.json | 2 +- corpora/literary-corpora/prilit.json | 2 +- corpora/literary-corpora/rep-bastille.json | 2 +- corpora/literary-corpora/sol.json | 2 +- corpora/literary-corpora/strindberg.json | 2 +- corpora/literary-corpora/uhlcs.json | 2 +- corpora/literary-corpora/york-poetry.json | 2 +- corpora/manually-annotated-corpora/acl-rd-tex.json | 2 +- corpora/manually-annotated-corpora/alksnis.json | 2 +- corpora/manually-annotated-corpora/artificial-treebank.json | 2 +- corpora/manually-annotated-corpora/artur.json | 2 +- corpora/manually-annotated-corpora/aspect-term-czech.json | 2 +- corpora/manually-annotated-corpora/austrian-baroque.json | 2 +- corpora/manually-annotated-corpora/b4-heliand.json | 2 +- corpora/manually-annotated-corpora/bnc-sampler.json | 2 +- corpora/manually-annotated-corpora/bultreebank.json | 2 +- corpora/manually-annotated-corpora/cintil-deepbank.json | 2 +- corpora/manually-annotated-corpora/cintil-dependency.json | 2 +- corpora/manually-annotated-corpora/cintil-portugues.json | 2 +- corpora/manually-annotated-corpora/cintil-propbank.json | 2 +- corpora/manually-annotated-corpora/cintil-treebank.json | 2 +- .../manually-annotated-corpora/cmc-training-janes-norm.json | 2 +- .../manually-annotated-corpora/cmc-training-janes-tag.json | 2 +- corpora/manually-annotated-corpora/czech-legal-treebank.json | 2 +- corpora/manually-annotated-corpora/czech-ne-corpus.json | 2 +- corpora/manually-annotated-corpora/dep-anno-creg.json | 2 +- corpora/manually-annotated-corpora/est-treebank-coref.json | 2 +- corpora/manually-annotated-corpora/est-treebank.json | 2 +- corpora/manually-annotated-corpora/facebook-sentiment.json | 2 +- corpora/manually-annotated-corpora/fictree.json | 2 +- corpora/manually-annotated-corpora/fin-treebank-1.json | 2 +- corpora/manually-annotated-corpora/fin-treebank-2.json | 2 +- corpora/manually-annotated-corpora/finnsentiment.json | 2 +- corpora/manually-annotated-corpora/frenk-styria.json | 2 +- corpora/manually-annotated-corpora/greek-coref.json | 2 +- corpora/manually-annotated-corpora/greek-entailment.json | 2 +- corpora/manually-annotated-corpora/grug-para-tree.json | 2 +- corpora/manually-annotated-corpora/grundtvig.json | 2 +- corpora/manually-annotated-corpora/hamledt.json | 2 +- corpora/manually-annotated-corpora/hr500k-1.json | 2 +- corpora/manually-annotated-corpora/hr500k-2.json | 2 +- corpora/manually-annotated-corpora/icepahc.json | 2 +- corpora/manually-annotated-corpora/jos1m.json | 2 +- corpora/manually-annotated-corpora/kas-biterm.json | 2 +- corpora/manually-annotated-corpora/kas-term.json | 2 +- corpora/manually-annotated-corpora/kpwr.json | 2 +- corpora/manually-annotated-corpora/lassy-klein.json | 2 +- corpora/manually-annotated-corpora/lvtb.json | 2 +- corpora/manually-annotated-corpora/matas.json | 2 +- corpora/manually-annotated-corpora/morph-dist-estonian.json | 2 +- corpora/manually-annotated-corpora/multext-east.json | 2 +- corpora/manually-annotated-corpora/nkjp1m.json | 2 +- corpora/manually-annotated-corpora/nl2sh.json | 2 +- corpora/manually-annotated-corpora/norec.json | 2 +- corpora/manually-annotated-corpora/parseme.json | 2 +- corpora/manually-annotated-corpora/pol-coref.json | 2 +- corpora/manually-annotated-corpora/pol-dep-tree.json | 2 +- corpora/manually-annotated-corpora/pol-spatial.json | 2 +- corpora/manually-annotated-corpora/pol-summaries.json | 2 +- .../manually-annotated-corpora/prague-arabic-treebank.json | 2 +- .../prague-dependency-treebank.json | 2 +- .../manually-annotated-corpora/prague-discourse-treebank.json | 2 +- .../prague-eng-ces-dep-treebank.json | 2 +- corpora/manually-annotated-corpora/reldi-normtagner-hr.json | 2 +- corpora/manually-annotated-corpora/reldi-normtagner-sr.json | 2 +- corpora/manually-annotated-corpora/rsdo-def.json | 2 +- corpora/manually-annotated-corpora/sem-dis-est.json | 2 +- corpora/manually-annotated-corpora/sentinews.json | 2 +- corpora/manually-annotated-corpora/setimes-1-sr.json | 2 +- corpora/manually-annotated-corpora/setimes-2-sr.json | 2 +- .../slovak-dependency-treebank.json | 2 +- corpora/manually-annotated-corpora/slowic.json | 2 +- corpora/manually-annotated-corpora/sonar.json | 2 +- .../manually-annotated-corpora/speech-thought-writing.json | 2 +- corpora/manually-annotated-corpora/ssj500k.json | 2 +- corpora/manually-annotated-corpora/syn-ref-med-fra.json | 2 +- corpora/manually-annotated-corpora/szeged-treebank.json | 2 +- corpora/manually-annotated-corpora/szeged.json | 2 +- corpora/manually-annotated-corpora/tamil-dep.json | 2 +- corpora/manually-annotated-corpora/timel-ann-est-news.json | 2 +- corpora/manually-annotated-corpora/tree-iness.json | 2 +- corpora/manually-annotated-corpora/tueba-dz.json | 2 +- corpora/manually-annotated-corpora/turku-dep.json | 2 +- corpora/manually-annotated-corpora/twitter-sentiment.json | 2 +- corpora/manually-annotated-corpora/ud-estonian.json | 2 +- corpora/manually-annotated-corpora/uni-dep.json | 2 +- corpora/manually-annotated-corpora/vejica.json | 2 +- corpora/manually-annotated-corpora/wut-relations.json | 2 +- corpora/manually-annotated-corpora/xlime.json | 2 +- corpora/multimodal-corpora/bas-smartkom.json | 2 +- corpora/multimodal-corpora/bas-smartweb.json | 2 +- corpora/multimodal-corpora/bielefeld-sga.json | 2 +- corpora/multimodal-corpora/eva.json | 2 +- corpora/multimodal-corpora/eye-tracking.json | 2 +- corpora/multimodal-corpora/hindi-vis-genome.json | 2 +- corpora/multimodal-corpora/hun-multimodal.json | 2 +- corpora/multimodal-corpora/ifa-dialog.json | 2 +- corpora/multimodal-corpora/interactions-dialogales.json | 2 +- corpora/multimodal-corpora/mpi-esf.json | 2 +- corpora/multimodal-corpora/multimodal-text-comprehension.json | 2 +- corpora/multimodal-corpora/natural-media-mc.json | 2 +- corpora/multimodal-corpora/polimodal.json | 2 +- corpora/multimodal-corpora/tourist-brochures-helsinki.json | 2 +- corpora/multimodal-corpora/tv-news.json | 2 +- corpora/multimodal-corpora/unisa-isizulu.json | 2 +- corpora/multimodal-corpora/video-linked-thai-swe.json | 2 +- corpora/newspaper-corpora/8-sidor.json | 2 +- corpora/newspaper-corpora/accurat.json | 2 +- corpora/newspaper-corpora/chronopress.json | 2 +- corpora/newspaper-corpora/contemp-serbian.json | 2 +- corpora/newspaper-corpora/corp-news-texts.json | 2 +- corpora/newspaper-corpora/cripco.json | 2 +- corpora/newspaper-corpora/dagny.json | 2 +- corpora/newspaper-corpora/deu-newscrawl.json | 2 +- corpora/newspaper-corpora/dn-1987.json | 2 +- corpora/newspaper-corpora/est-republicain.json | 2 +- corpora/newspaper-corpora/europeana-at.json | 2 +- corpora/newspaper-corpora/europeana-de.json | 2 +- corpora/newspaper-corpora/europeana-ee.json | 2 +- corpora/newspaper-corpora/europeana-fi.json | 2 +- corpora/newspaper-corpora/europeana-lu.json | 2 +- corpora/newspaper-corpora/europeana-lv.json | 2 +- corpora/newspaper-corpora/europeana-ner.json | 2 +- corpora/newspaper-corpora/europeana-nl.json | 2 +- corpora/newspaper-corpora/europeana-pl.json | 2 +- corpora/newspaper-corpora/europeana-rs.json | 2 +- corpora/newspaper-corpora/ger-greek-press.json | 2 +- corpora/newspaper-corpora/gp-1994-2001-2011.json | 2 +- corpora/newspaper-corpora/hertha.json | 2 +- corpora/newspaper-corpora/idun.json | 2 +- corpora/newspaper-corpora/karelian-news.json | 2 +- corpora/newspaper-corpora/kvinnornas.json | 2 +- corpora/newspaper-corpora/larepubblica.json | 2 +- corpora/newspaper-corpora/lib-inf-centre.json | 2 +- corpora/newspaper-corpora/makedonia.json | 2 +- corpora/newspaper-corpora/mannheim-hist.json | 2 +- corpora/newspaper-corpora/morgonbris.json | 2 +- corpora/newspaper-corpora/news-nat-fin.json | 2 +- corpora/newspaper-corpora/news-ocr-fin.json | 2 +- corpora/newspaper-corpora/nor-news.json | 2 +- corpora/newspaper-corpora/parallel-global.json | 2 +- corpora/newspaper-corpora/rostratt.json | 2 +- corpora/newspaper-corpora/setimes.json | 2 +- corpora/newspaper-corpora/smittskydd.json | 2 +- corpora/newspaper-corpora/syn2006pub.json | 2 +- corpora/newspaper-corpora/syn2013pub.json | 2 +- corpora/newspaper-corpora/ta-nea.json | 2 +- corpora/newspaper-corpora/tiger.json | 2 +- corpora/newspaper-corpora/timed-jsi-web.json | 2 +- corpora/newspaper-corpora/tuebingen-tree.json | 2 +- corpora/newspaper-corpora/webbnyheter.json | 2 +- corpora/newspaper-corpora/witac.json | 2 +- corpora/newspaper-corpora/zurich.json | 2 +- corpora/oral-history-corpora/austrian-mediathek.json | 2 +- corpora/oral-history-corpora/bruzzone.json | 2 +- corpora/oral-history-corpora/fortunoff-archive.json | 2 +- corpora/oral-history-corpora/frauen-von-ravensbrueck.json | 2 +- corpora/oral-history-corpora/getuigenverhalen.json | 2 +- corpora/oral-history-corpora/us-holocaust-memorial.json | 2 +- corpora/oral-history-corpora/usc-shoah-foundation.json | 2 +- corpora/oral-history-corpora/vpro-selma-van-der-perre.json | 2 +- corpora/parallel-corpora/accurat.json | 2 +- corpora/parallel-corpora/aformes.json | 2 +- corpora/parallel-corpora/bul-tm.json | 2 +- corpora/parallel-corpora/bulgarian-x.json | 2 +- corpora/parallel-corpora/ces-eng-manual-word-align.json | 2 +- corpora/parallel-corpora/ces-eng-ufal-abstracts.json | 2 +- corpora/parallel-corpora/ces-pol-eng.json | 2 +- corpora/parallel-corpora/ces-slk-parallel.json | 2 +- corpora/parallel-corpora/civitas-gentium.json | 2 +- corpora/parallel-corpora/compara.json | 2 +- corpora/parallel-corpora/crater-2.json | 2 +- corpora/parallel-corpora/csenvi-pairwise.json | 2 +- corpora/parallel-corpora/czeng.json | 2 +- corpora/parallel-corpora/dgt-acquis.json | 2 +- corpora/parallel-corpora/dgt-tm-2016.json | 2 +- corpora/parallel-corpora/dgt-trans-mem.json | 2 +- corpora/parallel-corpora/dpc.json | 2 +- corpora/parallel-corpora/eac-trans-mem.json | 2 +- corpora/parallel-corpora/ecb-parallel.json | 2 +- corpora/parallel-corpora/ecdc-trans-mem.json | 2 +- corpora/parallel-corpora/elexis-wsd.json | 2 +- corpora/parallel-corpora/emea.json | 2 +- corpora/parallel-corpora/emel.json | 2 +- corpora/parallel-corpora/eng-ces-wikipedia.json | 2 +- corpora/parallel-corpora/eng-luganda.json | 2 +- corpora/parallel-corpora/eng-slk-parallel.json | 2 +- corpora/parallel-corpora/eng-swe-parallel.json | 2 +- corpora/parallel-corpora/eng-urdu-rel.json | 2 +- corpora/parallel-corpora/entam.json | 2 +- corpora/parallel-corpora/epic-uds.json | 2 +- corpora/parallel-corpora/epic.json | 2 +- corpora/parallel-corpora/est-eng-parallel.json | 2 +- corpora/parallel-corpora/est-open-parallel.json | 2 +- corpora/parallel-corpora/eubookshop.json | 2 +- corpora/parallel-corpora/eur-const.json | 2 +- corpora/parallel-corpora/europarl-ell-eng.json | 2 +- corpora/parallel-corpora/europarl-qtleap-wsd-ned.json | 2 +- corpora/parallel-corpora/europarl-uds.json | 2 +- corpora/parallel-corpora/europarl.json | 2 +- corpora/parallel-corpora/fienwac.json | 2 +- corpora/parallel-corpora/free-trade-agreement.json | 2 +- corpora/parallel-corpora/frel.json | 2 +- corpora/parallel-corpora/glossologia.json | 2 +- corpora/parallel-corpora/hindencorp.json | 2 +- corpora/parallel-corpora/hrenwac.json | 2 +- corpora/parallel-corpora/ift-fr-gr.json | 2 +- corpora/parallel-corpora/intera-ell-eng.json | 2 +- corpora/parallel-corpora/intercorp.json | 2 +- corpora/parallel-corpora/interlingual-perspectives.json | 2 +- corpora/parallel-corpora/jrc-acquis.json | 2 +- corpora/parallel-corpora/kacenka.json | 2 +- corpora/parallel-corpora/kotus-fin-swe.json | 2 +- corpora/parallel-corpora/lila.json | 2 +- corpora/parallel-corpora/macocu.json | 2 +- corpora/parallel-corpora/mlcc.json | 2 +- corpora/parallel-corpora/mulcold.json | 2 +- corpora/parallel-corpora/multext-east.json | 2 +- corpora/parallel-corpora/multijur.json | 2 +- corpora/parallel-corpora/multiun.json | 2 +- corpora/parallel-corpora/musa.json | 2 +- corpora/parallel-corpora/naacl.json | 2 +- corpora/parallel-corpora/nor-spa-parallel.json | 2 +- corpora/parallel-corpora/opensubtitles.json | 2 +- corpora/parallel-corpora/opus-helsinki.json | 2 +- corpora/parallel-corpora/opus.json | 2 +- corpora/parallel-corpora/pages.json | 2 +- corpora/parallel-corpora/panacea-eng-fra-eng-ell.json | 2 +- corpora/parallel-corpora/para-eng-gle.json | 2 +- corpora/parallel-corpora/para-global-voices.json | 2 +- corpora/parallel-corpora/paracrawl.json | 2 +- corpora/parallel-corpora/parallel-bible.json | 2 +- corpora/parallel-corpora/parallel-kde4.json | 2 +- corpora/parallel-corpora/parallel-wiki.json | 2 +- corpora/parallel-corpora/parcor.json | 2 +- corpora/parallel-corpora/parfin.json | 2 +- corpora/parallel-corpora/parice.json | 2 +- corpora/parallel-corpora/parrus.json | 2 +- corpora/parallel-corpora/pelcra-clarin.json | 2 +- corpora/parallel-corpora/pelcra.json | 2 +- corpora/parallel-corpora/pol-bul-rus-parallel.json | 2 +- corpora/parallel-corpora/pol-lit-parallel.json | 2 +- corpora/parallel-corpora/qtleap-news.json | 2 +- corpora/parallel-corpora/qtleap.json | 2 +- corpora/parallel-corpora/qtlp-deu-ell-medical.json | 2 +- corpora/parallel-corpora/qtlp-eng-ell-automotive.json | 2 +- corpora/parallel-corpora/qtlp-eng-ell-medical.json | 2 +- corpora/parallel-corpora/qtlp-por-ell-automotive.json | 2 +- corpora/parallel-corpora/qtlp-por-ell-medical.json | 2 +- corpora/parallel-corpora/reveal-this.json | 2 +- corpora/parallel-corpora/scielo.json | 2 +- corpora/parallel-corpora/setimes-clarin.json | 2 +- corpora/parallel-corpora/setimes.json | 2 +- corpora/parallel-corpora/slenwac.json | 2 +- corpora/parallel-corpora/slk-eng-parallel.json | 2 +- corpora/parallel-corpora/spc.json | 2 +- corpora/parallel-corpora/srenwac.json | 2 +- corpora/parallel-corpora/szeged-parallel.json | 2 +- corpora/parallel-corpora/tatoeba.json | 2 +- corpora/parallel-corpora/ted-para.json | 2 +- corpora/parallel-corpora/tourism-eng-hrv.json | 2 +- corpora/parallel-corpora/tris.json | 2 +- corpora/parallel-corpora/ufal-nor-levantine.json | 2 +- corpora/parallel-corpora/umc-ces-rus-eng.json | 2 +- corpora/parallel-corpora/un-parallel.json | 2 +- corpora/parallel-corpora/up-tap-opennlp.json | 2 +- "corpora/parallel-corpora/\316\274topia.json" | 2 +- corpora/parliamentary-corpora/aalto-fin-parla.json | 2 +- corpora/parliamentary-corpora/archives-parlementaires.json | 2 +- corpora/parliamentary-corpora/assemblee-nationale.json | 2 +- corpora/parliamentary-corpora/at-parlamentsreden.json | 2 +- corpora/parliamentary-corpora/bul-pol-jour-speech.json | 2 +- corpora/parliamentary-corpora/bundestag-europe.json | 2 +- corpora/parliamentary-corpora/cepic.json | 2 +- corpora/parliamentary-corpora/czech-parl-meetings.json | 2 +- corpora/parliamentary-corpora/czechparl.json | 2 +- corpora/parliamentary-corpora/danish-parliament.json | 2 +- corpora/parliamentary-corpora/dutchparl.json | 2 +- corpora/parliamentary-corpora/epic-uds.json | 2 +- corpora/parliamentary-corpora/europarl-ell-eng.json | 2 +- corpora/parliamentary-corpora/europarl.json | 2 +- corpora/parliamentary-corpora/german-pol-speeches.json | 2 +- corpora/parliamentary-corpora/gerparcor.json | 2 +- corpora/parliamentary-corpora/handeset.json | 2 +- corpora/parliamentary-corpora/hansard.json | 2 +- corpora/parliamentary-corpora/hellenic-parla.json | 2 +- corpora/parliamentary-corpora/house-of-commons-europe.json | 2 +- corpora/parliamentary-corpora/icelandic-parla.json | 2 +- corpora/parliamentary-corpora/kranjska.json | 2 +- corpora/parliamentary-corpora/large-czech-parl-hearings.json | 2 +- corpora/parliamentary-corpora/linkedsaeima.json | 2 +- corpora/parliamentary-corpora/lit-parla-attribution.json | 2 +- corpora/parliamentary-corpora/nor-parla-speech.json | 2 +- corpora/parliamentary-corpora/parlameter-hr9.json | 2 +- corpora/parliamentary-corpora/parlameter-sl.json | 2 +- corpora/parliamentary-corpora/parlamint-ana-30.json | 2 +- corpora/parliamentary-corpora/parlamint-en-ana-30.json | 2 +- corpora/parliamentary-corpora/parlasent-bcs.json | 2 +- corpora/parliamentary-corpora/parlat-beta.json | 2 +- corpora/parliamentary-corpora/parlspeech.json | 2 +- corpora/parliamentary-corpora/plenary-fin-parla.json | 2 +- corpora/parliamentary-corpora/pol-parla.json | 2 +- corpora/parliamentary-corpora/polminer.json | 2 +- corpora/parliamentary-corpora/proceedings-nor-parla.json | 2 +- corpora/parliamentary-corpora/ptparl.json | 2 +- corpora/parliamentary-corpora/riigikogu.json | 2 +- corpora/parliamentary-corpora/riksdag-open-data.json | 2 +- corpora/parliamentary-corpora/saeima.json | 2 +- corpora/parliamentary-corpora/siparl.json | 2 +- corpora/parliamentary-corpora/slovparl.json | 2 +- corpora/parliamentary-corpora/speeches-greek-parla.json | 2 +- corpora/parliamentary-corpora/talk-of-norway.json | 2 +- corpora/parliamentary-corpora/ukparl.json | 2 +- corpora/parliamentary-corpora/yu1parl.json | 2 +- corpora/reference-corpora/abnc.json | 2 +- corpora/reference-corpora/bnc.json | 2 +- corpora/reference-corpora/bnrc.json | 2 +- corpora/reference-corpora/ccgigafida.json | 2 +- corpora/reference-corpora/cckres.json | 2 +- corpora/reference-corpora/cnc.json | 2 +- corpora/reference-corpora/cogreek.json | 2 +- corpora/reference-corpora/con-lit.json | 2 +- corpora/reference-corpora/conae.json | 2 +- corpora/reference-corpora/corcencc.json | 2 +- corpora/reference-corpora/corpol.json | 2 +- corpora/reference-corpora/dereko.json | 2 +- corpora/reference-corpora/dia-greek.json | 2 +- corpora/reference-corpora/enc2019.json | 2 +- corpora/reference-corpora/erc.json | 2 +- corpora/reference-corpora/gigafida.json | 2 +- corpora/reference-corpora/gos.json | 2 +- corpora/reference-corpora/helnc.json | 2 +- corpora/reference-corpora/hunnc.json | 2 +- corpora/reference-corpora/ice-giga.json | 2 +- corpora/reference-corpora/kres.json | 2 +- corpora/reference-corpora/lbk.json | 2 +- corpora/reference-corpora/lvk2022.json | 2 +- corpora/reference-corpora/metafida.json | 2 +- corpora/reference-corpora/nnk.json | 2 +- corpora/reference-corpora/rcgd.json | 2 +- corpora/reference-corpora/riznica.json | 2 +- corpora/reference-corpora/sonar.json | 2 +- corpora/reference-corpora/syn2005.json | 2 +- corpora/reference-corpora/syn2010.json | 2 +- corpora/reference-corpora/syn2015.json | 2 +- corpora/sign-language-resources/adamorobe-lexicon.json | 2 +- corpora/sign-language-resources/adamorobe.json | 2 +- corpora/sign-language-resources/addictionlink.json | 2 +- corpora/sign-language-resources/balines-homesign.json | 2 +- corpora/sign-language-resources/becos.json | 2 +- corpora/sign-language-resources/bible-translations.json | 2 +- corpora/sign-language-resources/bsl-corpus.json | 2 +- corpora/sign-language-resources/bsl-lexicon.json | 2 +- corpora/sign-language-resources/catteau-2020.json | 2 +- corpora/sign-language-resources/consumer-info-fin.json | 2 +- corpora/sign-language-resources/content4all.json | 2 +- corpora/sign-language-resources/corlse.json | 2 +- corpora/sign-language-resources/corpus-dsl-dic.json | 2 +- corpora/sign-language-resources/corpus-fin-sl.json | 2 +- corpora/sign-language-resources/corpus-lsfb.json | 2 +- corpora/sign-language-resources/corpus-ngt.json | 2 +- corpora/sign-language-resources/creagest-acquisition.json | 2 +- corpora/sign-language-resources/creagest-dialogues.json | 2 +- corpora/sign-language-resources/csl-lexicon.json | 2 +- corpora/sign-language-resources/czech-sl-amateur.json | 2 +- corpora/sign-language-resources/czech-sl-prof.json | 2 +- corpora/sign-language-resources/degels1.json | 2 +- corpora/sign-language-resources/dgs-corpus.json | 2 +- corpora/sign-language-resources/dicta-sign-lexicon.json | 2 +- corpora/sign-language-resources/dicta-sign.json | 2 +- corpora/sign-language-resources/dogon.json | 2 +- corpora/sign-language-resources/echo-ngt-lex-f2.json | 2 +- corpora/sign-language-resources/echo-ngt-lex-m.json | 2 +- corpora/sign-language-resources/echo-ngt-lex-m2.json | 2 +- corpora/sign-language-resources/echo-ssl-lex-signer-lm.json | 2 +- corpora/sign-language-resources/echo.json | 2 +- corpora/sign-language-resources/exhibition-corpus.json | 2 +- corpora/sign-language-resources/fadwa-mhimdi.json | 2 +- corpora/sign-language-resources/fin-sl-learning.json | 2 +- corpora/sign-language-resources/giving-rec.json | 2 +- corpora/sign-language-resources/hotel-review-dutch.json | 2 +- corpora/sign-language-resources/hotel-review-flemish.json | 2 +- corpora/sign-language-resources/hotel-review-spanish.json | 2 +- corpora/sign-language-resources/hun-sl-corpus.json | 2 +- corpora/sign-language-resources/iprosla.json | 2 +- corpora/sign-language-resources/isignos.json | 2 +- corpora/sign-language-resources/italian-sl.json | 2 +- corpora/sign-language-resources/kata-kolok-child.json | 2 +- corpora/sign-language-resources/kata-kolok.json | 2 +- corpora/sign-language-resources/kipo.json | 4 ++-- corpora/sign-language-resources/ls-colin.json | 2 +- corpora/sign-language-resources/maurician-sl.json | 2 +- corpora/sign-language-resources/mediapi-skel.json | 2 +- corpora/sign-language-resources/mocap1.json | 2 +- corpora/sign-language-resources/news-fin-sl.json | 2 +- corpora/sign-language-resources/ngt-interactive.json | 2 +- corpora/sign-language-resources/noema-plus.json | 2 +- corpora/sign-language-resources/noema.json | 2 +- corpora/sign-language-resources/norwegian-sl.json | 2 +- corpora/sign-language-resources/phd-fusellier-souza.json | 2 +- corpora/sign-language-resources/phd-martinod.json | 2 +- corpora/sign-language-resources/pjm-corpus.json | 2 +- corpora/sign-language-resources/polytropon-para.json | 2 +- corpora/sign-language-resources/sign-hub-life-stories.json | 2 +- corpora/sign-language-resources/signes-en-famille.json | 2 +- corpora/sign-language-resources/signor-corpus.json | 2 +- corpora/sign-language-resources/signs-of-ireland,json | 2 +- corpora/sign-language-resources/swedish-sl-corpus.json | 2 +- .../sign-language-resources/tactile-swedish-sl-corpus.json | 2 +- corpora/sign-language-resources/turkish-sl.json | 2 +- corpora/sign-language-resources/vidi-sign-space.json | 2 +- corpora/sign-language-resources/visibase.json | 2 +- corpora/sign-language-resources/vlaamse-gt.json | 2 +- corpora/spoken-corpora/2nd-gen-israel-migrants.json | 2 +- corpora/spoken-corpora/aalto-dsp.json | 2 +- corpora/spoken-corpora/absolventinnen.json | 2 +- corpora/spoken-corpora/acwme.json | 2 +- corpora/spoken-corpora/agender.json | 2 +- corpora/spoken-corpora/air-traffic-ctrl.json | 2 +- corpora/spoken-corpora/alcebla.json | 2 +- corpora/spoken-corpora/ananas-mt.json | 2 +- corpora/spoken-corpora/arabic-speech.json | 2 +- corpora/spoken-corpora/asr-artur.json | 2 +- corpora/spoken-corpora/asr-parlaspeech-hr.json | 2 +- corpora/spoken-corpora/australiendeutsch.json | 2 +- corpora/spoken-corpora/babel.json | 2 +- corpora/spoken-corpora/bas-alcohol.json | 2 +- corpora/spoken-corpora/bas-regional-juves.json | 2 +- corpora/spoken-corpora/bas-siemens.json | 2 +- corpora/spoken-corpora/bas-sl-recog.json | 2 +- corpora/spoken-corpora/bas-smartweb-video.json | 2 +- corpora/spoken-corpora/bas-verbmobil-emo.json | 2 +- corpora/spoken-corpora/bas-ziptel.json | 2 +- corpora/spoken-corpora/bcms.json | 2 +- corpora/spoken-corpora/bea.json | 2 +- corpora/spoken-corpora/bel-tv-debates.json | 2 +- corpora/spoken-corpora/berliner-wende.json | 2 +- corpora/spoken-corpora/bielefeld-speech-and-gesture.json | 2 +- corpora/spoken-corpora/bigbrother.json | 2 +- corpora/spoken-corpora/bio-reise.json | 2 +- corpora/spoken-corpora/bits.json | 2 +- corpora/spoken-corpora/border-karelia.json | 2 +- corpora/spoken-corpora/boston-u-radio.json | 2 +- corpora/spoken-corpora/brothers.json | 2 +- corpora/spoken-corpora/buckeye.json | 2 +- corpora/spoken-corpora/budapest-socioling.json | 2 +- corpora/spoken-corpora/cans.json | 2 +- corpora/spoken-corpora/ci-articulation.json | 2 +- corpora/spoken-corpora/clapi.json | 2 +- corpora/spoken-corpora/clips-mt-manual.json | 2 +- corpora/spoken-corpora/clips.json | 2 +- .../consonant-cochlear-patients-diachronic.json | 2 +- corpora/spoken-corpora/consonant-cochlear-patients.json | 2 +- corpora/spoken-corpora/contemporary-french.json | 2 +- corpora/spoken-corpora/corpus-avip-api.json | 2 +- corpora/spoken-corpora/corpus-lip.json | 2 +- corpora/spoken-corpora/corpus-lips.json | 2 +- corpora/spoken-corpora/cosi.json | 2 +- corpora/spoken-corpora/czech-malach.json | 2 +- corpora/spoken-corpora/de-hochlautung.json | 2 +- corpora/spoken-corpora/de-koenig.json | 2 +- corpora/spoken-corpora/de-mundarten-ddr.json | 2 +- corpora/spoken-corpora/de-mundarten-ost.json | 2 +- corpora/spoken-corpora/de-mundarten-zwirner.json | 2 +- corpora/spoken-corpora/de-pfeffer.json | 2 +- corpora/spoken-corpora/dialekt.json | 2 +- corpora/spoken-corpora/dialogstrukturen.json | 2 +- corpora/spoken-corpora/doc-patient-ahus.json | 2 +- corpora/spoken-corpora/elfa.json | 2 +- corpora/spoken-corpora/emigranten-israel-wiener.json | 2 +- corpora/spoken-corpora/emigranten-israel.json | 2 +- corpora/spoken-corpora/eslora.json | 2 +- corpora/spoken-corpora/est-dialect.json | 2 +- corpora/spoken-corpora/est-emotional-speech.json | 2 +- corpora/spoken-corpora/est-spontaneous-speech.json | 2 +- corpora/spoken-corpora/exmeralda-demo.json | 2 +- corpora/spoken-corpora/fadac.json | 2 +- corpora/spoken-corpora/fin-broadcast.json | 2 +- corpora/spoken-corpora/fin-dialect-syntax.json | 2 +- corpora/spoken-corpora/fin-parliament.json | 2 +- corpora/spoken-corpora/followup-fin-dialects.json | 2 +- corpora/spoken-corpora/formtask.json | 2 +- corpora/spoken-corpora/forschung-gespr-de.json | 2 +- corpora/spoken-corpora/fra-parisien-2000.json | 2 +- corpora/spoken-corpora/gamli.json | 2 +- corpora/spoken-corpora/gender-neutral-de.json | 2 +- corpora/spoken-corpora/gesprochenes-wortkorpus.json | 2 +- corpora/spoken-corpora/gewiss.json | 2 +- corpora/spoken-corpora/gos-video.json | 2 +- corpora/spoken-corpora/gos.json | 2 +- corpora/spoken-corpora/gothenburg-dialogue.json | 2 +- corpora/spoken-corpora/griffith-australian.json | 2 +- corpora/spoken-corpora/grundstrukturen-freiburg.json | 2 +- corpora/spoken-corpora/habla.json | 2 +- corpora/spoken-corpora/hacaspa.json | 2 +- corpora/spoken-corpora/hamburg-modern.json | 2 +- corpora/spoken-corpora/hamcopolig.json | 2 +- corpora/spoken-corpora/hempel.json | 2 +- corpora/spoken-corpora/hral.json | 2 +- corpora/spoken-corpora/hun-broadcast-news.json | 2 +- corpora/spoken-corpora/hun-gigaword-spoken.json | 2 +- corpora/spoken-corpora/hun-kindergarten.json | 2 +- corpora/spoken-corpora/hun-reference-speech-db.json | 2 +- corpora/spoken-corpora/ifa-spoken.json | 2 +- corpora/spoken-corpora/jasmin.json | 2 +- corpora/spoken-corpora/juznevesti-sr.json | 2 +- corpora/spoken-corpora/karel-makon.json | 2 +- corpora/spoken-corpora/karl-eberhard.json | 2 +- corpora/spoken-corpora/kennsluromur.json | 2 +- corpora/spoken-corpora/konfliktgespraeche.json | 2 +- corpora/spoken-corpora/kontrastiv.json | 2 +- corpora/spoken-corpora/lang-in-migration.json | 4 ++-- corpora/spoken-corpora/lecture-speech.json | 2 +- corpora/spoken-corpora/lia.json | 2 +- corpora/spoken-corpora/lmu-asica.json | 2 +- corpora/spoken-corpora/long-spoken-fin.json | 2 +- corpora/spoken-corpora/medical-speech.json | 2 +- corpora/spoken-corpora/mehrsprachige-kinder.json | 2 +- corpora/spoken-corpora/multichannel-articulatory.json | 2 +- corpora/spoken-corpora/natural-media-motion-capture.json | 2 +- corpora/spoken-corpora/nautilus.json | 2 +- corpora/spoken-corpora/nordic-dialect.json | 2 +- corpora/spoken-corpora/north-wind-sun.json | 2 +- corpora/spoken-corpora/nota-oslo.json | 2 +- corpora/spoken-corpora/nslc.json | 2 +- .../spoken-corpora/onset-cochlear-patients-diachronic.json | 2 +- corpora/spoken-corpora/onset-cochlear-patients.json | 2 +- corpora/spoken-corpora/oral2008.json | 2 +- corpora/spoken-corpora/oral2013.json | 2 +- corpora/spoken-corpora/orleans.json | 2 +- corpora/spoken-corpora/ortofon-audio.json | 2 +- corpora/spoken-corpora/ortofon.json | 2 +- corpora/spoken-corpora/ovm.json | 2 +- corpora/spoken-corpora/parcorfull.json | 2 +- corpora/spoken-corpora/parlato-telegiornalistico.json | 2 +- corpora/spoken-corpora/pdtsl.json | 2 +- corpora/spoken-corpora/phattsessionz.json | 2 +- corpora/spoken-corpora/phon-contemp-fra.json | 2 +- corpora/spoken-corpora/phoncat.json | 2 +- corpora/spoken-corpora/phondat1.json | 2 +- corpora/spoken-corpora/phondat2.json | 2 +- corpora/spoken-corpora/prague-db.json | 2 +- corpora/spoken-corpora/radio-interviews.json | 2 +- corpora/spoken-corpora/radio-news.json | 2 +- corpora/spoken-corpora/route-to-a-wing.json | 2 +- corpora/spoken-corpora/russlanddeutsch.json | 2 +- corpora/spoken-corpora/rvg1_clarin.json | 2 +- corpora/spoken-corpora/samples-spoken-fin.json | 2 +- corpora/spoken-corpora/samromur.json | 2 +- corpora/spoken-corpora/sc1.json | 2 +- corpora/spoken-corpora/sc10.json | 2 +- corpora/spoken-corpora/sc2.json | 2 +- corpora/spoken-corpora/schweizer-jugend.json | 2 +- corpora/spoken-corpora/serbian-forms-of-address.json | 2 +- corpora/spoken-corpora/shc.json | 2 +- corpora/spoken-corpora/si100.json | 2 +- corpora/spoken-corpora/si1000.json | 2 +- .../spoken-corpora/sibilant-cochlear-patients-diachronic.json | 2 +- corpora/spoken-corpora/sibilant-cochlear-patients.json | 2 +- corpora/spoken-corpora/siebenbuergisch.json | 2 +- corpora/spoken-corpora/skolt-saami.json | 2 +- corpora/spoken-corpora/smartkom-home.json | 2 +- corpora/spoken-corpora/smartkom-mobil.json | 2 +- corpora/spoken-corpora/smartkom-public.json | 2 +- corpora/spoken-corpora/smartweb-motorbike.json | 2 +- corpora/spoken-corpora/spit-mdb.json | 2 +- corpora/spoken-corpora/spjallromur.json | 2 +- corpora/spoken-corpora/spoken-bnc2014.json | 2 +- corpora/spoken-corpora/spoken-estonian.json | 2 +- corpora/spoken-corpora/spoken-icelandic.json | 2 +- corpora/spoken-corpora/spoken-wikipedia.json | 2 +- corpora/spoken-corpora/talromur-2.json | 2 +- corpora/spoken-corpora/taus.json | 2 +- corpora/spoken-corpora/uraluid.json | 4 ++-- corpora/spoken-corpora/verbmobil-1.json | 2 +- corpora/spoken-corpora/verbmobil-2.json | 2 +- corpora/spoken-corpora/vienna-oxford.json | 2 +- corpora/spoken-corpora/vowel-cochlear-patients.json | 2 +- corpora/spoken-corpora/wissenschaftssprache.json | 2 +- corpora/spoken-corpora/zurich-tangram-bas.json | 2 +- corpora/spoken-corpora/zurich-tangram-uzh.json | 2 +- tools/corpus-query-tools/aconcorde.json | 2 +- tools/corpus-query-tools/antconc.json | 2 +- tools/corpus-query-tools/antpconc.json | 2 +- tools/corpus-query-tools/autosearch.json | 2 +- tools/corpus-query-tools/bncweb-lancaster.json | 2 +- tools/corpus-query-tools/casualconc.json | 2 +- tools/corpus-query-tools/catma.json | 2 +- tools/corpus-query-tools/chn.json | 2 +- tools/corpus-query-tools/cintil.json | 2 +- tools/corpus-query-tools/clan.json | 2 +- tools/corpus-query-tools/clark.json | 2 +- tools/corpus-query-tools/clic.json | 2 +- tools/corpus-query-tools/coanzse.json | 2 +- tools/corpus-query-tools/collocate.json | 2 +- tools/corpus-query-tools/compleat.json | 2 +- tools/corpus-query-tools/concgram.json | 2 +- tools/corpus-query-tools/concordancer-espanol.json | 2 +- tools/corpus-query-tools/concordancer-estonian.json | 2 +- tools/corpus-query-tools/concordancer-gysseling.json | 2 +- tools/corpus-query-tools/concordancer-hr-nat-corp.json | 2 +- tools/corpus-query-tools/concordancer-italian-heritage.json | 2 +- tools/corpus-query-tools/concordancer-middelnederlands.json | 2 +- tools/corpus-query-tools/concordancer-portuguese.json | 2 +- tools/corpus-query-tools/coquery.json | 2 +- tools/corpus-query-tools/corpkit.json | 2 +- tools/corpus-query-tools/corpus-explorer.json | 2 +- tools/corpus-query-tools/corpus-presenter.json | 2 +- tools/corpus-query-tools/corpus-workbench.json | 2 +- tools/corpus-query-tools/corpuscle.json | 2 +- tools/corpus-query-tools/cosmas-ii.json | 2 +- tools/corpus-query-tools/couranten.json | 2 +- tools/corpus-query-tools/cqpweb-lancaster.json | 2 +- tools/corpus-query-tools/dwds.json | 2 +- tools/corpus-query-tools/english-corpora.json | 2 +- tools/corpus-query-tools/exakt.json | 2 +- tools/corpus-query-tools/gate.json | 2 +- tools/corpus-query-tools/glossa.json | 2 +- tools/corpus-query-tools/gretel.json | 2 +- tools/corpus-query-tools/i-analyzer.json | 2 +- tools/corpus-query-tools/icecup.json | 2 +- tools/corpus-query-tools/iness.json | 2 +- tools/corpus-query-tools/intellitext.json | 2 +- tools/corpus-query-tools/kontext-clarin-si.json | 2 +- tools/corpus-query-tools/kontext-latvian.json | 2 +- tools/corpus-query-tools/kontext-lindat.json | 2 +- tools/corpus-query-tools/korap-corola.json | 2 +- tools/corpus-query-tools/korap-dereko.json | 2 +- tools/corpus-query-tools/korp-copenhagen.json | 2 +- tools/corpus-query-tools/korp-kielipankki.json | 2 +- tools/corpus-query-tools/korp-sprakbanken.json | 2 +- tools/corpus-query-tools/lancsbox.json | 2 +- tools/corpus-query-tools/liwc-22.json | 2 +- tools/corpus-query-tools/lncc.json | 2 +- tools/corpus-query-tools/monoconc.json | 2 +- tools/corpus-query-tools/nat-pol-ipi-pan.json | 2 +- tools/corpus-query-tools/nat-pol-pelcra.json | 2 +- tools/corpus-query-tools/nb-dh-lab.json | 2 +- tools/corpus-query-tools/nederlab.json | 2 +- tools/corpus-query-tools/nooj.json | 2 +- tools/corpus-query-tools/nosketch-clarin-si.json | 2 +- tools/corpus-query-tools/nosketch-engine.json | 2 +- tools/corpus-query-tools/nvivo.json | 2 +- tools/corpus-query-tools/opensonar.json | 2 +- tools/corpus-query-tools/paqu.json | 2 +- tools/corpus-query-tools/paraconc.json | 2 +- tools/corpus-query-tools/praaline.json | 2 +- tools/corpus-query-tools/prime-machine.json | 2 +- tools/corpus-query-tools/pyxmlconc.json | 2 +- tools/corpus-query-tools/qcat.json | 2 +- tools/corpus-query-tools/scattertext.json | 2 +- tools/corpus-query-tools/shebanq.json | 2 +- tools/corpus-query-tools/shinyconc.json | 2 +- tools/corpus-query-tools/simple-concordancer.json | 2 +- tools/corpus-query-tools/simple-corpus-tool.json | 2 +- tools/corpus-query-tools/skell.json | 2 +- tools/corpus-query-tools/sketchengine.json | 2 +- tools/corpus-query-tools/spaadia.json | 2 +- tools/corpus-query-tools/teitok.json | 2 +- tools/corpus-query-tools/textable.json | 2 +- tools/corpus-query-tools/textal.json | 2 +- tools/corpus-query-tools/textstat.json | 2 +- tools/corpus-query-tools/tsakorpus.json | 2 +- tools/corpus-query-tools/txm-online.json | 2 +- tools/corpus-query-tools/txm.json | 2 +- tools/corpus-query-tools/voyant-tools-dk.json | 2 +- tools/corpus-query-tools/voyant-tools-salidar.json | 2 +- tools/corpus-query-tools/voyant-tools.json | 2 +- tools/corpus-query-tools/webclark.json | 2 +- tools/corpus-query-tools/webcorp-learn.json | 2 +- tools/corpus-query-tools/webcorp-lse.json | 2 +- tools/corpus-query-tools/webcorp.json | 2 +- tools/corpus-query-tools/wmatrix.json | 2 +- tools/corpus-query-tools/word-cruncher.json | 2 +- tools/corpus-query-tools/wordless.json | 2 +- tools/corpus-query-tools/wordsmith-tools.json | 2 +- tools/corpus-query-tools/wordstatix.json | 2 +- 942 files changed, 945 insertions(+), 945 deletions(-) diff --git a/corpora/academic-corpora/ac-lit.json b/corpora/academic-corpora/ac-lit.json index cbe3bb5..2463143 100644 --- a/corpora/academic-corpora/ac-lit.json +++ b/corpora/academic-corpora/ac-lit.json @@ -3,7 +3,7 @@ "URL": "http://coralit.lt/en/node/18", "Family": "Academic corpora", "Description": "This corpus contains textbooks, scientific monographs, journal articles, abstracts, forewords, research reports, and master’s and PhD theses from the following disciplines:\nThe materials were published between 1999 and 2009. The corpus is encoded in TEI 5.\nThe corpus is available for online querying through a dedicated website.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "", "Size": ["9 million words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/academic-corpora/aca-hum.json b/corpora/academic-corpora/aca-hum.json index cdaaaef..eefdeac 100644 --- a/corpora/academic-corpora/aca-hum.json +++ b/corpora/academic-corpora/aca-hum.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/49", "Family": "Academic corpora", "Description": "This corpus contains academic texts from humanities disciplines published between 1997 and 2012. The corpus data are in the XML format and plain text.\nThe corpus is available for download from the SWECLARIN repository and for online querying through the concordancer Korp (SWECLARIN distribution).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY", "Size": ["14.5 million tokens"], "Annotation": [], diff --git a/corpora/academic-corpora/aca-soc.json b/corpora/academic-corpora/aca-soc.json index e606956..a0c7a2b 100644 --- a/corpora/academic-corpora/aca-soc.json +++ b/corpora/academic-corpora/aca-soc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/50", "Family": "Academic corpora", "Description": "This corpus contains academic texts from social sciences disciplines published between 1997 and 2012. The corpus data are in the XML format and plain text.\nThe corpus is available for download from the SWECLARIN repository and for online querying through the concordancer Korp (SWECLARIN distribution).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY", "Size": ["10.8 million tokens"], "Annotation": ["sentence segmentation"], diff --git a/corpora/academic-corpora/acl-anth.json b/corpora/academic-corpora/acl-anth.json index 93630aa..e9667fa 100644 --- a/corpora/academic-corpora/acl-anth.json +++ b/corpora/academic-corpora/acl-anth.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.35111/rfeg-z495", "Family": "Academic corpora", "Description": "This corpus contains research papers in computational linguistics published between 1979 and 2015. The corpus data are in the XML format.\nThe corpus is available for online querying through the Sketch Engine (log-in required) and for download from a dedicated website.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY SA", "Size": ["75 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "author/text metadata"], diff --git a/corpora/academic-corpora/acnz.json b/corpora/academic-corpora/acnz.json index 51b3058..279f526 100644 --- a/corpora/academic-corpora/acnz.json +++ b/corpora/academic-corpora/acnz.json @@ -3,7 +3,7 @@ "URL": "https://www.wgtn.ac.nz/lals/resources/academicwordlist/information/corpus", "Family": "Academic corpora", "Description": "This corpus contains journal articles, book chapters, course workbooks, laboratory manuals, and course notes from the following disciplines: arts, commerce, law, and biology.\nThis corpus is not available.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["3.5 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/chambers-lb.json b/corpora/academic-corpora/chambers-lb.json index 82c65ad..35b5c02 100644 --- a/corpora/academic-corpora/chambers-lb.json +++ b/corpora/academic-corpora/chambers-lb.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2527", "Family": "Academic corpora", "Description": "This corpus contains research papers in the following disciplines:\n\nThe research papers were published between 1998 and 2006. This is a plain text corpus.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "Oxford Text Archive licence (academic use)", "Size": ["1 million words"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/czec-soc.json b/corpora/academic-corpora/czec-soc.json index de27949..18d39e0 100644 --- a/corpora/academic-corpora/czec-soc.json +++ b/corpora/academic-corpora/czec-soc.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-2703", "Family": "Academic corpora", "Description": "This corpus contains research papers in sociology published between 1993 and 2016. The corpus data are in the TSV format.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "MIT", "Size": ["3 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/eng-sci.json b/corpora/academic-corpora/eng-sci.json index 95b25e2..2b6bbf0 100644 --- a/corpora/academic-corpora/eng-sci.json +++ b/corpora/academic-corpora/eng-sci.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CF9-6", "Family": "Academic corpora", "Description": "This corpus contains journal articles in the following disciplines:\n\nThe articles were published in the 1970s, 1980s and the 200s.\nThe corpus is available for online querying through CQPWeb (CLARIN-D distribution).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "restricted", "Size": ["35 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "author/text metadata", "document structure"], diff --git a/corpora/academic-corpora/est-sci.json b/corpora/academic-corpora/est-sci.json index 4291b55..4b2ac95 100644 --- a/corpora/academic-corpora/est-sci.json +++ b/corpora/academic-corpora/est-sci.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11297/1-00-0000-0000-0000-0002-4", "Family": "Academic corpora", "Description": "This corpus contains scientific articles and PhD theses. The corpus data are in the P5 format.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA-NC", "Size": ["5 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/genia.json b/corpora/academic-corpora/genia.json index 2cb3389..2013694 100644 --- a/corpora/academic-corpora/genia.json +++ b/corpora/academic-corpora/genia.json @@ -3,7 +3,7 @@ "URL": "http://www.geniaproject.org/genia-corpus", "Family": "Academic corpora", "Description": "This corpus contains journal paper abstracts in biomedicine. The corpus data are in various formats, e.g., PTB.\nThe corpus is available for download from PORTULAN.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "free but unspecified", "Size": ["437,000 words"], "Annotation": ["PoS-tagged", "syntactically parsed", "annotated for terms, events, semantic relations and coreference", "text metadata"], diff --git a/corpora/academic-corpora/jezkor.json b/corpora/academic-corpora/jezkor.json index cf68fd8..330abca 100644 --- a/corpora/academic-corpora/jezkor.json +++ b/corpora/academic-corpora/jezkor.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1755", "Family": "Academic corpora", "Description": "This corpus contains a collection of linguistic scientific writing in the Slovenian language. It consists of 43 monographs published between 2009 and 2022 by Fran Ramovš institute of Slovenian language and Založba ZRC, 267 papers published in the journal \"Jezikoslovni zapiski\" and 28 papers published in the journal \"Slovenski jezik\". Note that the texts were obtained directly from PDFs, so they contain various types of noise.\nThe corpus is linguistically annotated with the CLASSLA pipeline (https://github.com/clarinsi/classla) on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features, and named entities. It is distributed in CoNLL-U and vertical file format, one file for each text. Text metadata consists of the author(s), title and year of publication.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY", "Size": ["9.3 million tokens"], "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated for named entities and author/text metadata"], diff --git a/corpora/academic-corpora/kas.json b/corpora/academic-corpora/kas.json index c47f829..0d182b2 100644 --- a/corpora/academic-corpora/kas.json +++ b/corpora/academic-corpora/kas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1448", "Family": "Academic corpora", "Description": "This corpus contains BA, MA, and PhD theses in humanities, social sciences, and natural sciences published between 2000 and 2018. The corpus data are in the TEI format.\nThe corpus is available for download from CLARIN.SI. Version 1.0 is also available for online querying through noSketch Engine and KonText (CLARIN.SI distribution).", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CLARIN.SI Licence ACA ID-BY-NC-INF-NORED 1.0", "Size": ["1.5 billion tokens"], "Annotation": ["MSD-tagged", "lemmatised", "marked for bilingual and monolingual term candidates"], diff --git a/corpora/academic-corpora/kiap.json b/corpora/academic-corpora/kiap.json index 7d31370..52ab3dc 100644 --- a/corpora/academic-corpora/kiap.json +++ b/corpora/academic-corpora/kiap.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D989-605B-8F10-5", "Family": "Academic corpora", "Description": "This comparable corpus contains research articles in economics, linguistics, and medicine published between 1992 and 2003.\nThe corpus is available for online browsing through the concordancer Corpuscle (CLARINO distribution).", - "Languages": ["eng","fra","nor"], + "Language": ["eng","fra","nor"], "Licence": "CC-BY 4.0", "Size": ["3.9 million tokens"], "Annotation": ["PoS-tagged"], diff --git a/corpora/academic-corpora/lit-trans.json b/corpora/academic-corpora/lit-trans.json index 3da38c2..682aa21 100644 --- a/corpora/academic-corpora/lit-trans.json +++ b/corpora/academic-corpora/lit-trans.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-24F2-6", "Family": "Academic corpora", "Description": "This corpus contains journal articles in literary and translation studies. This is a plain text corpus.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-SA", "Size": ["48,300 words"], "Annotation": [], diff --git a/corpora/academic-corpora/modern-greek.json b/corpora/academic-corpora/modern-greek.json index 4409310..b2a5813 100644 --- a/corpora/academic-corpora/modern-greek.json +++ b/corpora/academic-corpora/modern-greek.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-2502-4", "Family": "Academic corpora", "Description": "This corpus contains scientific texts in linguistics and dialectology. This is a plain text corpus.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-SA", "Size": ["113,000 words"], "Annotation": [], diff --git a/corpora/academic-corpora/muchmore.json b/corpora/academic-corpora/muchmore.json index 53fcb35..e27c18c 100644 --- a/corpora/academic-corpora/muchmore.json +++ b/corpora/academic-corpora/muchmore.json @@ -3,7 +3,7 @@ "URL": "http://muchmore.dfki.de/resources1.htm", "Family": "Academic corpora", "Description": "This paper contains journal paper abstracts from medical disciplines. The corpus is encoded in MuchMore XML.\nThe corpus is available for download from a dedicated website.", - "Languages": ["eng","deu"], + "Language": ["eng","deu"], "Licence": "free but unspecified", "Size": ["1 million tokens"], "Annotation": ["PoS/MSD-tagged", "phrase chunking", "semantic class and relations", "document structure"], diff --git a/corpora/academic-corpora/open-slo.json b/corpora/academic-corpora/open-slo.json index 751992d..ff6f12f 100644 --- a/corpora/academic-corpora/open-slo.json +++ b/corpora/academic-corpora/open-slo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1774", "Family": "Academic corpora", "Description": "This corpus contains a large collection of scientific writing in the Slovenian language gathered from the Open Science Slovenia portal. It consists of over 150 thousand monographs, articles, diploma, master's and doctoral theses, advanced textbooks, reviews etc. mostly published between 2000 and 2022 by Slovenian universities, research institutions, etc. Texts are accompanied by metadata, i.e. author, supervisor (for theses), year of publication, publisher (mostly faculties of the various universities), type of publication (according to SICRIS classification), keywords, and CERIF and UDC codes. The texts were obtained directly from PDFs, so it should be noted that they can contain various types of character noise. The texts are linguistically annotated with the CLASSLA pipeline on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features, and named entities. The corpus is distributed in CoNLL-U and vertical file formats, one file for each text. The text metadata is given as a TSV file.\nNote that there exist similar, but older and smaller corpora KAS 2.0 and KAS 1.0. These contain only theses and only up to 2018, but are cleaner and with more metadata. The repository also archives a number of KAS-derived datasets; pls. search for \"KAS\" to find them.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA", "Size": ["326 million tokens"], "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated for named entities and author/text metadata"], diff --git a/corpora/academic-corpora/orossimo.json b/corpora/academic-corpora/orossimo.json index 1f11fe7..203eef1 100644 --- a/corpora/academic-corpora/orossimo.json +++ b/corpora/academic-corpora/orossimo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2410-5", "Family": "Academic corpora", "Description": "This corpus contains academic texts in the following disciplines:\nsocial sciences,
  • computer science,
  • economics,
  • linguistics,
  • photography,
  • law,
  • engineering,
  • history,
  • astronomy,
  • earth sciences and geology,
  • medicine and health, and
  • biology.
  • \nThe corpus is encoded in XML (XCES).\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY", "Size": ["2.5 million tokens"], "Annotation": ["marked for term candidates", "mixed structural annotation"], diff --git a/corpora/academic-corpora/reading.json b/corpora/academic-corpora/reading.json index bca7485..102cfb5 100644 --- a/corpora/academic-corpora/reading.json +++ b/corpora/academic-corpora/reading.json @@ -3,7 +3,7 @@ "URL": "http://www.reading.ac.uk/internal/appling/corpus.htm", "Family": "Academic corpora", "Description": "This corpus contains PhD theses from the following disciplines: agriculture, psychology, food science, technology, meteorology, and history. The data are encoded in ASCII and HTML.\nThe corpus is not available because it is restricted at present to staff and researchers at the University of Reading, and it is only available 'on-site'. However, it is possible for people outside the University to make use of the corpus on a Research Attachment arrangement.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "restricted", "Size": [], "Annotation": [], diff --git a/corpora/academic-corpora/roger.json b/corpora/academic-corpora/roger.json index 6ef2b26..16d0db9 100644 --- a/corpora/academic-corpora/roger.json +++ b/corpora/academic-corpora/roger.json @@ -3,7 +3,7 @@ "URL": "https://roger-corpus.org/", "Family": "Academic corpora", "Description": "The corpus contains academic papers from eight disciplines, written by the Romanian students in native Romanian and English L2.\nThe corpus was collected over a three-year period (2018–2021) with the help of 27 collaborators from nine Romanian universities.\nThe corpus is available for online querying through a dedicated platform developed at the CODHUS research centre from the West University of Timisoara.", - "Languages": ["eng","ron"], + "Language": ["eng","ron"], "Licence": "CC BY-NC-ND", "Size": ["3.3 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/roysoc.json b/corpora/academic-corpora/roysoc.json index 7f28ea0..577f5ea 100644 --- a/corpora/academic-corpora/roysoc.json +++ b/corpora/academic-corpora/roysoc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0001-7E8B-6", "Family": "Academic corpora", "Description": "This corpus contains journal articles published in Philosophical Transactions of the Royal Society of London between 1665 and 1869.\nThe corpus is available for online querying through CQPweb and for download from the CLARIN-D repository of the University of Saarland.", - "Languages": ["English (late and early modern)"], + "Language": ["English (late and early modern)"], "Licence": "CC BY", "Size": ["32 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "normalised", "author and document metadata"], diff --git a/corpora/academic-corpora/scientext.json b/corpora/academic-corpora/scientext.json index aad4055..1a4cace 100644 --- a/corpora/academic-corpora/scientext.json +++ b/corpora/academic-corpora/scientext.json @@ -3,7 +3,7 @@ "URL": "https://scientext.hypotheses.org/corpus", "Family": "Academic corpora", "Description": "This corpus contains scientific texts and argumentative essays in humanities, experimental sciences, and applied/technical sciences.\nThe corpus is available for online querying through a dedicated webpage.", - "Languages": ["fra","eng"], + "Language": ["fra","eng"], "Licence": "CC BY", "Size": ["20 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/span-eng.json b/corpora/academic-corpora/span-eng.json index 297e36e..80704de 100644 --- a/corpora/academic-corpora/span-eng.json +++ b/corpora/academic-corpora/span-eng.json @@ -3,7 +3,7 @@ "URL": "https://books.google.si/books?id=NZbWCgAAQBAJ&pg=PA178&lpg=PA178&dq=serac+corpus&source=bl&ots=A7F-vUMJsr&sig=ACfU3U1b8W_r944Bs8OviL9xauHtUoeqVg&hl=sl&sa=X&ved=2ahUKEwiRuq_5nczmAhXT5KYKHWUtBlcQ6AEwAHoECAUQAQ#v=onepage&q=serac%20corpus&f=false", "Family": "Academic corpora", "Description": "This corpus contains journal articles published between 2000 and 2010.\nThe corpus is unavailable.", - "Languages": ["spa","eng"], + "Language": ["spa","eng"], "Licence": "", "Size": ["5.7 million words"], "Annotation": [], diff --git a/corpora/academic-corpora/ufal-papers.json b/corpora/academic-corpora/ufal-papers.json index 34ce095..ec43f12 100644 --- a/corpora/academic-corpora/ufal-papers.json +++ b/corpora/academic-corpora/ufal-papers.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11234/1-1731", "Family": "Academic corpora", "Description": "This parallel corpus contains research paper abstracts in formal and applied linguistics. For each publication, the authors were obliged to provide both the original abstract in Czech or English, and its translation into English or Czech, respectively. The corpus data are in the TSV format.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces","eng"], + "Language": ["ces","eng"], "Licence": "CC BY", "Size": ["2 million words"], "Annotation": ["document aligned"], diff --git a/corpora/academic-corpora/uh-eng.json b/corpora/academic-corpora/uh-eng.json index 809bba0..3bc8fd7 100644 --- a/corpora/academic-corpora/uh-eng.json +++ b/corpora/academic-corpora/uh-eng.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102401", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY", "Size": ["200 million tokens"], "Annotation": ["PoS-tagged", "syntactically parsed"], diff --git a/corpora/academic-corpora/uh-fin.json b/corpora/academic-corpora/uh-fin.json index a39e030..eb1df4a 100644 --- a/corpora/academic-corpora/uh-fin.json +++ b/corpora/academic-corpora/uh-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016090601", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY", "Size": ["12.5 million tokens"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/academic-corpora/uh-fra.json b/corpora/academic-corpora/uh-fra.json index e1d5cae..4ed6bbc 100644 --- a/corpora/academic-corpora/uh-fra.json +++ b/corpora/academic-corpora/uh-fra.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102806", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC BY", "Size": ["580,000 tokens"], "Annotation": [], diff --git a/corpora/academic-corpora/uh-ger.json b/corpora/academic-corpora/uh-ger.json index 9621143..008bac3 100644 --- a/corpora/academic-corpora/uh-ger.json +++ b/corpora/academic-corpora/uh-ger.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102807", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC BY", "Size": ["560,000 tokens"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/uh-rus.json b/corpora/academic-corpora/uh-rus.json index c483fd3..832db9a 100644 --- a/corpora/academic-corpora/uh-rus.json +++ b/corpora/academic-corpora/uh-rus.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102808", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["rus"], + "Language": ["rus"], "Licence": "CC BY", "Size": ["1.1 million words"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/uh-spa.json b/corpora/academic-corpora/uh-spa.json index 0d53738..b0a161f 100644 --- a/corpora/academic-corpora/uh-spa.json +++ b/corpora/academic-corpora/uh-spa.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102809", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "CC BY", "Size": ["2.3 million tokens"], "Annotation": ["No annotation"], diff --git a/corpora/academic-corpora/uh-swe.json b/corpora/academic-corpora/uh-swe.json index 0e2b296..79a0625 100644 --- a/corpora/academic-corpora/uh-swe.json +++ b/corpora/academic-corpora/uh-swe.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016102810", "Family": "Academic corpora", "Description": "This corpus contains MA and PhD theses published between 1999 and 2016.\nThe corpus is available for online querying through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY", "Size": ["105 million tokens"], "Annotation": [], diff --git a/corpora/cmc-corpora/comere.json b/corpora/cmc-corpora/comere.json index 789fb81..d427b4e 100644 --- a/corpora/cmc-corpora/comere.json +++ b/corpora/cmc-corpora/comere.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/comere", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains e-mails, forum posts, online chats, tweets and SMS.\nThe corpus is available for download from Ortolang.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": ["80 million tokens"], "Annotation": ["tokenised", "mostly untagged"], diff --git a/corpora/cmc-corpora/contemp-blogs.json b/corpora/cmc-corpora/contemp-blogs.json index e6de082..ccb0128 100644 --- a/corpora/cmc-corpora/contemp-blogs.json +++ b/corpora/cmc-corpora/contemp-blogs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-000E-011B-8", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY", "Size": ["1 million tokens"], "Annotation": ["tokenised", "sentence tagged"], diff --git a/corpora/cmc-corpora/dereko-news-wiki.json b/corpora/cmc-corpora/dereko-news-wiki.json index 84b52ca..2b9df32 100644 --- a/corpora/cmc-corpora/dereko-news-wiki.json +++ b/corpora/cmc-corpora/dereko-news-wiki.json @@ -3,7 +3,7 @@ "URL": "https://cosmas2.ids-mannheim.de/cosmas2-web/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains content from newsgroup posts and Wikipedia.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["670 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/didi.json b/corpora/cmc-corpora/didi.json index d49495d..eb9d9e2 100644 --- a/corpora/cmc-corpora/didi.json +++ b/corpora/cmc-corpora/didi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12124/7", "Family": "Computer-mediated communication corpora", "Description": "This corpus consists of Facebook posts gathered from 136 Facebook users from South Tyrol. All texts are anonymised.\nThe corpus is available for download from the EURAC Research CLARIN repository.", - "Languages": ["deu","ita","eng","lad"], + "Language": ["deu","ita","eng","lad"], "Licence": "ACA-BY-NC-NORED 1.0", "Size": ["600,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/do-chat.json b/corpora/cmc-corpora/do-chat.json index 2d3ff6d..e75a760 100644 --- a/corpora/cmc-corpora/do-chat.json +++ b/corpora/cmc-corpora/do-chat.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-203Z-0000-002D-ECC7-2", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains online chats from 2000 to 2006\nThe corpus is available for download from the repository of CLARIN-D", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY", "Size": ["1 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/dwds-blogs.json b/corpora/cmc-corpora/dwds-blogs.json index 07089cd..395abba 100644 --- a/corpora/cmc-corpora/dwds-blogs.json +++ b/corpora/cmc-corpora/dwds-blogs.json @@ -3,7 +3,7 @@ "URL": "https://www.dwds.de/r#group-Spezialkorpora", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["102 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/ebay-petit.json b/corpora/cmc-corpora/ebay-petit.json index 2ee61cc..4b55dcb 100644 --- a/corpora/cmc-corpora/ebay-petit.json +++ b/corpora/cmc-corpora/ebay-petit.json @@ -3,7 +3,7 @@ "URL": "https://www.uni-potsdam.de/langage/la-bank/ebay.php", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains eBay listings from 2005, 2017, and 2018. The corpus is manually annotated.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["100,000 tokens"], "Annotation": ["see here"], diff --git a/corpora/cmc-corpora/flemish-teen-talk.json b/corpora/cmc-corpora/flemish-teen-talk.json index 012bc3c..93a7d6f 100644 --- a/corpora/cmc-corpora/flemish-teen-talk.json +++ b/corpora/cmc-corpora/flemish-teen-talk.json @@ -3,7 +3,7 @@ "URL": "https://repository.uantwerpen.be/docman/irua/948a9a/159941.pdf", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains Facebook posts and WhatsApp messages from 2015 and 2016.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": ["2.9 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/global-web-en.json b/corpora/cmc-corpora/global-web-en.json index 6d69c1e..13ff928 100644 --- a/corpora/cmc-corpora/global-web-en.json +++ b/corpora/cmc-corpora/global-web-en.json @@ -3,7 +3,7 @@ "URL": "https://www.kielipankki.fi/corpora/glowbe/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains texts from web-pages in United States, Great Britain, Australia, India, and 16 other countries. About 60% of the texts come from blogs.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN RES (download); CLARIN ACA (online)", "Size": ["1.8 billion words", "1.8 million texts"], "Annotation": "", diff --git a/corpora/cmc-corpora/heid.json b/corpora/cmc-corpora/heid.json index 04909a0..d26c525 100644 --- a/corpora/cmc-corpora/heid.json +++ b/corpora/cmc-corpora/heid.json @@ -3,7 +3,7 @@ "URL": "https://www.researchgate.net/publication/311674809_Political_Discourse_in_Polish_Internet-Corpus_of_Highly_Emotive_Internet_Discussions", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains tweets.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["160 milllion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/hs-fi-news.json b/corpora/cmc-corpora/hs-fi-news.json index 0781dc5..949caed 100644 --- a/corpora/cmc-corpora/hs-fi-news.json +++ b/corpora/cmc-corpora/hs-fi-news.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052718", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains the domestic news of the Helsingin Sanomat website and their comments from 5 September 2011 to 4 September 2012.\nThe corpus has been syntactically parsed using TDT alpha.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA – NC", "Size": ["8 million tokens", "593,760 sentences", "93,602 texts"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed"], diff --git a/corpora/cmc-corpora/janes-blog.json b/corpora/cmc-corpora/janes-blog.json index 858a401..9d6a3e5 100644 --- a/corpora/cmc-corpora/janes-blog.json +++ b/corpora/cmc-corpora/janes-blog.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1138", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts from RTV Slovenija and Publishwall.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["34 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-forum.json b/corpora/cmc-corpora/janes-forum.json index 168a924..5e6c4fb 100644 --- a/corpora/cmc-corpora/janes-forum.json +++ b/corpora/cmc-corpora/janes-forum.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1139", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from Avtomobilizem.com, MedOver.net and RTV Slovenija.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["47 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-news.json b/corpora/cmc-corpora/janes-news.json index 32c92d7..658ab52 100644 --- a/corpora/cmc-corpora/janes-news.json +++ b/corpora/cmc-corpora/janes-news.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1140", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains news comments from RTV Slovenija, Mladina and Reporter.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["14 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-tweet.json b/corpora/cmc-corpora/janes-tweet.json index d0b0cff..a5a351d 100644 --- a/corpora/cmc-corpora/janes-tweet.json +++ b/corpora/cmc-corpora/janes-tweet.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1142", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains tweets written by Slovenian Twitter users from 2013 to 2017.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["139 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/janes-wiki.json b/corpora/cmc-corpora/janes-wiki.json index 4030cc7..988da9e 100644 --- a/corpora/cmc-corpora/janes-wiki.json +++ b/corpora/cmc-corpora/janes-wiki.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1137", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains Slovenian Wikipedia user and talk pages.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["5 million tokens"], "Annotation": ["tokenised", "sentence segmented", "MSD-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/litis.json b/corpora/cmc-corpora/litis.json index 3365491..3517181 100644 --- a/corpora/cmc-corpora/litis.json +++ b/corpora/cmc-corpora/litis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/11", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from portals delfi.lt and lrytas.lt from 2010 to 2014.\nThe corpus is available for download from the CLARIN-LT repository.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN_ACA", "Size": ["190,000 comments"], "Annotation": "", diff --git a/corpora/cmc-corpora/macocu.json b/corpora/cmc-corpora/macocu.json index 7795459..ccab5ee 100644 --- a/corpora/cmc-corpora/macocu.json +++ b/corpora/cmc-corpora/macocu.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1804", "Family": "Computer-mediated communication corpora", "Description": "These corpora are a collection containing web texts and were built by crawling national internet top-level domains (specified below) and by extending the crawl dynamically to other domains as well. The crawler is available at MaCoCu GitHub channel. Considerable effort was devoted into cleaning the extracted text to provide a high-quality web corpus. This was achieved by removing boilerplate and near-duplicated paragraphs, discarding very short texts as well as texts that are not in the target language. Furthermore, samples from the largest 1,500 domains were manually checked and bad domains, such as machine-translated domains, were removed.\nThe dataset is characterized by extensive metadata which allows filtering the dataset based on text quality and other criteria, making the corpus highly useful for corpus linguistics studies, as well as for training language models and other language technologies. In XML format, each document is accompanied by the following metadata: title, crawl date, url, domain, file type of the original document, distribution of languages inside the document, and a fluency score based on a language model. The text of each document is divided into paragraphs that are accompanied by metadata on the information whether a paragraph is a heading or not, metadata on the paragraph quality (labels, such as \"short\" or \"good\", assigned based on paragraph length, URL and stopword density via the jusText tool) and fluency (score between 0 and 1, assigned with the Monocleaner tool), the automatically identified language of the text in the paragraph, and information whether the paragraph contains sensitive information (identified via the Biroamer tool). As opposed to the previous version in the case of corpora in version 2.0, this version has more accurate metadata on languages of the texts, which was achieved by using Google's Compact Language Detector 2 (CLD2), a high-performance language detector supporting many languages. Other tools, used for web corpora creation and curation, have been updated as well, resulting in an even cleaner, as well as larger corpus.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be easily read with the prevert parser.", - "Languages": ["sqi","bos","bul","cat","hrv","ell","isl","mkd","mlt","cnr","srp","tur","ukr","slv"], + "Language": ["sqi","bos","bul","cat","hrv","ell","isl","mkd","mlt","cnr","srp","tur","ukr","slv"], "Licence": "CC0 No Rights Reserved", "Size": "", "Annotation": ["annotated with extensive metadata"], diff --git a/corpora/cmc-corpora/mixed-newmedia.json b/corpora/cmc-corpora/mixed-newmedia.json index f7f4300..9146432 100644 --- a/corpora/cmc-corpora/mixed-newmedia.json +++ b/corpora/cmc-corpora/mixed-newmedia.json @@ -3,7 +3,7 @@ "URL": "http://www.cl.ut.ee/korpused/segakorpus/uusmeedia/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains chat room messages, forum posts and news comments from 2000 to 2008\nThe corpus is available for download from a dedicated webpage associated with CLARIN Estonia and through a dedicated concordancer.", - "Languages": ["est"], + "Language": ["est"], "Licence": "", "Size": ["25 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/monitor-at-tweets.json b/corpora/cmc-corpora/monitor-at-tweets.json index 1583cf0..79a5b3b 100644 --- a/corpora/cmc-corpora/monitor-at-tweets.json +++ b/corpora/cmc-corpora/monitor-at-tweets.json @@ -3,7 +3,7 @@ "URL": "https://hal.archives-ouvertes.fr/hal-01323274/document", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains tweets from 2007 to 2017.", - "Languages": ["deu","eng"], + "Language": ["deu","eng"], "Licence": "", "Size": ["40 million tweets"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/cmc-corpora/monitor-slo-trendi.json b/corpora/cmc-corpora/monitor-slo-trendi.json index 2dc433b..2723e41 100644 --- a/corpora/cmc-corpora/monitor-slo-trendi.json +++ b/corpora/cmc-corpora/monitor-slo-trendi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1782", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains news from 107 different media websites, published by 72 different publishers, and is a monitor corpus of Slovene. Trendi 2023-02 covers the period from January 2019 to February 2023, complementing the Gigafida 2.0 reference corpus of written Slovene. All the contents of the Trendi corpus are at the moment obtained using the Jožef Stefan Institute Newsfeed service. The texts have been annotated using the CLASSLA-Stanza pipeline, including syntactic parsing according to the Universal Dependencies and Named Entities.\nAn important addition are topics or thematical categories, which have been automatically assigned to each text. There are 13 categories altogether: Arts and culture, Crime and accidents, Economy, Environment, Health, Leisure, Politics and Law, Science and Technology, Society, Sports, Weather, Entertainment, and Education. Text classification models are available at Text classification model SloBERTa-Trendi-Topics 1.0, Text classification model fastText-Trendi-Topics 1.0, and SloBERTa model. At the moment, the corpus is not available as a dataset due to copyright restrictions but we hope to make at least some of it available in the near future.\nThe corpus can be queried through noSketchEngine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "", "Size": ["700 million tokens"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed", "annotated for named entities and topics"], diff --git a/corpora/cmc-corpora/ntap-en.json b/corpora/cmc-corpora/ntap-en.json index 956f59c..864e2b2 100644 --- a/corpora/cmc-corpora/ntap-en.json +++ b/corpora/cmc-corpora/ntap-en.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/DAB8-BE65-64FD-4", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts that are related to climate change issues across science, politics, and the environment. The vast majority of the posts are from 2005 onwards.\nThe corpus is available for searching online through the Corpuscle concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["660,798,199 tokens"], "Annotation": "", diff --git a/corpora/cmc-corpora/ntap-fr.json b/corpora/cmc-corpora/ntap-fr.json index 0d60169..4cb2b62 100644 --- a/corpora/cmc-corpora/ntap-fr.json +++ b/corpora/cmc-corpora/ntap-fr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/DE48-00A5-6536-1", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains blog posts that are related to climate change issues across science, politics, and the environment. The vast majority of the posts are from 2005 onwards.\nThe corpus is available for searching online through the Corpuscle concordancer.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "", "Size": ["1,506,064,082 words"], "Annotation": "", diff --git a/corpora/cmc-corpora/paisa.json b/corpora/cmc-corpora/paisa.json index 04e206b..1029260 100644 --- a/corpora/cmc-corpora/paisa.json +++ b/corpora/cmc-corpora/paisa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12124/3", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains approximately 380,000 documents coming from about 1,000 different websites, for a total of about 250 million words. Approximately 260,000 documents are from Wikipedia, approx. 5,600 from other Wikimedia Foundation projects. About 9,300 documents come from Indymedia, and we estimate that about 65,000 documents come from blog services.\nThe corpus is available for download from the EURAC Research CLARIN repository.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["380,000 pages", "250 million words"], "Annotation": "", diff --git a/corpora/cmc-corpora/pdrs.json b/corpora/cmc-corpora/pdrs.json index b2e491e..a9c3a61 100644 --- a/corpora/cmc-corpora/pdrs.json +++ b/corpora/cmc-corpora/pdrs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1752", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains texts from the web obtained by crawling the .rs domain. Crawling has been done in September and October 2022 with BootCat. As search terms, appr. 2,800 word forms with a frequency between 5,000 and 500,000 in srWaC have been used. The texts are deduplicated, cyrillic texts have been transliterated into the Latin alphabet. The linguistic processing was done with the CLASSLA package for tokenization, lemmatization and morpho-syntactic tagging (both MULTEXT-East and Universal Dependencies).\nIn addition, some 80% of the URLs are manually tagged for 10 different types of sources (\"area\"): media (media outlets with several posts daily), inform (topic-centered sites with infrequent posts - maximum 3 per day), company (presentations of companies), state (websites of government bodies on nationa, regional and local level), forum (forum posts), portal (topic-centered portals without daily coverage), science (scientific publications), shop (with descriptions of products), database (knowledge bases, dictionaries, databases and similar) and community (NGOs, fan clubs, associations and other). The corpus is distributed in the CoNLL-U format in batches of appr. 2x50 mio. tokens.\nThe corpus is available for download from the Slovenian repository CLARIN.SI and can be queried through noSketchEngine and KonText concordancers.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC-BY", "Size": ["715 million tokens"], "Annotation": ["tokenised", "MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated for text source"], diff --git a/corpora/cmc-corpora/sfnet.json b/corpora/cmc-corpora/sfnet.json index 33413be..a1086dc 100644 --- a/corpora/cmc-corpora/sfnet.json +++ b/corpora/cmc-corpora/sfnet.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20150126", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains written posts from the SFNET forum in Finnish from 2002 to 2003.\nThe PoS-tagging has been done with the FI-FDG Parser, which uses a computational implementation of Functional Dependency Grammar.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank)", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA – NC", "Size": ["100 million words"], "Annotation": ["PoS-tagged", "sentence and word segmentation"], diff --git a/corpora/cmc-corpora/sms4science.json b/corpora/cmc-corpora/sms4science.json index 568bbbc..3255557 100644 --- a/corpora/cmc-corpora/sms4science.json +++ b/corpora/cmc-corpora/sms4science.json @@ -3,7 +3,7 @@ "URL": "http://sms4science.ch/Main/WebHome", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains around 25000 SMS from 2009.\nThe corpus comes in two different versions which are available through separate concordancers - SMS Navigator and ANNIS. The version accessible through ANNIS is more richly annotated and includes PoS-tagging, normalization, annotation of nonce borrowings, etc. Access through the concordancers requires free registration.", - "Languages": ["gsw","deu","fra","ita","roh"], + "Language": ["gsw","deu","fra","ita","roh"], "Licence": "", "Size": ["0.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/sonar-newmedia.json b/corpora/cmc-corpora/sonar-newmedia.json index b22fb92..0f26b00 100644 --- a/corpora/cmc-corpora/sonar-newmedia.json +++ b/corpora/cmc-corpora/sonar-newmedia.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/157d6fee6134f5beab09b159dd7c710a", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains tweets, chats and SMS from 2005 to 2012.\nThe corpus is available for searching online through the OpenSONAR environment.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN ACA", "Size": ["35 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/cmc-corpora/suomi24.json b/corpora/cmc-corpora/suomi24.json index 0875be4..e2511e9 100644 --- a/corpora/cmc-corpora/suomi24.json +++ b/corpora/cmc-corpora/suomi24.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017021506", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains forum posts from the Suomi24 website from 2001 to 2016.\nThe corpus is available for download from the FIN-CLARIN repository and through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA", "Size": ["2.6 billion tokens"], "Annotation": ["tokenised", "MSD-tagged"], diff --git a/corpora/cmc-corpora/welsh-tweets.json b/corpora/cmc-corpora/welsh-tweets.json index 6024fd3..a368818 100644 --- a/corpora/cmc-corpora/welsh-tweets.json +++ b/corpora/cmc-corpora/welsh-tweets.json @@ -3,7 +3,7 @@ "URL": "http://techiaith.cymru/corpora/twitter/?lang=en", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains tweets.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["cym"], + "Language": ["cym"], "Licence": "unclear", "Size": ["7 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/cmc-corpora/whatsup-ch.json b/corpora/cmc-corpora/whatsup-ch.json index 3d54503..7a45939 100644 --- a/corpora/cmc-corpora/whatsup-ch.json +++ b/corpora/cmc-corpora/whatsup-ch.json @@ -3,7 +3,7 @@ "URL": "http://cmc-corpora.ch/", "Family": "Computer-mediated communication corpora", "Description": "This corpus contains 216 WhatsApp chats from 2014.\nThe corpus is accessible online through the ANNIS system.", - "Languages": ["gsw","deu","fra","ita","roh"], + "Language": ["gsw","deu","fra","ita","roh"], "Licence": "", "Size": ["5 million tokens"], "Annotation": "", diff --git a/corpora/cmc-corpora/ylilauta.json b/corpora/cmc-corpora/ylilauta.json index 9cc3018..1081342 100644 --- a/corpora/cmc-corpora/ylilauta.json +++ b/corpora/cmc-corpora/ylilauta.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015031802", "Family": "Computer-mediated communication corpora", "Description": "The corpus contains text from discussions of the Ylilauta online discussion board from 2012 to 2014.\nThe corpus has been syntactically annotated with the TDT alpha parser, while the named entities have been assigned using the FiNER tool.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank) and for online browsing through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-NC", "Size": ["26.9 million words"], "Annotation": ["PoS-tagged", "lemmatised", "syntactically parsed", "named entities"], diff --git a/corpora/corpora-of-disordered-speech/adhd-uva.json b/corpora/corpora-of-disordered-speech/adhd-uva.json index 32f29c2..4856f03 100644 --- a/corpora/corpora-of-disordered-speech/adhd-uva.json +++ b/corpora/corpora-of-disordered-speech/adhd-uva.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-2766F32F-4305-4F13-A02C-F4A8F5216425", "Family": "Corpora of Disordered Speech", "Description": "This corpus aims to compare the language and executive functioning profiles of children with ADHD to children with Specific Language Impairment and children with Tourette’s Disorder.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["4 GB (67 recordings) of 26 Dutch children with ADHD, 19 Dutch children with SLI, 22 children Dutch controls"], "Annotation": ["Transcriptions (CHAT-format)"], diff --git a/corpora/corpora-of-disordered-speech/adresso-challenge.json b/corpora/corpora-of-disordered-speech/adresso-challenge.json index 561435c..02077dd 100644 --- a/corpora/corpora-of-disordered-speech/adresso-challenge.json +++ b/corpora/corpora-of-disordered-speech/adresso-challenge.json @@ -3,7 +3,7 @@ "URL": "https://sla.talkbank.org/TBB/dementia", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in dementia.\nAccess to the data in DementiaBank is password protected and restricted to members of the DementiaBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng", "deu", "cmn", "spa", "Taiwanese"], + "Language": ["eng", "deu", "cmn", "spa", "Taiwanese"], "Licence": "email request for access", "Size": [], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json b/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json index d5ba406..21e92fc 100644 --- a/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json +++ b/corpora/corpora-of-disordered-speech/ahoslabi-esophageal.json @@ -3,7 +3,7 @@ "URL": "https://catalog.elra.info/en-us/repository/browse/ELRA-S0413/", "Family": "Corpora of Disordered Speech", "Description": "This corpus primarily consists of recordings of 31 laryngectomees (27 males and 4 females) pronouncing 100 phonetically balanced sentences.\nEsophageal voices were recorded in a soundproof recording cubicle with a Neuman microphone.\nThe corpus also includes parallel recordings of the sentences by 9 healthy speakers (6 males and 3 females) to facilitate speech processing tasks that require small parallel corpora, such as voice conversion or synthetic speech adaptation. Apart from the sentences, the database also contains 4 sustained vowels and a small set of isolated words (14) which can be very valuable for research on esophageal speech analysis, diagnosis and evaluation. ", - "Languages": ["Spanish, Castilian"], + "Language": ["Spanish, Castilian"], "Licence": "Non Commercial Use - ELRA END USER", "Size": ["10.8 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/aphasiabank.json b/corpora/corpora-of-disordered-speech/aphasiabank.json index 39a403f..e5dff53 100644 --- a/corpora/corpora-of-disordered-speech/aphasiabank.json +++ b/corpora/corpora-of-disordered-speech/aphasiabank.json @@ -3,7 +3,7 @@ "URL": "https://aphasia.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in aphasia.\n Access to the data in AphasiaBank is password protected and restricted to members of the AphasiaBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["yue", "hrv", "eng", "fra", "deu", "ell", "hun", "ita", "jpn", "cmn", "ron", "spa"], + "Language": ["yue", "hrv", "eng", "fra", "deu", "ell", "hun", "ita", "jpn", "cmn", "ron", "spa"], "Licence": "email request for access", "Size": ["380 MB transcripts", "827 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/asdbank.json b/corpora/corpora-of-disordered-speech/asdbank.json index 20fd36c..98c683d 100644 --- a/corpora/corpora-of-disordered-speech/asdbank.json +++ b/corpora/corpora-of-disordered-speech/asdbank.json @@ -3,7 +3,7 @@ "URL": "https://asd.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in autism-spectrum disorder.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["nld", "eng", "fra", "ell", "cmn", "spa"], + "Language": ["nld", "eng", "fra", "ell", "cmn", "spa"], "Licence": "open access", "Size": ["42 MB transcripts", "401 MB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json b/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json index cf109c2..e2e62de 100644 --- a/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json +++ b/corpora/corpora-of-disordered-speech/bil-deaf-ru-kentalis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-F6BC06C4-B2AD-4ED8-8527-AB81F4EF4E8F", "Family": "Corpora of Disordered Speech", "Description": "The corpus is used for investigating the bilingual language and communication development of young deaf children in Sign Language of the Netherlands (SLN) and Dutch.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["4 GB complete video recordings. 1 GB selected parts video recordings. 0,1 GB selected parts transcripts. 0,5 GB test and background data of 11 deaf children, longitudinal, 104 recordings"], "Annotation": [" CHAT-like format for 104 recordings"], diff --git a/corpora/corpora-of-disordered-speech/cleft-dataset.json b/corpora/corpora-of-disordered-speech/cleft-dataset.json index a5c6df6..4ff0601 100644 --- a/corpora/corpora-of-disordered-speech/cleft-dataset.json +++ b/corpora/corpora-of-disordered-speech/cleft-dataset.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/cleft/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound and audio recorded with children with cleft lip and palate.", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["11 speakers"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/copas.json b/corpora/corpora-of-disordered-speech/copas.json index e93ec3f..98dda2a 100644 --- a/corpora/corpora-of-disordered-speech/copas.json +++ b/corpora/corpora-of-disordered-speech/copas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-n3", "Family": "Corpora of Disordered Speech", "Description": "This corpus has been constructed within the framework of the project Speech Algorithms for Clinical and Educational applications (SPACE).", - "Languages": ["Dutch (Flemish)"], + "Language": ["Dutch (Flemish)"], "Licence": "Academic, bespoke", "Size": ["319 speakers of which 122 normal controls and 197 with a speech disorder. Corpus size: 1.3 GB"], "Annotation": ["Orthographic transcription"], diff --git a/corpora/corpora-of-disordered-speech/deaf-adults-ru.json b/corpora/corpora-of-disordered-speech/deaf-adults-ru.json index 75ab8b0..99aacfd 100644 --- a/corpora/corpora-of-disordered-speech/deaf-adults-ru.json +++ b/corpora/corpora-of-disordered-speech/deaf-adults-ru.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-97AF29EA-877D-422A-BAF7-25FA269351A6", "Family": "Corpora of Disordered Speech", "Description": "This corpus aims at the investigation of the acquisition of Dutch by deaf Dutch adults (late L1/early L2) and comparison to hearing Turkish and Moroccan-Arabic.", - "Languages": ["nld", "tur", "ary"], + "Language": ["nld", "tur", "ary"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["2GB of 46 deaf Dutch adults, 38 hearing Turkish adults, 24 hearing Moroccan adults, 10 Dutch controls"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json b/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json index 77feca7..f6eab6e 100644 --- a/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json +++ b/corpora/corpora-of-disordered-speech/demcorpus-basilicata.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/OPEN-989", "Family": "Corpora of Disordered Speech", "Description": "This corpus consists of semi-spontaneous speech data produced by elderly residents of the Basilicata region in Italy.\nIn total, 40 individuals participated: the patient group consists of 20 participants with a diagnosis of dementia (9 cases of Alzheimer’s disease, 2 patients with mixed dementia, 5 patients with not-further-specified dementia, 3 patients with vascular dementia, and 1 patient with frontotemporal dementia).\nthe control group consists of 20 healthy individuals matched for age, gender, and geographical origin. Three linguistic tasks were administered to all participants: two narrative tasks (the first one was about an excursion or a trip, and the second was about Christmas festivities), and an image description task. This resulted in 8 hours and 50 minutes of recorded semi-spontaneous speech, which was then transcribed, segmented, and annotated using ELAN. ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "Processed data available by request", "Size": ["08:50 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/ewa-db.json b/corpora/corpora-of-disordered-speech/ewa-db.json index 72d3248..99b9b73 100644 --- a/corpora/corpora-of-disordered-speech/ewa-db.json +++ b/corpora/corpora-of-disordered-speech/ewa-db.json @@ -3,7 +3,7 @@ "URL": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0489/", "Family": "Corpora of Disordered Speech", "Description": "This corpus contains data from 3 clinical groups: Alzheimer's disease, Parkinson's disease, mild cognitive impairment, and a control group of healthy subjects.\nSpeech samples of each clinical group were obtained using the EWA smartphone application, which contains 4 different language tasks: sustained vowel phonation, diadochokinesis, object and action naming (30 objects and 30 actions), and picture description (two single pictures and three complex pictures).", - "Languages": ["slk"], + "Language": ["slk"], "Licence": "Non-commercial and commercial options", "Size": ["150 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/fluencybank.json b/corpora/corpora-of-disordered-speech/fluencybank.json index 8efcc14..5f500e0 100644 --- a/corpora/corpora-of-disordered-speech/fluencybank.json +++ b/corpora/corpora-of-disordered-speech/fluencybank.json @@ -3,7 +3,7 @@ "URL": "https://fluency.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This corpus is intended for the study of fluency development.\nParticipants include typically-developing monolingual and bilingual children, children and adults who stutter (C/AWS) or who clutter (C/AWC), and second language learners.\nAccess to the research data in FluencyBank is password protected and restricted to members of the FluencyBank consortium group, although a subset of the corpus is publicly available.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["nld", "eng", "fra", "deu"], + "Language": ["nld", "eng", "fra", "deu"], "Licence": "email request for access", "Size": ["481 MB transcripts", "207 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/itaasd.json b/corpora/corpora-of-disordered-speech/itaasd.json index 056737f..2c20b40 100644 --- a/corpora/corpora-of-disordered-speech/itaasd.json +++ b/corpora/corpora-of-disordered-speech/itaasd.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/OPEN-990", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of semi-spontaneous speech produced by 34 children between 6 and 13 years of age, residents in the Campania region of Italy.#sepHalf of the participating children were diagnosed with high-functioning Autism Spectrum Disorder, and the other half were neurotypical children matched for age, gender, and geographical origin.#sepAll participants were administered three tasks: a complex image description task, a story-telling task, and a story-retelling task. This resulted in 4 hours and 19 minutes of recorded speech, which were then transcribed and annotated using ELAN. ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["04.19 hours"], "Annotation": ["Orthographic"], diff --git a/corpora/corpora-of-disordered-speech/oplon.json b/corpora/corpora-of-disordered-speech/oplon.json index ae39864..9266b73 100644 --- a/corpora/corpora-of-disordered-speech/oplon.json +++ b/corpora/corpora-of-disordered-speech/oplon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/ILC-992", "Family": "Corpora of Disordered Speech", "Description": "This corpus consists of semi-spontaneous speech data collected from 96 elderly participants who were divided into two groups: the pathological and the control group.\nThe pathological group refers to three categories: (i) 16 participants with amnestic Mild Cognitive Impairment (MCI), (ii) 16 participants with multiple-domain MCI, and (iii) 16 participants with Early Dementia (probable Alzheimer Dementia, Fronto-Temporal Dementia, Mixed Dementia, and Lewy Body Dementia).\nThe control group includes 48 healthy individuals matched for gender, age, educational level, and geographical origin. The corpus was subjected to PoS Tagging and Dependency Parsing (CoNLL format). ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["06:50 hours"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/perceptual-voice-q.json b/corpora/corpora-of-disordered-speech/perceptual-voice-q.json index 093bf4c..71365a7 100644 --- a/corpora/corpora-of-disordered-speech/perceptual-voice-q.json +++ b/corpora/corpora-of-disordered-speech/perceptual-voice-q.json @@ -3,7 +3,7 @@ "URL": "https://data.mendeley.com/datasets/9dz247gnyb/4", "Family": "Corpora of Disordered Speech", "Description": "This corpus contains voice samples which have been rated by experienced voice professionals (at least 3 different raters with a minimum of 2 years’ clinical experience) in order to provide educators with standardized materials to better train pre-service clinical voice professionals. ", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC 4.0", "Size": ["296 audio files of varying sizes"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/phonologyt-project.json b/corpora/corpora-of-disordered-speech/phonologyt-project.json index 638938a..c1b17db 100644 --- a/corpora/corpora-of-disordered-speech/phonologyt-project.json +++ b/corpora/corpora-of-disordered-speech/phonologyt-project.json @@ -3,7 +3,7 @@ "URL": "https://phonodevelopment.sites.olt.ubc.ca/", "Family": "Corpora of Disordered Speech", "Description": "This corpus is used for investigating the phonological development across languages, and to evaluate intervention outcomes given a nonlinear phonological approach and ultrasound intervention outcomes across speech disorders.", - "Languages": ["eng", "fra", "spa", "cmn", "yue", "slv"], + "Language": ["eng", "fra", "spa", "cmn", "yue", "slv"], "Licence": "CC 4.0 Non-commercial", "Size": ["4 speakers for transcription resource"], "Annotation": ["Phonemic and phonetic transcription"], diff --git a/corpora/corpora-of-disordered-speech/plan-v-aphasia.json b/corpora/corpora-of-disordered-speech/plan-v-aphasia.json index be13c97..dcdc58d 100644 --- a/corpora/corpora-of-disordered-speech/plan-v-aphasia.json +++ b/corpora/corpora-of-disordered-speech/plan-v-aphasia.json @@ -3,7 +3,7 @@ "URL": "https://planv-project.gr/", "Family": "Corpora of Disordered Speech", "Description": "This corpus contains spoken discourse data collected from Greek-speaking People with Aphasia (PWA) and from neurotypical adults.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY 4.0", "Size": ["1.84 MB"], "Annotation": ["Sentence", "utterance", "clause", "POS"], diff --git a/corpora/corpora-of-disordered-speech/polish-cued.json b/corpora/corpora-of-disordered-speech/polish-cued.json index 49405ba..fc1770e 100644 --- a/corpora/corpora-of-disordered-speech/polish-cued.json +++ b/corpora/corpora-of-disordered-speech/polish-cued.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/dbcd8568-d17d-4861-94bb-aa553e943399", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of recordings of the DIA (Dutch Intelligibilty Assessment).\nThe corpus also contains a variety of other samples like reading passages, isolated sentences and recordings of spontaneous speech.\nThe corpus contains samples of 187 speakers with a speech disorder and samples of 122 speakers without a speech disorder. ", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "open access or through email request for access", "Size": ["20 children (11 girls and 9 boys)"], "Annotation": ["CHAT format"], diff --git a/corpora/corpora-of-disordered-speech/psychosisbank.json b/corpora/corpora-of-disordered-speech/psychosisbank.json index de0993b..8e69c45 100644 --- a/corpora/corpora-of-disordered-speech/psychosisbank.json +++ b/corpora/corpora-of-disordered-speech/psychosisbank.json @@ -3,7 +3,7 @@ "URL": "https://psychosis.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus intended for the study of language in psychosis.\nThe site is noted as under construction.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["English (various dialects)", "spa"], + "Language": ["English (various dialects)", "spa"], "Licence": "email request for access", "Size": ["Not available"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/raput.json b/corpora/corpora-of-disordered-speech/raput.json index abd23e1..af3a0cc 100644 --- a/corpora/corpora-of-disordered-speech/raput.json +++ b/corpora/corpora-of-disordered-speech/raput.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1435", "Family": "Corpora of Disordered Speech", "Description": "The corpus consists of texts produced by nonprofessional typical speakers and speakers with different language disorders (developmental language disorder, dyslexia, traumatic brain injury, aphasia, other).\nRoughly half of the corpus consists of texts of typical speakers, and the other half of speakers with language disorders.\nLanguage samples were elicited by six groups of tasks representing different writing styles (descriptive, expository, narrative, and letter) and different levels of formality.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC-BY-SA 4.0", "Size": ["6760 texts", "34469 sentences", "426187 tokens"], "Annotation": ["MULTEXT-East tagset"], diff --git a/corpora/corpora-of-disordered-speech/rhdbank.json b/corpora/corpora-of-disordered-speech/rhdbank.json index 1b93e77..3c6de75 100644 --- a/corpora/corpora-of-disordered-speech/rhdbank.json +++ b/corpora/corpora-of-disordered-speech/rhdbank.json @@ -3,7 +3,7 @@ "URL": "https://rhd.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in people with Right Hemisphere Damage (RHD).\nAccess to the data in RHDBank is password protected and restricted to members of the RHDBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng", "spa"], + "Language": ["eng", "spa"], "Licence": "email request for access", "Size": ["30 MB transcripts", "28 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/seed.json b/corpora/corpora-of-disordered-speech/seed.json index 8c16353..4c293e0 100644 --- a/corpora/corpora-of-disordered-speech/seed.json +++ b/corpora/corpora-of-disordered-speech/seed.json @@ -3,7 +3,7 @@ "URL": "https://osf.io/ygc8n/", "Family": "Corpora of Disordered Speech", "Description": "This corpus includes recordings of single words and continuous speech samples that provide examples of speakers with and without speech disorders.", - "Languages": ["English (American)"], + "Language": ["English (American)"], "Licence": "Access by registration", "Size": [], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json b/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json index d72e3cd..cb695b1 100644 --- a/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json +++ b/corpora/corpora-of-disordered-speech/sli-ru-kentalis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-97AF29EA-877D-422A-BAF7-25FA269351A6", "Family": "Corpora of Disordered Speech", "Description": "The corpus has been collected to investigate of the expression of spatial relations by children with SLI and normally developing children in their spoken language production. ", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB (Transcriptions), CLARIN RESTRICTED (Recordings)", "Size": ["2 GB"], "Annotation": ["Praat transcripts"], diff --git a/corpora/corpora-of-disordered-speech/ssnce-tamil.json b/corpora/corpora-of-disordered-speech/ssnce-tamil.json index 78049c8..103dc99 100644 --- a/corpora/corpora-of-disordered-speech/ssnce-tamil.json +++ b/corpora/corpora-of-disordered-speech/ssnce-tamil.json @@ -3,7 +3,7 @@ "URL": "https://catalog.ldc.upenn.edu/LDC2021S04", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of Tamil Dysarthric Speech.\nThe corpus contains approximately eight hours of Tamil speech data, time-aligned transcripts and metadata collected from 30 speakers (20 dysarthric speakers and 10 non-dysarthric speakers).\nThe non-dysarthric speakers consisted of five female and five male subjects. The dysarthric speakers (7 female, 13 male) reported a diagnosis of cerebral palsy and ranged in age from 12 years old to 37 years ol.\nIn total, each speaker recorded 365 utterances consisting of single words and of sentences that included a combination of common and uncommon Tamil phrases.\nThe corpus includes time-aligned phonetic transcripts for all collected speech data. Additional documentation includes phoneme mappings and speaker metadata. Audio data is presented as 16-bit 16kHz FLAC compressed linear pcm wav. Transcripts are presented as UTF-8 encoded plain text.", - "Languages": ["tam"], + "Language": ["tam"], "Licence": "LDC", "Size": ["30 speakers"], "Annotation": ["phonetic"], diff --git a/corpora/corpora-of-disordered-speech/star-sentences.json b/corpora/corpora-of-disordered-speech/star-sentences.json index 9b70322..b08d0b3 100644 --- a/corpora/corpora-of-disordered-speech/star-sentences.json +++ b/corpora/corpora-of-disordered-speech/star-sentences.json @@ -3,7 +3,7 @@ "URL": "https://www.seeingspeech.ac.uk/speechstar/disordered-child-speech-sentences-database/", "Family": "Corpora of Disordered Speech", "Description": "This is a collection of multiple audio-articulatory speech-disorder corpora.\nDatabase items are composite videos containing (i) midsagittal tongue movement, imaged with ultrasound tongue imaging (UTI), (ii) optional profile lip movement, recorded with a headset-mounted camera, and (iii) synchronised audio.\nRecordings in this database are of sentences produced by child speakers (aged 6,1-13,4) who were either reading orthographic stimuli from a screen, or repeating sentences produced by a researcher. Diagnoses are based on clinicians' reports.", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "CC BY-NC-ND", "Size": ["18 speakers"], "Annotation": ["orthographic", "phonemic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/star-speech-error.json b/corpora/corpora-of-disordered-speech/star-speech-error.json index 85a649b..9b4b3ec 100644 --- a/corpora/corpora-of-disordered-speech/star-speech-error.json +++ b/corpora/corpora-of-disordered-speech/star-speech-error.json @@ -3,7 +3,7 @@ "URL": "https://www.seeingspeech.ac.uk/speechstar/child-speech-error-database/", "Family": "Corpora of Disordered Speech", "Description": "This is a collection of multiple audio-articulatory speech disorder corpora.\nThe corpus is constituted of composite videos containing (i) midsagittal tongue movement, imaged with ultrasound tongue imaging (UTI), (ii) optional profile lip movement, recorded with a headset-mounted camera, and (iii) synchronised audio.\nRecordings in this database are of single words, or short phrases, produced by child speakers who were either reading orthographic stimuli from a screen, naming pictures, or repeating words produced by a researcher. Phonemic transcriptions are provided in order that those who are not familiar with the (rhotic) central Scottish accent can be aware of the speech sound targets.", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "CC BY-NC-ND", "Size": ["162 audio files"], "Annotation": ["orthographic", "phonemic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/tbibank.json b/corpora/corpora-of-disordered-speech/tbibank.json index 70a3873..ece2c80 100644 --- a/corpora/corpora-of-disordered-speech/tbibank.json +++ b/corpora/corpora-of-disordered-speech/tbibank.json @@ -3,7 +3,7 @@ "URL": "https://tbi.talkbank.org/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of multimedia interactions for the study of communication in people with traumatic brain injury.\nAccess to the data in TBIBank is password protected and restricted to members of the TBIBank consortium group.\nData in TalkBank use a consistent XML-compatible representation called CHAT. All of the data is transcribed in CHAT and CA/CHAT formats.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "email request for access", "Size": ["63 MB transcripts", "98 GB media"], "Annotation": ["CHAT and CA/CHAT"], diff --git a/corpora/corpora-of-disordered-speech/torgo.json b/corpora/corpora-of-disordered-speech/torgo.json index addd044..725894a 100644 --- a/corpora/corpora-of-disordered-speech/torgo.json +++ b/corpora/corpora-of-disordered-speech/torgo.json @@ -3,7 +3,7 @@ "URL": "http://www.cs.toronto.edu/~complingweb/data/TORGO/torgo.html", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of dysarthric articulation and consists of aligned acoustics and measured 3D articulatory features from speakers with either cerebral palsy (CP) or amyotrophic lateral sclerosis (ALS), which are two of the most prevalent causes of speech disability, and matched controls.\nThis dataset contains 2000 samples for dysarthric males, dysarthric females, non-dysarthric males, and non-dysarthric females.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY", "Size": ["Originally TORGO database contains 18GB of data"], "Annotation": [], diff --git a/corpora/corpora-of-disordered-speech/uclass.json b/corpora/corpora-of-disordered-speech/uclass.json index d40a012..7f303df 100644 --- a/corpora/corpora-of-disordered-speech/uclass.json +++ b/corpora/corpora-of-disordered-speech/uclass.json @@ -3,7 +3,7 @@ "URL": "https://www.uclass.psychol.ucl.ac.uk/", "Family": "Corpora of Disordered Speech", "Description": "This corpus consists of data from a study by Howell, Davis, Bartrip, and Wormald (2004).\nThe study looked at the fluency-enhancing effects of speaking at the same time as a frequency shifted version of the voice.\nThere were 14 speakers and four recording per speaker making 56 files in all. Recording are in SFS format.\nThe four recordings for a speaker were for two texts and two readings of each text.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "open access", "Size": ["56 files"], "Annotation": ["None"], diff --git a/corpora/corpora-of-disordered-speech/ultraphonix.json b/corpora/corpora-of-disordered-speech/ultraphonix.json index 90b704f..aefc934 100644 --- a/corpora/corpora-of-disordered-speech/ultraphonix.json +++ b/corpora/corpora-of-disordered-speech/ultraphonix.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/uxssd/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound and audio recordings from children with speech sound disorders. It contains data from 20 speakers (16 male, 4 female), aged 6-13 years. ", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["19 hours"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/ultrax-2020.json b/corpora/corpora-of-disordered-speech/ultrax-2020.json index 681f67a..c829802 100644 --- a/corpora/corpora-of-disordered-speech/ultrax-2020.json +++ b/corpora/corpora-of-disordered-speech/ultrax-2020.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/ux2020/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound tongue imaging and audio data, gathered from children with speech sound disorders by speech and language therapists in hospital environments.\n11 female speakers and 26 male, aged 5-12 years. There is one recording per child.\nThe following metadata are available for each recording: speech waveform, raw ultrasound data, ultrasound parameters, and prompt text with date/time of utterance recording. ", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["37 speakers"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/corpora-of-disordered-speech/ultrax-disorders.json b/corpora/corpora-of-disordered-speech/ultrax-disorders.json index 2c9c286..00f81ed 100644 --- a/corpora/corpora-of-disordered-speech/ultrax-disorders.json +++ b/corpora/corpora-of-disordered-speech/ultrax-disorders.json @@ -3,7 +3,7 @@ "URL": "https://ultrasuite.github.io/data/uxssd/", "Family": "Corpora of Disordered Speech", "Description": "This is a corpus of ultrasound and audio recordings from children with speech sound disorders.\nIt contains data from 8 speakers (2 female and 6 male), aged 5-10 years. ", - "Languages": ["English (Scottish)"], + "Language": ["English (Scottish)"], "Licence": "open access", "Size": ["11 hours"], "Annotation": ["Orthographic", "phonetic"], diff --git a/corpora/historical-corpora/15th-nt-trans.json b/corpora/historical-corpora/15th-nt-trans.json index a337130..86e486a 100644 --- a/corpora/historical-corpora/15th-nt-trans.json +++ b/corpora/historical-corpora/15th-nt-trans.json @@ -3,7 +3,7 @@ "URL": "http://stnt.ijp.pan.pl/", "Family": "Historical corpora", "Description": "This corpus contains Biblical texts from 1380 to 1500.\nThis corpus is available through a dedicated concordancer.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": ["400,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/17th-18th-polish.json b/corpora/historical-corpora/17th-18th-polish.json index dd6949d..7a5276a 100644 --- a/corpora/historical-corpora/17th-18th-polish.json +++ b/corpora/historical-corpora/17th-18th-polish.json @@ -3,7 +3,7 @@ "URL": "https://www.korba.edu.pl/query_corpus/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1601 to 1772.\nThe corpus is available through a dedicated concordancer.\nA manually annotated subset is available here.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["13.5 million tokens"], "Annotation": ["tokenised", "partially PoS-tagged", "structural annotation"], diff --git a/corpora/historical-corpora/19th-polish.json b/corpora/historical-corpora/19th-polish.json index bf27cd3..440f37c 100644 --- a/corpora/historical-corpora/19th-polish.json +++ b/corpora/historical-corpora/19th-polish.json @@ -3,7 +3,7 @@ "URL": "http://korpus19.nlp.ipipan.waw.pl/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1830 to 1918.\nThe corpus is available for download through a dedicated webpage.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["625,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "transliteration", "transcription"], diff --git a/corpora/historical-corpora/agricola-db.json b/corpora/historical-corpora/agricola-db.json index d87d1b9..71bacd1 100644 --- a/corpora/historical-corpora/agricola-db.json +++ b/corpora/historical-corpora/agricola-db.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730170", "Family": "Historical corpora", "Description": "This corpus contains texts from 1544 to 1551 written by the clergyman Mikael Agricola.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-ND", "Size": ["428,300 tokens"], "Annotation": ["tokenised", "PoS-tagged", "morphological components and syntactic function"], diff --git a/corpora/historical-corpora/aleksis-kivi.json b/corpora/historical-corpora/aleksis-kivi.json index dff8c1c..16c8a4b 100644 --- a/corpora/historical-corpora/aleksis-kivi.json +++ b/corpora/historical-corpora/aleksis-kivi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405274", "Family": "Historical corpora", "Description": "This corpus contains the works by Finnish author Aleksis Kivi from 1855 to 1871.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY-NC", "Size": ["413,700 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/anno-cuneiform.json b/corpora/historical-corpora/anno-cuneiform.json index 94b5109..403e102 100644 --- a/corpora/historical-corpora/anno-cuneiform.json +++ b/corpora/historical-corpora/anno-cuneiform.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2018071121", "Family": "Historical corpora", "Description": "This corpus contains cuneiform texts from Ancient history.\nThe texts come from the Oracc project and include collections such as the Corpus of Ancient Mesopotamian Scholarship, The Digital Corpus of Cuneiform Lexical Texts, and Royal Inscriptions of Babylonia online.\nThe corpus is available through the concordancer Korp and for download from the repository of FIN-CLARIN.", - "Languages": ["akk"], + "Language": ["akk"], "Licence": "CC-BY-SA", "Size": ["1,600,563 tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged", "semantically annotated"], diff --git a/corpora/historical-corpora/anth-mid-eng.json b/corpora/historical-corpora/anth-mid-eng.json index 0a3c5ac..9c7c1b6 100644 --- a/corpora/historical-corpora/anth-mid-eng.json +++ b/corpora/historical-corpora/anth-mid-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1398", "Family": "Historical corpora", "Description": "This corpus contains literary texts from 1100 to 1400.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["enm","heb"], + "Language": ["enm","heb"], "Licence": "Oxford Text Archive licence", "Size": ["4000 words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/archer.json b/corpora/historical-corpora/archer.json index 9ff21a4..c3686de 100644 --- a/corpora/historical-corpora/archer.json +++ b/corpora/historical-corpora/archer.json @@ -3,7 +3,7 @@ "URL": "http://www.projects.alc.manchester.ac.uk/archer/", "Family": "Historical corpora", "Description": "The corpus contains texts from 1600 to 1999.\nThe corpus is available through the CQPConcordancer. ", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/austrian-baroque.json b/corpora/historical-corpora/austrian-baroque.json index 3566d2f..15d6a0c 100644 --- a/corpora/historical-corpora/austrian-baroque.json +++ b/corpora/historical-corpora/austrian-baroque.json @@ -3,7 +3,7 @@ "URL": "https://acdh.oeaw.ac.at/abacus/", "Family": "Historical corpora", "Description": "This corpus contains sermons from 1650 to 1750.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["200,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "named entities"], diff --git a/corpora/historical-corpora/b4-hist-preach.json b/corpora/historical-corpora/b4-hist-preach.json index 932950c..c009591 100644 --- a/corpora/historical-corpora/b4-hist-preach.json +++ b/corpora/historical-corpora/b4-hist-preach.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B23-A", "Family": "Historical corpora", "Description": "This corpus contains sermons from an Upper German (Balvarian-Alemannic) dialect area.\nThe corpus is available for download from the repository of the University of Hamburg and through the ANNIS environment.", - "Languages": ["gmh"], + "Language": ["gmh"], "Licence": "CLARIN ACA", "Size": ["92,500 tokens"], "Annotation": ["tokenised", "syntactic and discursive annotation"], diff --git a/corpora/historical-corpora/b4-ludolf.json b/corpora/historical-corpora/b4-ludolf.json index 2830e14..7afcac1 100644 --- a/corpora/historical-corpora/b4-ludolf.json +++ b/corpora/historical-corpora/b4-ludolf.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B22-B", "Family": "Historical corpora", "Description": "This corpus contains texts from a journey diary from 1350.\nThe corpus is available for download from the repository of the University of Hamburg and through the ANNIS environment.", - "Languages": ["gmh"], + "Language": ["gmh"], "Licence": "CLARIN ACA", "Size": ["6,690 tokens"], "Annotation": ["tokenised", "tagged for clause type and grammatical function"], diff --git a/corpora/historical-corpora/b4-tatian.json b/corpora/historical-corpora/b4-tatian.json index b801bda..4c1b068 100644 --- a/corpora/historical-corpora/b4-tatian.json +++ b/corpora/historical-corpora/b4-tatian.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B1E-1", "Family": "Historical corpora", "Description": "This corpus contains the OHG Tatian, which is one of the largest prose texts from the Old High German period.\nThe corpus is available for download and through a concordancer from the repository of the University of Hamburg.", - "Languages": ["lat", "goh"], + "Language": ["lat", "goh"], "Licence": "CC-BY", "Size": ["11,300 tokens"], "Annotation": ["tokenised", "MSD-tagged"], diff --git a/corpora/historical-corpora/bib-text-scots.json b/corpora/historical-corpora/bib-text-scots.json index 554b8d0..dd73bb9 100644 --- a/corpora/historical-corpora/bib-text-scots.json +++ b/corpora/historical-corpora/bib-text-scots.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1713", "Family": "Historical corpora", "Description": "This corpus contains Biblical texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["sco"], + "Language": ["sco"], "Licence": "Oxford Text Archive licence", "Size": ["35,506 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/brieven-buit.json b/corpora/historical-corpora/brieven-buit.json index 7d0a980..fb84cd2 100644 --- a/corpora/historical-corpora/brieven-buit.json +++ b/corpora/historical-corpora/brieven-buit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/f6d68fed217ef7364a32c431396ac465", "Family": "Historical corpora", "Description": "This corpus contains 40,000 letters from the 17th to the 19th century.\nThese letters were sent home by sailors and others from abroad but also vice versa by those staying behind who needed to keep in touch with their loved ones. Many letters did not reach their destinations: they were taken as loot by privateers and confiscated by the High Court of Admiralty during the wars fought between The Netherlands and England\nThe corpus is available through a dedicated concordancer.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB", "Size": ["460,000 words"], "Annotation": ["lemmatised", "PoS-tagged", "grammatically tagged"], diff --git a/corpora/historical-corpora/bundesblatt.json b/corpora/historical-corpora/bundesblatt.json index a473d63..5078ccf 100644 --- a/corpora/historical-corpora/bundesblatt.json +++ b/corpora/historical-corpora/bundesblatt.json @@ -3,7 +3,7 @@ "URL": "https://feuille-federale.unige.ch/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1849 to 2014.\nThe corpus is available through the CQPWeb concordancer.", - "Languages": ["deu","fra","ita"], + "Language": ["deu","fra","ita"], "Licence": "", "Size": ["203,585,806 tokens (German)", "239,125,036 tokens (French)", "85,223,085 tokens (Italian)"], "Annotation": ["tokenised", "syntactically-parsed"], diff --git a/corpora/historical-corpora/carniolan-pa.json b/corpora/historical-corpora/carniolan-pa.json index 8eda22e..d385795 100644 --- a/corpora/historical-corpora/carniolan-pa.json +++ b/corpora/historical-corpora/carniolan-pa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1824", "Family": "Historical corpora", "Description": "The corpus contains meeting proceedings of 694 sessions of the Carniolan Provincial Assembly from 1861 to 1913.\nThe source data (scanned and OCR processed pdf documents) originally come from The Digital Library of Slovenia dLib.si and History of Slovenia - SIstory portals. The documents are bilingual, in Slovenian and German, depending on the speaker. German was first typeset in the Gothic script and later on in Latin.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Language was detected on the sentence level, roughly 58% sentences are in Slovenian and 42% in German. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using Trankit for Slovenian and German, while Lingua is used for language detection.\nThe documents are in the Parla-CLARIN compliant TEI XML format. Each session in one file.", - "Languages": ["deu", "slv"], + "Language": ["deu", "slv"], "Licence": "CC-BY 4.0", "Size": ["10.9 million words"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/ced.json b/corpora/historical-corpora/ced.json index 30c27a2..96f9847 100644 --- a/corpora/historical-corpora/ced.json +++ b/corpora/historical-corpora/ced.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2507", "Family": "Historical corpora", "Description": "This corpus contains dialogues from literary and didactic works from 1560 to 1760.\n There are five text-types in the CED. The text-types representative of constructed dialogue are drama comedy, didactic works (language manuals and other handbooks) and fiction; the text-types representative of authentic dialogue are trial proceedings and witness depositions. In addition, a small group of miscellaneous dialogic texts is included in the collection.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["1.2 million words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/ceecs.json b/corpora/historical-corpora/ceecs.json index d38413b..3a6673f 100644 --- a/corpora/historical-corpora/ceecs.json +++ b/corpora/historical-corpora/ceecs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2461", "Family": "Historical corpora", "Description": "This corpus contains 1147 letters from 1418 to 1680.\nThe corpus was created from the larger Corpus of Early English Correspondence.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["450,000 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/chroniclItaly.json b/corpora/historical-corpora/chroniclItaly.json index 3aa3b28..57a8d3c 100644 --- a/corpora/historical-corpora/chroniclItaly.json +++ b/corpora/historical-corpora/chroniclItaly.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.24416/uu01-t4ymow", "Family": "Historical corpora", "Description": "This corpus contains Italian language newspapers published in the United States between 1898 and 1920. The corpus includes seven Italian language newspapers published in California, Massachusetts, Pennsylvania, Vermont, and West Virginia. The collection includes the following titles: L’Italia, Cronaca sovversiva, La libera parola, The patriot, La ragione, La rassegna, and La sentinella del West Virginia.\nThe corpus is available for download from the repository of the University of Utrecht.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "ODC Attribution License (ODC-By)", "Size": ["16.6 million words"], "Annotation": ["unannotated"], diff --git a/corpora/historical-corpora/chronopress.json b/corpora/historical-corpora/chronopress.json index 7d90141..47423d4 100644 --- a/corpora/historical-corpora/chronopress.json +++ b/corpora/historical-corpora/chronopress.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/260", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1945 to 1954.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY-SA", "Size": ["16 million tokens"], "Annotation": [], diff --git a/corpora/historical-corpora/cipm.json b/corpora/historical-corpora/cipm.json index 0fa6795..9c4f8ea 100644 --- a/corpora/historical-corpora/cipm.json +++ b/corpora/historical-corpora/cipm.json @@ -3,7 +3,7 @@ "URL": "http://cipm.fcsh.unl.pt/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 9th to the 16th century.\nThe corpus is available through a dedicated concordancer (restricted access).", - "Languages": ["por"], + "Language": ["por"], "Licence": "", "Size": ["2 million tokens"], "Annotation": ["tokenised", "PoS-tagged"], diff --git a/corpora/historical-corpora/class-lib-nat-lib-fi.json b/corpora/historical-corpora/class-lib-nat-lib-fi.json index 7f412bd..fc42b30 100644 --- a/corpora/historical-corpora/class-lib-nat-lib-fi.json +++ b/corpora/historical-corpora/class-lib-nat-lib-fi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2018051701", "Family": "Historical corpora", "Description": "This corpus will contain literary texts from 1549 to 1944.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/ddr-press.json b/corpora/historical-corpora/ddr-press.json index b1ecd73..e481055 100644 --- a/corpora/historical-corpora/ddr-press.json +++ b/corpora/historical-corpora/ddr-press.json @@ -3,7 +3,7 @@ "URL": "https://clarin.bbaw.de/en/corpus/", "Family": "Historical corpora", "Description": "This corpus contains newspaper texts from 1945 to 1994.\nThe corpus is available through a concordancer provided by CLARIN-D.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/diacoris.json b/corpora/historical-corpora/diacoris.json index 02649f9..d3aa78c 100644 --- a/corpora/historical-corpora/diacoris.json +++ b/corpora/historical-corpora/diacoris.json @@ -3,7 +3,7 @@ "URL": "http://corpora.dslo.unibo.it/coris_ita.html", "Family": "Historical corpora", "Description": "This corpus contains texts from 1861 to 1945.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/diakorp.json b/corpora/historical-corpora/diakorp.json index 7ee47fb..a74b6e7 100644 --- a/corpora/historical-corpora/diakorp.json +++ b/corpora/historical-corpora/diakorp.json @@ -3,7 +3,7 @@ "URL": "http://wiki.korpus.cz/doku.php/en:cnk:diakorp", "Family": "Historical corpora", "Description": "This corpus contains texts from the 14th to the 20th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY-NC-SA", "Size": ["4 million tokens"], "Annotation": ["basic structural markup"], diff --git a/corpora/historical-corpora/dig-hist-slovene.json b/corpora/historical-corpora/dig-hist-slovene.json index dfb3bd2..e81471b 100644 --- a/corpora/historical-corpora/dig-hist-slovene.json +++ b/corpora/historical-corpora/dig-hist-slovene.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1031", "Family": "Historical corpora", "Description": "This corpus contains 658 unique texts from 1584 to 1919.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancer KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY-SA 4.0", "Size": ["17.7 million tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged"], diff --git a/corpora/historical-corpora/diorisis-ancient-greek.json b/corpora/historical-corpora/diorisis-ancient-greek.json index ae32eb0..7394b7d 100644 --- a/corpora/historical-corpora/diorisis-ancient-greek.json +++ b/corpora/historical-corpora/diorisis-ancient-greek.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-4769", "Family": "Historical corpora", "Description": "This corpus consists of 820 texts spanning between the beginnings of the Ancient Greek literary tradition (Homer) to the fifth century AD.\nThe texts are sourced from the Perseus Canonical Greek Lit Repository, \"The Little Sailing\" digital library, and the Bibliotheca Augustana digital library.\nThe corpus is available for download from Figshare.", - "Languages": ["grc"], + "Language": ["grc"], "Licence": "CC BY 4.0", "Size": ["10.2 million words"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/doec.json b/corpora/historical-corpora/doec.json index 2f7a906..c3d11ce 100644 --- a/corpora/historical-corpora/doec.json +++ b/corpora/historical-corpora/doec.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2488", "Family": "Historical corpora", "Description": "This corpus contains 3037 texts from 600 to 1150.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang","lat"], + "Language": ["ang","lat"], "Licence": "Oxford Text Archive licence", "Size": [], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/dta.json b/corpora/historical-corpora/dta.json index b7ddbdc..6576c96 100644 --- a/corpora/historical-corpora/dta.json +++ b/corpora/historical-corpora/dta.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11120/0000-0005-0ABA-F", "Family": "Historical corpora", "Description": "This corpus contains texts from the 17th to the 20th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN PUB", "Size": ["215,168,761 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/early-modern-fi.json b/corpora/historical-corpora/early-modern-fi.json index ad47d1c..00ee32e 100644 --- a/corpora/historical-corpora/early-modern-fi.json +++ b/corpora/historical-corpora/early-modern-fi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730147", "Family": "Historical corpora", "Description": "This corpus contains texts from 1809 to 1899.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "rus", "deu", "lat"], + "Language": ["fin", "rus", "deu", "lat"], "Licence": "EUPL v.1.1 SA", "Size": ["8.6 million words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/ecco-tcp.json b/corpora/historical-corpora/ecco-tcp.json index 73c80e6..f123611 100644 --- a/corpora/historical-corpora/ecco-tcp.json +++ b/corpora/historical-corpora/ecco-tcp.json @@ -3,7 +3,7 @@ "URL": "https://textcreationpartnership.org/tcp-texts/ecco-tcp-eighteenth-century-collections-online/", "Family": "Historical corpora", "Description": "This corpus contains texts (literature, philosophy, politics, religion, geography, science and all other areas of human endeavour) from 1700 to 1800.\nThe corpus is available for download from a dedicated webpage and through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-0", "Size": ["74 million tokens"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/edinburgh-dost.json b/corpora/historical-corpora/edinburgh-dost.json index 304a1dc..0a40140 100644 --- a/corpora/historical-corpora/edinburgh-dost.json +++ b/corpora/historical-corpora/edinburgh-dost.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/0701", "Family": "Historical corpora", "Description": "This corpus contains texts from 1450 to 1600.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["877,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/eebo-tcp.json b/corpora/historical-corpora/eebo-tcp.json index 380eff6..dba9d9b 100644 --- a/corpora/historical-corpora/eebo-tcp.json +++ b/corpora/historical-corpora/eebo-tcp.json @@ -3,7 +3,7 @@ "URL": "https://textcreationpartnership.org/tcp-texts/eebo-tcp-early-english-books-online/", "Family": "Historical corpora", "Description": "This corpus contains texts (literature, philosophy, politics, religion, geography, science and all other areas of human endeavour) from 1450 to 1750.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-0", "Size": ["766 million tokens"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/efontes.json b/corpora/historical-corpora/efontes.json index 2ca177b..2c07f5e 100644 --- a/corpora/historical-corpora/efontes.json +++ b/corpora/historical-corpora/efontes.json @@ -3,7 +3,7 @@ "URL": "http://scriptores.pl/efontes/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 11th to the middle of the 16th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": ["5 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/historical-corpora/en-nw-late-modern.json b/corpora/historical-corpora/en-nw-late-modern.json index 565222b..94e8727 100644 --- a/corpora/historical-corpora/en-nw-late-modern.json +++ b/corpora/historical-corpora/en-nw-late-modern.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2468", "Family": "Historical corpora", "Description": "This corpus contains texts from 1761 to 1790.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["300,000 words"], "Annotation": ["COCOA-style"], diff --git a/corpora/historical-corpora/evans-tcp.json b/corpora/historical-corpora/evans-tcp.json index 889dcd1..596a87e 100644 --- a/corpora/historical-corpora/evans-tcp.json +++ b/corpora/historical-corpora/evans-tcp.json @@ -3,7 +3,7 @@ "URL": "https://textcreationpartnership.org/tcp-texts/evans-tcp-evans-early-american-imprints/", "Family": "Historical corpora", "Description": "This corpus contains American texts from 1640 to 1821.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-0", "Size": ["766 million tokens"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/fin-classics.json b/corpora/historical-corpora/fin-classics.json index 7a4ade7..fe4692b 100644 --- a/corpora/historical-corpora/fin-classics.json +++ b/corpora/historical-corpora/fin-classics.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730186", "Family": "Historical corpora", "Description": "This corpus contains literary texts from 1880 to 1949.\nIn terms of genre, the texts correspond to prose fiction, plays, poetry and aphorisms.\nThe corpus is available through the concordancer Korp (FIN-CLARIN).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "EUPL v.1.1 SA", "Size": ["1.5 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/fin-folk.json b/corpora/historical-corpora/fin-folk.json index 1769380..467c143 100644 --- a/corpora/historical-corpora/fin-folk.json +++ b/corpora/historical-corpora/fin-folk.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052712", "Family": "Historical corpora", "Description": "This corpus contains poems from 1564 to 1939.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], + "Language": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], "Licence": "CC-BY-NC", "Size": ["7.1 million words"], "Annotation": ["normalised (added diacritics)"], diff --git a/corpora/historical-corpora/fin-gutenberg.json b/corpora/historical-corpora/fin-gutenberg.json index 22a4bb2..8d7cdd2 100644 --- a/corpora/historical-corpora/fin-gutenberg.json +++ b/corpora/historical-corpora/fin-gutenberg.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014100301", "Family": "Historical corpora", "Description": "This corpus contains books published up to 1925 that are made available through the Gutenberg project.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY", "Size": ["34.5 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/fin-news-periodicals.json b/corpora/historical-corpora/fin-news-periodicals.json index a60e249..f61f0d8 100644 --- a/corpora/historical-corpora/fin-news-periodicals.json +++ b/corpora/historical-corpora/fin-news-periodicals.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016050302", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1840 to 2011.\nFor a comprehensive list of newspapers included in the corpus, see here.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC_BY-SA", "Size": ["5.2 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/frantext.json b/corpora/historical-corpora/frantext.json index cb48215..eaa105b 100644 --- a/corpora/historical-corpora/frantext.json +++ b/corpora/historical-corpora/frantext.json @@ -3,7 +3,7 @@ "URL": "https://www.frantext.fr/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 10th to the 21st century.\nThe corpus is available through a dedicated concordancer (restricted access).", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "", "Size": ["300 million words"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/germanc.json b/corpora/historical-corpora/germanc.json index 3006ecb..4882cda 100644 --- a/corpora/historical-corpora/germanc.json +++ b/corpora/historical-corpora/germanc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2544", "Family": "Historical corpora", "Description": "This corpus contains personal letters, sermons and fictional, scholarly (i.e., humanities), scientific and legal texts from 1650 to 1800.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["700,000 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/grek-medieval.json b/corpora/historical-corpora/grek-medieval.json index 077dc76..c33f456 100644 --- a/corpora/historical-corpora/grek-medieval.json +++ b/corpora/historical-corpora/grek-medieval.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-251D-7", "Family": "Historical corpora", "Description": "This corpus contains texts from the 4th to the 16th century.\nThe texts belong to the following categories: religious, poetical-literary, political, and historical texts, as well as hymns and epigrams.\nThe corpus is available for download from the clarin:el repository. ", - "Languages": ["grc"], + "Language": ["grc"], "Licence": "CC-BY", "Size": ["3.4 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/gysseling.json b/corpora/historical-corpora/gysseling.json index 6bf9354..b8acfe8 100644 --- a/corpora/historical-corpora/gysseling.json +++ b/corpora/historical-corpora/gysseling.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-j4", "Family": "Historical corpora", "Description": "This corpus contains texts from the 13th century.\nThe texts were prepared and originally published in the 1970s and 1980s by the Ghent linguist Maurits Gysseling.\nThe corpus is available for download from the Instituut voor de Nederlandse Taal and through a dedicated concordancer.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "INT Licence for researchers", "Size": ["1.5 million words"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/hacossa.json b/corpora/historical-corpora/hacossa.json index a2638b7..eb535b1 100644 --- a/corpora/historical-corpora/hacossa.json +++ b/corpora/historical-corpora/hacossa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9D16-7", "Family": "Historical corpora", "Description": "This corpus contains texts written in the Late Old Swedish period (from 1375 to 1550).\nThe corpus is available for download from the repository of the University of Hamburg.", - "Languages": ["eng", "deu", "lat", "non", "swe"], + "Language": ["eng", "deu", "lat", "non", "swe"], "Licence": "CLARIN RES", "Size": ["128,000 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/hansard.json b/corpora/historical-corpora/hansard.json index 75b3d16..f855320 100644 --- a/corpora/historical-corpora/hansard.json +++ b/corpora/historical-corpora/hansard.json @@ -3,7 +3,7 @@ "URL": "https://www.clarin.ac.uk/hansard-corpus", "Family": "Historical corpora", "Description": "This corpus contains parliamentary debates from 1803 to 2005.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["1.6 billion tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "semantic tags"], diff --git a/corpora/historical-corpora/helsinki-eng.json b/corpora/historical-corpora/helsinki-eng.json index 4dbfc1a..50e25a8 100644 --- a/corpora/historical-corpora/helsinki-eng.json +++ b/corpora/historical-corpora/helsinki-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1477", "Family": "Historical corpora", "Description": "This corpus contains religious and fictional texts from 730 to 1710.\nSee the project page for a list of all the texts included in the corpus.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["English (Old)", "English (Middle)"], + "Language": ["English (Old)", "English (Middle)"], "Licence": "Oxford Text Archive licence", "Size": ["240,000 words"], "Annotation": [], diff --git a/corpora/historical-corpora/helsinki-old-scot.json b/corpora/historical-corpora/helsinki-old-scot.json index 765da34..53a85a5 100644 --- a/corpora/historical-corpora/helsinki-old-scot.json +++ b/corpora/historical-corpora/helsinki-old-scot.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2081", "Family": "Historical corpora", "Description": "This corpus contains texts of different domains and genres (e.g., burgh records, diaries, pamphlets, scientific treatises, sermons) from 1450 to 1700.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["sco"], + "Language": ["sco"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["1,940,706 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/helsinki-scot.json b/corpora/historical-corpora/helsinki-scot.json index 471c1a4..afc35ed 100644 --- a/corpora/historical-corpora/helsinki-scot.json +++ b/corpora/historical-corpora/helsinki-scot.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201411071", "Family": "Historical corpora", "Description": "This corpus contains personal correspondence from 1540 to 1750.\nthe corpus consists of transcripts of original letter manuscripts. The texts are reproduced without any modernisation or normalisation. Language-external variables such as date, region, gender, addressee, hand and script type have been coded.\nThe writers originate from fifteen different regions of Scotland. A fifth of the correspondents in the corpus are women.\nThe corpus is available through the concordancer Korp.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN ACA", "Size": ["500,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/hist-am-eng.json b/corpora/historical-corpora/hist-am-eng.json index ccfb90e..1449b1f 100644 --- a/corpora/historical-corpora/hist-am-eng.json +++ b/corpora/historical-corpora/hist-am-eng.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017061925", "Family": "Historical corpora", "Description": "This corpus contains texts from 1810 to 2009.\nEach decade has roughly the same balance of fiction, popular magazine, newspaper, and non-fiction books.\nThe corpus is available through the concordancer Korp.", - "Languages": ["English (American)"], + "Language": ["English (American)"], "Licence": "CLARIN ACA", "Size": ["385 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/hist-lancaster.json b/corpora/historical-corpora/hist-lancaster.json index 328635c..902fe28 100644 --- a/corpora/historical-corpora/hist-lancaster.json +++ b/corpora/historical-corpora/hist-lancaster.json @@ -3,7 +3,7 @@ "URL": "https://cqpweb.lancs.ac.uk/", "Family": "Historical corpora", "Description": "The corpus contains texts in various domains (e.g., fiction, newspaper texts, religious texts) from 1500 on.\nThe corpus is available through the CQPConcordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Annotation": ["tokenised", "PoS-tagged", "partial semantic tagging (USAS system)"], diff --git a/corpora/historical-corpora/hist-welsh.json b/corpora/historical-corpora/hist-welsh.json index 18dd50d..5e4d924 100644 --- a/corpora/historical-corpora/hist-welsh.json +++ b/corpora/historical-corpora/hist-welsh.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-883", "Family": "Historical corpora", "Description": "This corpus contains 30 texts from 1500 to 1850.\nThe corpus is available for download from a dedicated website and through a dedicated concordancer.", - "Languages": ["cym"], + "Language": ["cym"], "Licence": "", "Size": ["420,000 words"], "Annotation": [], diff --git a/corpora/historical-corpora/hun-courts.json b/corpora/historical-corpora/hun-courts.json index a6c2bec..3accae9 100644 --- a/corpora/historical-corpora/hun-courts.json +++ b/corpora/historical-corpora/hun-courts.json @@ -3,7 +3,7 @@ "URL": "http://tmk.nytud.hu/about.php", "Family": "Historical corpora", "Description": "This corpus contains private letters and testimonies from the 16th to the 18th  century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["850,000 words"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "sociolinguistic metadata"], diff --git a/corpora/historical-corpora/hun-hist.json b/corpora/historical-corpora/hun-hist.json index 4812be1..a93e8ce 100644 --- a/corpora/historical-corpora/hun-hist.json +++ b/corpora/historical-corpora/hun-hist.json @@ -3,7 +3,7 @@ "URL": "http://clara.nytud.hu/mtsz/run.cgi/first_form", "Family": "Historical corpora", "Description": "This corpus contains historical texts from the 18th century to the 2000s.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["30 million words"], "Annotation": [], diff --git a/corpora/historical-corpora/impact-gt.json b/corpora/historical-corpora/impact-gt.json index 6347dc3..2dc71fb 100644 --- a/corpora/historical-corpora/impact-gt.json +++ b/corpora/historical-corpora/impact-gt.json @@ -3,7 +3,7 @@ "URL": "https://szukajwslownikach.uw.edu.pl/IMPACT_GT_1/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1570 to 1756.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["1.5 million tokens"], "Annotation": ["transcription"], diff --git a/corpora/historical-corpora/lampeter-tracts.json b/corpora/historical-corpora/lampeter-tracts.json index 91933aa..fc9ada1 100644 --- a/corpora/historical-corpora/lampeter-tracts.json +++ b/corpora/historical-corpora/lampeter-tracts.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/3193", "Family": "Historical corpora", "Description": "This corpus contains tracts from 1640 to 1740.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["50,797,916 words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/lancaster-newsbooks.json b/corpora/historical-corpora/lancaster-newsbooks.json index 482e28d..162058d 100644 --- a/corpora/historical-corpora/lancaster-newsbooks.json +++ b/corpora/historical-corpora/lancaster-newsbooks.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2531", "Family": "Historical corpora", "Description": "This corpus contains two collections of English printed pamphlets, books, and newspapers from 1654 to 1655.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["3,001,604 words"], "Annotation": [], diff --git a/corpora/historical-corpora/late-modern-en-prose.json b/corpora/historical-corpora/late-modern-en-prose.json index a46dfe1..5d67dd3 100644 --- a/corpora/historical-corpora/late-modern-en-prose.json +++ b/corpora/historical-corpora/late-modern-en-prose.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2077", "Family": "Historical corpora", "Description": "This corpus contains fictional texts from 1837 to 1926.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["580,056 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/late-modern-en-texts.json b/corpora/historical-corpora/late-modern-en-texts.json index 393b8b1..754cb71 100644 --- a/corpora/historical-corpora/late-modern-en-texts.json +++ b/corpora/historical-corpora/late-modern-en-texts.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0002-43F3-0", "Family": "Historical corpora", "Description": "This corpus contains texts written by British and Irish authors from 1710 to 1920.\nIn terms of genre, the texts correspond to narrative fiction and non-fiction, drama, letters, treatises, and miscellaneous written works.\nThe corpus is available for download from a CLARIN-D repository. ", - "Languages": ["English (Late Modern)"], + "Language": ["English (Late Modern)"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["34 million words"], "Annotation": ["PoS-tagged"], diff --git a/corpora/historical-corpora/latinise.json b/corpora/historical-corpora/latinise.json index 1a3eaf0..3a83668 100644 --- a/corpora/historical-corpora/latinise.json +++ b/corpora/historical-corpora/latinise.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-3170", "Family": "Historical corpora", "Description": "This corpus consists of Latin texts from the 2nd century B.C. to the 21st century. Non-linguistic metadata include information on genre, title, century and specific date.\nThe corpus is available for download from LINDAT and for search online through Sketch Engine.", - "Languages": ["lat"], + "Language": ["lat"], "Licence": "CC BY-NC-SA 4.0", "Size": ["13.3 million tokens"], "Annotation": ["sentence segmented", "PoS-tagged", "lemmatized"], diff --git a/corpora/historical-corpora/letter-sinebrychoff.json b/corpora/historical-corpora/letter-sinebrychoff.json index ac34e9b..08576c6 100644 --- a/corpora/historical-corpora/letter-sinebrychoff.json +++ b/corpora/historical-corpora/letter-sinebrychoff.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407303", "Family": "Historical corpora", "Description": "This corpus contains letters from 1895 to 1909.\nThe corpus is available through a dedicated online search environment.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": ["8.6 million words"], "Annotation": ["Finnish subset: MSD-tagged, syntactically parsed; Swedish subset: no linguistic annotation"], diff --git a/corpora/historical-corpora/mannheim-hist.json b/corpora/historical-corpora/mannheim-hist.json index 9dcecff..eb8d0ca 100644 --- a/corpora/historical-corpora/mannheim-hist.json +++ b/corpora/historical-corpora/mannheim-hist.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-01B8-AE41-41A4-DC01-5", "Family": "Newspaper corpora", "Description": "This corpus contains articles from 21 German newspapers from the 18th and 19th century.\nThe corpus is available for download from the CLARIN-D repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["3532 pages", "4.1 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/medi-charter.json b/corpora/historical-corpora/medi-charter.json index 2217bf6..7b543ca 100644 --- a/corpora/historical-corpora/medi-charter.json +++ b/corpora/historical-corpora/medi-charter.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1952", "Family": "Historical corpora", "Description": "This corpus contains Latin charters created in the era of John the Bling, King of Bohemia.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces","lat"], + "Language": ["ces","lat"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["57 chapters"], "Annotation": ["manually-tagged", "named entities"], diff --git a/corpora/historical-corpora/menota.json b/corpora/historical-corpora/menota.json index a3e69fc..6ca0ef3 100644 --- a/corpora/historical-corpora/menota.json +++ b/corpora/historical-corpora/menota.json @@ -3,7 +3,7 @@ "URL": "http://clarino.uib.no/menota/page", "Family": "Historical corpora", "Description": "This corpus contains Medieval Nordic texts.\nThe corpus is available for download and through the concordancer Corpuscle.", - "Languages": ["Old Norse"], + "Language": ["Old Norse"], "Licence": "CC-BY", "Size": ["1.6 million tokens"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/historical-corpora/midia.json b/corpora/historical-corpora/midia.json index bb294d0..ab228f8 100644 --- a/corpora/historical-corpora/midia.json +++ b/corpora/historical-corpora/midia.json @@ -3,7 +3,7 @@ "URL": "http://www.corpusmidia.unito.it/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 13th to the 20th century.\nThe corpus is available through a dedicated concordancer", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC-BY-NC 4.0", "Size": ["7.5 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/news-fin-17-18.json b/corpora/historical-corpora/news-fin-17-18.json index 7c64504..632eb32 100644 --- a/corpora/historical-corpora/news-fin-17-18.json +++ b/corpora/historical-corpora/news-fin-17-18.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015051201", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1771 to 1874.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/historical-corpora/news-fin-18-19.json b/corpora/historical-corpora/news-fin-18-19.json index 1e3b039..362ff6a 100644 --- a/corpora/historical-corpora/news-fin-18-19.json +++ b/corpora/historical-corpora/news-fin-18-19.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201801192", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1875 to 1920.\nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CLARIN ACA", "Size": ["8.7 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/news-fin.json b/corpora/historical-corpora/news-fin.json index 089f6c4..f214427 100644 --- a/corpora/historical-corpora/news-fin.json +++ b/corpora/historical-corpora/news-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405276", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1770 to 2011.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY", "Size": ["8.7 billion words"], "Annotation": [], diff --git a/corpora/historical-corpora/notthingham-de-medicine.json b/corpora/historical-corpora/notthingham-de-medicine.json index fed772d..f0058c9 100644 --- a/corpora/historical-corpora/notthingham-de-medicine.json +++ b/corpora/historical-corpora/notthingham-de-medicine.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2562", "Family": "Historical corpora", "Description": "This corpus contains medical writing from 1500 to 1700.\nThe texts are taken primarily from digital facsimile copies available online via the University of Würzburg’s library interface, particularly from the subcategory of pertaining to gynaecology.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["120,000 tokens"], "Annotation": ["TEI Lite markup", "no linguistic annotation"], diff --git a/corpora/historical-corpora/old-bailey.json b/corpora/historical-corpora/old-bailey.json index 9dd9482..b51a42c 100644 --- a/corpora/historical-corpora/old-bailey.json +++ b/corpora/historical-corpora/old-bailey.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CFB-2", "Family": "Historical corpora", "Description": "This corpus contains proceedings of the Old Bailey (i.e., legal documents) from 1674 to 1913.\nThe corpus is available for download from the CLARIN-D repository and through the CQPConcordancer.\nFor the corpus manual, see Huber et al. (2016).", - "Languages": ["English (Late Modern)"], + "Language": ["English (Late Modern)"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["134 million words"], "Annotation": ["detailed sociobiographical, pragmatic and textual annotation"], diff --git a/corpora/historical-corpora/old-hungarian.json b/corpora/historical-corpora/old-hungarian.json index 860409a..be42d32 100644 --- a/corpora/historical-corpora/old-hungarian.json +++ b/corpora/historical-corpora/old-hungarian.json @@ -3,7 +3,7 @@ "URL": "http://oldhungariancorpus.nytud.hu/en-descr.html", "Family": "Historical corpora", "Description": "This corpus contains texts (codices, letters) from the 12th to the 17th century.\nThe corpus is available for download from a dedicated webpage and through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["3 million tokens"], "Annotation": ["tokenised", "partially normalized", "partially MSD-tagged"], diff --git a/corpora/historical-corpora/old-lit-fin.json b/corpora/historical-corpora/old-lit-fin.json index d86e208..b08d513 100644 --- a/corpora/historical-corpora/old-lit-fin.json +++ b/corpora/historical-corpora/old-lit-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407165", "Family": "Historical corpora", "Description": "This corpus contains both literary and non-literary texts from 1543 to 1810.\nIn terms of genre, the texts correspond to bible translations and religious texts (for instance, all of the clergyman Mikael Agricola's Finnish works), legal texts, poems, and texts concerning agriculture, nature, health, and so on.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "EUPL v.1.1 SA", "Size": ["4.1 million words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/orossimo.json b/corpora/historical-corpora/orossimo.json index f2993a8..a332429 100644 --- a/corpora/historical-corpora/orossimo.json +++ b/corpora/historical-corpora/orossimo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-240F-8", "Family": "Historical corpora", "Description": "This corpus contains historic academic texts.\nThe corpus is available for download from the clarin:el repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY", "Size": ["553,000 tokens"], "Annotation": ["structural annotation (paragraph)"], diff --git a/corpora/historical-corpora/pamphlets-am.json b/corpora/historical-corpora/pamphlets-am.json index 9ec5ea7..da284a5 100644 --- a/corpora/historical-corpora/pamphlets-am.json +++ b/corpora/historical-corpora/pamphlets-am.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2021", "Family": "Historical corpora", "Description": "This corpus contains pamphlets of the American Revolution from 1750 to 1776.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["431,013 words"], "Annotation": [], diff --git a/corpora/historical-corpora/parsed-hist-pt.json b/corpora/historical-corpora/parsed-hist-pt.json index ecfde89..4ebbb26 100644 --- a/corpora/historical-corpora/parsed-hist-pt.json +++ b/corpora/historical-corpora/parsed-hist-pt.json @@ -3,7 +3,7 @@ "URL": "http://www.tycho.iel.unicamp.br/", "Family": "Historical corpora", "Description": "This corpus contains 76 texts written by authors born between 1380 and 1881.\nThe corpus is available for download and through a dedicated concordancer.", - "Languages": ["por"], + "Language": ["por"], "Licence": "", "Size": ["3.3 million"], "Annotation": ["tokenised", "PoS-tagged (2 million)", "treebanked (1.2 million)"], diff --git a/corpora/historical-corpora/partonopeus-de-blois.json b/corpora/historical-corpora/partonopeus-de-blois.json index 288e88b..6fc874b 100644 --- a/corpora/historical-corpora/partonopeus-de-blois.json +++ b/corpora/historical-corpora/partonopeus-de-blois.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2499", "Family": "Historical corpora", "Description": "This corpus contains transcriptions of the manuscripts and fragments of the romance Partonopeus de Blois.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["fro"], + "Language": ["fro"], "Licence": "CC BY-NC-SA 3.0", "Size": ["21,736,766 words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/historical-corpora/pceec.json b/corpora/historical-corpora/pceec.json index 3003726..b82abf0 100644 --- a/corpora/historical-corpora/pceec.json +++ b/corpora/historical-corpora/pceec.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2510", "Family": "Historical corpora", "Description": "This corpus contains correspondence from around 1410 to 1681.\nThere are 4970 personal letters by 666 writers. The letters have been selected to be as socially representative of the literate social ranks of the time as possible.\nThis corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Oxford Text Archive licence", "Size": ["2.2 million words"], "Annotation": ["tokenised", "PoS-tagged", "syntactically parsed"], diff --git a/corpora/historical-corpora/pol-16th.json b/corpora/historical-corpora/pol-16th.json index 7e1e260..e268a42 100644 --- a/corpora/historical-corpora/pol-16th.json +++ b/corpora/historical-corpora/pol-16th.json @@ -3,7 +3,7 @@ "URL": "https://spxvi.edu.pl/korpus/", "Family": "Historical corpora", "Description": "This corpus contains texts from the 16th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": [], "Annotation": ["lemmatised", "transliteration"], diff --git a/corpora/historical-corpora/pol-bf-1500.json b/corpora/historical-corpora/pol-bf-1500.json index 1f09c3e..4ed07c8 100644 --- a/corpora/historical-corpora/pol-bf-1500.json +++ b/corpora/historical-corpora/pol-bf-1500.json @@ -3,7 +3,7 @@ "URL": "https://ijp.pan.pl/publikacje-elektroniczne/korpus-tekstow-staropolskich", "Family": "Historical corpora", "Description": "This corpus contains texts until 1500.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["pol","lat"], + "Language": ["pol","lat"], "Licence": "", "Size": ["620,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/pol-lang-1960s.json b/corpora/historical-corpora/pol-lang-1960s.json index 75dab1a..3ec99ce 100644 --- a/corpora/historical-corpora/pol-lang-1960s.json +++ b/corpora/historical-corpora/pol-lang-1960s.json @@ -3,7 +3,7 @@ "URL": "ihttp://hdl.handle.net/20.500.14106/2482", "Family": "Historical corpora", "Description": "This corpus contains essays, news articles, and scientific and literary texts from 1963 to 1967.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["500,000 words"], "Annotation": ["MSD-tagged"], diff --git a/corpora/historical-corpora/poldilemma.json b/corpora/historical-corpora/poldilemma.json index df6bef6..c60c766 100644 --- a/corpora/historical-corpora/poldilemma.json +++ b/corpora/historical-corpora/poldilemma.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8C44-B", "Family": "Historical corpora", "Description": "This corpus contains political, religious and scientific texts from the 16th to the 18th century.\nThe corpus is available for download from the CLARIN-D repository.", - "Languages": ["ces","lat","deu","pol"], + "Language": ["ces","lat","deu","pol"], "Licence": "CC BY-NC-SA 4.0", "Size": ["7 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/historical-corpora/ref-hist-slovene.json b/corpora/historical-corpora/ref-hist-slovene.json index 4b076af..83e86f1 100644 --- a/corpora/historical-corpora/ref-hist-slovene.json +++ b/corpora/historical-corpora/ref-hist-slovene.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1025", "Family": "Historical corpora", "Description": "This corpus contains 89 unique texts from 1584 to 1899.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancer KonText.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY 4.0", "Size": ["300,000 tokens"], "Annotation": ["manually tokenised", "lemmatised", "PoS-tagged", "modern synonyms for archaic words"], diff --git a/corpora/historical-corpora/ref-mhd.json b/corpora/historical-corpora/ref-mhd.json index 1ebb8e0..24ce7fb 100644 --- a/corpora/historical-corpora/ref-mhd.json +++ b/corpora/historical-corpora/ref-mhd.json @@ -3,7 +3,7 @@ "URL": "http://deutschestextarchiv.de/rem/", "Family": "Historical corpora", "Description": "This corpus contains texts from 1050 to 1350.\nThe corpus is available for download from the Deutsches Text Archiv and through a concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-SA 4.0", "Size": ["2.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "normalised", "morphosyntactic description"], diff --git a/corpora/historical-corpora/ref-mid-low-de.json b/corpora/historical-corpora/ref-mid-low-de.json index ce458fb..79bedbc 100644 --- a/corpora/historical-corpora/ref-mid-low-de.json +++ b/corpora/historical-corpora/ref-mid-low-de.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0007-C64C-5", "Family": "Historical corpora", "Description": "This corpus contains texts from the 13th century to the middle of the 17th century.\nThe corpus is available for download from the repository of the University of Hamburg through the ANNIS environment.", - "Languages": ["gml"], + "Language": ["gml"], "Licence": "CC-BY", "Size": ["200,700 tokens"], "Annotation": ["tokenised", "MSD-tagged"], diff --git a/corpora/historical-corpora/roysoc-corp.json b/corpora/historical-corpora/roysoc-corp.json index 327de17..a9889c3 100644 --- a/corpora/historical-corpora/roysoc-corp.json +++ b/corpora/historical-corpora/roysoc-corp.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0001-7E8B-6", "Family": "Historical corpora", "Description": "This corpus contains articles from the  Philosophical Transactions of the Royal Society of London journal from 1665 to 1869.\nThe corpus is available for download from the CLARIN-D repository as well as through a concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY-NC-SA-4.0", "Size": ["35 million tokens"], "Annotation": ["PoS-tagged using PennTreebank tagset", "lemmatised", "normalised"], diff --git a/corpora/historical-corpora/sacoco.json b/corpora/historical-corpora/sacoco.json index 0a25e56..f8ca803 100644 --- a/corpora/historical-corpora/sacoco.json +++ b/corpora/historical-corpora/sacoco.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-001F-7C43-1", "Family": "Historical corpora", "Description": "This corpus contains historical cookbook recipes from  1569 to 1800, as well as contemporary ones from 2012.\nThe corpus is available through the CQPweb concordancer provided by CLARIN-D.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-NC-SA-3.0", "Size": ["436,000 tokens"], "Annotation": ["PoS-tagged using the STTS tagset", "lemmatised", "normalised"], diff --git a/corpora/historical-corpora/saga.json b/corpora/historical-corpora/saga.json index fd98201..583e977 100644 --- a/corpora/historical-corpora/saga.json +++ b/corpora/historical-corpora/saga.json @@ -3,7 +3,7 @@ "URL": "https://clarin.is/en/resources/sagacorpus/", "Family": "Historical corpora", "Description": "This corpus contains Old Icelandic (Old Norse) Narrative texts from the 13th to the 15th century.\nThe corpus is available for download from CLARIN-IS and for search through the concordancer Korp.", - "Languages": ["Icelandic (Old)"], + "Language": ["Icelandic (Old)"], "Licence": "CC-BY 4.0", "Size": ["1.5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "normalized orthography"], diff --git a/corpora/historical-corpora/sheffield-chin.json b/corpora/historical-corpora/sheffield-chin.json index 392f104..7d8fae9 100644 --- a/corpora/historical-corpora/sheffield-chin.json +++ b/corpora/historical-corpora/sheffield-chin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2481", "Family": "Historical corpora", "Description": "This corpus contains three texts (two non-fictional and one fictional) from the Medieval and Modern Chinese periods.\nThe text \"Zhuzi Yulei\" is genre-wise similar to sermons and vernacular dialogues, and is representative of Medieval Chinese. The two other texts are the novel \"Shuihu Zhuan\", which is from the Ming Dynasty (1368–1644), and the novel \"Rulin Waishi\", which is from the Quing Dynasty (1644–1911).\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["zho"], + "Language": ["zho"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["148,876 words"], "Annotation": ["no annotation"], diff --git a/corpora/historical-corpora/sprakbanken-hist.json b/corpora/historical-corpora/sprakbanken-hist.json index 084d588..4f3e5f1 100644 --- a/corpora/historical-corpora/sprakbanken-hist.json +++ b/corpora/historical-corpora/sprakbanken-hist.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/korp/?mode=all_hist#?lang=en&stats_reduce=word&cqp=%5B%5D", "Family": "Historical corpora", "Description": "This collection of corpora contains – among others – diachronic legal texts, Bible translations, medieval letters, digitized newspapers from the Swedish National Library and 19th century fiction from the Swedish Literature Bank.\nThe corpora are available through the concordancer Korp.", - "Languages": ["swe", "deu", "fra", "and others"], + "Language": ["swe", "deu", "fra", "and others"], "Licence": "CC-BY", "Size": ["1.34 billion tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "syntactically parsed", "word sense (for materials more recent than 1800)"], diff --git a/corpora/historical-corpora/sumerian-rev.json b/corpora/historical-corpora/sumerian-rev.json index bae57f6..c435846 100644 --- a/corpora/historical-corpora/sumerian-rev.json +++ b/corpora/historical-corpora/sumerian-rev.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2518", "Family": "Historical corpora", "Description": "This corpus contains transliterations and English translations of 394 Sumerian compositions from approximately 2100 to 1700 BCE.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng", "sux"], + "Language": ["eng", "sux"], "Licence": "CC-BY-NC-SA 3.0", "Size": ["5,151,373 words"], "Annotation": ["Each word form in the composite transliterations has been assigned to a lexeme which is specified by a citation form, word class information and basic English translation."], diff --git a/corpora/historical-corpora/swe-news-periodicals.json b/corpora/historical-corpora/swe-news-periodicals.json index 7c76e36..48fe10a 100644 --- a/corpora/historical-corpora/swe-news-periodicals.json +++ b/corpora/historical-corpora/swe-news-periodicals.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016050301", "Family": "Historical corpora", "Description": "This corpus contains newspaper articles from 1770 to 1950.\nThe corpus is available through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY-SA.", "Size": ["3.5 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/historical-corpora/syn-ref-fra.json b/corpora/historical-corpora/syn-ref-fra.json index 5f227cf..5ca658a 100644 --- a/corpora/historical-corpora/syn-ref-fra.json +++ b/corpora/historical-corpora/syn-ref-fra.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1007-0000-0000-9D2B-0", "Family": "Historical corpora", "Description": "This corpus contains texts from the 9th to the 13th century.\nThe syntactic categories of the SRCMF annotation and the grammatical principles of the annotation are explained in detail in the documentation.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["fro"], + "Language": ["fro"], "Licence": "CLARIN ACA", "Size": ["245,000 tokens"], "Annotation": ["tokenised", "syntactically-parsed"], diff --git a/corpora/historical-corpora/tlio.json b/corpora/historical-corpora/tlio.json index 9689101..6e166ee 100644 --- a/corpora/historical-corpora/tlio.json +++ b/corpora/historical-corpora/tlio.json @@ -3,7 +3,7 @@ "URL": " http://tlio.ovi.cnr.it/TLIO/", "Family": "Historical corpora", "Description": "This corpus contains early Italian texts before 1375.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["23 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/historical-corpora/vvks.json b/corpora/historical-corpora/vvks.json index ea536e6..2c61dcc 100644 --- a/corpora/historical-corpora/vvks.json +++ b/corpora/historical-corpora/vvks.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017082101", "Family": "Historical corpora", "Description": "This corpus contains literary texts from 1543 to 1791.\nThis corpus complements the Corpus of Old Literary Finnish available through FIN-CLARIN.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-NC-ND", "Size": ["48 texts"], "Annotation": [], diff --git a/corpora/historical-corpora/written-est.json b/corpora/historical-corpora/written-est.json index 508ca70..47cb65e 100644 --- a/corpora/historical-corpora/written-est.json +++ b/corpora/historical-corpora/written-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11297/1-00-0000-0000-0000-0002-6", "Family": "Historical corpora", "Description": "This corpus covers secular and religious texts from the 16th to the 18th century.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY", "Size": ["2 million tokens"], "Annotation": ["tokenised, 16.-18. century texts have been tagged with contemporary Estonian, morphological and language information. 19. century texts are unannotated."], diff --git a/corpora/historical-corpora/ycoe.json b/corpora/historical-corpora/ycoe.json index 61065ea..c925fac 100644 --- a/corpora/historical-corpora/ycoe.json +++ b/corpora/historical-corpora/ycoe.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2462", "Family": "Historical corpora", "Description": "This corpus contains fictional texts from 600 to 1150.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang","lat"], + "Language": ["ang","lat"], "Licence": "Oxford Text Archive licence", "Size": ["1.5 million words"], "Annotation": ["syntactically-parsed"], diff --git a/corpora/historical-corpora/ycoep.json b/corpora/historical-corpora/ycoep.json index 16a0040..6732cc3 100644 --- a/corpora/historical-corpora/ycoep.json +++ b/corpora/historical-corpora/ycoep.json @@ -3,7 +3,7 @@ "URL": "ihttp://hdl.handle.net/20.500.14106/2425", "Family": "Historical corpora", "Description": "This corpus contains poems from 730 to 1710.\nThe corpus contains a selection of poems taken from the Old English subpart of the Helsinki Corpus of English Texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang"], + "Language": ["ang"], "Licence": "Oxford Text Archive licence", "Size": ["71,500 words"], "Annotation": ["syntactically-parsed"], diff --git a/corpora/historical-corpora/yu1parl.json b/corpora/historical-corpora/yu1parl.json index 2ae3137..a297b25 100644 --- a/corpora/historical-corpora/yu1parl.json +++ b/corpora/historical-corpora/yu1parl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1845", "Family": "Historical corpora", "Description": "This historical parliamentary corpus contains meeting proceedings of the National Representation of the Kingdom of Yugoslavia from 191 to 1939. The corpus comprises 714 sessions.\nThe source data (scanned images of printed Stenographic Minutes) come from the History of Slovenia - SIstory portal. The images were OCR processed and the results saved as pdf, docx and txt. The documents are multilingual, in Serbo-Croatian and Slovenian, depending on the speaker. Serbo-Croatian is typeset in the Cyrillic (Serbian) or in the Latin (Croatian) alphabet.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Lingua was used for language detection on the sentence level. Roughly 59% of sentences are in Serbian (Cyrillic script), 38% in Croatian (Latin script) and 3% in Slovenian. Some sentences in German and French were also detected. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using CLASSLA for Serbian, Croatian and Slovenian. Words in Serbian (Cyrillic script) have lemmas in Latin script.\nThe corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["hrv", "srp", "slv"], + "Language": ["hrv", "srp", "slv"], "Licence": "CC BY 4.0", "Size": ["34,542 utterances", "578,958 sentences", "13,271,885 words", "15,403 pages"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/legal-corpora/ann-czech-case-law.json b/corpora/legal-corpora/ann-czech-case-law.json index 78ede92..1e7e28c 100644 --- a/corpora/legal-corpora/ann-czech-case-law.json +++ b/corpora/legal-corpora/ann-czech-case-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-3008", "Family": "Legal corpora", "Description": "This corpus consists of 350 manually annotated decisions at Czech top-tier courts (Supreme Court, Supreme Administrative Court, Constitutional Court). Each decision has been manually annotated by two trained annotators; the corpus is primarily developed as training and testing materials for reference recognition tasks. See also the variant of this corpus annotated for segmentation tasks.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY 4.0", "Size": [], "Annotation": ["legal references (identifier of court decision; author of law book or article, etc.)"], diff --git a/corpora/legal-corpora/cabank-eng-scotus.json b/corpora/legal-corpora/cabank-eng-scotus.json index 80ec727..47661a0 100644 --- a/corpora/legal-corpora/cabank-eng-scotus.json +++ b/corpora/legal-corpora/cabank-eng-scotus.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.21415/T5Z315", "Family": "Legal corpora", "Description": "This corpus consists of transcripts and recordings of oral arguments at the Supreme Court of the United States.\nThe transcripts and audio recordings are aligned at the utterance level; the utterances are annotated based on speaker role (the primary one being Justice) and name, as well as gender.\nThe corpus is part of the CABank collection and available for download from and online browsing through TalkBank.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-NC-SA 3.0", "Size": [], "Annotation": ["speaker segmentation", "sociolinguistic annotation"], diff --git a/corpora/legal-corpora/covid-19-eur-lex-cef.json b/corpora/legal-corpora/covid-19-eur-lex-cef.json index 6a4574e..8fb2297 100644 --- a/corpora/legal-corpora/covid-19-eur-lex-cef.json +++ b/corpora/legal-corpora/covid-19-eur-lex-cef.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000D-FE69-0", "Family": "Legal corpora", "Description": "This is a multilingual corpus of the European Union Law pertaining to COVID-19 period.\nThe corpus is available for download from the PORTULAN repository.", - "Languages": ["mlt", "hun", "lit", "lav", "pol", "por", "eng", "slv", "ell", "Spanish (Castilian)", "ron", "slk", "Moldavian", "swe", "bul", "ita", "deu", "hrv", "fra", "Dutch (Flemish)", "ces", "fin", "dan", "Irish", "est"], + "Language": ["mlt", "hun", "lit", "lav", "pol", "por", "eng", "slv", "ell", "Spanish (Castilian)", "ron", "slk", "Moldavian", "swe", "bul", "ita", "deu", "hrv", "fra", "Dutch (Flemish)", "ces", "fin", "dan", "Irish", "est"], "Licence": "CC BY", "Size": ["475,931 translation pairs"], "Annotation": [], diff --git a/corpora/legal-corpora/covid-19-eur-lex-en-pt.json b/corpora/legal-corpora/covid-19-eur-lex-en-pt.json index d537908..5564d3b 100644 --- a/corpora/legal-corpora/covid-19-eur-lex-en-pt.json +++ b/corpora/legal-corpora/covid-19-eur-lex-en-pt.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000D-FE66-3", "Family": "Legal corpora", "Description": "This is a parallel corpus of the European Union Law pertaining to COVID-19 period.\nThe corpus is available for download from the PORTULAN repository.", - "Languages": ["eng", "por"], + "Language": ["eng", "por"], "Licence": "CC BY", "Size": ["21,000 units"], "Annotation": [], diff --git a/corpora/legal-corpora/czcdc.json b/corpora/legal-corpora/czcdc.json index f707f8a..904fcfe 100644 --- a/corpora/legal-corpora/czcdc.json +++ b/corpora/legal-corpora/czcdc.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-3052", "Family": "Legal corpora", "Description": "This corpus consists of around 237,000 court decisions from three top-tier courts (Supreme, Supreme Administrative, and Constitutional) in Czechia, published between 1993 and 2018.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC 4.0", "Size": ["460 million words"], "Annotation": ["unannotated"], diff --git a/corpora/legal-corpora/czech-legal-tree.json b/corpora/legal-corpora/czech-legal-tree.json index cfd5114..b0ce6a8 100644 --- a/corpora/legal-corpora/czech-legal-tree.json +++ b/corpora/legal-corpora/czech-legal-tree.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2498", "Family": "Legal corpora", "Description": "This corpus consists of two legal documents: Accounting Act (563/1991 Coll., as amended) and Decree on Double-entry Accounting for undertakers (500/2002 Coll., as amended).\nThe corpus is available for download from LINDAT and online browsing through the treebank viewer PML-TQ and the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["1128 sentences"], "Annotation": ["manual syntactic annotation; manual annotation of entities from the accouting domain and relations definition, obligation, right"], diff --git a/corpora/legal-corpora/deu-sub-mulcold.json b/corpora/legal-corpora/deu-sub-mulcold.json index ae3ca79..674e004 100644 --- a/corpora/legal-corpora/deu-sub-mulcold.json +++ b/corpora/legal-corpora/deu-sub-mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042606", "Family": "Legal corpora", "Description": "This corpus, which is a subcorpus of MULCOLD (see also the Parallel corpora resource family) contains international conventions and treaties.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN Distribution).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC BY-ND", "Size": ["198,035 tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/eng-sub-mulcold.json b/corpora/legal-corpora/eng-sub-mulcold.json index 82fc515..14e0a30 100644 --- a/corpora/legal-corpora/eng-sub-mulcold.json +++ b/corpora/legal-corpora/eng-sub-mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042605", "Family": "Legal corpora", "Description": "This corpus, which is a subcorpus of MULCOLD (see also the Parallel corpora resource family) contains international conventions and treaties.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN Distribution).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-ND", "Size": ["359,874 tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/english-acquis.json b/corpora/legal-corpora/english-acquis.json index 34db725..76d1132 100644 --- a/corpora/legal-corpora/english-acquis.json +++ b/corpora/legal-corpora/english-acquis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D50A-A", "Family": "Legal corpora", "Description": "This corpus contains selected texts from the Acquis Communautaire between the 1950s and today, translated to English.\nThe corpus is available for download from PORTULAN.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "MIT (academic)", "Size": ["34.6 million tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/est-law.json b/corpora/legal-corpora/est-law.json index 2285726..34d2a86 100644 --- a/corpora/legal-corpora/est-law.json +++ b/corpora/legal-corpora/est-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11297/1-00-0000-0000-0000-0002-2", "Family": "Legal corpora", "Description": "This corpus contains Estonian laws (1.8 million tokens) as well as European legislation (9.6 million tokens) translated into Estonian.\nThe corpus is available for download from a dedicated webpage hosted by CLARIN Estonia.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN PUB", "Size": ["11 million tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/fin-sub-firulex.json b/corpora/legal-corpora/fin-sub-firulex.json index d0dfdf5..b5c2890 100644 --- a/corpora/legal-corpora/fin-sub-firulex.json +++ b/corpora/legal-corpora/fin-sub-firulex.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042604", "Family": "Legal corpora", "Description": "This is the Finnish subcorpus of FiRuLex, which contains juridical texts in Russian and Finnish.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN distribution)", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY-ND", "Size": ["1.5 million tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/fin-sub-jrc.json b/corpora/legal-corpora/fin-sub-jrc.json index cde2273..0faafd3 100644 --- a/corpora/legal-corpora/fin-sub-jrc.json +++ b/corpora/legal-corpora/fin-sub-jrc.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042710", "Family": "Legal corpora", "Description": "This is the legal subcorpus of the Helsinki Korp Version of the Finnish TreeBank 3.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN distribution) and for download from the Finnish Language Bank.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY", "Size": ["44.1 million tokens"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/igc-laws.json b/corpora/legal-corpora/igc-laws.json index efe6c58..f7397f3 100644 --- a/corpora/legal-corpora/igc-laws.json +++ b/corpora/legal-corpora/igc-laws.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/116", "Family": "Legal corpora", "Description": "IGC-Laws is a subcorpus of the The Icelandic Gigaword Corpus (see also CLARIN reference corpora). IGC-Laws contains 1) the Icelandic laws, 2) explanatory reports and observations extracted from bills submitted to Althingi, and 3) parliamentary proposals and resolutions. The corpus comes in two formats. One contains the texts untokenized and untagged while the other has been tokenized, PoS-tagged and lemmatized.\nThe corpus is available for download from the CLARIN-IS repository.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["2,2 million sentences", "40,6 million words"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/jrc-acquis.json b/corpora/legal-corpora/jrc-acquis.json index 23a6509..493e5ec 100644 --- a/corpora/legal-corpora/jrc-acquis.json +++ b/corpora/legal-corpora/jrc-acquis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/ATHENA-0000-0000-25C9-4", "Family": "Legal corpora", "Description": "This is a parallel corpus of Acquis Communautaire, which is the total body of European Union law applicable in European member states.\nMost texts have been manually classified according to the EUROVOC subject domains so that the collection can also be used to train and test multi-label classification algorithms and keyword-assignment software. The corpus is encoded in XML, according to the Text Encoding Initiative Guidelines. Due to the large number of parallel texts in many languages, the JRC-Acquis is particularly suitable to carry out all types of cross-language research, as well as to test and benchmark text analysis software across different languages (for instance for alignment, sentence splitting and term extraction). The sentence-level alignment was done using the hunalign tool.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["bul", "ces", "dan", "deu", "eng", "spa", "est", "fin", "fra", "hun", "ita", "lit", "lav", "mlt", "nld", "pol", "por", "ron", "slk", "slv", "swe"], + "Language": ["bul", "ces", "dan", "deu", "eng", "spa", "est", "fin", "fra", "hun", "ita", "lit", "lav", "mlt", "nld", "pol", "por", "ron", "slk", "slv", "swe"], "Licence": "CC BY 4.0", "Size": ["1 billion words"], "Annotation": ["paragraph and sentence alignment"], diff --git a/corpora/legal-corpora/jrc-eu-dgt.json b/corpora/legal-corpora/jrc-eu-dgt.json index 6c203a6..4f7fa3c 100644 --- a/corpora/legal-corpora/jrc-eu-dgt.json +++ b/corpora/legal-corpora/jrc-eu-dgt.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1197", "Family": "Legal corpora", "Description": "", - "Languages": ["bul", "hrv", "ces", "dan", "nld", "eng", "est", "fin", "fra", "deu", "hun", "gle", "ita", "lav", "lit", "ell", "pol", "por", "ron", "slk", "slv", "spa", "swe"], + "Language": ["bul", "hrv", "ces", "dan", "nld", "eng", "est", "fin", "fra", "deu", "hun", "gle", "ita", "lav", "lit", "ell", "pol", "por", "ron", "slk", "slv", "spa", "swe"], "Licence": "CC BY 4.0", "Size": ["2.1 billion tokens"], "Annotation": ["syntactically parsed (Universal Dependencies)"], diff --git a/corpora/legal-corpora/judicial-rhetoric.json b/corpora/legal-corpora/judicial-rhetoric.json index 0624b51..29ad413 100644 --- a/corpora/legal-corpora/judicial-rhetoric.json +++ b/corpora/legal-corpora/judicial-rhetoric.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/CLARIN-EL-0000-0000-6114-C", "Family": "Legal corpora", "Description": "This corpus consists of transcriptions of defendants’ and witnesses’ speeches in criminal cases of rape, attempted rape, murder, and attempted murder.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC BY-NC-ND 4.0", "Size": [], "Annotation": [], diff --git a/corpora/legal-corpora/juridisch-nl.json b/corpora/legal-corpora/juridisch-nl.json index 18ef41c..86a37e2 100644 --- a/corpora/legal-corpora/juridisch-nl.json +++ b/corpora/legal-corpora/juridisch-nl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-u2", "Family": "Legal corpora", "Description": "This corpus contains legal texts from 1814 to 1989, compiled year by year.\nThe corpus is available for online browsing on a dedicated webpage", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB", "Size": ["5,856 texts"], "Annotation": ["lemmatised", "PoS-tagged"], diff --git a/corpora/legal-corpora/legal-est-min-just.json b/corpora/legal-corpora/legal-est-min-just.json index 224353d..c34d915 100644 --- a/corpora/legal-corpora/legal-est-min-just.json +++ b/corpora/legal-corpora/legal-est-min-just.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000D-FAD1-D ", "Family": "Legal corpora", "Description": "This corpus contains Estonian-English translations of the Acts of Estonian law.\nThe corpus is available for download from PORTULAN.", - "Languages": ["Estonian-English"], + "Language": ["Estonian-English"], "Licence": "CC BY", "Size": ["47,000 units"], "Annotation": [], diff --git a/corpora/legal-corpora/legal-nynorsk-munic.json b/corpora/legal-corpora/legal-nynorsk-munic.json index 3163c9d..2f1f523 100644 --- a/corpora/legal-corpora/legal-nynorsk-munic.json +++ b/corpora/legal-corpora/legal-nynorsk-munic.json @@ -3,7 +3,7 @@ "URL": "https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-60/", "Family": "Legal corpora", "Description": "This corpus contains 50,000 legal documents and meeting minutes collected with the web crawler Veidemann. Around 88.5 million words are in Nynork, while the rest are in Bokmal (Bokmål).\nThe corpus is available for download from the Norwegian Language Bank.", - "Languages": ["Norwegian (Nynorsk and Bokmål)"], + "Language": ["Norwegian (Nynorsk and Bokmål)"], "Licence": "CC0 1.0 Universal", "Size": ["127 million words"], "Annotation": [], diff --git a/corpora/legal-corpora/lifr-law.json b/corpora/legal-corpora/lifr-law.json index bfededf..2b55314 100644 --- a/corpora/legal-corpora/lifr-law.json +++ b/corpora/legal-corpora/lifr-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-5020", "Family": "Legal corpora", "Description": "This is a corpus of Czech legal and administrative texts with measured reading comprehension and a subjective expert annotation of diverse textual properties based on the Hamburg Comprehensibility Concept.\nThe corpus is comprised of 18 documents in total; that is, six different texts from the legal/administration domain, each in three versions: the original and two paraphrases. Each such document triple shares one reading-comprehension test administered to at least thirty readers of random gender, educational background, and age. The data set also captures basic demographic information about each reader, their familiarity with the topic, and their subjective assessment of the stylistic properties of the given document, roughly corresponding to the key text properties identified by the Hamburg Comprehensibility Concept.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY 4.0", "Size": ["17601 tokens"], "Annotation": ["textual annotation"], diff --git a/corpora/legal-corpora/likumi.json b/corpora/legal-corpora/likumi.json index 2fe0940..d9f4ac0 100644 --- a/corpora/legal-corpora/likumi.json +++ b/corpora/legal-corpora/likumi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12574/65", "Family": "Legal corpora", "Description": "The corpus contains all legal acts of the Republic of Latvia published on the website likumi.lv (until February 2022).\nThe corpus is available for download from the CLARIN.LV repository.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "CC BY 4.0", "Size": ["116 million tokens", "73 million words"], "Annotation": [], diff --git a/corpora/legal-corpora/lit-eu-law.json b/corpora/legal-corpora/lit-eu-law.json index c853c17..16fbfad 100644 --- a/corpora/legal-corpora/lit-eu-law.json +++ b/corpora/legal-corpora/lit-eu-law.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/18", "Family": "Legal corpora", "Description": "This corpus contains primary and secondary European law acts (32 texts) translated into Lithuanian.\nThe corpus is available for download from CLARIN-LT.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN PUB", "Size": ["274,460 words"], "Annotation": [], diff --git a/corpora/legal-corpora/maltese-acquis.json b/corpora/legal-corpora/maltese-acquis.json index 40b00fc..ea6df06 100644 --- a/corpora/legal-corpora/maltese-acquis.json +++ b/corpora/legal-corpora/maltese-acquis.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D4FD-9", "Family": "Legal corpora", "Description": "", - "Languages": ["mlt"], + "Language": ["mlt"], "Licence": "MIT (academic)", "Size": ["20.9 million tokens"], "Annotation": [], diff --git a/corpora/legal-corpora/meta-nord-dan.json b/corpora/legal-corpora/meta-nord-dan.json index d2863b6..a87373e 100644 --- a/corpora/legal-corpora/meta-nord-dan.json +++ b/corpora/legal-corpora/meta-nord-dan.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9BE-2D15-4C1C-1", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC BY 4.0", "Size": ["102 sentences", "1799 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-est.json b/corpora/legal-corpora/meta-nord-est.json index 6bc53cb..b2fed71 100644 --- a/corpora/legal-corpora/meta-nord-est.json +++ b/corpora/legal-corpora/meta-nord-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D1-EE49-223F-3", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY 4.0", "Size": ["78 sentences", "1443 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-fin.json b/corpora/legal-corpora/meta-nord-fin.json index 80e5fe8..35c9f57 100644 --- a/corpora/legal-corpora/meta-nord-fin.json +++ b/corpora/legal-corpora/meta-nord-fin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D1-FD1D-3174-1", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank. The corpus is syntactically parsed using the FinnTreeBank 2 schema and is available for download and online browsing through INESS (CLARINO).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY 4.0", "Size": ["122 sentences", "1464 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-isl.json b/corpora/legal-corpora/meta-nord-isl.json index 5d6084e..86420b8 100644 --- a/corpora/legal-corpora/meta-nord-isl.json +++ b/corpora/legal-corpora/meta-nord-isl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D2-09F8-20E3-8", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["73 sentences", "1880 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-nor.json b/corpora/legal-corpora/meta-nord-nor.json index bb9f0a1..0247757 100644 --- a/corpora/legal-corpora/meta-nord-nor.json +++ b/corpora/legal-corpora/meta-nord-nor.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D937-A55E-278E-1", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CC BY 4.0", "Size": ["101 sentences", "1862 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/meta-nord-swe.json b/corpora/legal-corpora/meta-nord-swe.json index db3cbfe..75555c3 100644 --- a/corpora/legal-corpora/meta-nord-swe.json +++ b/corpora/legal-corpora/meta-nord-swe.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D9D3-24E5-429B-9", "Family": "Legal corpora", "Description": "This is a subcorpus of the META-NORD Acquis Parallel Treebank.\nThe corpus is available for download and online browsing through INESS (CLARINO).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC BY 4.0", "Size": ["102 sentences", "1982 words"], "Annotation": ["syntactically parsed (constituency)", "sentence/phrase/word segmentation"], diff --git a/corpora/legal-corpora/multieurlex.json b/corpora/legal-corpora/multieurlex.json index 57064bd..9991e5a 100644 --- a/corpora/legal-corpora/multieurlex.json +++ b/corpora/legal-corpora/multieurlex.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/CLARIN-EL-0000-0000-61A7-6", "Family": "Legal corpora", "Description": "This corpus consists of 65,000 European laws in 23 official European languages. Each law has been annotated with the EuroVoc concept labels.\nThe corpus is available for download from the repository of CLARIN:EL.", - "Languages": ["fin", "slk", "lit", "hrv", "slv", "est", "lav", "mlt", "eng", "deu", "fra", "ita", "spa", "Castilian", "pol", "ron", "Moldavian", "Moldovan", "nld", "Flemish", "ell", "hun", "por", "ces", "swe", "bul", "dan"], + "Language": ["fin", "slk", "lit", "hrv", "slv", "est", "lav", "mlt", "eng", "deu", "fra", "ita", "spa", "Castilian", "pol", "ron", "Moldavian", "Moldovan", "nld", "Flemish", "ell", "hun", "por", "ces", "swe", "bul", "dan"], "Licence": "CC BY", "Size": [], "Annotation": ["conceptual annotation"], diff --git a/corpora/legal-corpora/nor-acquis.json b/corpora/legal-corpora/nor-acquis.json index 453abb8..0012ad1 100644 --- a/corpora/legal-corpora/nor-acquis.json +++ b/corpora/legal-corpora/nor-acquis.json @@ -3,7 +3,7 @@ "URL": "https://www.nb.no/sprakbanken/ressurskatalog/oai-nb-no-sbr-2/", "Family": "Legal corpora", "Description": "This corpus contains Norwegian translations of 5414 documents in Acquis Communautaire.\nThe corpus is available for download from the Norwegian Language Bank.", - "Languages": ["Norwegian (Bokmål and Nynorsk)"], + "Language": ["Norwegian (Bokmål and Nynorsk)"], "Licence": "CC BY-NC 4.0", "Size": ["14 million words"], "Annotation": [], diff --git a/corpora/legal-corpora/old-bailey.json b/corpora/legal-corpora/old-bailey.json index e1ba8e3..208c55e 100644 --- a/corpora/legal-corpora/old-bailey.json +++ b/corpora/legal-corpora/old-bailey.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0023-8CFB-2", "Family": "Legal corpora", "Description": "This historical corpus consists of Proceedings of the Old Bailey; the Old Bailey was London’s central criminal court between 1674 and 1913. The corpus consists of texts from 1970 to 1913, and is annotated for detailed utterance-level sociolinguistic annotation at the following three levels: sociobiographical speaker information (gender, age, occupation, social class), pragmatic information (speaker role in the courtroom such as judge, witness, etc.), and metatextual information (the scribe, printer, and publisher of the individual Proceeding).\nThe corpus is available for download from CLARIN-D (Saarland University) and for online browsing through CQPWeb.", - "Languages": ["English (Late Modern)"], + "Language": ["English (Late Modern)"], "Licence": "CC BY-NC-SA 4.0", "Size": ["24.4 million words"], "Annotation": ["sociolinguistic annotation"], diff --git a/corpora/legal-corpora/rus-sub-firulex.json b/corpora/legal-corpora/rus-sub-firulex.json index c9f70b3..0ea3590 100644 --- a/corpora/legal-corpora/rus-sub-firulex.json +++ b/corpora/legal-corpora/rus-sub-firulex.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042603", "Family": "Legal corpora", "Description": "This is the Russian subcorpus of FiRuLex, which contains juridical texts in Russian and Finnish.\nThe corpus is available for online browsing through the concordancer Korp (FIN-CLARIN distribution)", - "Languages": ["rus"], + "Language": ["rus"], "Licence": "CC BY-ND", "Size": ["1.2 million tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/legal-corpora/rus-sub-mulcold.json b/corpora/legal-corpora/rus-sub-mulcold.json index 5fcbfc7..8ffee1f 100644 --- a/corpora/legal-corpora/rus-sub-mulcold.json +++ b/corpora/legal-corpora/rus-sub-mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016042607", "Family": "Legal corpora", "Description": "This corpus, which is a subcorpus of MULCOLD (see also the Parallel corpora resource family) contains international conventions and treaties.\nThe corpus can be accessed online through the concordancer Korp (FIN-CLARIN Distribution).", - "Languages": ["rus"], + "Language": ["rus"], "Licence": "CC BY-ND", "Size": ["198,035 tokens"], "Annotation": ["lemmatised", "MSD-tagged"], diff --git a/corpora/literary-corpora/1000-novels.json b/corpora/literary-corpora/1000-novels.json index 6eef07b..fb60b97 100644 --- a/corpora/literary-corpora/1000-novels.json +++ b/corpora/literary-corpora/1000-novels.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/312", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY 4.0", "Size": ["1000 texts"], "Annotation": [], diff --git a/corpora/literary-corpora/1000plus-novels.json b/corpora/literary-corpora/1000plus-novels.json index ebbe8ce..c3b761d 100644 --- a/corpora/literary-corpora/1000plus-novels.json +++ b/corpora/literary-corpora/1000plus-novels.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/699", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY-SA 3.0", "Size": ["1000 texts", "17,352,826 words"], "Annotation": [], diff --git a/corpora/literary-corpora/15c-castilian.json b/corpora/literary-corpora/15c-castilian.json index b938e52..2ca1993 100644 --- a/corpora/literary-corpora/15c-castilian.json +++ b/corpora/literary-corpora/15c-castilian.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-873", "Family": "Literary corpora", "Description": "This is a lyric corpus of 15th century cancioneros.\nThe corpus is available for online browsing through an external interface.", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/1920-polish.json b/corpora/literary-corpora/1920-polish.json index a06e32b..18ae818 100644 --- a/corpora/literary-corpora/1920-polish.json +++ b/corpora/literary-corpora/1920-polish.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/57", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC-BY 3.0", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/aformes.json b/corpora/literary-corpora/aformes.json index f3fd03b..3281572 100644 --- a/corpora/literary-corpora/aformes.json +++ b/corpora/literary-corpora/aformes.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2575-3", "Family": "Literary corpora", "Description": "This corpus contains fiction texts from a journal of undergraduate creative writing at the Faculty of English Language and Literature.\nThe corpus is available for download from clarin:el.", - "Languages": ["ell","eng"], + "Language": ["ell","eng"], "Licence": "CC-BY-NC", "Size": ["376,250 words"], "Annotation": [], diff --git a/corpora/literary-corpora/anglosaxon.json b/corpora/literary-corpora/anglosaxon.json index 1cc5a54..3287ad6 100644 --- a/corpora/literary-corpora/anglosaxon.json +++ b/corpora/literary-corpora/anglosaxon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-867", "Family": "Literary corpora", "Description": "This corpus is available for online browsing through an external interface.", - "Languages": ["ang"], + "Language": ["ang"], "Licence": "", "Size": [], "Annotation": ["none"], diff --git a/corpora/literary-corpora/anth-me.json b/corpora/literary-corpora/anth-me.json index 4ce8fce..e5f48e6 100644 --- a/corpora/literary-corpora/anth-me.json +++ b/corpora/literary-corpora/anth-me.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/1398", "Family": "Literary corpora", "Description": "This corpus contains literary texts from 1100 to 1400.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["enm", "heb"], + "Language": ["enm", "heb"], "Licence": "Oxford Text Archive Licence", "Size": ["4,000 words"], "Annotation": [], diff --git a/corpora/literary-corpora/bonnier-one.json b/corpora/literary-corpora/bonnier-one.json index dd82862..9545a3a 100644 --- a/corpora/literary-corpora/bonnier-one.json +++ b/corpora/literary-corpora/bonnier-one.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/115", "Family": "Literary corpora", "Description": "This corpus presents 69 Bonnier novels from 1976-77.\nThe corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY 4.0", "Size": ["6,578,675 tokens", "462,625 sentences"], "Annotation": ["sentence scrambling"], diff --git a/corpora/literary-corpora/bonnier-two.json b/corpora/literary-corpora/bonnier-two.json index 543cc3f..37b7fc5 100644 --- a/corpora/literary-corpora/bonnier-two.json +++ b/corpora/literary-corpora/bonnier-two.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/116", "Family": "Literary corpora", "Description": "This corpus presents 60 Bonnier novels from 1980-81.\nThe corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY 4.0", "Size": ["4,304,271 tokens", "298,361 sentences"], "Annotation": ["sentence scrambling"], diff --git a/corpora/literary-corpora/ceal.json b/corpora/literary-corpora/ceal.json index 017283e..2f4c80b 100644 --- a/corpora/literary-corpora/ceal.json +++ b/corpora/literary-corpora/ceal.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016110901", "Family": "Literary corpora", "Description": "This corpus contains Finnish translations of the following three texts: Jane Austen: Ylpeys ja ennakkoluulo (Pride and Prejudice), translated by Kersti Juva, Teos 2013; Henry James: Washingtonin aukio (Washington Square), translated by Kersti Juva, Otava 2003; Charles Dickens: Kolea talo (Bleak House), translated by Kersti Juva, Tammi, 2006.\nThe corpus is available for online browsing through Korp in two versions - Version 1 (Sentences and Paragraphs in the Original Order) and Version 2 (Scrambled Paragraphs))", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN RES + NC", "Size": ["3 novels", "484,010 tokens"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/classic-fin-lit.json b/corpora/literary-corpora/classic-fin-lit.json index f6d054c..c4f5bf7 100644 --- a/corpora/literary-corpora/classic-fin-lit.json +++ b/corpora/literary-corpora/classic-fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-773", "Family": "Literary corpora", "Description": "This corpus contains works by established Finnish fiction writers from the 1880s to the 1930s. There are different types of prose and plays, as well as lyrics and aphorisms.\nThis corpus is available for online browsing through an external interface.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "", "Size": ["1,456,658 words"], "Annotation": [], diff --git a/corpora/literary-corpora/classic-fin.json b/corpora/literary-corpora/classic-fin.json index 55b4a99..7835873 100644 --- a/corpora/literary-corpora/classic-fin.json +++ b/corpora/literary-corpora/classic-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2018051701", "Family": "Literary corpora", "Description": "This corpus contains literary texts from 1549 to 1944.\nThe corpus is available for online browsing through FIN-CLARIN.", - "Languages": ["fin","swe"], + "Language": ["fin","swe"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/early-fin-lit.json b/corpora/literary-corpora/early-fin-lit.json index b92b9df..7d89b0d 100644 --- a/corpora/literary-corpora/early-fin-lit.json +++ b/corpora/literary-corpora/early-fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-772", "Family": "Literary corpora", "Description": "The corpus of Early Modern Finnish contains Finnish-language works in various fields published during the 19th century, annual issues of the oldest periodicals and newspapers, almanac and decree texts, and some dictionaries. An effort has been made to include the earliest, most important and (based on the number of reprints, for example) most widely distributed works. The selection of publications has also been made with a view to achieving the widest possible thematic coverage, although more works originally written in Finnish have been included than translations. These have been alphabetised by the name of their translator, seasonal publications by their title, and other works by their author. The Finnish translations of unknown authors are in the Anonymous folder, the texts of unknown authors in the Other folder. The materials cover the period between Old and Modern English and a little beyond. The earliest book dates from 1809, the latest from 1891, but there are texts of the regulations right up to the end of the century. However, most of the material is from 1810-1880. This later material can also be found in the Classics corpus.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/est-fiction.json b/corpora/literary-corpora/est-fiction.json index 09c3580..12d2210 100644 --- a/corpora/literary-corpora/est-fiction.json +++ b/corpora/literary-corpora/est-fiction.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-0007EL", "Family": "Literary corpora", "Description": "This corpus contains texts from 1990 onwards.\nThe corpus is available for download from CELR.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA - NC", "Size": ["5,768,504 words"], "Annotation": [], diff --git a/corpora/literary-corpora/est-runic.json b/corpora/literary-corpora/est-runic.json index 89a219a..7417e3e 100644 --- a/corpora/literary-corpora/est-runic.json +++ b/corpora/literary-corpora/est-runic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-0008FL", "Family": "Literary corpora", "Description": "These are the oldest text recordings of Estonian runic songs (the text recordings were created in the 19th century and in the first decades of the 20th century). In addition to the runic songs, the database also has songs of transitional form and end-rhymed songs (about 6000).\nThe corpus is available for online browsing through an external interface.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA", "Size": ["92,134 texts"], "Annotation": [], diff --git a/corpora/literary-corpora/etcsl.json b/corpora/literary-corpora/etcsl.json index 121680b..ec2e52f 100644 --- a/corpora/literary-corpora/etcsl.json +++ b/corpora/literary-corpora/etcsl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-874", "Family": "Literary corpora", "Description": "This corpus presents a selection of nearly 400 literary compositions recorded on sources which come from ancient Mesopotamia and date to the late third and early second millennia BCE.\nThe corpus is available for online browsing through an external interface.", - "Languages": ["sux"], + "Language": ["sux"], "Licence": "", "Size": ["400 literary compositions"], "Annotation": [], diff --git a/corpora/literary-corpora/fin-folk.json b/corpora/literary-corpora/fin-folk.json index 614b361..993f97e 100644 --- a/corpora/literary-corpora/fin-folk.json +++ b/corpora/literary-corpora/fin-folk.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052712", "Family": "Literary corpora", "Description": "This corpus contains poems from 1564 to 1939.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], + "Language": ["fin", "krl", "lud", "lat", "swe", "olo", "izh", "vot"], "Licence": "CC-BY-NC", "Size": ["7.1 million words"], "Annotation": ["unannotated"], diff --git a/corpora/literary-corpora/fin-gutenberg.json b/corpora/literary-corpora/fin-gutenberg.json index 2f6dd05..b78b710 100644 --- a/corpora/literary-corpora/fin-gutenberg.json +++ b/corpora/literary-corpora/fin-gutenberg.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014100301", "Family": "Literary corpora", "Description": "This corpus contains Finnish books made available by the Gutenberg project. The texts have not been linguistically annotated.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY", "Size": ["34,487,420 words"], "Annotation": [], diff --git a/corpora/literary-corpora/fin-lit.json b/corpora/literary-corpora/fin-lit.json index 6bd08d1..0cd64ad 100644 --- a/corpora/literary-corpora/fin-lit.json +++ b/corpora/literary-corpora/fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730186", "Family": "Literary corpora", "Description": "This corpus contains prose fiction, plays, poetry and aphorisms (some written originally in Swedish) of established Finnish authors published from 1880s to 1949.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "EUPL v.1.1 SA", "Size": ["1,500,000 words"], "Annotation": ["syntactically parsed (TDT alpha)", "named entities (FiNER)", "MSD-tagged", "lemmatized"], diff --git a/corpora/literary-corpora/greek-medieval.json b/corpora/literary-corpora/greek-medieval.json index dd51dcd..d1d4e46 100644 --- a/corpora/literary-corpora/greek-medieval.json +++ b/corpora/literary-corpora/greek-medieval.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-251D-7", "Family": "Literary corpora", "Description": "This corpus contains medieval texts contains written material covering the period from the 4th till the 16th century A.D. The texts can be classified into the following categories: religious, poetical-literary, political-historical, hymns, epigrams.\nThe corpus is available for download from clarin:el.", - "Languages": ["ell","grc"], + "Language": ["ell","grc"], "Licence": "CC-BY-NC", "Size": ["3,419,553 words"], "Annotation": [], diff --git a/corpora/literary-corpora/greek-thesaurus.json b/corpora/literary-corpora/greek-thesaurus.json index 91b40d0..3c3946d 100644 --- a/corpora/literary-corpora/greek-thesaurus.json +++ b/corpora/literary-corpora/greek-thesaurus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E3-8", "Family": "Literary corpora", "Description": "This corpus contains prose, poetry, drama, and essays from the 18th century onwards.\nThe corpus is available for online browsing through a dedicated webpage.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "proprietary", "Size": ["1 million tokens"], "Annotation": ["semantic"], diff --git a/corpora/literary-corpora/joh-jen.json b/corpora/literary-corpora/joh-jen.json index bb07d26..0d9df39 100644 --- a/corpora/literary-corpora/joh-jen.json +++ b/corpora/literary-corpora/joh-jen.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12115/20", "Family": "Literary corpora", "Description": "This corpus presents the collected works of the Danish author Johannes Jensen.\nThe corpus is available for download from CLARIN-DK and for online browsing through a dedicated concordancer.", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC BY-SA 4.0", "Size": ["1,760,093 words", "8,489 pages"], "Annotation": ["unannotated"], diff --git a/corpora/literary-corpora/kdsp.json b/corpora/literary-corpora/kdsp.json index b21341e..629bb85 100644 --- a/corpora/literary-corpora/kdsp.json +++ b/corpora/literary-corpora/kdsp.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1823", "Family": "Literary corpora", "Description": "This corpus contains 262 texts of longer older Slovenian narrative prose. The texts were published between 1836 and 1918 and are at least 20,000 words long.\nThe texts have bibliographical metadata (author name, title, year of publication, length) and are classified according to the decade of publication, length, text type, text subtype, theme, and level of canonicity (texts by those authors included in school textbooks after 1980 and/or included in the Collected writings of Slovenian poets and writers, are marked with a high degree of canonicity). The metadata about the authors of the texts are provided with their gender, occupation, and years of birth and death. The corpus texts come from three digital sources, and each text is marked for its source. They are Wikisource (145 texts), the ELTeC corpus (96 texts), and the dLib digital library (21 texts). The corpus is provided in two variants, one containing running text and the other with added linguistic analyses. These comprise tokens, sentences, lemmas, MULTEXT-East morphosytactic descriptions and Universal Dependencies morphological features. The linguistic annotation was performed with the CLASSLA program. The source format of the corpus in TEI/XML, with two derived formats also available: one is plain text, and the other vertical files, as used by concordances, like the CWB.\nThe corpus is available for download from CLARIN.SI as well as through the noSketchEngine and KonText concordancers.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY 4.0", "Size": ["262 texts", "11 million words", "14 million tokens"], "Annotation": ["MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated with author and text metadata"], diff --git a/corpora/literary-corpora/kivi.json b/corpora/literary-corpora/kivi.json index 80f6802..d6c3141 100644 --- a/corpora/literary-corpora/kivi.json +++ b/corpora/literary-corpora/kivi.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405274", "Family": "Literary corpora", "Description": "This corpus contains all the known letters, manuscripts and published works by Finnish author Aleksis Kivi (1834–1872). Most of the texts were written in Finnish while some of the letters and manuscripts are in Swedish. The time coverage of the texts: 1855-1871.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin","swe"], + "Language": ["fin","swe"], "Licence": "CC-BY-NC", "Size": ["413,735 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/lat-lit-classic.json b/corpora/literary-corpora/lat-lit-classic.json index 67a052f..759791f 100644 --- a/corpora/literary-corpora/lat-lit-classic.json +++ b/corpora/literary-corpora/lat-lit-classic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-184", "Family": "Literary corpora", "Description": "This corpus presents classics from the end of the 19th century to the beginning of the 20th century.", - "Languages": ["lat"], + "Language": ["lat"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/literary-corpora/ltcorpus.json b/corpora/literary-corpora/ltcorpus.json index 600e1ed..a52c650 100644 --- a/corpora/literary-corpora/ltcorpus.json +++ b/corpora/literary-corpora/ltcorpus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net//21.11115/0000-000B-D33D-3", "Family": "Literary corpora", "Description": "This corpus contains 70 copyright-free classics (61 Portugal and 9 Brazil) published before 1940.\nThe corpus is available for download from PORTULAN.", - "Languages": ["por"], + "Language": ["por"], "Licence": "CLARIN RES", "Size": ["1,781,083 words"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/literary-corpora/m-agricola.json b/corpora/literary-corpora/m-agricola.json index 76e4212..b68f1d3 100644 --- a/corpora/literary-corpora/m-agricola.json +++ b/corpora/literary-corpora/m-agricola.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730170", "Family": "Literary corpora", "Description": "This corpus contains the Finnish parts of Mikael Agricola’s works (Abckiria, Rukouskiria, Se Wsi testamenti, Käsikiria, Messu, Piina, Psaltari, Veisut, Profeetat).\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-ND", "Size": ["83,678 sentences", "428,314 tokens", "38,308 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/micro-pol.json b/corpora/literary-corpora/micro-pol.json index df1e570..df374ba 100644 --- a/corpora/literary-corpora/micro-pol.json +++ b/corpora/literary-corpora/micro-pol.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/604", "Family": "Literary corpora", "Description": "This corpus is available for download from CLARIN-PL.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "plWordNet", "Size": [], "Annotation": ["unannotated"], diff --git a/corpora/literary-corpora/multext1984.json b/corpora/literary-corpora/multext1984.json index d6eca95..18db9fb 100644 --- a/corpora/literary-corpora/multext1984.json +++ b/corpora/literary-corpora/multext1984.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1043", "Family": "Literary corpora", "Description": "This is a parallel corpus of George Orwell's 1984 and its translations.\nThe corpus is available for download from CLARIN.SI.", - "Languages": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], + "Language": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], "Licence": "CC BY-NC SA 4.0", "Size": ["12 texts", "79,718 sentences", "1,064,424 words"], "Annotation": ["sentence-alignment", "MSD tagging"], diff --git a/corpora/literary-corpora/norbok-children.json b/corpora/literary-corpora/norbok-children.json index 1dd20d1..5cfbb3a 100644 --- a/corpora/literary-corpora/norbok-children.json +++ b/corpora/literary-corpora/norbok-children.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D988-1F83-B1F5-1", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nob"], + "Language": ["nob"], "Licence": "CLARIN ACA", "Size": ["4,111,213 words", "389,564 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/norbok-fiction.json b/corpora/literary-corpora/norbok-fiction.json index 4c52e97..5fd0fa8 100644 --- a/corpora/literary-corpora/norbok-fiction.json +++ b/corpora/literary-corpora/norbok-fiction.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D988-2078-6447-1", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nob"], + "Language": ["nob"], "Licence": "CLARIN ACA", "Size": ["26,903,637 words", "2,469,916 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/nornyn-children.json b/corpora/literary-corpora/nornyn-children.json index d42831e..6846830 100644 --- a/corpora/literary-corpora/nornyn-children.json +++ b/corpora/literary-corpora/nornyn-children.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D963-33EA-65BD-0", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nno"], + "Language": ["nno"], "Licence": "CLARIN ACA", "Size": ["1,043,260 words", "106,434 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/nornyn-fiction.json b/corpora/literary-corpora/nornyn-fiction.json index 414c2e2..3cc07c1 100644 --- a/corpora/literary-corpora/nornyn-fiction.json +++ b/corpora/literary-corpora/nornyn-fiction.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D985-7B94-F361-1", "Family": "Literary corpora", "Description": "This corpus, which is based on OCR data from the National Library of Norway, is available for online browsing through INESS.", - "Languages": ["nno"], + "Language": ["nno"], "Licence": "CLARIN ACA", "Size": ["2,884,376 words", "260,285 sentences"], "Annotation": ["syntactically parsed"], diff --git a/corpora/literary-corpora/north-saami.json b/corpora/literary-corpora/north-saami.json index f6fe3e0..7137740 100644 --- a/corpora/literary-corpora/north-saami.json +++ b/corpora/literary-corpora/north-saami.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014032620", "Family": "Literary corpora", "Description": "This corpus contains Kerttu Vuolab's novel Cheppari cháráhus.\nThe corpus is available for online browsing through the TAITO shell.", - "Languages": ["sme"], + "Language": ["sme"], "Licence": "CLARIN RES +NC +NORED +PLAN", "Size": ["17,830 words"], "Annotation": [], diff --git a/corpora/literary-corpora/old-fin-lit.json b/corpora/literary-corpora/old-fin-lit.json index 3f11433..8e59b16 100644 --- a/corpora/literary-corpora/old-fin-lit.json +++ b/corpora/literary-corpora/old-fin-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-776", "Family": "Literary corpora", "Description": "This corpus contains various works published during the Swedish rule (from the 16th century to about 1810), extensive manuscripts from that period (most of which were later printed), as well as individual almanac and decree texts, sermons and poetry.\nThis corpus is available for online browsing through an external interface.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "", "Size": ["3,428,618 words"], "Annotation": [], diff --git a/corpora/literary-corpora/one-mil-cro.json b/corpora/literary-corpora/one-mil-cro.json index 9877216..60549e5 100644 --- a/corpora/literary-corpora/one-mil-cro.json +++ b/corpora/literary-corpora/one-mil-cro.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-234", "Family": "Literary corpora", "Description": "The corpus is listed in the LINDAT repository. ", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "", "Size": ["1 million tokens"], "Annotation": [], diff --git a/corpora/literary-corpora/orig-est.json b/corpora/literary-corpora/orig-est.json index 70a1265..3d9dd40 100644 --- a/corpora/literary-corpora/orig-est.json +++ b/corpora/literary-corpora/orig-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00088L", "Family": "Literary corpora", "Description": "This corpus collects older Estonian literary texts published on \"Kreutzwald's Century: the Estonian Cultural History Web\". The electronically republished books, included in the collection, are based on the first editions of works by more important Estonian authors, published in 1854-1944.\nThe corpus is available for online browsing through an external interface.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA", "Size": ["173 texts"], "Annotation": [], diff --git a/corpora/literary-corpora/parfin.json b/corpora/literary-corpora/parfin.json index 74db151..b532ec8 100644 --- a/corpora/literary-corpora/parfin.json +++ b/corpora/literary-corpora/parfin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016121610", "Family": "Literary corpora", "Description": "This corpus contains Finnish literary texts from 1990-2010 and their translations into Russian aligned at sentence level.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin","rus"], + "Language": ["fin","rus"], "Licence": "CLARIN RES +NC +INF +ND", "Size": ["2,044,172 tokens"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/literary-corpora/parrus.json b/corpora/literary-corpora/parrus.json index 3324d8c..e8e6b7a 100644 --- a/corpora/literary-corpora/parrus.json +++ b/corpora/literary-corpora/parrus.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730173", "Family": "Literary corpora", "Description": "This corpus contains Russian literary texts (classical literature & 20th century) and their translations into Finnish aligned at paragraph level.\nThe corpus is available for online browsing through Korp.", - "Languages": ["fin","rus"], + "Language": ["fin","rus"], "Licence": "CLARIN RES +NC +INF +ND", "Size": ["5,900,000 tokens"], "Annotation": ["MSD-tagged, syntactically parsed"], diff --git a/corpora/literary-corpora/prilit.json b/corpora/literary-corpora/prilit.json index b5971d2..a95078e 100644 --- a/corpora/literary-corpora/prilit.json +++ b/corpora/literary-corpora/prilit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1319", "Family": "Literary corpora", "Description": "This corpus contains texts of older Slovenian narrative prose by 12 authors.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY 4.0", "Size": ["43 texts", "1,275,209 tokens"], "Annotation": ["word modernisation", "lemmatisation", "syntactic annotation (Universal Dependencies)"], diff --git a/corpora/literary-corpora/rep-bastille.json b/corpora/literary-corpora/rep-bastille.json index 4a230cc..d0877f8 100644 --- a/corpora/literary-corpora/rep-bastille.json +++ b/corpora/literary-corpora/rep-bastille.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AUTH-0000-0000-24DC-0", "Family": "Literary corpora", "Description": "This corpus contains République-Bastille, a novel by Melpo Axioti. This French text is of particular linguistic interest since it is a text written in a language other than the author's mother tongue and is suited for research on bilingualism and self-translation. It would be worth measuring the naturalness of the language with computational tools, for example.\nThe corpus is available for download from clarin:el.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": ["37,965 words"], "Annotation": [], diff --git a/corpora/literary-corpora/sol.json b/corpora/literary-corpora/sol.json index 5c1dd2b..14a4f35 100644 --- a/corpora/literary-corpora/sol.json +++ b/corpora/literary-corpora/sol.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/80", "Family": "Literary corpora", "Description": "This corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "CC-BY 4.0", "Size": ["1,267,391 tokens", "69,270 sentences"], "Annotation": ["sentence scrambled"], diff --git a/corpora/literary-corpora/strindberg.json b/corpora/literary-corpora/strindberg.json index 730ba17..37a7205 100644 --- a/corpora/literary-corpora/strindberg.json +++ b/corpora/literary-corpora/strindberg.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10794/79", "Family": "Literary corpora", "Description": "This corpus presents the collected works of August Strindberg.\nThe corpus is available for download from SWE-CLARIN and for online browsing through Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY 4.0", "Size": ["4,309,037 tokens", "321,759 sentences"], "Annotation": ["sentence scrambling"], diff --git a/corpora/literary-corpora/uhlcs.json b/corpora/literary-corpora/uhlcs.json index 1eb9c24..aa6b49a 100644 --- a/corpora/literary-corpora/uhlcs.json +++ b/corpora/literary-corpora/uhlcs.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014032622", "Family": "Literary corpora", "Description": "This corpus contains samples of Finnish literature published by the WSOY publishing company in the 1990.\nThe corpus is available online through FIN-CLARIN.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN RES", "Size": ["68,425 words"], "Annotation": ["tagged"], diff --git a/corpora/literary-corpora/york-poetry.json b/corpora/literary-corpora/york-poetry.json index 8b136f0..7d1d5c2 100644 --- a/corpora/literary-corpora/york-poetry.json +++ b/corpora/literary-corpora/york-poetry.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2425", "Family": "Literary corpora", "Description": "This corpus contains a selection of poetic texts (71,490 words) from the Old English Section of the Helsinki Corpus of English Texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["ang"], + "Language": ["ang"], "Licence": "Restricted", "Size": ["71,490 words"], "Annotation": ["MSD-tagged", "syntactically parsed"], diff --git a/corpora/manually-annotated-corpora/acl-rd-tex.json b/corpora/manually-annotated-corpora/acl-rd-tex.json index f41bb52..4a3b293 100644 --- a/corpora/manually-annotated-corpora/acl-rd-tex.json +++ b/corpora/manually-annotated-corpora/acl-rd-tex.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-1661", "Family": "Manually annotated corpora", "Description": "This corpus contains 6818 terms extracted from abstracts of computational linguistics papers.\nThe corpus is available for download from LINDAT and through KonText.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-NC-SA 4.0", "Size": ["33216 tokens"], "Annotation": ["terminology extraction/classification"], diff --git a/corpora/manually-annotated-corpora/alksnis.json b/corpora/manually-annotated-corpora/alksnis.json index bbe4215..6bbfa73 100644 --- a/corpora/manually-annotated-corpora/alksnis.json +++ b/corpora/manually-annotated-corpora/alksnis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/10", "Family": "Manually annotated corpora", "Description": "Syntactic parsing follows the rules of the Prague Dependency Treebank\nThis corpus is available for download from the CLARIN-LT repository. The second version is available upon request.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN PUB", "Size": ["2,355 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/artificial-treebank.json b/corpora/manually-annotated-corpora/artificial-treebank.json index f556ced..381191e 100644 --- a/corpora/manually-annotated-corpora/artificial-treebank.json +++ b/corpora/manually-annotated-corpora/artificial-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2616", "Family": "Manually annotated corpora", "Description": "This syntactic parsing follows the Universal Dependencies schema.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces", "eng", "fin", "rus", "slk"], + "Language": ["ces", "eng", "fin", "rus", "slk"], "Licence": "Licence Universal dependencies v2.1", "Size": ["106,000 tokens", "10,604 sentences"], "Annotation": ["syntactic parsing", "mark-up of elliptical constructions"], diff --git a/corpora/manually-annotated-corpora/artur.json b/corpora/manually-annotated-corpora/artur.json index fc5ff74..4a6bbf2 100644 --- a/corpora/manually-annotated-corpora/artur.json +++ b/corpora/manually-annotated-corpora/artur.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1772", "Family": "Manually annotated corpora", "Description": "This corpus was designed for the needs of developing automatic speech recognition for the Slovenian language. The complete database includes 1,067 hours of speech, of which 884 hours are transcribed, while the remaining 183 hours are recordings only.\nThe audio files are available in a separate repository entry. Transcriptions are available in the original TRS format of the Transcriber 1.5.1 tool which was used for making the transcriptions. All transcriptions were made manually or manually corrected.\nThe data are structured as follows:
    1. Artur-B, read speech, 573 hours in total.\nIt includes: (1a) Artur-B-Brani, 485 hours: Readings of sentences which were pre-selected from a 10% increment in the Gigafida 2.0 corpus. The sentences were chosen in such a way that they reflect the natural or the actual distribution of triphones in the words. They were distributed between 1,000 speakers, so that we recorded approx. 30 min in read form from each speaker. The speakers were balanced according to gender, age, region, and a small proportion of speakers were non-native speakers of Slovene. Each sentence is its own audio file and has a corresponding transcription file. (1b) Artur-B-Crkovani, 10 hours: Spellings. Speakers were asked to spell abbreviations and personal names and surnames, all chosen so that all Slovene letters were covered, plus the most common foreign letters. (1c) Artur-B-Studio, 51 hours: Designed for the development of speech synthesis. The sentences were read in a studio by a single speaker. Each sentence is its own audio file and has a corresponding transcription file. (1d) Artur-B-Izloceno, 27 hours: The recordings include different types of errors, typically, incorrect reading of sentences or a noisy environment.
    2. (2) Artur-J, public speech, 62 hours in total.\nIt includes: (2a) Artur-J-Splosni, 62 hours: media recordings, online recordings of conferences, workshops, education videos, etc.
    3. (3) Artur-N, private speech, 74 hours in total.\nIt includes: (3a) Artur-N-Obrazi, 6 hours: Speakers were asked to describe faces on pictures. Designed for a face-description domain-specific speech recognition. (3b) Artur-N-PDom, 7 hours: Speakers were asked to read pre-written sentences, as well as to express instructions for a potential smart-home system freely. Designed for a smart-home domain-specific speech recognition. (3c) Artur-N-Prosti, 61 hours: Monologues and dialogues between two persons, recorded for the purposes of the Artur database creation. Speakers were asked to conversate or explain freely on casual topics.
    4. (4) Artur-P, parliamentary speech, 201 hours in total.\nIt includes: (4a) Artur-P-SejeDZ, 201 hours: Speech from the Slovene National Assembly.
    5. \nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["884 hours"], "Annotation": ["orthographically transcribed speech"], diff --git a/corpora/manually-annotated-corpora/aspect-term-czech.json b/corpora/manually-annotated-corpora/aspect-term-czech.json index e63fffa..1894fbe 100644 --- a/corpora/manually-annotated-corpora/aspect-term-czech.json +++ b/corpora/manually-annotated-corpora/aspect-term-czech.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1507", "Family": "Manually annotated corpora", "Description": "This corpus contains online user-product reviews.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 3.0", "Size": ["2200 reviews"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/austrian-baroque.json b/corpora/manually-annotated-corpora/austrian-baroque.json index de281b8..683bd61 100644 --- a/corpora/manually-annotated-corpora/austrian-baroque.json +++ b/corpora/manually-annotated-corpora/austrian-baroque.json @@ -3,7 +3,7 @@ "URL": "https://acdh.oeaw.ac.at/abacus/", "Family": "Manually annotated corpora", "Description": "This historical corpus contains sermons from 1650 to 1750. For linguistic annotation, each individual token was automatically assigned to a morphosyntactic word class using the TreeTagger software. As a classification system, the 54-part Stuttgart-Tübingen TagSet (STTS) was used. For lemmatization , a normalized basic word form was used for each token and the Duden and the German dictionary by Jacob and Wilhelm Grimm were used as reference works. The part-of-speech tagging and lemmatization was then manually checked.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["200,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "named entities"], diff --git a/corpora/manually-annotated-corpora/b4-heliand.json b/corpora/manually-annotated-corpora/b4-heliand.json index 8e42cad..2f10df6 100644 --- a/corpora/manually-annotated-corpora/b4-heliand.json +++ b/corpora/manually-annotated-corpora/b4-heliand.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-9B24-9", "Family": "Manually annotated corpora", "Description": "This corpus contains historical German texts.\nThe corpus is available for download from the HZSK repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY", "Size": ["3495 tokens"], "Annotation": ["PoS tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/bnc-sampler.json b/corpora/manually-annotated-corpora/bnc-sampler.json index 5288cee..846ad5e 100644 --- a/corpora/manually-annotated-corpora/bnc-sampler.json +++ b/corpora/manually-annotated-corpora/bnc-sampler.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2551", "Family": "Manually annotated corpora", "Description": "The corpus was manually post-edited to correct the PoS tags automatically assigned by CLAWS.\nThe corpus is available for online querying via CQPWeb (registration required) for download from the Oxford Text Archive", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "BNC Licence", "Size": ["2 million tokens"], "Annotation": ["PoS tagging"], diff --git a/corpora/manually-annotated-corpora/bultreebank.json b/corpora/manually-annotated-corpora/bultreebank.json index cc6d5e2..30779ab 100644 --- a/corpora/manually-annotated-corpora/bultreebank.json +++ b/corpora/manually-annotated-corpora/bultreebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/D93F-C6E9-65D9-2", "Family": "Manually annotated corpora", "Description": "This corpus is available for download through the concordancer Corpuscle.", - "Languages": ["bul"], + "Language": ["bul"], "Licence": "MS-NC-NoReD", "Size": ["214,000 tokens"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/cintil-deepbank.json b/corpora/manually-annotated-corpora/cintil-deepbank.json index c208ff2..7441897 100644 --- a/corpora/manually-annotated-corpora/cintil-deepbank.json +++ b/corpora/manually-annotated-corpora/cintil-deepbank.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D34F-F", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the PORTULAN CLARIN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["PoS-tagging", "syntactic parsing", "grammatical functions", "logical forms"], diff --git a/corpora/manually-annotated-corpora/cintil-dependency.json b/corpora/manually-annotated-corpora/cintil-dependency.json index 05f618b..70a3648 100644 --- a/corpora/manually-annotated-corpora/cintil-dependency.json +++ b/corpora/manually-annotated-corpora/cintil-dependency.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D31C-8", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the PORTULAN CLARIN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["morphosyntactic tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/cintil-portugues.json b/corpora/manually-annotated-corpora/cintil-portugues.json index 9b203f9..e3c2672 100644 --- a/corpora/manually-annotated-corpora/cintil-portugues.json +++ b/corpora/manually-annotated-corpora/cintil-portugues.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D33B-5", "Family": "Manually annotated corpora", "Description": "The corpus contains transcriptions of spoken communication as well as written texts from several genres (news, literature, magazines, etc.).\nThe corpus is available for download from the CLARIN PORTULAN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "CLARIN RES", "Size": ["1 million tokens"], "Annotation": ["morphosyntactic tagging", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/cintil-propbank.json b/corpora/manually-annotated-corpora/cintil-propbank.json index e40d7ef..1075ab1 100644 --- a/corpora/manually-annotated-corpora/cintil-propbank.json +++ b/corpora/manually-annotated-corpora/cintil-propbank.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D300-6", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the ELRA catalogue.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["syntactic parsing", "phrase semantic roles"], diff --git a/corpora/manually-annotated-corpora/cintil-treebank.json b/corpora/manually-annotated-corpora/cintil-treebank.json index 7d0cf4c..417c408 100644 --- a/corpora/manually-annotated-corpora/cintil-treebank.json +++ b/corpora/manually-annotated-corpora/cintil-treebank.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D2FE-A", "Family": "Manually annotated corpora", "Description": "This corpus contains literary and newspaper texts.\nThe corpus is available for download from the PORTULAN CLARIN repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "MS-NC-No ReD-ND", "Size": ["110,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/cmc-training-janes-norm.json b/corpora/manually-annotated-corpora/cmc-training-janes-norm.json index b1e0619..404dc85 100644 --- a/corpora/manually-annotated-corpora/cmc-training-janes-norm.json +++ b/corpora/manually-annotated-corpora/cmc-training-janes-norm.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1084", "Family": "Manually annotated corpora", "Description": "This corpus is partially also manually annotated with MSD tags and lemmatized.\nThe corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["184,755 tokens"], "Annotation": ["normalization"], diff --git a/corpora/manually-annotated-corpora/cmc-training-janes-tag.json b/corpora/manually-annotated-corpora/cmc-training-janes-tag.json index e0e003e..71b7a80 100644 --- a/corpora/manually-annotated-corpora/cmc-training-janes-tag.json +++ b/corpora/manually-annotated-corpora/cmc-training-janes-tag.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1123", "Family": "Manually annotated corpora", "Description": "This corpus contains computer-mediated communication (CMC). The corpus is morphosyntactically tagged following the MULTEXT-East Version 5 tagset.\nThe corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["75,000 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "word normalisation", "morphosyntactic tagging", "lemmatisation", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/czech-legal-treebank.json b/corpora/manually-annotated-corpora/czech-legal-treebank.json index 9a5e6ee..d1449f3 100644 --- a/corpora/manually-annotated-corpora/czech-legal-treebank.json +++ b/corpora/manually-annotated-corpora/czech-legal-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2498", "Family": "Manually annotated corpora", "Description": "This corpus contains legal texts.\nThe corpus is available through the concordance KonText, the PML-TQ tool and for download from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["1121 sentences"], "Annotation": ["syntactic parsing", "labelling of semantic entities"], diff --git a/corpora/manually-annotated-corpora/czech-ne-corpus.json b/corpora/manually-annotated-corpora/czech-ne-corpus.json index 6bd133f..d1ca72c 100644 --- a/corpora/manually-annotated-corpora/czech-ne-corpus.json +++ b/corpora/manually-annotated-corpora/czech-ne-corpus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-1B04-C", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 3.0", "Size": ["5868 sentences", "35220 NEs"], "Annotation": ["Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/dep-anno-creg.json b/corpora/manually-annotated-corpora/dep-anno-creg.json index c5bdf31..46140b4 100644 --- a/corpora/manually-annotated-corpora/dep-anno-creg.json +++ b/corpora/manually-annotated-corpora/dep-anno-creg.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-2CA4-6", "Family": "Manually annotated corpora", "Description": "This corpus consists of answers to reading comprehension questions written by American college students learning German.\nThe corpus is available for download from the Tübingen CLARIN Repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["109 sentences"], "Annotation": ["PoS tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/est-treebank-coref.json b/corpora/manually-annotated-corpora/est-treebank-coref.json index c6a8773..eca4443 100644 --- a/corpora/manually-annotated-corpora/est-treebank-coref.json +++ b/corpora/manually-annotated-corpora/est-treebank-coref.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-0016AL", "Family": "Manually annotated corpora", "Description": "This corpus contains newspaper texts plus one scientific medical text.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "GPL", "Size": ["107,000 words"], "Annotation": ["anaphora relations"], diff --git a/corpora/manually-annotated-corpora/est-treebank.json b/corpora/manually-annotated-corpora/est-treebank.json index 2cc8a4a..3780a2e 100644 --- a/corpora/manually-annotated-corpora/est-treebank.json +++ b/corpora/manually-annotated-corpora/est-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00080L", "Family": "Manually annotated corpora", "Description": "The corpus contains fictional and newspaper texts.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN_ACA", "Size": ["1,000 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/facebook-sentiment.json b/corpora/manually-annotated-corpora/facebook-sentiment.json index 76eabd0..7c92a8d 100644 --- a/corpora/manually-annotated-corpora/facebook-sentiment.json +++ b/corpora/manually-annotated-corpora/facebook-sentiment.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0022-FE82-7", "Family": "Manually annotated corpora", "Description": "This corpus contains Facebook posts.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-SA 3.0", "Size": ["10,000 Facebook posts"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/fictree.json b/corpora/manually-annotated-corpora/fictree.json index a3f60a2..cdac007 100644 --- a/corpora/manually-annotated-corpora/fictree.json +++ b/corpora/manually-annotated-corpora/fictree.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2517", "Family": "Manually annotated corpora", "Description": "This corpus contains fictional texts.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["12760 sentences"], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/fin-treebank-1.json b/corpora/manually-annotated-corpora/fin-treebank-1.json index 0c0b5bd..0e09089 100644 --- a/corpora/manually-annotated-corpora/fin-treebank-1.json +++ b/corpora/manually-annotated-corpora/fin-treebank-1.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2016011501", "Family": "Manually annotated corpora", "Description": "This corpus contains 19,000 sentences from the Large Grammar of Finnish.\nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY 3.0", "Size": ["160,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/fin-treebank-2.json b/corpora/manually-annotated-corpora/fin-treebank-2.json index da5d46c..804a601 100644 --- a/corpora/manually-annotated-corpora/fin-treebank-2.json +++ b/corpora/manually-annotated-corpora/fin-treebank-2.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407163", "Family": "Manually annotated corpora", "Description": "This corpus contains 19,000 sentences from the Large Grammar of Finnish.\nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY 3.0", "Size": ["160,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/finnsentiment.json b/corpora/manually-annotated-corpora/finnsentiment.json index 5d486b9..1e4c19b 100644 --- a/corpora/manually-annotated-corpora/finnsentiment.json +++ b/corpora/manually-annotated-corpora/finnsentiment.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2023012701", "Family": "Manually annotated corpora", "Description": "This corpus contains sentences from Finnish social media that have been manually annotated for sentiment polarity by three native annotators.\nThe corpus is available for download from META-SHARE (the Finnish Language Bank).", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC BY", "Size": ["27,000 sentences"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/frenk-styria.json b/corpora/manually-annotated-corpora/frenk-styria.json index 5d42c35..1060627 100644 --- a/corpora/manually-annotated-corpora/frenk-styria.json +++ b/corpora/manually-annotated-corpora/frenk-styria.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1202", "Family": "Manually annotated corpora", "Description": "This corpus contains news comments from the website 24sata.hr.\nThe corpus is available for download from CLARIN.SI.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["407.5 million words"], "Annotation": ["sentiment analysis (socially unacceptable discourse)"], diff --git a/corpora/manually-annotated-corpora/greek-coref.json b/corpora/manually-annotated-corpora/greek-coref.json index afb8084..f5f2157 100644 --- a/corpora/manually-annotated-corpora/greek-coref.json +++ b/corpora/manually-annotated-corpora/greek-coref.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25DC-F", "Family": "Manually annotated corpora", "Description": "In addition to coreference, the corpus is annotated for identity and bridging relations.\nIn addition to coreference, the corpus is annotated for identity and bridging relations.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC-SA", "Size": ["62,988 tokens"], "Annotation": ["coreference"], diff --git a/corpora/manually-annotated-corpora/greek-entailment.json b/corpora/manually-annotated-corpora/greek-entailment.json index 3cac88b..7e07ce7 100644 --- a/corpora/manually-annotated-corpora/greek-entailment.json +++ b/corpora/manually-annotated-corpora/greek-entailment.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23DB-2", "Family": "Manually annotated corpora", "Description": "This corpus contains texts from the domains of politics, law and travel.\nThis corpus is available for download from the clarin:el repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY", "Size": ["600 sentence-pairs"], "Annotation": ["logical entailment"], diff --git a/corpora/manually-annotated-corpora/grug-para-tree.json b/corpora/manually-annotated-corpora/grug-para-tree.json index aba4677..b330e4f 100644 --- a/corpora/manually-annotated-corpora/grug-para-tree.json +++ b/corpora/manually-annotated-corpora/grug-para-tree.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-246C-0000-0006-C150-9", "Family": "Manually annotated corpora", "Description": "The corpus is syntactically parsed following the TIGER guidelines.\nThe corpus is available for download from a dedicated website provided by the CLARIN-D consortium.", - "Languages": ["kat", "ukr", "rus", "deu"], + "Language": ["kat", "ukr", "rus", "deu"], "Licence": "CC-BY", "Size": ["10,400 sentence pairs"], "Annotation": ["syntactic parsing", "PoS tagging"], diff --git a/corpora/manually-annotated-corpora/grundtvig.json b/corpora/manually-annotated-corpora/grundtvig.json index 60e5778..69803bd 100644 --- a/corpora/manually-annotated-corpora/grundtvig.json +++ b/corpora/manually-annotated-corpora/grundtvig.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12115/31", "Family": "Manually annotated corpora", "Description": "This corpus contains the literary works of the Danish bishop N.F.S Grundtvig.\nThe corpus is available for download from the CLARIN-DK repository.", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC BY-NC 4.0", "Size": ["11,417,194 words"], "Annotation": ["linked data (places, persons, bible citations, etc.)"], diff --git a/corpora/manually-annotated-corpora/hamledt.json b/corpora/manually-annotated-corpora/hamledt.json index 36da328..6baa04a 100644 --- a/corpora/manually-annotated-corpora/hamledt.json +++ b/corpora/manually-annotated-corpora/hamledt.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1508", "Family": "Manually annotated corpora", "Description": "This treebank collection is available for download from LINDAT.\nThe treebanks can be individually queried through KonText and the treebank tool PML-TQ. We list them here by language:\n
      1. Arabic(KonText, PML-TQ)
      2. Bengali (KonText)
      3. Catalan (KonText)
      4. Czech (KonText, PML-TQ)
      5. Dutch (KonText, PML-TQ)
      6. English (KonText)
      7. Estonian (KonText, PML-TQ)
      8. German (KonText)
      9. Greek (KonText)
      10. Hindi (KonText)
      11. Latin (KonText, PML-TQ)
      12. Persian (KonText, PML-TQ)
      13. Polish (KonText, PML-TQ)
      14. Portuguese (KonText, PML-TQ)
      15. Romanian (KonText, PML-TQ)
      16. Russian (KonText)
      17. Slovenian (KonText, PML-TQ)
      18. Spanish (KonText)
      19. Tamil (KonText, PML-TQ)
      ", - "Languages": ["19 languages"], + "Language": ["19 languages"], "Licence": "HamleDT 3.0 Licence Terms", "Size": ["19 treebanks"], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/hr500k-1.json b/corpora/manually-annotated-corpora/hr500k-1.json index 8ce7475..fe8e034 100644 --- a/corpora/manually-annotated-corpora/hr500k-1.json +++ b/corpora/manually-annotated-corpora/hr500k-1.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1183", "Family": "Manually annotated corpora", "Description": "This corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["500,000 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "morphosyntactic tagging", "lemmatisation", "Named Entity recognition", "Half of corpus also syntactically parsed"], diff --git a/corpora/manually-annotated-corpora/hr500k-2.json b/corpora/manually-annotated-corpora/hr500k-2.json index e7713ad..2092d36 100644 --- a/corpora/manually-annotated-corpora/hr500k-2.json +++ b/corpora/manually-annotated-corpora/hr500k-2.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1792", "Family": "Manually annotated corpora", "Description": "This training corpus contains about 500,000 tokens manually annotated on the levels of tokenisation, sentence segmentation, morphosyntactic tagging, lemmatisation and named entities. About half of the corpus is also manually annotated with syntactic dependencies. A subset of the syntactically annotated corpus is also annotated for multi-word expressions. Furthermore, about a fifth of the corpus is annotated with semantic role labels.\nThe annotation formalisms followed in the hr500k corpus are (1) the MULTEXT-East V6 morphosyntactic specifications for the Serbo-Croatian macro-language, (2) the UDv2 Guidelines, (3) the Janes annotation guidelines for named entities, (4) the PARSEME guidelines for annotating multi-word expressions and (4) the semantic role labelling annotation protocol for Slovenian and Croatian.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["499,635 tokens"], "Annotation": ["fully – tokenisation, sentence segmentation, morphosyntactic tagging, and lemmatisation, named entities. Half of the corpus – syntactic parsing, a subset also for multi-word expressions. Fifth of the corpus: semantic roles."], diff --git a/corpora/manually-annotated-corpora/icepahc.json b/corpora/manually-annotated-corpora/icepahc.json index 0999bbf..b3f2dcd 100644 --- a/corpora/manually-annotated-corpora/icepahc.json +++ b/corpora/manually-annotated-corpora/icepahc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/62", "Family": "Manually annotated corpora", "Description": "This corpus contains Icelandic texts from the 12th through the 21st centuries – approximately 100,000 words from each century. The corpus is syntactically parsed following the UPenn scheme for historical texts.\nThe corpus is available for online search through treebankstudio.org and for download in different formats from a dedicated webpage.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "GNU LGPL", "Size": ["1 million tokens"], "Annotation": ["morphosyntactic tagging", "lemmatisation", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/jos1m.json b/corpora/manually-annotated-corpora/jos1m.json index cecf106..c183ca4 100644 --- a/corpora/manually-annotated-corpora/jos1m.json +++ b/corpora/manually-annotated-corpora/jos1m.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1037", "Family": "Manually annotated corpora", "Description": "This corpus contains sampled paragraphs from the Slovenian national corpus FidaPLUS. The corpus is morphosyntactically tagged following the MULTEXT-East Version 4 tagset.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-NC 4.0", "Size": ["1 million words"], "Annotation": ["morphosyntactic tagging", "lemmatisation"], diff --git a/corpora/manually-annotated-corpora/kas-biterm.json b/corpora/manually-annotated-corpora/kas-biterm.json index 8ab2ee2..6550be6 100644 --- a/corpora/manually-annotated-corpora/kas-biterm.json +++ b/corpora/manually-annotated-corpora/kas-biterm.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1199", "Family": "Manually annotated corpora", "Description": "This corpus contains PHD theses.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv", "eng"], + "Language": ["slv", "eng"], "Licence": "CC BY-SA 4.0", "Size": ["1,950 sentences", "78,500 tokens", "3,700 terms"], "Annotation": ["bi-lingual term extraction"], diff --git a/corpora/manually-annotated-corpora/kas-term.json b/corpora/manually-annotated-corpora/kas-term.json index 82308e9..d768c59 100644 --- a/corpora/manually-annotated-corpora/kas-term.json +++ b/corpora/manually-annotated-corpora/kas-term.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1198", "Family": "Manually annotated corpora", "Description": "This corpus contains term candidates from PhD theses in chemistry, computer science and political science.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["22,950 term candidates"], "Annotation": ["monolingual term extraction"], diff --git a/corpora/manually-annotated-corpora/kpwr.json b/corpora/manually-annotated-corpora/kpwr.json index 0e0fbad..93aad3c 100644 --- a/corpora/manually-annotated-corpora/kpwr.json +++ b/corpora/manually-annotated-corpora/kpwr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/270", "Family": "Manually annotated corpora", "Description": "This corpus contains texts in a variety of domains (blogs, science, stenographic recordings, etc.).\nThe corpus is available for download from the CLARIN-PL repository.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY-SA 3.0", "Size": ["447,000 tokens"], "Annotation": ["chunks and selected predicate-argument relations", "Named Entity recognition", "relations between named entities", "anaphora relations", "word senses", "events", "temporal expressions", "spatial relations between entities", "keywords and semantic roles within nominal and adjective phrases"], diff --git a/corpora/manually-annotated-corpora/lassy-klein.json b/corpora/manually-annotated-corpora/lassy-klein.json index 0430b40..97c5d64 100644 --- a/corpora/manually-annotated-corpora/lassy-klein.json +++ b/corpora/manually-annotated-corpora/lassy-klein.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/efc201791fadf20f67858b602553874b", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from the Dutch Language Institute and through the online environments PaQu and GrETEL.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "VAGUE", "Size": ["1 million tokens"], "Annotation": ["PoS tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/lvtb.json b/corpora/manually-annotated-corpora/lvtb.json index 77aec5a..9373dc7 100644 --- a/corpora/manually-annotated-corpora/lvtb.json +++ b/corpora/manually-annotated-corpora/lvtb.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12574/86", "Family": "Manually annotated corpora", "Description": "This treebank is manually annotated according to a hybrid dependency-constituency grammar.\nThe treebank is available for download from the CLARIN-LV repository.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "CC BY-SA 4.0", "Size": ["289,791 tokens", "17,127 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/matas.json b/corpora/manually-annotated-corpora/matas.json index 5ec8d77..5c3f195 100644 --- a/corpora/manually-annotated-corpora/matas.json +++ b/corpora/manually-annotated-corpora/matas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/9", "Family": "Manually annotated corpora", "Description": "The corpus contains texts from various domains (documents, fiction, periodicals, scientific texts, wordforms).\nThis corpus is available for download from the CLARIN-LT repository.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN ACA", "Size": ["1.6 million words"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/morph-dist-estonian.json b/corpora/manually-annotated-corpora/morph-dist-estonian.json index e4f9417..f09c223 100644 --- a/corpora/manually-annotated-corpora/morph-dist-estonian.json +++ b/corpora/manually-annotated-corpora/morph-dist-estonian.json @@ -3,7 +3,7 @@ "URL": "http://doi.org/10.15155/1-00-0000-0000-0000-00085L", "Family": "Manually annotated corpora", "Description": "This corpus contains texts from the 1980s subcorpus of the Corpus of Written Estonian 1890-1990.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN_ACA-NC", "Size": ["513,000 tokens"], "Annotation": ["morphological disambiguation"], diff --git a/corpora/manually-annotated-corpora/multext-east.json b/corpora/manually-annotated-corpora/multext-east.json index 88b385d..54d946b 100644 --- a/corpora/manually-annotated-corpora/multext-east.json +++ b/corpora/manually-annotated-corpora/multext-east.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1043", "Family": "Manually annotated corpora", "Description": "This corpus contains 11 human translations of George Orwell’s Nineteen Eighty-Four, as well as the original text. The corpus is morphosyntactically tagged following the MULTEXT-East Version 4 tagset.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], + "Language": ["bul", "ces", "eng", "est", "hun", "mkd", "fas", "pol", "ron", "srp", "slk", "slv"], "Licence": "CC BY-NC-SA 4.0", "Size": ["80,000 sentences", "1 million words"], "Annotation": ["morphosyntactic tagging", "lemmatisation", "sentence alignment"], diff --git a/corpora/manually-annotated-corpora/nkjp1m.json b/corpora/manually-annotated-corpora/nkjp1m.json index 9b91ed5..d6fce68 100644 --- a/corpora/manually-annotated-corpora/nkjp1m.json +++ b/corpora/manually-annotated-corpora/nkjp1m.json @@ -3,7 +3,7 @@ "URL": "http://clip.ipipan.waw.pl/NationalCorpusOfPolish", "Family": "Manually annotated corpora", "Description": "This corpus is a manually annotated subset of the National Corpus of Polish.\nThe corpus is available for download from the Computational Linguistics in Poland website.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "GNU GPL 3", "Size": ["1 million tokens"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/nl2sh.json b/corpora/manually-annotated-corpora/nl2sh.json index 92f2110..8be7c38 100644 --- a/corpora/manually-annotated-corpora/nl2sh.json +++ b/corpora/manually-annotated-corpora/nl2sh.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1822", "Family": "Manually annotated corpora", "Description": "This corpus can be used to build and evaluate methods for knowledge extraction and representation based on a semantic hypergraph. Each sentence has natural language annotations and dedicated semantic hyperedge. Majority of the sentences used in this dataset are taken from the following sources:\n
        \n
      • John Eastwood, Oxford Guide to English Grammar, Oxford University Press, 2002.
      • \n
      • Andrew Redford, An Introduction to English Sentence Structure, Cambridge University Press, 2009.
      • \n
      • Essential English Grammar, Philip Gucker, Dover Publications, Inc. New York, 1966.
      • \nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN.SI Licence ACA ID-BY-NC-INF-NORED", "Size": ["6,851 tokens"], "Annotation": ["semantic role labelling", "coreference", "tokenisation", "PoS-tagging", "lemmatisation", "syntactic dependencies", "named entities"], diff --git a/corpora/manually-annotated-corpora/norec.json b/corpora/manually-annotated-corpora/norec.json index 976acb6..35cdd6e 100644 --- a/corpora/manually-annotated-corpora/norec.json +++ b/corpora/manually-annotated-corpora/norec.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11509/124", "Family": "Manually annotated corpora", "Description": "This corpus contains reviews in different domains (e.g., literature, videogames, etc.).\nThe corpus is available for download from the CLARINO repository.", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CC BY-NC 3.0", "Size": ["14.8 million tokens"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/parseme.json b/corpora/manually-annotated-corpora/parseme.json index 05ee42c..6735144 100644 --- a/corpora/manually-annotated-corpora/parseme.json +++ b/corpora/manually-annotated-corpora/parseme.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-5124", "Family": "Manually annotated corpora", "Description": "This multilingual resource contains corpora in which verbal multi-word expressions (MWEs) have been manually annotated. Verbal MWEs include idioms (let the cat out of the bag), light-verb constructions (make a decision), verb-particle constructions (give up), inherently reflexive verbs (help oneself), and multi-verb constructions (make do).\nThe 1.0 versions of the PARSEME corpora can be queried individually through KonText. We provide the individual links to each corpus:\n
        1. Parseme VMWE 1.0 – Czech
        2. Parseme VMWE 1.0 – German
        3. Parseme VMWE 1.0 – Greek
        4. Parseme VMWE 1.0 – Spanish
        5. Parseme VMWE 1.0 – Persian
        6. Parseme VMWE 1.0 – French
        7. Parseme VMWE 1.0 – Hungarian
        8. Parseme VMWE 1.0 – Italian
        9. Parseme VMWE 1.0 – Maltese
        10. Parseme VMWE 1.0 – Polish
        11. Parseme VMWE 1.0 – Portuguese
        12. Parseme VMWE 1.0 – Romanian
        13. Parseme VMWE 1.0 – Slovenian
        14. Parseme VMWE 1.0 – Swedish
        15. Parseme VMWE 1.0 – Turkish
        ", - "Languages": ["ara", "eus", "bul", "zho", "hrv", "ces", "eng", "fra", "deu", "heb", "hin", "hun", "gle", "ita", "lit", "mlt", "ell", "fas", "pol", "por", "ron", "srp", "slv", "spa", "swe", "tur"], + "Language": ["ara", "eus", "bul", "zho", "hrv", "ces", "eng", "fra", "deu", "heb", "hin", "hun", "gle", "ita", "lit", "mlt", "ell", "fas", "pol", "por", "ron", "srp", "slv", "spa", "swe", "tur"], "Licence": "PARSEME Shared Task Data (v. 1.1) Agreement", "Size": ["5.8 million tokens"], "Annotation": ["identification of verbal multi-word expressions (idioms, light-verb constructions, verb-particle constructions, inherently reflexive verbs, multi-verb constructions)"], diff --git a/corpora/manually-annotated-corpora/pol-coref.json b/corpora/manually-annotated-corpora/pol-coref.json index a17b30d..b223204 100644 --- a/corpora/manually-annotated-corpora/pol-coref.json +++ b/corpora/manually-annotated-corpora/pol-coref.json @@ -3,7 +3,7 @@ "URL": "http://zil.ipipan.waw.pl/PolishCoreferenceCorpus", "Family": "Manually annotated corpora", "Description": "This corpus contains texts in a variety of domains (magazines, fiction literature, non-fiction literature, computer-mediated communication, academic writing, etc.).\nThe corpus is available for download and online browsing.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY 3", "Size": ["540,000 tokens"], "Annotation": ["coreference"], diff --git a/corpora/manually-annotated-corpora/pol-dep-tree.json b/corpora/manually-annotated-corpora/pol-dep-tree.json index 621b47b..59add21 100644 --- a/corpora/manually-annotated-corpora/pol-dep-tree.json +++ b/corpora/manually-annotated-corpora/pol-dep-tree.json @@ -3,7 +3,7 @@ "URL": "http://zil.ipipan.waw.pl/PDB", "Family": "Manually annotated corpora", "Description": "This corpus also contains sentences showing certain problematic syntactic phenomena – sentences with ellipsis, comparative constructions, constructions with the bi-functional subordinating conjunction jako, etc. The syntactic parsing follows the Universal Dependencies schema.\nThe first version of the corpus is available for download from the Computational Linguistics in Poland website. The second version is available upon request.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY-NC-SA 4.0", "Size": ["22,000 trees", "351,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/pol-spatial.json b/corpora/manually-annotated-corpora/pol-spatial.json index 0a72bc2..ca78a7f 100644 --- a/corpora/manually-annotated-corpora/pol-spatial.json +++ b/corpora/manually-annotated-corpora/pol-spatial.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/543", "Family": "Manually annotated corpora", "Description": "This corpus contains travel blogs.\nThe corpus is available for download from the CLARIN-PL repository.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY-SA 4.0", "Size": ["46,000 tokens"], "Annotation": ["Named Entity recognition (spatial expressions)"], diff --git a/corpora/manually-annotated-corpora/pol-summaries.json b/corpora/manually-annotated-corpora/pol-summaries.json index fed012f..8ab9c7d 100644 --- a/corpora/manually-annotated-corpora/pol-summaries.json +++ b/corpora/manually-annotated-corpora/pol-summaries.json @@ -3,7 +3,7 @@ "URL": "http://zil.ipipan.waw.pl/PolishSummariesCorpus", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from the ZIL IPI PAN repository.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY 3", "Size": ["10845 summaries"], "Annotation": ["summarization"], diff --git a/corpora/manually-annotated-corpora/prague-arabic-treebank.json b/corpora/manually-annotated-corpora/prague-arabic-treebank.json index 62742ac..05f71de 100644 --- a/corpora/manually-annotated-corpora/prague-arabic-treebank.json +++ b/corpora/manually-annotated-corpora/prague-arabic-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-4872-3", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from the LINDAT repository.", - "Languages": ["ara"], + "Language": ["ara"], "Licence": "CC BY-NC-SA 3.0", "Size": [], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/prague-dependency-treebank.json b/corpora/manually-annotated-corpora/prague-dependency-treebank.json index 4139fb5..067ad12 100644 --- a/corpora/manually-annotated-corpora/prague-dependency-treebank.json +++ b/corpora/manually-annotated-corpora/prague-dependency-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2621", "Family": "Manually annotated corpora", "Description": "This corpus is manually annotated at several levels – aside from syntactic parsing and morphological information, it is annotation for sentence information structure, multiword expression, coreference, bridging relations and discourse relations.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["2 million words"], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/prague-discourse-treebank.json b/corpora/manually-annotated-corpora/prague-discourse-treebank.json index 2d5994e..641ed27 100644 --- a/corpora/manually-annotated-corpora/prague-discourse-treebank.json +++ b/corpora/manually-annotated-corpora/prague-discourse-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1905", "Family": "Manually annotated corpora", "Description": "This corpus is a subset of the Prague Dependency Treebank 3.5\nThe corpus is available through the PML-TQ tool.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY", "Size": ["49,500 sentences"], "Annotation": ["syntactic parsing", "mark-up of discourse phenomena enriched by the annotation of secondary connectives"], diff --git a/corpora/manually-annotated-corpora/prague-eng-ces-dep-treebank.json b/corpora/manually-annotated-corpora/prague-eng-ces-dep-treebank.json index d90089b..5e8c9ff 100644 --- a/corpora/manually-annotated-corpora/prague-eng-ces-dep-treebank.json +++ b/corpora/manually-annotated-corpora/prague-eng-ces-dep-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1664", "Family": "Manually annotated corpora", "Description": "This corpus is an extended version of Prague Czech-English Dependency Treebank 2.0, with added mark-up of coreference. The syntactic parsing follows the PDT 2.0 style.\nThe corpus is available for download from the LINDAT repository. The version without coreference annotation is available through the concordancer KonText and the PML-TQ tool (Czech part only).", - "Languages": ["ces", "eng"], + "Language": ["ces", "eng"], "Licence": "CC-BY-NC-SA + LDC99T42 (restricted use)", "Size": ["49,000 sentences"], "Annotation": ["syntactic parsing", "mark-up of coreference"], diff --git a/corpora/manually-annotated-corpora/reldi-normtagner-hr.json b/corpora/manually-annotated-corpora/reldi-normtagner-hr.json index 37a0362..2848f2b 100644 --- a/corpora/manually-annotated-corpora/reldi-normtagner-hr.json +++ b/corpora/manually-annotated-corpora/reldi-normtagner-hr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1793", "Family": "Manually annotated corpora", "Description": "This corpus contains manually annotated Croatian tweets. It is meant as a gold-standard training and testing dataset for tokenisation, sentence segmentation, word normalisation, morphosyntactic tagging, lemmatisation and named entity recognition of non-standard Serbian. Each tweet is also annotated for its automatically assigned standardness levels (T = technical standardness, L = linguistic standardness)..\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY 4.0", "Size": ["89,855 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "word normalisation", "morphosyntactic tagging", "lemmatisation", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/reldi-normtagner-sr.json b/corpora/manually-annotated-corpora/reldi-normtagner-sr.json index 540aa93..47e7ab4 100644 --- a/corpora/manually-annotated-corpora/reldi-normtagner-sr.json +++ b/corpora/manually-annotated-corpora/reldi-normtagner-sr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1794", "Family": "Manually annotated corpora", "Description": "This corpus contains manually annotated Serbian tweets. It is meant as a gold-standard training and testing dataset for tokenisation, sentence segmentation, word normalisation, morphosyntactic tagging, lemmatisation and named entity recognition of non-standard Serbian. Each tweet is also annotated for its automatically assigned standardness levels (T = technical standardness, L = linguistic standardness).\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC BY 4.0", "Size": ["92,271 tokens"], "Annotation": ["morphosyntactic tagging", "tokenisation", "sentence segmentation", "word normalisation", "lemmatisation", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/rsdo-def.json b/corpora/manually-annotated-corpora/rsdo-def.json index c7f7c7f..c4bb3b1 100644 --- a/corpora/manually-annotated-corpora/rsdo-def.json +++ b/corpora/manually-annotated-corpora/rsdo-def.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1841", "Family": "Manually annotated corpora", "Description": "This corpus contains sentences extracted from the Corpus of term-annotated texts RSDO5 1.1, which contains texts with annotated terms from four different domains: biomechanics, linguistics, chemistry, and veterinary science. The file and sentence identifiers are the same as in the original RSDO corpus. The labels added to the sentences included in the dataset denote: 0: Non-definition; 1: Weak definition; 2: Definition.\nThe dataset consists of two parts: 1. RSDO-def-random employed a random sampling strategy, with 14 definitions, 98 weak-definitions and 849 non-definitions; and 2. RSDO-def-larger added sentences to the random one by the pattern-based definition extraction as presented in Pollak et al. (2014). It contains 169 definitions, 214 weak-definitions and 872 non-definitions. Both parts were manually annotated by five terminographers. In case of discrepancies between annotators, a consensus was reached and the final label was confirmed by all five annotators. Duplicates were removed in both parts.\nThe criteria for annotation are based on the standard ISO 1087-1:2000 (E/F) Terminology Work - Vocabulary, Part 1, Theory and Application, which explains a definition as follows: \"Representation of a concept by a descriptive statement which serves to differentiate it from related concepts\". Weak definition labels were assigned if the extracted sentences contained a term and at least one delimiting feature without a superordinate concept, or sentences consisting of superordinate concepts without delimiting features but with some typical examples. Instances were labeled as Non-definition if the sentence with the extracted concept did not contain any information about the concept or its delimiting features.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["2,216 sentences"], "Annotation": ["term definition evaluation"], diff --git a/corpora/manually-annotated-corpora/sem-dis-est.json b/corpora/manually-annotated-corpora/sem-dis-est.json index 48af951..6ed833f 100644 --- a/corpora/manually-annotated-corpora/sem-dis-est.json +++ b/corpora/manually-annotated-corpora/sem-dis-est.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00081L", "Family": "Manually annotated corpora", "Description": "The corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA", "Size": ["375,733 tokens"], "Annotation": ["word sense disambiguation"], diff --git a/corpora/manually-annotated-corpora/sentinews.json b/corpora/manually-annotated-corpora/sentinews.json index 888c672..1bb2821 100644 --- a/corpora/manually-annotated-corpora/sentinews.json +++ b/corpora/manually-annotated-corpora/sentinews.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1110", "Family": "Manually annotated corpora", "Description": "This corpus contains news articles.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["10,427 articles"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/setimes-1-sr.json b/corpora/manually-annotated-corpora/setimes-1-sr.json index a7606ab..4a7eefe 100644 --- a/corpora/manually-annotated-corpora/setimes-1-sr.json +++ b/corpora/manually-annotated-corpora/setimes-1-sr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1200", "Family": "Manually annotated corpora", "Description": "This corpus contains posts from the Southeast European Times news portal, which is now defunct. The syntactic parsing follows the Universal Dependencies framework.\nThe corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC BY-SA 4.0", "Size": ["87,000 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "morphosyntactic tagging", "lemmatisation", "syntactic parsing", "Named Entity recognition"], diff --git a/corpora/manually-annotated-corpora/setimes-2-sr.json b/corpora/manually-annotated-corpora/setimes-2-sr.json index 1834cdf..22572ab 100644 --- a/corpora/manually-annotated-corpora/setimes-2-sr.json +++ b/corpora/manually-annotated-corpora/setimes-2-sr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1843", "Family": "Manually annotated corpora", "Description": "This training corpus contains around 100,000 tokens manually annotated on the levels of tokenisation, sentence segmentation, morphosyntactic tagging, lemmatisation, syntactic dependencies, and named entities. The annotation formalisms followed in the SETimes.SR corpus are (1) MULTEXT-East V6 morphosyntactic specifications, (2) the UDv2 Guidelines, and (3) Janes annotation guidelines for named entities. The difference to the previous version of the corpus are (1) the extension of the corpus with 502 sentences from various news sources and (2) improvements in the annotations of the corpus.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC BY-SA 4.0", "Size": ["97,673 tokens"], "Annotation": ["tokenisation", "sentence segmentation", "morphosyntactic tagging", "lemmatisation", "syntactic dependencies", "named entities"], diff --git a/corpora/manually-annotated-corpora/slovak-dependency-treebank.json b/corpora/manually-annotated-corpora/slovak-dependency-treebank.json index 8b972d5..2aecde8 100644 --- a/corpora/manually-annotated-corpora/slovak-dependency-treebank.json +++ b/corpora/manually-annotated-corpora/slovak-dependency-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1822", "Family": "Manually annotated corpora", "Description": "This syntactic parsing is modelled after the Prague Dependency Treebank.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-SA 4.0", "Size": ["106,000 tokens", "10,600 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/slowic.json b/corpora/manually-annotated-corpora/slowic.json index ee31bad..2dbaaaf 100644 --- a/corpora/manually-annotated-corpora/slowic.json +++ b/corpora/manually-annotated-corpora/slowic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1781", "Family": "Manually annotated corpora", "Description": "The SloWIC dataset is a Slovenian dataset for the Word in Context task. Each example in the dataset contains a target word with multiple meanings and two sentences that both contain the target word. Each example is also annotated with a label that shows if both sentences use the same meaning of the target word. The dataset contains 1808 manually annotated sentence pairs and additional 13150 automatically annotated pairs to help with training larger models. The dataset is stored in the JSON format following the format used in the SuperGLUE version of the Word in Context task.\nEach example contains the following data fields:\n
          \n
        • word: The target word with multiple meanings
        • \n
        • sentence1: The first sentence containing the target word
        • \n
        • sentence2: The second sentence containing the target word
        • \n
        • idx: The index of the example in the dataset
        • \n
        • label: Label showing if the sentences contain the same meaning of the target word
        • \n
        • start1: Start of the target word in the first sentence
        • \n
        • start2: Start of the target word in the second sentence
        • \n
        • end1: End of the target word in the first sentence
        • \n
        • end2: End of the target word in the second sentence
        • \n
        • version: The version of the annotation
        • \n
        • manual_annotation: Boolean showing if the label was manually annotated
        • \n
        • group: The group of annotators that labelled the example
        • \n
        ", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["14,958 items"], "Annotation": ["word sense disambiguation"], diff --git a/corpora/manually-annotated-corpora/sonar.json b/corpora/manually-annotated-corpora/sonar.json index 421f46a..dae1ddb 100644 --- a/corpora/manually-annotated-corpora/sonar.json +++ b/corpora/manually-annotated-corpora/sonar.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-h5", "Family": "Manually annotated corpora", "Description": "This is a manually annotated subset of the much larger (approx. 500 million) word) SoNaR corpus.\nThe corpus is available for download from the Dutch Language Institute.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": ["1 million words"], "Annotation": ["PoS tagging", "syntactic parsing", "semantic role labelling"], diff --git a/corpora/manually-annotated-corpora/speech-thought-writing.json b/corpora/manually-annotated-corpora/speech-thought-writing.json index e165090..6309102 100644 --- a/corpora/manually-annotated-corpora/speech-thought-writing.json +++ b/corpora/manually-annotated-corpora/speech-thought-writing.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2540", "Family": "Manually annotated corpora", "Description": "This corpus contains literary, newspaper and biography texts.\nThe corpus is available for download from the Oxford Text Archive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-NC-SA 3.0", "Size": ["260,000 words"], "Annotation": ["identification of reported speech"], diff --git a/corpora/manually-annotated-corpora/ssj500k.json b/corpora/manually-annotated-corpora/ssj500k.json index b293bae..0648a02 100644 --- a/corpora/manually-annotated-corpora/ssj500k.json +++ b/corpora/manually-annotated-corpora/ssj500k.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1181", "Family": "Manually annotated corpora", "Description": "This corpus contains standard Slovenian.\nThe corpus is available through the concordancers KonText and noSketchEngine and for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-NC-SA 4.0", "Size": ["586,000 tokens"], "Annotation": ["fully – tokenisation, sentence segmentation, morphosyntactic tagging, and lemmatisation. Half of the corpus – syntactic parsing, Named Entity recognition, and verbal multiword expression tagging. Quarter of corpus: semantic roles"], diff --git a/corpora/manually-annotated-corpora/syn-ref-med-fra.json b/corpora/manually-annotated-corpora/syn-ref-med-fra.json index 2d47483..c8f23e7 100644 --- a/corpora/manually-annotated-corpora/syn-ref-med-fra.json +++ b/corpora/manually-annotated-corpora/syn-ref-med-fra.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1007-0000-0000-9D2B-0", "Family": "Manually annotated corpora", "Description": "This corpus contains Old French texts.\nThe corpus is available for download from the IMS CLARIN-D repository.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CLARIN ACA", "Size": ["245,000 words"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/szeged-treebank.json b/corpora/manually-annotated-corpora/szeged-treebank.json index 0e83b24..1050e0b 100644 --- a/corpora/manually-annotated-corpora/szeged-treebank.json +++ b/corpora/manually-annotated-corpora/szeged-treebank.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-201", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from a dedicated webpage.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "licence agreement", "Size": ["82,000 sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/szeged.json b/corpora/manually-annotated-corpora/szeged.json index b257dce..8e34fa2 100644 --- a/corpora/manually-annotated-corpora/szeged.json +++ b/corpora/manually-annotated-corpora/szeged.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-347", "Family": "Manually annotated corpora", "Description": "This corpus is available for download from a dedicated webpage.\nTo download the versions of the Szeged Corpus and Szeged Treebank, you are obliged to fill and send a Licence Agreement.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "Licence agreement", "Size": ["1.5 million tokens"], "Annotation": ["morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/tamil-dep.json b/corpora/manually-annotated-corpora/tamil-dep.json index ee87ec2..7412d0b 100644 --- a/corpora/manually-annotated-corpora/tamil-dep.json +++ b/corpora/manually-annotated-corpora/tamil-dep.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1453", "Family": "Manually annotated corpora", "Description": "The syntactic parsing follows the rules of the https://ufal.mff.cuni.cz/pdt/.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["tam"], + "Language": ["tam"], "Licence": "CC BY-NC-SA 3.0", "Size": ["600 sentences"], "Annotation": ["syntactic parsing", "morphosyntactic tagging"], diff --git a/corpora/manually-annotated-corpora/timel-ann-est-news.json b/corpora/manually-annotated-corpora/timel-ann-est-news.json index 5ed7d69..c0f073e 100644 --- a/corpora/manually-annotated-corpora/timel-ann-est-news.json +++ b/corpora/manually-annotated-corpora/timel-ann-est-news.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-0015CL", "Family": "Manually annotated corpora", "Description": "This corpus contains newspaper articles.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY-SA", "Size": ["22,000 words"], "Annotation": ["morphosyntactic tagging", "syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/tree-iness.json b/corpora/manually-annotated-corpora/tree-iness.json index 8d95894..a430c8c 100644 --- a/corpora/manually-annotated-corpora/tree-iness.json +++ b/corpora/manually-annotated-corpora/tree-iness.json @@ -3,7 +3,7 @@ "URL": "http://clarino.uib.no/iness/page", "Family": "Manually annotated corpora", "Description": "This is a collection of treebanks made available through the Infrastructure for the Exploration of Syntax and Semantics (INESS).\nThe corpora are available for online querying through INESS.", - "Languages": ["71 languages"], + "Language": ["71 languages"], "Licence": "CC-BY", "Size": ["532 treebanks"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/tueba-dz.json b/corpora/manually-annotated-corpora/tueba-dz.json index ab107da..adf0b0c 100644 --- a/corpora/manually-annotated-corpora/tueba-dz.json +++ b/corpora/manually-annotated-corpora/tueba-dz.json @@ -3,7 +3,7 @@ "URL": "https://uni-tuebingen.de/en/faculties/faculty-of-humanities/departments/modern-languages/department-of-linguistics/chairs/general-and-computational-linguistics/ressources/corpora/tueba-dz/", "Family": "Manually annotated corpora", "Description": "This corpus contains newspaper articles.\nThe corpus is available for download from the Tübingen CLARIN Repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["1.9 million tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/turku-dep.json b/corpora/manually-annotated-corpora/turku-dep.json index 32f9de6..eddc2a7 100644 --- a/corpora/manually-annotated-corpora/turku-dep.json +++ b/corpora/manually-annotated-corpora/turku-dep.json @@ -3,7 +3,7 @@ "URL": "http://bionlp.utu.fi/fintreebank.html", "Family": "Manually annotated corpora", "Description": "The syntactic parsing follows the Universal Dependencies schema.\nThe corpus is available for download from the Turku BioNLP Group. ", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-SA", "Size": ["204,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/twitter-sentiment.json b/corpora/manually-annotated-corpora/twitter-sentiment.json index 1cf8325..92a697b 100644 --- a/corpora/manually-annotated-corpora/twitter-sentiment.json +++ b/corpora/manually-annotated-corpora/twitter-sentiment.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1054", "Family": "Manually annotated corpora", "Description": "This corpus contains Tweet IDs with sentiment annotations.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["sqi", "bos", "bul", "hrv", "eng", "deu", "hun", "pol", "por", "rus", "srp", "slk", "slv", "spa", "swe"], + "Language": ["sqi", "bos", "bul", "hrv", "eng", "deu", "hun", "pol", "por", "rus", "srp", "slk", "slv", "spa", "swe"], "Licence": "CC BY-SA 4.0", "Size": ["1.6 million tweets"], "Annotation": ["sentiment analysis"], diff --git a/corpora/manually-annotated-corpora/ud-estonian.json b/corpora/manually-annotated-corpora/ud-estonian.json index fe59418..7cd2973 100644 --- a/corpora/manually-annotated-corpora/ud-estonian.json +++ b/corpora/manually-annotated-corpora/ud-estonian.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00168L", "Family": "Manually annotated corpora", "Description": "This corpus contains fictional, newspaper and scientific texts. The syntactic parsing follows the Universal Dependencies schema.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY-SA", "Size": ["434,000 tokens"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/uni-dep.json b/corpora/manually-annotated-corpora/uni-dep.json index 389c677..9f5eb84 100644 --- a/corpora/manually-annotated-corpora/uni-dep.json +++ b/corpora/manually-annotated-corpora/uni-dep.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-5150", "Family": "Manually annotated corpora", "Description": "This corpus collection contains treebanks following theUniversal Dependencies framework.\nThe corpus collection is available for download from the LINDAT repository.\nThe individual treebanks in Universal Dependencies 2.3 can also be queried through the concordancer KonText and the treebank query tool PML-TQ. Below we provide links to these search environments for all the treebanks. For a detailed description of the treebanks, see the Universal Dependencies project page.\n
        1. UD_Akkadian-PISANDUB (KonText)
        2. UD_Amharic-ATT (KonText, PML-TQ)
        3. UD_Armenian-ArmTDP (KonText, PML-TQ)
        4. UD_Breton-KEB (KonText, PML-TQ)
        5. UD_Buryat-BDT (KonText, PML-TQ)
        6. UD_Cantonese-HK (KonText, PML-TQ)
        7. UD_Chinese-HK (KonText, PML-TQ)
        8. UD_Chinese-CFL (KonText, PML-TQ)
        9. UD_Coptic-Scriptorium (KonText, PML-TQ)
        10. UD_Croatian-SET (KonText, PML-TQ)
        11. UD_English-ESL (KonText, PML-TQ)
        12. UD_Faroese-OFT (KonText, PML-TQ)
        13. UD_Galician-TreeGal (KonText, PML-TQ)
        14. UD_Hindi_English-HIENCS (KonText)
        15. UD_Kazakh-KTB 2.2 (KonText, PML-TQ)
        16. UD_Komi_Zyrian-Lattice (KonText, PML-TQ)
        17. UD_Komi_Zyrian-IKDP (KonText, PML-TQ
        18. UD_Kurmanji-MG (KonText, PML-TQ)
        19. UD_Lithuanian-HSE (KonText, PML-TQ)
        20. UD_Maltese-MUDT (KonText, PML-TQ)
        21. UD_Marathi-UFAL (KonText, PML-TQ)
        22. UD_Naija-NSC (KonText, PML-TQ)
        23. UD_Persian-Seraji (KonText, PML-TQ)
        24. UD_Russian-Taiga (KonText, PML-TQ)
        25. UD_Sanskrit-UFAL (KonText, PML-TQ)
        26. UD_Serbian-SET (KonText, PML-TQ)  
        27. UD_Slovenian-SST (KonText, PML-TQ)
        28. UD_Tagalog-TRG (KonText, PML-TQ)
        29. UD_Telugu-MTG (KonText, PML-TQ)
        30. UD_Ukrainian-IU (KonText, PML-TQ)
        31. UD_Upper_Sorbian-UFAL (KonText, PML-TQ)
        32. UD_Uyghur-UDT (KonText, PML-TQ)
        33. UD_Warlpiri-UFAL (KonText, PML-TQ)
        34. UD_Yoruba-YTB (KonText, PML-TQ)
        35. UD_Afrikaans-AfriBooms (KonText)
        36. UD_Ancient_Greek-PROIEL (KonText)
        37. UD_Ancient_Greek-Perseus (KonText, PML-TQ)
        38. UD_Arabic-PADT (KonText, PML-TQ)
        39. UD_Arabic-PUD (KonText, PML-TQ)
        40. UD_Arabic-NYUAD (KonText)
        41. UD_Bambara-CRB (KonText, PML-TQ)
        42. UD_Basque-BDT (KonText, PML-TQ)
        43. UD_Belarusian-HSE  (KonText, PML-TQ)
        44. UD_Bulgarian-BTB (KonText, PML-TQ)
        45. UD_Catalan-AnCora (KonText, PML-TQ)
        46. UD_Chinese-GSD (KonText, PML-TQ)
        47. UD_Chinese-PUD (KonText, PML-TQ)
        48. UD_Czech-PDT  (KonText, PML-TQ)
        49. UD_Czech-CAC  (KonText, PML-TQ)
        50. UD_Czech-FicTree  (KonText, PML-TQ
        51. UD_Czech-PUD (KonTextPML-TQ)
        52. UD_Czech-CLTT (KonTextPML-TQ)
        53. UD_Danish-DDT (KonText, PML-TQ)
        54. UD_Dutch-Alpino (KonText, PML-TQ)
        55. UD_Dutch-LassySmall (KonText, PML-TQ)
        56. UD_English-ParTUT (KonTextPML-TQ)
        57. UD_English-GUM (KonText, PML-TQ)
        58. UD_English-EWT (KonText, PML-TQ)
        59. UD_English-PUD (KonText, PML-TQ)
        60. UD_English-LinES (KonText, PML-TQ)
        61. UD_Erzya-JR (KonText, PML-TQ)
        62. UD_Finnish-FTB (KonText, PML-TQ)
        63. UD_Finnish-TDT (KonText, PML-TQ)
        64. UD_Finnish-PUD (KonText, PML-TQ)
        65. UD_French-ParTUT (KonText, PML-TQ)
        66. UD_French-GSD (KonText, PML-TQ)
        67. UD_French-Sequoia (KonText, PML-TQ)
        68. UD_French-Spoken (KonText, PML-TQ)
        69. UD_French-PUD (KonText, PML-TQ)
        70. UD_French-FTB (KonText)
        71. UD_Galician-CTG (KonText, PML-TQ)
        72. UD_German-GSD  (KonText, PML-TQ)
        73. UD_German-PUD (KonText, PML-T)
        74. UD_Gothic-PROIEL (KonText, PML-TQ)
        75. UD_Greek-GDT (KonText, PML-TQ)
        76. UD_Hebrew-HTB (KonText, PML-TQ)
        77. UD_Hindi-HDTB (KonText, PML-TQ)
        78. UD_Hindi-PUD (KonText, PML-TQ)
        79. UD_Hungarian-Szeged (KonText, PML-TQ)
        80. UD_Indonesian-GSD (KonText, PML-TQ)
        81. UD_Indonesian-PUD  (KonText, PML-TQ)
        82. UD_Irish-IDT  (KonText, PML-TQ)
        83. UD_Italian-ISDT (KonText, PML-TQ)
        84. UD_Italian-ParTUT (KonText, PML-TQ)
        85. UD_Italian-PUD (KonText, PML-TQ)
        86. UD_Japanese-GSD (KonText, PML-TQ
        87. UD_Japanese-PUD (KonText, PML-TQ)
        88. UD_Japanese-Modern (KonText, PML-TQ)
        89. UD_Korean-Kaist (KonText, PML-TQ)
        90. UD_Korean-GSD (KonText, PML-TQ)
        91. UD_Korean-PUD (KonText, PML-TQ)
        92. UD_Latin-PROIEL (KonText, PML-TQ)
        93. UD_Latin-ITTB (KonText, PML-TQ)
        94. UD_Latin-Perseus (KonText, PML-TQ)
        95. UD_Latvian-LVTB (KonText, PML-TQ)
        96. UD_North_Sami-Giella (KonText, PML-TQ)
        97. UD_Norwegian-Bokmaal (KonText, PML-TQ)
        98. UD_Norwegian-Nynorsk (KonText, PML-TQ)
        99. UD_Norwegian-NynorskLIA (KonText, PML-TQ)
        100. UD_Old_Church_Slavonic-PROIEL (KonText, PML-TQ)
        101. UD_Old_French-SRCMF (KonText, PML-TQ)
        102. UD_Polish-LFG (KonText, PML-TQ)
        103. UD_Polish-SZ (KonText, PML-TQ)
        104. UD_Portuguese-Bosque (KonText, PML-TQ)
        105. UD_Portuguese-GSD (KonText, PML-TQ)
        106. UD_Portuguese-PUD (KonText, PML-TQ)
        107. UD_Romanian-RRT (KonText, PML-TQ)
        108. UD_Romanian-Nonstandard (KonText, PML-TQ)
        109. UD_Russian-GSD (KonText, PML-TQ)
        110. UD_Russian-PUD (KonText, PML-TQ)
        111. UD_Russian-SynTagRus (KonText, PML-TQ)
        112. UD_Slovak-SNK (KonText, PML-TQ)
        113. UD_Slovenian-SSJ (KonText, PML-TQ)
        114. UD_Spanish-AnCora (KonText, PML-TQ)
        115. UD_Spanish-GSD (KonText, PML-TQ)
        116. UD_Spanish-PUD (KonText, PML-TQ)
        117. UD_Swedish-Talbanken (KonText, PML-TQ)
        118. UD_Swedish-LinES (KonText, PML-TQ)
        119. UD_Swedish-PUD (KonText, PML-TQ)
        120. UD_Swedish_Sign_Language-SSLC (KonText, PML-TQ)
        121. UD_Tamil-TTB (KonText, PML-TQ)
        122. UD_Thai-PUD (KonText, PML-TQ)
        123. UD_Turkish-IMST (KonText, PML-TQ)
        124. UD_Turkish-PUD (KonText, PML-TQ)
        125. UD_Urdu-UDTB (KonText, PML-TQ)
        126. UD_Vietnamese-VTB (KonText, PML-TQ)
        ", - "Languages": ["75 languages"], + "Language": ["75 languages"], "Licence": "Licence Universal Dependencies v2.12 ", "Size": ["30 million tokens", "30.6 million words", "1.8 million sentences"], "Annotation": ["syntactic parsing"], diff --git a/corpora/manually-annotated-corpora/vejica.json b/corpora/manually-annotated-corpora/vejica.json index f8f6e22..86c4545 100644 --- a/corpora/manually-annotated-corpora/vejica.json +++ b/corpora/manually-annotated-corpora/vejica.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1185", "Family": "Manually annotated corpora", "Description": "This corpus contains texts from various Slovenian corpora (KUST, Šolar aLektorm JANES-Vejican Wikpedia.\nThe corpus is available for download from CLARIN.SI.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-NC-SA 4.0", "Size": ["104,000 sentences"], "Annotation": ["comma placement"], diff --git a/corpora/manually-annotated-corpora/wut-relations.json b/corpora/manually-annotated-corpora/wut-relations.json index ad68445..b98bfa4 100644 --- a/corpora/manually-annotated-corpora/wut-relations.json +++ b/corpora/manually-annotated-corpora/wut-relations.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/305", "Family": "Manually annotated corpora", "Description": "This corpus contains news items.\nThe corpus is available for download from the CLARIN.PL repository.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CC BY-SA 3.0", "Size": ["5654 sentences"], "Annotation": ["relations between sentences - Cross-document Structure Theory (CST)"], diff --git a/corpora/manually-annotated-corpora/xlime.json b/corpora/manually-annotated-corpora/xlime.json index 005dc88..69ba26d 100644 --- a/corpora/manually-annotated-corpora/xlime.json +++ b/corpora/manually-annotated-corpora/xlime.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1078", "Family": "Manually annotated corpora", "Description": "This corpus contains Tweets.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["deu", "ita", "spa"], + "Language": ["deu", "ita", "spa"], "Licence": "MIT License", "Size": ["364,000 tokens"], "Annotation": ["PoS tagging", "Named Entity recognition", "sentiment analysis"], diff --git a/corpora/multimodal-corpora/bas-smartkom.json b/corpora/multimodal-corpora/bas-smartkom.json index 9bfecd3..dd567ae 100644 --- a/corpora/multimodal-corpora/bas-smartkom.json +++ b/corpora/multimodal-corpora/bas-smartkom.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-DB6B-2", "Family": "Manually annotated corpora", "Description": "This corpus contains multi modal recordings of 86 actors who use the SmartKom system. SmartKom Public is comparable to a traditional public phone booth but equipped with additional intelligent communication devices. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum, in fact the user only knew that the system is able to understand speech, gestures and even mimic expressions and should more or less communicate like a human.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["15 hours"], "Annotation": ["orthography", "phonology", "speaker turn", "noise", "prosody", "emotion", "hand gesture", "facial expression"], diff --git a/corpora/multimodal-corpora/bas-smartweb.json b/corpora/multimodal-corpora/bas-smartweb.json index dbe09c0..1c3874b 100644 --- a/corpora/multimodal-corpora/bas-smartweb.json +++ b/corpora/multimodal-corpora/bas-smartweb.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C059-C", "Family": "Manually annotated corpora", "Description": "The corpus contains a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The recordings include 156 field recordings using a hand-held UMTS device (one person, SmartWeb Handheld Corpus SHC), 99 field recordings with video capture of the primary speaker and a secondary speaker (SmartWeb Video Corpus SVC) as well as 36 mobile recordings performed on a BMW motorbike (one speaker, SmartWeb Motorbike Corpus SMC).\nThe corpus is available for download from the BAS CLARIN-D repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["36 hours"], "Annotation": ["orthography", "phonology", "speaker turn", "noise", "prosody", "gaze direction"], diff --git a/corpora/multimodal-corpora/bielefeld-sga.json b/corpora/multimodal-corpora/bielefeld-sga.json index df4bf2d..f0d40ef 100644 --- a/corpora/multimodal-corpora/bielefeld-sga.json +++ b/corpora/multimodal-corpora/bielefeld-sga.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-DEC1-C", "Family": "Manually annotated corpora", "Description": "This corpus contains 25 dialogues of interlocutors (50), who engage in a spatial communication task combining direction-giving and sight description. The stimulus is a model of a town presented in a Virtual Reality (VR) environment. Upon finishing a “bus ride” through the VR town along five landmarks, a router explained the route as well as the wayside landmarks to an unknown and naive follower.\nThe corpus is available for download from the BAS CLARIN-D repository.", - "Languages": ["deu", "eng"], + "Language": ["deu", "eng"], "Licence": "CLARIN ACA", "Size": ["9881 isolated words", "1764 gestures"], "Annotation": ["alignment of speech and gestures"], diff --git a/corpora/multimodal-corpora/eva.json b/corpora/multimodal-corpora/eva.json index 8fa9dbb..67046ca 100644 --- a/corpora/multimodal-corpora/eva.json +++ b/corpora/multimodal-corpora/eva.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11356/1311", "Family": "Manually annotated corpora", "Description": "This corpus contains one episode of an audio/video session plus corresponding orthographic transcriptions with a duration of 57 minutes. The multi-party spontaneous discourse in the recording is from an entertaining evening TV-talk show A si ti tut not padu, broadcasted by the POP-TV Slovene commercial TV station in 2008, and represents a part of the Slovene spoken corpus GOS.\nIn addition to the original transcription and morphosyntactic annotation from the GOS corpus, the following layers of information are added:
        • statement sentiment
        • phrase breaks within statements
        • prominence of statements
        • sentences within the statement
        • sentence sentiment
        • sentence type
        • speaker visibility on the scene
        • gesture units
        • gesture phrases
        • emotions
        • semiotic intent
        • dialogue role
        \nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-NC-SA 4.0", "Size": ["57 minutes"], "Annotation": ["MSD-tagged", "non-verbal and verbal elements of communication"], diff --git a/corpora/multimodal-corpora/eye-tracking.json b/corpora/multimodal-corpora/eye-tracking.json index e7750e1..c68fe6c 100644 --- a/corpora/multimodal-corpora/eye-tracking.json +++ b/corpora/multimodal-corpora/eye-tracking.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/F35713E0-CE29-4BCA-98E9-1F2E3E912909", "Family": "Manually annotated corpora", "Description": "The corpus is available for download from the Language Archive (CLARIAH-NL).", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "restricted", "Size": [], "Annotation": [], diff --git a/corpora/multimodal-corpora/hindi-vis-genome.json b/corpora/multimodal-corpora/hindi-vis-genome.json index c138512..32a5550 100644 --- a/corpora/multimodal-corpora/hindi-vis-genome.json +++ b/corpora/multimodal-corpora/hindi-vis-genome.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2997", "Family": "Manually annotated corpora", "Description": "This corpus contains short English segments (captions) from Visual Genome along with associated images. The English texts are automatically translated to Hindi with manual post-editing, taking the associated images into account.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["hin", "eng"], + "Language": ["hin", "eng"], "Licence": "CC BY-NC-SA 4.0", "Size": ["32,925 items", "32,535 images", "32925 sentences", "322,000 words"], "Annotation": [], diff --git a/corpora/multimodal-corpora/hun-multimodal.json b/corpora/multimodal-corpora/hun-multimodal.json index 60bc625..970893c 100644 --- a/corpora/multimodal-corpora/hun-multimodal.json +++ b/corpora/multimodal-corpora/hun-multimodal.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-001A-E17C-1", "Family": "Manually annotated corpora", "Description": "This corpus contains video and audio recordings of conversations divided into two major parts: a simulated job interview and a guided dialogue about personal topics. The participants are university students (54 females, 67 males) mostly involving the same interviewer in both scenarios.\nThe corpus is available for online browsing through the MTA RIL Language Archive Serve (HUN-CLARIN distribution) and for download from the Language Archive (CLARIAH-NL).", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "open and restricted", "Size": ["50 hours"], "Annotation": ["non-verbal and verbal elements of communication"], diff --git a/corpora/multimodal-corpora/ifa-dialog.json b/corpora/multimodal-corpora/ifa-dialog.json index 480c8bd..e1754c1 100644 --- a/corpora/multimodal-corpora/ifa-dialog.json +++ b/corpora/multimodal-corpora/ifa-dialog.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-735", "Family": "Manually annotated corpora", "Description": "This corpus contains annotated video recordings of friendly Face-to-Face dialogues. It is modelled on the Face-to-Face dialogues in the Spoken Dutch Corpus (CGN). The procedures and design of the corpus were adapted to make this corpus useful for other researchers of Dutch speech. For this corpus 20 dialogue conversations of 15 minutes were recorded and annotated, in total 5 hours of speech. To stay close to the Face-to-Face dialogues in the CGN, pairs of well-acquainted participants were selected, either good friends, relatives, or long-time colleagues. The participants were allowed to talk about any topic they wanted.\nThe corpus is available for download from a dedicated webpage (hosted by CLARIAH-NL).", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "GNU general public license", "Size": ["5 hours"], "Annotation": ["functional annotation of dialogue utterances", "annotated gaze direction"], diff --git a/corpora/multimodal-corpora/interactions-dialogales.json b/corpora/multimodal-corpora/interactions-dialogales.json index 16b5956..0e88035 100644 --- a/corpora/multimodal-corpora/interactions-dialogales.json +++ b/corpora/multimodal-corpora/interactions-dialogales.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/sldr000027/v2", "Family": "Manually annotated corpora", "Description": "A demo version of this corpus is available for download (videos and transcriptions) from the ORTOLANG repository.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "", "Size": ["8 hours"], "Annotation": ["prosody", "interpausal units", "gestures", "syntax"], diff --git a/corpora/multimodal-corpora/mpi-esf.json b/corpora/multimodal-corpora/mpi-esf.json index a23d180..0207ce8 100644 --- a/corpora/multimodal-corpora/mpi-esf.json +++ b/corpora/multimodal-corpora/mpi-esf.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-426", "Family": "Manually annotated corpora", "Description": "This corpus was built under the ESF Foreign Language Speakers project. It contains a lot of annotated audio recordings containing multimodal interaction.", - "Languages": ["nld", "eng", "fra", "deu", "swe"], + "Language": ["nld", "eng", "fra", "deu", "swe"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/multimodal-corpora/multimodal-text-comprehension.json b/corpora/multimodal-corpora/multimodal-text-comprehension.json index 607c31a..d7d3655 100644 --- a/corpora/multimodal-corpora/multimodal-text-comprehension.json +++ b/corpora/multimodal-corpora/multimodal-text-comprehension.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2546-8", "Family": "Manually annotated corpora", "Description": "This corpus contains reading comprehension exercises in a high school setting involving 2 high school students and their teacher. The goal of the sessions is to represent how the interaction between a teacher and more than one students is performed: what is the structure of the conversation, how turn-taking is coordinated, what are the multimodal feedback and attention signals the speakers employ.\nThe corpus is available for download from CLARIN:EL.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC BY-NC-SA", "Size": [], "Annotation": ["orthographic transcription", "gaze/head/eye/lip movements"], diff --git a/corpora/multimodal-corpora/natural-media-mc.json b/corpora/multimodal-corpora/natural-media-mc.json index 8a5cd05..f37f515 100644 --- a/corpora/multimodal-corpora/natural-media-mc.json +++ b/corpora/multimodal-corpora/natural-media-mc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C34C-8", "Family": "Manually annotated corpora", "Description": "This corpus contains data from 18 participants, whose task was to describe nine objects each to an experimenter, without using everyday vocabulary about forms, sizes or objects. The participants were recorded on audio and several video cameras, and their hand movements were recorded using an optical VICON motion capture system.\nThe corpus is available for download from the BAS CLARIN-D repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["3 hours"], "Annotation": ["gesture types", "meta-information about encoding (e.g., difficult to encode)"], diff --git a/corpora/multimodal-corpora/polimodal.json b/corpora/multimodal-corpora/polimodal.json index 4fbbe87..60f6f19 100644 --- a/corpora/multimodal-corpora/polimodal.json +++ b/corpora/multimodal-corpora/polimodal.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11752/OPEN-534", "Family": "Manually annotated corpora", "Description": "This corpus includes the transcripts of 56 TV face-to-face interviews (14 hours total) taken from several broadcasts of the Italian political talk show Mezz'ora, from 24 September 2017 to 14 January 2018, aired on the Rai 3 channel.\nThe audio signal has been transcribed using a semi-supervised speech-to-text methodology (Google API+ manual correction). Annotation has been done using XML as markup language and following the TEI standard for Speech Transcripts in terms of utterances.\nThe corpus is available for download from the ILC4CLARIN repository.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC BY-NC-SA 4.0", "Size": ["100,870 tokens"], "Annotation": ["utterance phenomena", "gesture annotations (facial, hand, body posture)"], diff --git a/corpora/multimodal-corpora/tourist-brochures-helsinki.json b/corpora/multimodal-corpora/tourist-brochures-helsinki.json index 90123fd..65fe7a9 100644 --- a/corpora/multimodal-corpora/tourist-brochures-helsinki.json +++ b/corpora/multimodal-corpora/tourist-brochures-helsinki.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015030301", "Family": "Manually annotated corpora", "Description": "This corpus contains tourist brochures produced by the city of Helsinki, Finland, is fully annotated using XML schema provided for the Genre and Multimodality (GeM) model (Bateman 2008).\nThe corpus is available for download from FIN-CLARIN.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA", "Size": ["58 double pages"], "Annotation": ["content", "layout", "graphic", "typographic appearance", "rhetorical structure"], diff --git a/corpora/multimodal-corpora/tv-news.json b/corpora/multimodal-corpora/tv-news.json index 060b4cc..d31bda6 100644 --- a/corpora/multimodal-corpora/tv-news.json +++ b/corpora/multimodal-corpora/tv-news.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00093L", "Family": "Manually annotated corpora", "Description": "This corpus contains video and audio recordings and their transcriptions.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY-SA", "Size": ["30 hours"], "Annotation": [], diff --git a/corpora/multimodal-corpora/unisa-isizulu.json b/corpora/multimodal-corpora/unisa-isizulu.json index 4b7758d..d385282 100644 --- a/corpora/multimodal-corpora/unisa-isizulu.json +++ b/corpora/multimodal-corpora/unisa-isizulu.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/20.500.12185/230", "Family": "Manually annotated corpora", "Description": "The corpus is unavailable.", - "Languages": ["zul"], + "Language": ["zul"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/multimodal-corpora/video-linked-thai-swe.json b/corpora/multimodal-corpora/video-linked-thai-swe.json index 607a799..a950b57 100644 --- a/corpora/multimodal-corpora/video-linked-thai-swe.json +++ b/corpora/multimodal-corpora/video-linked-thai-swe.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10050/00-0000-0000-0000-0002-7@view", "Family": "Manually annotated corpora", "Description": "This corpus consists of 60 transcripts from interactions in everyday contexts between 6 children and their caregivers (10 transcripts per child), recorded longitudinally, for the period when the children are 18 to 27 months of age. All six children are growing up in middle class environments, in Sweden and Thailand (Bangkok area) respectively. The videos of the corpus are linked to the transcripts, on an utterance-by-utterance basis using the software CLAN (MacWhinney 2020).\nThe corpus is available for online browsing (CLARIN K-Centre Lund University Humanities Lab).", - "Languages": ["swe", "tha"], + "Language": ["swe", "tha"], "Licence": "", "Size": [], "Annotation": ["video-transcription alignment", "word segmentation", "phonetic transcription"], diff --git a/corpora/newspaper-corpora/8-sidor.json b/corpora/newspaper-corpora/8-sidor.json index b6b10ad..b649a25 100644 --- a/corpora/newspaper-corpora/8-sidor.json +++ b/corpora/newspaper-corpora/8-sidor.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/attasidor", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the Swedish newspaper 8 sidor from 2003 to 2012.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["678,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "parsed", "compounds"], diff --git a/corpora/newspaper-corpora/accurat.json b/corpora/newspaper-corpora/accurat.json index d338800..7811eb6 100644 --- a/corpora/newspaper-corpora/accurat.json +++ b/corpora/newspaper-corpora/accurat.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23BF-2", "Family": "Newspaper corpora", "Description": "This comparable corpus contains sentence pairs extracted from news comparable corpora.\nThe corpus is available for download from the CLARIN:EL repository.\nLanguage pairs: English-Croatian, English- Greek, English-Estonian, English-Latvian, English-Lithuanian, English-Romanian, English-Slovenian, Greek-Romanian, Latvian-Lithuanian, Romanian-German, Romanian-Lithuanian and German-English", - "Languages": ["eng","hrv","ell","est","lav","lit","ron","slv","deu"], + "Language": ["eng","hrv","ell","est","lav","lit","ron","slv","deu"], "Licence": "CC BY", "Size": ["23,820 sentences"], "Annotation": [], diff --git a/corpora/newspaper-corpora/chronopress.json b/corpora/newspaper-corpora/chronopress.json index 08aa909..641dd17 100644 --- a/corpora/newspaper-corpora/chronopress.json +++ b/corpora/newspaper-corpora/chronopress.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/260", "Family": "Newspaper corpora", "Description": "This corpus contains articles from various Polish newspapers from 1945 and 1962.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "CLARIN PUB", "Size": ["20 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "named entities"], diff --git a/corpora/newspaper-corpora/contemp-serbian.json b/corpora/newspaper-corpora/contemp-serbian.json index d648bc8..e3630f0 100644 --- a/corpora/newspaper-corpora/contemp-serbian.json +++ b/corpora/newspaper-corpora/contemp-serbian.json @@ -3,7 +3,7 @@ "URL": "http://metashare.elda.org/repository/browse/corpus-of-contemporary-serbian-newpapers-and-magazines/210858448b2a11e2b539001517144592b76e35aee8794c51bd3016f1e57e765e/", "Family": "Newspaper corpora", "Description": "This corpus contains articles from over a 100 Serbian newspapers from 2004 to 2012.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC-BY", "Size": ["916 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/newspaper-corpora/corp-news-texts.json b/corpora/newspaper-corpora/corp-news-texts.json index 1e5450b..999ec86 100644 --- a/corpora/newspaper-corpora/corp-news-texts.json +++ b/corpora/newspaper-corpora/corp-news-texts.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730175", "Family": "Newspaper corpora", "Description": "This corpus contains articles from a variety of Swedish, English and Finnish newspapers.\nThe corpus can be found in the FIN-CLARIN repository although its availability and licence are still under negotiation.", - "Languages": ["swe","fin","eng"], + "Language": ["swe","fin","eng"], "Licence": "under negotiation", "Size": ["435 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/newspaper-corpora/cripco.json b/corpora/newspaper-corpora/cripco.json index fa9760f..1d25235 100644 --- a/corpora/newspaper-corpora/cripco.json +++ b/corpora/newspaper-corpora/cripco.json @@ -3,7 +3,7 @@ "URL": "https://hlt-nlp.fbk.eu/technologies/cripco", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the Italian newspaper L’Adige from 1999 to 2006.\nThe corpus is available for download through META-SHARE.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "proprietary", "Size": ["43,000 documents"], "Annotation": ["coreference resolution"], diff --git a/corpora/newspaper-corpora/dagny.json b/corpora/newspaper-corpora/dagny.json index 4f38ecf..1942da6 100644 --- a/corpora/newspaper-corpora/dagny.json +++ b/corpora/newspaper-corpora/dagny.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/ub-kvt-dagny#tabs=information", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Dagny from 1886 to 1913.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["8.1 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/deu-newscrawl.json b/corpora/newspaper-corpora/deu-newscrawl.json index 3e6a8b3..c2c1b58 100644 --- a/corpora/newspaper-corpora/deu-newscrawl.json +++ b/corpora/newspaper-corpora/deu-newscrawl.json @@ -3,7 +3,7 @@ "URL": "http://corpora.uni-leipzig.de/en?corpusId=deu_newscrawl_2011", "Family": "Newspaper corpora", "Description": "This corpus contains articles from various German newspapers from 2011.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["426 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/newspaper-corpora/dn-1987.json b/corpora/newspaper-corpora/dn-1987.json index 2116b3c..700f54e 100644 --- a/corpora/newspaper-corpora/dn-1987.json +++ b/corpora/newspaper-corpora/dn-1987.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/dn1987", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the Swedish newspaper Dagens Nyheter from 1987.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["5 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "parsed", "compounds"], diff --git a/corpora/newspaper-corpora/est-republicain.json b/corpora/newspaper-corpora/est-republicain.json index b18d899..a32cb74 100644 --- a/corpora/newspaper-corpora/est-republicain.json +++ b/corpora/newspaper-corpora/est-republicain.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/est_republicain", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the French newspaper l'Est Républicain from 1999 to 2003.\nThe corpus is available for download from Ortolang.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": [], "Annotation": ["MSD-tagged", "lemmatised"], diff --git a/corpora/newspaper-corpora/europeana-at.json b/corpora/newspaper-corpora/europeana-at.json index a59898c..f0fb1f6 100644 --- a/corpora/newspaper-corpora/europeana-at.json +++ b/corpora/newspaper-corpora/europeana-at.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1091", "Family": "Newspaper corpora", "Description": "This corpus contains 147,515 issues of 77 newspapers published in Austria between 1683 and 1930.", - "Languages": ["deu","ell","hrv"], + "Language": ["deu","ell","hrv"], "Licence": "Public", "Size": ["2,351,079,191 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-de.json b/corpora/newspaper-corpora/europeana-de.json index 109c3c7..8917cf0 100644 --- a/corpora/newspaper-corpora/europeana-de.json +++ b/corpora/newspaper-corpora/europeana-de.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1098", "Family": "Newspaper corpora", "Description": "This corpus contains 126,564 issues of 11 newspapers published in Germany (chiefly Berlin and Hamburg) between 1792 and 1945.", - "Languages": ["deu", "eng"], + "Language": ["deu", "eng"], "Licence": "Public", "Size": ["5,593,768,847 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-ee.json b/corpora/newspaper-corpora/europeana-ee.json index 544dabe..3bd2a9a 100644 --- a/corpora/newspaper-corpora/europeana-ee.json +++ b/corpora/newspaper-corpora/europeana-ee.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.34733/vc-1086", "Family": "Newspaper corpora", "Description": "This corpus contains 92,558 issues of 40 newspapers published in Estonia between 1852 and 1946.", - "Languages": ["est", "rus", "deu"], + "Language": ["est", "rus", "deu"], "Licence": "Public", "Size": ["351,656,185 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-fi.json b/corpora/newspaper-corpora/europeana-fi.json index 77ac91c..a223396 100644 --- a/corpora/newspaper-corpora/europeana-fi.json +++ b/corpora/newspaper-corpora/europeana-fi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1095", "Family": "Newspaper corpora", "Description": "This corpus contains 24,164 issues of 10 newspapers published in Finland between 1900 and 1910.", - "Languages": ["fin","swe"], + "Language": ["fin","swe"], "Licence": "Public", "Size": ["393,776,815 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-lu.json b/corpora/newspaper-corpora/europeana-lu.json index f749e34..dbbe9ef 100644 --- a/corpora/newspaper-corpora/europeana-lu.json +++ b/corpora/newspaper-corpora/europeana-lu.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1096", "Family": "Newspaper corpora", "Description": "This corpus contains 1225 issues of 2 newspapers published in Luxembourg between 1704 and 1794.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "Public", "Size": ["29,266,765 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-lv.json b/corpora/newspaper-corpora/europeana-lv.json index e5a56a4..6ac2659 100644 --- a/corpora/newspaper-corpora/europeana-lv.json +++ b/corpora/newspaper-corpora/europeana-lv.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1092", "Family": "Newspaper corpora", "Description": "This corpus contains 67,870 issues of 77 newspapers published in Latvia between 1868 and 1955.", - "Languages": ["lav","rus","deu","pol","est"], + "Language": ["lav","rus","deu","pol","est"], "Licence": "Public", "Size": ["964,243,746 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-ner.json b/corpora/newspaper-corpora/europeana-ner.json index 040811d..d74cd43 100644 --- a/corpora/newspaper-corpora/europeana-ner.json +++ b/corpora/newspaper-corpora/europeana-ner.json @@ -3,7 +3,7 @@ "URL": "https://github.com/EuropeanaNewspapers/ner-corpora", "Family": "Newspaper corpora", "Description": "This corpus contains articles from Europeana newspapers for the following time periods: 1811-1856 for the Dutch subcorpus, 1871-1916 for the French subcorpus, and 1926 for the German subcorpus.\nThe corpus is available for download from the KB Lab.", - "Languages": ["nld","fra","deu"], + "Language": ["nld","fra","deu"], "Licence": "CC-ZERO", "Size": ["500, 000 tokens (182,483 Dutch; 207,000 French;  96,735 German)"], "Annotation": ["named entities"], diff --git a/corpora/newspaper-corpora/europeana-nl.json b/corpora/newspaper-corpora/europeana-nl.json index b489465..9fd8c59 100644 --- a/corpora/newspaper-corpora/europeana-nl.json +++ b/corpora/newspaper-corpora/europeana-nl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1097", "Family": "Newspaper corpora", "Description": "This corpus contains 4266 issues of 164 newspapers r published in the Netherlands between 1618 and 1940.", - "Languages": ["nld","fra","eng","spa","heb","fry","deu","pan","ara"], + "Language": ["nld","fra","eng","spa","heb","fry","deu","pan","ara"], "Licence": "Public", "Size": ["2,869,483,985 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-pl.json b/corpora/newspaper-corpora/europeana-pl.json index 58df20f..66a605f 100644 --- a/corpora/newspaper-corpora/europeana-pl.json +++ b/corpora/newspaper-corpora/europeana-pl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1094", "Family": "Newspaper corpora", "Description": "This corpus contains 15,130 issues of 10 newspapers published in Poland between 1866 and 1939.", - "Languages": ["pol","deu","ukr","rus"], + "Language": ["pol","deu","ukr","rus"], "Licence": "Public", "Size": ["181,102,489 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/europeana-rs.json b/corpora/newspaper-corpora/europeana-rs.json index b325197..c90ac7e 100644 --- a/corpora/newspaper-corpora/europeana-rs.json +++ b/corpora/newspaper-corpora/europeana-rs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/VC-1093", "Family": "Newspaper corpora", "Description": "This corpus contains 22,087 issues of 44 newspapers published in Serbia between 1830 and 1944.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "Public", "Size": ["338,080,416 words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/ger-greek-press.json b/corpora/newspaper-corpora/ger-greek-press.json index f9ac78d..cae4375 100644 --- a/corpora/newspaper-corpora/ger-greek-press.json +++ b/corpora/newspaper-corpora/ger-greek-press.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0007-E18D-C", "Family": "Newspaper corpora", "Description": "The corpus consists of newspaper articles from three Greek newspapers (Ta Nea, Risospastis, and To Vima) dealing with Germany from the Greek perspective.\nBibliographical information is encoded in the path to the file: It is composed of title of the newspaper, year, month, day, and rubric. The lemmata are stored in a separate tree of the same structure, the text files in that tree contain one lemma per line.\nThe corpus is available for download from CLARIN-D (Saarland University B-centre).", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "", "Size": ["3.5 million tokens", "7650 texts"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/newspaper-corpora/gp-1994-2001-2011.json b/corpora/newspaper-corpora/gp-1994-2001-2011.json index 5153742..6a8ebcc 100644 --- a/corpora/newspaper-corpora/gp-1994-2001-2011.json +++ b/corpora/newspaper-corpora/gp-1994-2001-2011.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resources", "Family": "Newspaper corpora", "Description": "This group of corpora contain articles from the Swedish newspaper Göteborgsposten from 1994 and from 2001 to  2011.\nThe corpora are available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["271 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "parsed", "compounds"], diff --git a/corpora/newspaper-corpora/hertha.json b/corpora/newspaper-corpora/hertha.json index 2268afe..088aa94 100644 --- a/corpora/newspaper-corpora/hertha.json +++ b/corpora/newspaper-corpora/hertha.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/ub-kvt-hertha#tabs=information", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Hertha from 1914 to 2015.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["3.8 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/idun.json b/corpora/newspaper-corpora/idun.json index ecadce4..b35de5b 100644 --- a/corpora/newspaper-corpora/idun.json +++ b/corpora/newspaper-corpora/idun.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/ub-kvt-idun", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Idun from 1887 to 1917.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "", "Size": ["2 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/karelian-news.json b/corpora/newspaper-corpora/karelian-news.json index 0c05547..fc9054f 100644 --- a/corpora/newspaper-corpora/karelian-news.json +++ b/corpora/newspaper-corpora/karelian-news.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014092601", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the Finnish newspaper Karjalan Sanomat from 2012 to 2014.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA", "Size": ["500,000 tokens"], "Annotation": [], diff --git a/corpora/newspaper-corpora/kvinnornas.json b/corpora/newspaper-corpora/kvinnornas.json index a8444a7..d84d18d 100644 --- a/corpora/newspaper-corpora/kvinnornas.json +++ b/corpora/newspaper-corpora/kvinnornas.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/ub-kvt-kvt", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Kvinnornas Tidning for the period between 1921 and 1925.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["5.5 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/larepubblica.json b/corpora/newspaper-corpora/larepubblica.json index 4410946..b50be5f 100644 --- a/corpora/newspaper-corpora/larepubblica.json +++ b/corpora/newspaper-corpora/larepubblica.json @@ -3,7 +3,7 @@ "URL": "https://docs.sslmit.unibo.it/doku.php?id=corpora:repubblica", "Family": "Newspaper corpora", "Description": "The corpus contains articles from the Italian newspaper La Repubblica.\nThe corpus is available through the noSketch Engine concordancer.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC-BY", "Size": ["380 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/newspaper-corpora/lib-inf-centre.json b/corpora/newspaper-corpora/lib-inf-centre.json index fc00c5c..6c730f9 100644 --- a/corpora/newspaper-corpora/lib-inf-centre.json +++ b/corpora/newspaper-corpora/lib-inf-centre.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AUTH-0000-0000-2C4C-B", "Family": "Newspaper corpora", "Description": "The corpus contains newspaper articles.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC-SA", "Size": ["20 units"], "Annotation": [], diff --git a/corpora/newspaper-corpora/makedonia.json b/corpora/newspaper-corpora/makedonia.json index 84b45ac..00ae840 100644 --- a/corpora/newspaper-corpora/makedonia.json +++ b/corpora/newspaper-corpora/makedonia.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-24FB-D", "Family": "Newspaper corpora", "Description": "This corpus contains newspaper articles in various topics (politics, economy, sports).\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC-SA", "Size": ["3 million tokens"], "Annotation": [], diff --git a/corpora/newspaper-corpora/mannheim-hist.json b/corpora/newspaper-corpora/mannheim-hist.json index ce38940..2a742ea 100644 --- a/corpora/newspaper-corpora/mannheim-hist.json +++ b/corpora/newspaper-corpora/mannheim-hist.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-01B8-AE41-41A4-DC01-5", "Family": "Newspaper corpora", "Description": "This corpus contains articles from 21 German newspapers from the 18th and 19th century.\nThe corpus is available for download from the CLARIN-D repository.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["4.1 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/newspaper-corpora/morgonbris.json b/corpora/newspaper-corpora/morgonbris.json index be320c0..1f1734f 100644 --- a/corpora/newspaper-corpora/morgonbris.json +++ b/corpora/newspaper-corpora/morgonbris.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/ub-kvt-morgonbris", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Morgonbris from 1904 to 1924.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["3.5 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/news-nat-fin.json b/corpora/newspaper-corpora/news-nat-fin.json index e478984..b4ac8fc 100644 --- a/corpora/newspaper-corpora/news-nat-fin.json +++ b/corpora/newspaper-corpora/news-nat-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405276", "Family": "Newspaper corpora", "Description": "This corpus contains articles from a large variety of Finnish and Swedish newspapers (over 100 for each language) from 1770 to 2011.\nThe corpus can be accessed through the concordancer Korp.", - "Languages": ["swe","fin"], + "Language": ["swe","fin"], "Licence": "CC-BY", "Size": ["8.8 billion tokens"], "Annotation": ["tokenised", "MSD-tagged", "syntactically parsed"], diff --git a/corpora/newspaper-corpora/news-ocr-fin.json b/corpora/newspaper-corpora/news-ocr-fin.json index 19636f2..235b4c1 100644 --- a/corpora/newspaper-corpora/news-ocr-fin.json +++ b/corpora/newspaper-corpora/news-ocr-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015051201", "Family": "Newspaper corpora", "Description": "This corpus contains articles from a large variety of Finnish and Swedish newspapers (over 100 for each language) from 1771 to 1874.\nThe corpus can be downloaded from FIN-CLARIN.", - "Languages": ["swe","fin"], + "Language": ["swe","fin"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/newspaper-corpora/nor-news.json b/corpora/newspaper-corpora/nor-news.json index 338faca..0dac58a 100644 --- a/corpora/newspaper-corpora/nor-news.json +++ b/corpora/newspaper-corpora/nor-news.json @@ -3,7 +3,7 @@ "URL": "http://avis.uib.no/", "Family": "Newspaper corpora", "Description": "This corpus contains articles from 24 Norwegian newspapers from 1998 onwards.\nThe corpus is available through the concordancer Corpuscle.", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CC-BY", "Size": ["700 million tokens"], "Annotation": ["multitagged"], diff --git a/corpora/newspaper-corpora/parallel-global.json b/corpora/newspaper-corpora/parallel-global.json index 8d629be..93c79ec 100644 --- a/corpora/newspaper-corpora/parallel-global.json +++ b/corpora/newspaper-corpora/parallel-global.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25DD-E", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the https://globalvoices.org/ website, where volunteers publish and translate news stories in more than 40 languages.", - "Languages": ["40 languages"], + "Language": ["40 languages"], "Licence": "CC BY", "Size": ["8 million units"], "Annotation": [], diff --git a/corpora/newspaper-corpora/rostratt.json b/corpora/newspaper-corpora/rostratt.json index 103ea2e..d8533ab 100644 --- a/corpora/newspaper-corpora/rostratt.json +++ b/corpora/newspaper-corpora/rostratt.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/ub-kvt-rostratt", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Rösträtt för Kvinnor from 1912 to 1919.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["2.2 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/setimes.json b/corpora/newspaper-corpora/setimes.json index f622a83..d20c6cd 100644 --- a/corpora/newspaper-corpora/setimes.json +++ b/corpora/newspaper-corpora/setimes.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2591-2", "Family": "Newspaper corpora", "Description": "This parallel corpus contains online news articles extracted from the SETimes webpage.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ron","tur","srp","eng","bul", "mkd", "hrv","ell","sqi"], + "Language": ["ron","tur","srp","eng","bul", "mkd", "hrv","ell","sqi"], "Licence": "Open For Reuse With Restrictions", "Size": ["341.83 million tokens"], "Annotation": ["sentence-aligned"], diff --git a/corpora/newspaper-corpora/smittskydd.json b/corpora/newspaper-corpora/smittskydd.json index 3e9509a..fcdc7ba 100644 --- a/corpora/newspaper-corpora/smittskydd.json +++ b/corpora/newspaper-corpora/smittskydd.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/smittskydd", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the newspaper Smittskyd from 2002 to 2010.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["691,000 tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/syn2006pub.json b/corpora/newspaper-corpora/syn2006pub.json index 1c33f52..b478aad 100644 --- a/corpora/newspaper-corpora/syn2006pub.json +++ b/corpora/newspaper-corpora/syn2006pub.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-1358-3", "Family": "Newspaper corpora", "Description": "This corpus contains articles from 11 Czech newspapers from 1989 to 2004.\nThe corpus is available for download from the Czech repository LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY", "Size": ["300 million tokens"], "Annotation": ["tokenised", "lemmatised", "PoS-tagged"], diff --git a/corpora/newspaper-corpora/syn2013pub.json b/corpora/newspaper-corpora/syn2013pub.json index fc50f72..b19c8a9 100644 --- a/corpora/newspaper-corpora/syn2013pub.json +++ b/corpora/newspaper-corpora/syn2013pub.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-3B09-4", "Family": "Newspaper corpora", "Description": "This corpus contains articles from Czech newspapers from 2005 to 2009.\nThe corpus is available for download from the Czech repository LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Czech National Corpus (Shuffled Corpus Data)", "Size": ["935 million tokens"], "Annotation": ["tokenised", "lemmatised", "MSD-tagged"], diff --git a/corpora/newspaper-corpora/ta-nea.json b/corpora/newspaper-corpora/ta-nea.json index 425326b..f174486 100644 --- a/corpora/newspaper-corpora/ta-nea.json +++ b/corpora/newspaper-corpora/ta-nea.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/KEG-0000-0000-24F9-F", "Family": "Newspaper corpora", "Description": "This corpus contains newspaper articles in various topics (politics, economy, sports).\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC-SA", "Size": ["2 million words"], "Annotation": [], diff --git a/corpora/newspaper-corpora/tiger.json b/corpora/newspaper-corpora/tiger.json index 07345b9..f0d6223 100644 --- a/corpora/newspaper-corpora/tiger.json +++ b/corpora/newspaper-corpora/tiger.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11022/1007-0000-0000-8E50-6", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the German newspaper Frankfurter Rundschau.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN PUB", "Size": ["900,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "parsed", "lemmatised"], diff --git a/corpora/newspaper-corpora/timed-jsi-web.json b/corpora/newspaper-corpora/timed-jsi-web.json index ff782c8..151d911 100644 --- a/corpora/newspaper-corpora/timed-jsi-web.json +++ b/corpora/newspaper-corpora/timed-jsi-web.json @@ -3,7 +3,7 @@ "URL": "https://www.sketchengine.co.uk/jozef-stefan-institute-newsfeed-corpus/", "Family": "Newspaper corpora", "Description": "This corpus contains articles from newsfeed from 2014 to 2017.\nThe corpus is available through noSketchEingine.", - "Languages": ["18 languages"], + "Language": ["18 languages"], "Licence": "", "Size": ["35 billion tokens"], "Annotation": ["tokenised", "PoS-tagged"], diff --git a/corpora/newspaper-corpora/tuebingen-tree.json b/corpora/newspaper-corpora/tuebingen-tree.json index ea50eb4..8dcc55f 100644 --- a/corpora/newspaper-corpora/tuebingen-tree.json +++ b/corpora/newspaper-corpora/tuebingen-tree.json @@ -3,7 +3,7 @@ "URL": "https://uni-tuebingen.de/en/faculties/faculty-of-humanities/departments/modern-languages/department-of-linguistics/chairs/general-and-computational-linguistics/ressources/corpora/tueba-dz/", "Family": "Newspaper corpora", "Description": "This corpus contains articles from the German newspaper Die Tageszeitung.\nThe corpus is available through a dedicated concordancer with an institutional account.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["1.8 million tokens"], "Annotation": ["tokenised","MSD tagged","lemmatised","syntactic constituency","named-entities"], diff --git a/corpora/newspaper-corpora/webbnyheter.json b/corpora/newspaper-corpora/webbnyheter.json index a24810f..baa0686 100644 --- a/corpora/newspaper-corpora/webbnyheter.json +++ b/corpora/newspaper-corpora/webbnyheter.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resource/webbnyheter2001", "Family": "Newspaper corpora", "Description": "This corpus contains articles from various Swedish online newspapers from 2001 to 2013.\nThe corpus is available for download from Språkbanken and can be accessed through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["272 million tokens"], "Annotation": ["tokenized", "PoS-tagged", "parsed"], diff --git a/corpora/newspaper-corpora/witac.json b/corpora/newspaper-corpora/witac.json index 114b32a..e3a7fd7 100644 --- a/corpora/newspaper-corpora/witac.json +++ b/corpora/newspaper-corpora/witac.json @@ -3,7 +3,7 @@ "URL": "https://hlt-nlp.fbk.eu/technologies/witac-newsreader-wikinews-italian-corpus", "Family": "Newspaper corpora", "Description": "This corpus contains Italian translations of 120 English Wikinews articles.\nThe corpus is available for download from a dedicated website.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CC-BY", "Size": ["40,231 tokens"], "Annotation": ["entities, events, event factuality, temporal information, semantic roles, and intra-document and cross-document event and entity coreference"], diff --git a/corpora/newspaper-corpora/zurich.json b/corpora/newspaper-corpora/zurich.json index fae21ab..0ae598b 100644 --- a/corpora/newspaper-corpora/zurich.json +++ b/corpora/newspaper-corpora/zurich.json @@ -3,7 +3,7 @@ "URL": "https://varieng.helsinki.fi/CoRD/corpora/ZEN/", "Family": "Newspaper corpora", "Description": "This corpus contains articles from various English newspapers (mainly newspapers from London) from the 17th and 18th century. ", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "public", "Size": ["1.6 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/oral-history-corpora/austrian-mediathek.json b/corpora/oral-history-corpora/austrian-mediathek.json index b12f6fb..1c7c46c 100644 --- a/corpora/oral-history-corpora/austrian-mediathek.json +++ b/corpora/oral-history-corpora/austrian-mediathek.json @@ -3,7 +3,7 @@ "URL": "https://www.mediathek.at/", "Family": "Oral history corpora", "Description": "The corpus contains 8 interviews involving the ex-deportee Aloisia Hofinger, Anna Redlinger, Antonia Bruha, Erika Gugig, Eva Gutfreund, Friederike Furch, Irma Trksak (split in two parts). The interviews are open access but for most interviews registraton and creating a ticket are required. Part of the interviews is directly accessible, but for some you have to register and request access. Below you can find the metadata of the interviews. ", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "Open access. Access requires registration and a ticket. Access is restricted to streaming, and to seven days only. Download is probited.", "Size": ["8 interviews related to Ravensbrück"], "Annotation": ["None"], diff --git a/corpora/oral-history-corpora/bruzzone.json b/corpora/oral-history-corpora/bruzzone.json index 108e137..1c90242 100644 --- a/corpora/oral-history-corpora/bruzzone.json +++ b/corpora/oral-history-corpora/bruzzone.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/e24ae5a4-be49-4c31-a7b8-b0c9ed84029e", "Family": "Oral History Corpora", "Description": "The corpus contains four interviews involving five ex-deportee in the female-only Nazi concentration camp of Ravensbrück (Lidia Rolfi, Bianca Paganini, Livia Borsi, Lina and Nella Baroncini). By clicking on the download button you are brought to a page with a description of the four interviews. By clicking on the name of the interviewee you are brought to a page with a brief description and extensive metadata of the single interview. On the right you can see the corresponding audio- and text files marked in orange. This means they are restricted and you need to contact the owner, prof. Silvia Calamai (silvia.calamai@unisi.it) to get access to this data. The files marked in green are short clips that you can download directly.\nThe interviews were originally recorded on a series of analog supports by the Italian oral historian Anna Maria Bruzzone (Mondoví, 1925 – Turin, 2015) in preparation for the book Le Donne di Ravensbrück (\"The Women of Ravensbrück\", Einaudi, first edition 1978). The corpus is composed of 27 distinct audio files, deriving from the digitisation of an equal number of sides from the original 14 audiocassettes. The original supports are storied within the Archivio Storico of the Ospedale Neuropsichiatrico di Arezzo (Siena University), Arezzo (AR), Italy. Links to richly described metadata files are available in the repository of the Max Planck Institute for Psycholinguistics: ", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "Restricted access (CLARIN RES+PLAN+BY+NC+INF+PRIV+NORED+ND+DEP+*)", "Size": ["4 interviews related to Ravensbrück"], "Annotation": ["Orthographic"], diff --git a/corpora/oral-history-corpora/fortunoff-archive.json b/corpora/oral-history-corpora/fortunoff-archive.json index efb70cc..cb874df 100644 --- a/corpora/oral-history-corpora/fortunoff-archive.json +++ b/corpora/oral-history-corpora/fortunoff-archive.json @@ -3,7 +3,7 @@ "URL": "https://fortunoff.aviaryplatform.com/", "Family": "Oral history corpora", "Description": "A single interview with the ex-deportee Clementine U. You need to request an account to get access to The Fortunoff collection. To get to the single interview use the search function in the online database. Below you can find the metadata) of the interview.", - "Languages": ["English (mainly)", "fra", "heb", "slk", "deu", "others"], + "Language": ["English (mainly)", "fra", "heb", "slk", "deu", "others"], "Licence": "Restricted access (account required)", "Size": ["1 interview related to Ravensbrück"], "Annotation": ["None"], diff --git a/corpora/oral-history-corpora/frauen-von-ravensbrueck.json b/corpora/oral-history-corpora/frauen-von-ravensbrueck.json index f30a91a..58e3a24 100644 --- a/corpora/oral-history-corpora/frauen-von-ravensbrueck.json +++ b/corpora/oral-history-corpora/frauen-von-ravensbrueck.json @@ -3,7 +3,7 @@ "URL": "https://videoarchiv-ravensbrueck.de/de/einfuehrung", "Family": "Oral history corpora", "Description": "The corpus contains 7 interviews with Aenne Meier, Frieda Hummler, Gertrud Frühschütz, Gertrud Müller, Gretel Pressl, Maria Zeh, Victoria Ahrend Kupersztajn. You need to request an account to get access to Loretta Walz’ collection of video interviews. To get to the single interviews use the search function in the online database. Below you can find the metadata of the interviews. ", - "Languages": ["German (mainly)", "fra", "eng", "others"], + "Language": ["German (mainly)", "fra", "eng", "others"], "Licence": "Restricted access (account required)", "Size": ["7 interviews related to Ravensbrück"], "Annotation": ["None"], diff --git a/corpora/oral-history-corpora/getuigenverhalen.json b/corpora/oral-history-corpora/getuigenverhalen.json index bbaedf9..55684c2 100644 --- a/corpora/oral-history-corpora/getuigenverhalen.json +++ b/corpora/oral-history-corpora/getuigenverhalen.json @@ -3,7 +3,7 @@ "URL": "http://getuigenverhalen.nl/home", "Family": "Oral history corpora", "Description": "The corpus contains two video interviews with political prisoners: Geert van der Molen (see also metadata) and Tine Boeke Kramer (see also metadata). By clicking on their names you are brought to a page with tabs and extensive metadata. To get access to the text files you need to ask an account at DANS. To request a copy of the videos you need to contact dinekestam@cultuurenco.nl", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "Open access", "Size": ["2 interviews related to Ravensbrück"], "Annotation": ["Orthographic"], diff --git a/corpora/oral-history-corpora/us-holocaust-memorial.json b/corpora/oral-history-corpora/us-holocaust-memorial.json index f9b2c5b..c01aa0e 100644 --- a/corpora/oral-history-corpora/us-holocaust-memorial.json +++ b/corpora/oral-history-corpora/us-holocaust-memorial.json @@ -3,7 +3,7 @@ "URL": "https://www.ushmm.org/collections/the-museums-collections/about/oral-history", "Family": "Oral history corpora", "Description": "The corpus contains several video interview involving Ravensbrück survivors (Anna Wirbel, Frieda Greinegger Noga, Genevieve De Gaulle, Irene Miller, Janina Pawlica, Joukje Grandia-Smits, Odette Hallowes, Susan Gerofi, Susan Mahrer). You do not need an account to get access to these video-interviews. To get to the single interviews use the search function of the USHMM oral history website. Copies of some of the video's can be ordered. Basic metadata are available for each survivor: ", - "Languages": ["English (mainly)", "heb", "fra", "deu", "nld"], + "Language": ["English (mainly)", "heb", "fra", "deu", "nld"], "Licence": "Open access", "Size": ["9 interviews related to Ravensbrück"], "Annotation": ["None"], diff --git a/corpora/oral-history-corpora/usc-shoah-foundation.json b/corpora/oral-history-corpora/usc-shoah-foundation.json index aedee8c..7142715 100644 --- a/corpora/oral-history-corpora/usc-shoah-foundation.json +++ b/corpora/oral-history-corpora/usc-shoah-foundation.json @@ -3,7 +3,7 @@ "URL": "https://vhaonline.usc.edu/", "Family": "Oral history corpora", "Description": "The corpus contains 7 interviews with the ex-deportees Aniela Popowiez-Radyno, Antonia Bruha, Elisabeth Jäger, Lore Perl, Renee Scott, Soula Molho, Wanda Lorenc. You need to create an account at the Shoah Visual History Archive to get access to this part of the collection that is freely available. To get to the single interviews use the search function in the VHA online database. Below you can find the metadata of the interviews: ", - "Languages": ["English (mainly)", "deu"], + "Language": ["English (mainly)", "deu"], "Licence": "Restricted access (account required)", "Size": ["7 interviews related to Ravensbrück"], "Annotation": [], diff --git a/corpora/oral-history-corpora/vpro-selma-van-der-perre.json b/corpora/oral-history-corpora/vpro-selma-van-der-perre.json index de04024..613e1fe 100644 --- a/corpora/oral-history-corpora/vpro-selma-van-der-perre.json +++ b/corpora/oral-history-corpora/vpro-selma-van-der-perre.json @@ -3,7 +3,7 @@ "URL": "https://www.vpro.nl/programmas/ovt/speel~RBX_VPRO_15812838~het-spoor-terug-mijn-naam-is-selma-deel-1~.html", "Family": "Oral history corpora", "Description": "A single interview with Selma van der Perre, divided in three parts of about 45 min, complementary to her memoirs Mijn naam is Selma. The interview was made by Mathijs Deen for the Dutch broadcast organisation VPRO in the series Het spoor terug (\"The Trace Back\"). Extensive metadata is available here.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "Open access", "Size": ["1 interview related to Ravensbrück"], "Annotation": ["None"], diff --git a/corpora/parallel-corpora/accurat.json b/corpora/parallel-corpora/accurat.json index be4461f..9720a5c 100644 --- a/corpora/parallel-corpora/accurat.json +++ b/corpora/parallel-corpora/accurat.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23BD-4", "Family": "Parallel corpora", "Description": "This corpus contains texts in Greek, Slovenian, Romanian, Latvian, Estonian, Croatian, and Lithuanian.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["7 languages"], + "Language": ["7 languages"], "Licence": "CC-BY", "Size": ["4,608 sentences"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/aformes.json b/corpora/parallel-corpora/aformes.json index 1f20a0b..fc5f60a 100644 --- a/corpora/parallel-corpora/aformes.json +++ b/corpora/parallel-corpora/aformes.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2575-3", "Family": "Parallel corpora", "Description": "This corpus contains articles from a journal of undergraduate creative writing at an English department in Greece.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["English-Greek"], + "Language": ["English-Greek"], "Licence": "CC-BY", "Size": ["376,250 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/bul-tm.json b/corpora/parallel-corpora/bul-tm.json index 6ac18ac..7341dc0 100644 --- a/corpora/parallel-corpora/bul-tm.json +++ b/corpora/parallel-corpora/bul-tm.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E4-7", "Family": "Parallel corpora", "Description": "This corpus contains societal and political texts.\nThe corpus is available for download through the CLARIN:el repository.", - "Languages": ["Greek-Bulgarian"], + "Language": ["Greek-Bulgarian"], "Licence": "CC-BY", "Size": ["10 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/bulgarian-x.json b/corpora/parallel-corpora/bulgarian-x.json index 2537801..92e9400 100644 --- a/corpora/parallel-corpora/bulgarian-x.json +++ b/corpora/parallel-corpora/bulgarian-x.json @@ -3,7 +3,7 @@ "URL": "http://metashare.elda.org/repository/browse/bulgarian-x-language-parallel-corpus/b8ecf7fe66cd11e281b65cf3fcb88b70394683c3b32549349cf039716e61a92b/", "Family": "Parallel corpora", "Description": "This corpus is a part of the Bulgarian National Corpus.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["50 languages"], + "Language": ["50 languages"], "Licence": "CC-BY", "Size": ["1.2 billion tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/ces-eng-manual-word-align.json b/corpora/parallel-corpora/ces-eng-manual-word-align.json index b80efbd..da848de 100644 --- a/corpora/parallel-corpora/ces-eng-manual-word-align.json +++ b/corpora/parallel-corpora/ces-eng-manual-word-align.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1804", "Family": "Parallel corpora", "Description": "This corpus contains texts from e-books, Reader’s Digest, the Kačenka magazine, Acquis Communautaire, the Project Syndicate and the PCEDT project.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["Czech-English"], + "Language": ["Czech-English"], "Licence": "CC-BY", "Size": ["113,000 tokens", "2500 sentences"], "Annotation": ["tokenised", "word-aligned (manually)"], diff --git a/corpora/parallel-corpora/ces-eng-ufal-abstracts.json b/corpora/parallel-corpora/ces-eng-ufal-abstracts.json index b53b916..cd07213 100644 --- a/corpora/parallel-corpora/ces-eng-ufal-abstracts.json +++ b/corpora/parallel-corpora/ces-eng-ufal-abstracts.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1731", "Family": "Parallel corpora", "Description": "This corpus contains abstracts of published by authors from the Institute of Formal and Applied Linguistics, Charles University, as reported in the institute's system Biblio.\nNo filtering was performed, except for removing entries missing the Czech or English abstract, and replacing newline and tabulator characters by spaces.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["Czech-English"], + "Language": ["Czech-English"], "Licence": "CC-BY", "Size": ["1556 entries", "12,000 sentences", "200,000 words"], "Annotation": ["tokenised", "document-aligned"], diff --git a/corpora/parallel-corpora/ces-pol-eng.json b/corpora/parallel-corpora/ces-pol-eng.json index 40b3702..be5768b 100644 --- a/corpora/parallel-corpora/ces-pol-eng.json +++ b/corpora/parallel-corpora/ces-pol-eng.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/manually-aligned-ces-polish-english-parallel-corpus/314b93d26b0011e284b6000423bfd61c36a51e4b609742288e99ba691f07dfdb/", "Family": "Parallel corpora", "Description": "This corpus contains CES reports.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Polish-English"], + "Language": ["Polish-English"], "Licence": "CC-BY", "Size": ["1.4 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/ces-slk-parallel.json b/corpora/parallel-corpora/ces-slk-parallel.json index 763f1e3..86b288c 100644 --- a/corpora/parallel-corpora/ces-slk-parallel.json +++ b/corpora/parallel-corpora/ces-slk-parallel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0006-AADF-0", "Family": "Parallel corpora", "Description": "This corpus contains legal texts (Acquis), parliamentary debates (from the Europarl corpus), articles from the Official Journal of the European Union, and texts from the OPUS corpus.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["Czech-Slovak"], + "Language": ["Czech-Slovak"], "Licence": "CC-BY", "Size": ["5.7 million sentences"], "Annotation": ["automatic morphological annotation"], diff --git a/corpora/parallel-corpora/civitas-gentium.json b/corpora/parallel-corpora/civitas-gentium.json index df3cd3a..6612af7 100644 --- a/corpora/parallel-corpora/civitas-gentium.json +++ b/corpora/parallel-corpora/civitas-gentium.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2578-0", "Family": "Parallel corpora", "Description": "This corpus contains scientific papers and book reviews in English, Greek, and French.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "CC-BY", "Size": ["31 articles"], "Annotation": [], diff --git a/corpora/parallel-corpora/compara.json b/corpora/parallel-corpora/compara.json index c9576ba..c10ae2e 100644 --- a/corpora/parallel-corpora/compara.json +++ b/corpora/parallel-corpora/compara.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-866", "Family": "Parallel corpora", "Description": "This corpus contains fictional texts and academic, newspaper and tourist articles.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["Portuguese-English"], + "Language": ["Portuguese-English"], "Licence": "CC-BY", "Size": [], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/crater-2.json b/corpora/parallel-corpora/crater-2.json index 6b600cc..8c4af77 100644 --- a/corpora/parallel-corpora/crater-2.json +++ b/corpora/parallel-corpora/crater-2.json @@ -3,7 +3,7 @@ "URL": "https://catalog.elra.info/en-us/repository/browse/ELRA-W0033/", "Family": "Parallel corpora", "Description": "This corpus contains texts from the telecommunications domain.\nThe corpus is available for download from the ELRA catalogue.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "ELRA END USER/ELRA VAR", "Size": ["4 million tokens"], "Annotation": ["tokenised", "morphosyntactically tagged"], diff --git a/corpora/parallel-corpora/csenvi-pairwise.json b/corpora/parallel-corpora/csenvi-pairwise.json index 00dc47e..feaeeb2 100644 --- a/corpora/parallel-corpora/csenvi-pairwise.json +++ b/corpora/parallel-corpora/csenvi-pairwise.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1595", "Family": "Parallel corpora", "Description": "This corpus contains TED talks and subtitles from the CLUVI corpus in Vietnamese, Czech, and English.\nThe corpus is available for download from LINDAT.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "CC-BY", "Size": ["31 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/czeng.json b/corpora/parallel-corpora/czeng.json index 9e702f3..1665d73 100644 --- a/corpora/parallel-corpora/czeng.json +++ b/corpora/parallel-corpora/czeng.json @@ -3,7 +3,7 @@ "URL": "http://ufal.mff.cuni.cz/czeng", "Family": "Parallel corpora", "Description": "This corpus is bidirectional, with original texts in English and Czech and accompanying translations. CzEng 2.0 is composed from authentic and synthetic parallel data. The authentic part contains filtered CzEng 1.6 and six additional resources: Europarl, Paracrawl, Common Crawl, News Commentary, Tilde MODEL, Wiki Titles, WikiMatrix, which was downloaded from WMT 2020.\nThe corpus is available for download from a dedicated website.", - "Languages": ["Czech-English"], + "Language": ["Czech-English"], "Licence": "CC-BY", "Size": ["702 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/dgt-acquis.json b/corpora/parallel-corpora/dgt-acquis.json index 2fc6ef0..625dc92 100644 --- a/corpora/parallel-corpora/dgt-acquis.json +++ b/corpora/parallel-corpora/dgt-acquis.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2592-1", "Family": "Parallel corpora", "Description": "This corpus contains articles from the Official Journal of the European Union from  2004 to 2011.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["23 languages"], + "Language": ["23 languages"], "Licence": "Open For Reuse With Restrictions", "Size": [], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/dgt-tm-2016.json b/corpora/parallel-corpora/dgt-tm-2016.json index 5a3ddf3..f710c41 100644 --- a/corpora/parallel-corpora/dgt-tm-2016.json +++ b/corpora/parallel-corpora/dgt-tm-2016.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-4300-4", "Family": "Parallel corpora", "Description": "This corpus contains texts from the European Legislation.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["Approx. 30 languages"], + "Language": ["Approx. 30 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["373 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/dgt-trans-mem.json b/corpora/parallel-corpora/dgt-trans-mem.json index b650e57..6850cff 100644 --- a/corpora/parallel-corpora/dgt-trans-mem.json +++ b/corpora/parallel-corpora/dgt-trans-mem.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-258C-9", "Family": "Parallel corpora", "Description": "This corpus contains legislative texts of the European Legislation.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["Approx. 20 languages"], + "Language": ["Approx. 20 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["10.1 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/dpc.json b/corpora/parallel-corpora/dpc.json index 64cf2bb..8595eb0 100644 --- a/corpora/parallel-corpora/dpc.json +++ b/corpora/parallel-corpora/dpc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-h3", "Family": "Parallel corpora", "Description": "This corpus contains fictional, journalistic, instructive and administrative texts in English, Dutch, and French.\nThe corpus is available for download (after registration) from the Dutch Language Institute.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "CLARIN ACA", "Size": ["10.8 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/eac-trans-mem.json b/corpora/parallel-corpora/eac-trans-mem.json index 0a3bcb4..dc3e0f1 100644 --- a/corpora/parallel-corpora/eac-trans-mem.json +++ b/corpora/parallel-corpora/eac-trans-mem.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-258D-8", "Family": "Parallel corpora", "Description": "This corpus contains law documents and texts related to education and culture.\nThe corpus is available for download through the CLARIN:el repository.", - "Languages": ["50 languages"], + "Language": ["50 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["320,000 tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/ecb-parallel.json b/corpora/parallel-corpora/ecb-parallel.json index ee89173..e974415 100644 --- a/corpora/parallel-corpora/ecb-parallel.json +++ b/corpora/parallel-corpora/ecb-parallel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2590-3", "Family": "Parallel corpora", "Description": "This corpus contains texts from the European Central Bank.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["19 languages"], + "Language": ["19 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["757 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/ecdc-trans-mem.json b/corpora/parallel-corpora/ecdc-trans-mem.json index 6f21b5e..4ab11f2 100644 --- a/corpora/parallel-corpora/ecdc-trans-mem.json +++ b/corpora/parallel-corpora/ecdc-trans-mem.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-24DA-2", "Family": "Parallel corpora", "Description": "This corpus contains texts from the public health domain.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["Approx. 20 languages"], + "Language": ["Approx. 20 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["320,000 tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/elexis-wsd.json b/corpora/parallel-corpora/elexis-wsd.json index 2074fdd..bfe9771 100644 --- a/corpora/parallel-corpora/elexis-wsd.json +++ b/corpora/parallel-corpora/elexis-wsd.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1842", "Family": "Parallel corpora", "Description": "This corpus is a parallel sense-annotated corpus in which content words (nouns, adjectives, verbs, and adverbs) have been assigned senses. Version 1.1 contains sentences for 10 languages: Bulgarian, Danish, English, Spanish, Estonian, Hungarian, Italian, Dutch, Portuguese, and Slovene.\nThe corpus was compiled by automatically extracting a set of sentences from WikiMatrix (Schwenk et al., 2019), a large open-access collection of parallel sentences derived from Wikipedia, using an automatic approach based on multilingual sentence embeddings. The sentences were manually validated according to specific formal, lexical and semantic criteria (e.g. by removing incorrect punctuation, morphological errors, notes in square brackets and etymological information typically provided in Wikipedia pages). To obtain a satisfying semantic coverage, we filtered out sentences with less than 5 words and less than 2 polysemous words were filtered out. Subsequently, in order to obtain datasets in the other nine target languages, for each selected sentence in English, the corresponding WikiMatrix translation into each of the other languages was retrieved. If no translation was available, the English sentence was translated manually. The resulting corpus is comprised of 2,024 sentences for each language.\nThe sentences were tokenized, lemmatized, and tagged with POS tags using UDPipe v2.6. Senses were annotated using LexTag,/a>: each content word (noun, verb, adjective, and adverb) was assigned a sense from among the available senses from the sense inventory selected for the language (see below) or BabelNet. Sense inventories were also updated with new senses during annotation.\nThis corpus is available for download from the CLARIN.SI repository.", - "Languages": ["10 langages"], + "Language": ["10 langages"], "Licence": "CC BY-SA 4.0", "Size": ["345,092 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "annotated for senses"], diff --git a/corpora/parallel-corpora/emea.json b/corpora/parallel-corpora/emea.json index 7664296..26751c9 100644 --- a/corpora/parallel-corpora/emea.json +++ b/corpora/parallel-corpora/emea.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25DB-0", "Family": "Parallel corpora", "Description": "This corpus contains documents of the European Medicines Agency.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["Approx. 20 languages"], + "Language": ["Approx. 20 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["31 million tokens"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/emel.json b/corpora/parallel-corpora/emel.json index 8c47d1b..776755c 100644 --- a/corpora/parallel-corpora/emel.json +++ b/corpora/parallel-corpora/emel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AUTH-0000-0000-2C5A-B", "Family": "Parallel corpora", "Description": "This corpus contains NLP conference papers.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["English-French"], + "Language": ["English-French"], "Licence": "CC-BY", "Size": ["43,000 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/eng-ces-wikipedia.json b/corpora/parallel-corpora/eng-ces-wikipedia.json index d365c66..6ebb5aa 100644 --- a/corpora/parallel-corpora/eng-ces-wikipedia.json +++ b/corpora/parallel-corpora/eng-ces-wikipedia.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1932", "Family": "Parallel corpora", "Description": "This corpus contains Wikipedi articles translated from English into Czech.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["English-Czech"], + "Language": ["English-Czech"], "Licence": "CC-BY", "Size": ["7.5 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/eng-luganda.json b/corpora/parallel-corpora/eng-luganda.json index 27c4eca..8dd4aa7 100644 --- a/corpora/parallel-corpora/eng-luganda.json +++ b/corpora/parallel-corpora/eng-luganda.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-560", "Family": "Parallel corpora", "Description": "This corpus contains Biblical scripture (150 manually annotated sentences from the Gospel of Luke (1:1 to 3:18). The English text is King James Bible whereas the Lugandan text is taken from the online Luganda bible.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["English-Luganda"], + "Language": ["English-Luganda"], "Licence": "", "Size": ["150 sentences"], "Annotation": ["word-aligned"], diff --git a/corpora/parallel-corpora/eng-slk-parallel.json b/corpora/parallel-corpora/eng-slk-parallel.json index d77dd21..624d1d0 100644 --- a/corpora/parallel-corpora/eng-slk-parallel.json +++ b/corpora/parallel-corpora/eng-slk-parallel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0006-AAE0-A", "Family": "Parallel corpora", "Description": "This corpus contains legal texts (Acquis), parliamentary debates (from the Europarl corpus), articles from the Official Journal of the European Union, and texts from the OPUS corpus.\nThe corpus is available for download from the LINDAT repository.", - "Languages": ["English-Slovak"], + "Language": ["English-Slovak"], "Licence": "CC-BY NC-SA 3.0", "Size": [], "Annotation": ["automatic morphological annotation"], diff --git a/corpora/parallel-corpora/eng-swe-parallel.json b/corpora/parallel-corpora/eng-swe-parallel.json index 5c92a13..be2baa6 100644 --- a/corpora/parallel-corpora/eng-swe-parallel.json +++ b/corpora/parallel-corpora/eng-swe-parallel.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/en/resources/espc", "Family": "Parallel corpora", "Description": "This corpus contains fictional and non-fictional texts. It is bidirectional. The corpus is not available.", - "Languages": ["English-Swedish"], + "Language": ["English-Swedish"], "Licence": "", "Size": ["3.5 million tokens"], "Annotation": ["tokenised", "paragraph aligned"], diff --git a/corpora/parallel-corpora/eng-urdu-rel.json b/corpora/parallel-corpora/eng-urdu-rel.json index 1990269..3a6477f 100644 --- a/corpora/parallel-corpora/eng-urdu-rel.json +++ b/corpora/parallel-corpora/eng-urdu-rel.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11234/1-2582", "Family": "Parallel corpora", "Description": "This corpus contains religious texts (the Bible and the Quran).\nThe corpus is available for download from LINDAT.", - "Languages": ["English-Urdu"], + "Language": ["English-Urdu"], "Licence": "CC-BY", "Size": ["14,371 sentences"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/entam.json b/corpora/parallel-corpora/entam.json index 34d6610..711dd46 100644 --- a/corpora/parallel-corpora/entam.json +++ b/corpora/parallel-corpora/entam.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1454", "Family": "Parallel corpora", "Description": "This corpus contains news articles and texts related to film.\nThe corpus is available for download from LINDAT.", - "Languages": ["English-Tamil"], + "Language": ["English-Tamil"], "Licence": "CC-BY", "Size": ["169,871 sentences"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/epic-uds.json b/corpora/parallel-corpora/epic-uds.json index b204826..c6e5875 100644 --- a/corpora/parallel-corpora/epic-uds.json +++ b/corpora/parallel-corpora/epic-uds.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0008-F519-8", "Family": "Parallel corpora", "Description": "This is a parallel and comparable corpus of speeches held in the European Parliament; the corpus follows the European Parliament Interpreting Corpora tradition of the EPIC and EPICG corpora. It contains original speeches from 2008 to 2013 by English, German, and Spanish native speakers and their interpretation (English to and from German; Spanish to English).\nAll transcripts in the corpus are based on videos of the European Parliament Proceedings published by the European Parliament.\nAnnotation includes typical characteristics of spoken language such as false starts, hesitations and truncated words. To obtain better results for source-target alignment as well as sentence parsing the transcripts were segmented using a main clause approach: compound sentences were segmented separately. For the second version of the corpus, the transcripts were processed clause by clause with the spaCy NLP tools; the data is encoded in CoNLL-U and provides universal PoS tags, fine-grained language-specific PoS tags as well as Universal Dependency syntactic relations. All data was enriched with relevant metadata such as source language, name of original speaker, speech timing, mode of delivery and delivery rate.\nThe corpus is available for download from CLARIN-D (Saarland University B-centre).", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "CC BY-NC-SA 4.0", "Size": ["350,000 tokens", "20,000 sentences"], "Annotation": ["tokenised", "PoS-tagged", "syntactically parsed", "speech phenomena"], diff --git a/corpora/parallel-corpora/epic.json b/corpora/parallel-corpora/epic.json index 0472d9c..25888c0 100644 --- a/corpora/parallel-corpora/epic.json +++ b/corpora/parallel-corpora/epic.json @@ -3,7 +3,7 @@ "URL": "https://catalog.elra.info/en-us/repository/browse/ELRA-S0323/", "Family": "Parallel corpora", "Description": "This corpus contains debates of the European Parliament in Italian, English, and Spanish, with translations in all possible combinations.\nThe corpus is available for download from the ELRA catalogue.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "ELRA END USER", "Size": ["177,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parallel-corpora/est-eng-parallel.json b/corpora/parallel-corpora/est-eng-parallel.json index 592fa93..d58a9c4 100644 --- a/corpora/parallel-corpora/est-eng-parallel.json +++ b/corpora/parallel-corpora/est-eng-parallel.json @@ -3,7 +3,7 @@ "URL": "https://www.cl.ut.ee/korpused/paralleel/index.php?lang=en", "Family": "Parallel corpora", "Description": "This corpus contains Estonian laws and their translations into English and EU legislation translated into Estonian.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Estonian-English"], + "Language": ["Estonian-English"], "Licence": "CLARIN ACA", "Size": ["307,000 sentences"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/est-open-parallel.json b/corpora/parallel-corpora/est-open-parallel.json index eefbfab..05818f5 100644 --- a/corpora/parallel-corpora/est-open-parallel.json +++ b/corpora/parallel-corpora/est-open-parallel.json @@ -3,7 +3,7 @@ "URL": "http://metashare.tilde.com/repository/browse/estonian-open-parallel-corpus-2012-estonian-english/1ebafd00a96111e5aa3b001dd8b71c66ec3a43cb0e0f4669b64f85347efd43a7/", "Family": "Parallel corpora", "Description": "This corpus contains Biblical and legal texts.\nThe corpus is available for download from META-SHARE.", - "Languages": ["Estonian-English"], + "Language": ["Estonian-English"], "Licence": "CC-BY", "Size": ["2.5 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/eubookshop.json b/corpora/parallel-corpora/eubookshop.json index 4d0545a..bae2eb3 100644 --- a/corpora/parallel-corpora/eubookshop.json +++ b/corpora/parallel-corpora/eubookshop.json @@ -3,7 +3,7 @@ "URL": "http://opus.lingfil.uu.se/EUbookshop.php", "Family": "Parallel corpora", "Description": "This corpus contains texts from EU law books and related publications.\nThe corpus is available for download from the OPUS webpage.", - "Languages": ["48 languages"], + "Language": ["48 languages"], "Licence": "", "Size": ["3.5 billion tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/eur-const.json b/corpora/parallel-corpora/eur-const.json index 6670b18..bd12dee 100644 --- a/corpora/parallel-corpora/eur-const.json +++ b/corpora/parallel-corpora/eur-const.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-258B-A", "Family": "Parallel corpora", "Description": "This corpus contains European Constitution documents.\nThe corpus is available for download through the CLARIN:el repository.", - "Languages": ["21 languages"], + "Language": ["21 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["3 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/europarl-ell-eng.json b/corpora/parallel-corpora/europarl-ell-eng.json index d98025b..8111054 100644 --- a/corpora/parallel-corpora/europarl-ell-eng.json +++ b/corpora/parallel-corpora/europarl-ell-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23DE-F", "Family": "Parallel corpora", "Description": "This corpus contains debates of the European Parliament from 1996 to 2011.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["Greek-English"], + "Language": ["Greek-English"], "Licence": "CC-ZERO", "Size": ["1.2 million sentences"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/europarl-qtleap-wsd-ned.json b/corpora/parallel-corpora/europarl-qtleap-wsd-ned.json index 8c32e15..592bca9 100644 --- a/corpora/parallel-corpora/europarl-qtleap-wsd-ned.json +++ b/corpora/parallel-corpora/europarl-qtleap-wsd-ned.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1477", "Family": "Parallel corpora", "Description": "This corpus contains debates of the European Parliament in the following language pairs: Bulgarian-English, Czech-English, Portuguese-English, Spanish-English, and Basque-English.\nThe corpus is available for download from LINDAT.", - "Languages": ["6 languages"], + "Language": ["6 languages"], "Licence": "CC-BY", "Size": ["52 million tokens"], "Annotation": ["tokenised", "WSD", "NER", "CR-tagged"], diff --git a/corpora/parallel-corpora/europarl-uds.json b/corpora/parallel-corpora/europarl-uds.json index 9b934c5..2cad298 100644 --- a/corpora/parallel-corpora/europarl-uds.json +++ b/corpora/parallel-corpora/europarl-uds.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0000-D5EE-4", "Family": "Parallel corpora", "Description": "The corpus contains parliamentary debates of the European Parliament. A subset is a parallel corpus for the following language combinations: English-German and English-Spanish.\nThe corpus is available for download from a CLARIN-D repository. ", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "CC-BY-NC-SA 4.0", "Size": [], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/europarl.json b/corpora/parallel-corpora/europarl.json index fe06a62..9dc6b9d 100644 --- a/corpora/parallel-corpora/europarl.json +++ b/corpora/parallel-corpora/europarl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-395", "Family": "Parallel corpora", "Description": "This corpus contains debates of the European Parliament from 1996 to 2011.\nThe corpus is available for download from the corpus webpage.", - "Languages": ["21 languages"], + "Language": ["21 languages"], "Licence": "CC-ZERO", "Size": ["650,000 tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/fienwac.json b/corpora/parallel-corpora/fienwac.json index 2e41928..6426e48 100644 --- a/corpora/parallel-corpora/fienwac.json +++ b/corpora/parallel-corpora/fienwac.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1061", "Family": "Parallel corpora", "Description": "This corpus contains texts crawled from top-level Finnish .fi domains.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["Finnish-English"], + "Language": ["Finnish-English"], "Licence": "CLARIN.SI User License for Internet Corpora", "Size": ["2.9 million tokens"], "Annotation": ["tokenised", "sentenced-aligned"], diff --git a/corpora/parallel-corpora/free-trade-agreement.json b/corpora/parallel-corpora/free-trade-agreement.json index 015f7c9..eb88114 100644 --- a/corpora/parallel-corpora/free-trade-agreement.json +++ b/corpora/parallel-corpora/free-trade-agreement.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/DF2E-8C2F-E0AF-8", "Family": "Parallel corpora", "Description": "This corpus contains texts on the Free Trade Agreement.\nThe corpus is available through the concordancer Corpuscle.", - "Languages": ["English-Spanish"], + "Language": ["English-Spanish"], "Licence": "CLARIN ACA", "Size": ["3 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/frel.json b/corpora/parallel-corpora/frel.json index 024aab2..324ffbf 100644 --- a/corpora/parallel-corpora/frel.json +++ b/corpora/parallel-corpora/frel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AUTH-0000-0000-24DF-D", "Family": "Parallel corpora", "Description": "This corpus contains literary texts translated from French to Greek.", - "Languages": ["French-Greek"], + "Language": ["French-Greek"], "Licence": "under negotiation", "Size": ["701,401 tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/glossologia.json b/corpora/parallel-corpora/glossologia.json index b6304d8..ee4c4e6 100644 --- a/corpora/parallel-corpora/glossologia.json +++ b/corpora/parallel-corpora/glossologia.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-257A-E", "Family": "Parallel corpora", "Description": "This corpus contains articles from Glossologia, a journal of general and historical Greek linguistics, in French, Greek, English, and German.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["4 languages"], + "Language": ["4 languages"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/hindencorp.json b/corpora/parallel-corpora/hindencorp.json index 3228628..1ded425 100644 --- a/corpora/parallel-corpora/hindencorp.json +++ b/corpora/parallel-corpora/hindencorp.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-625F-0", "Family": "Manually annotated corpora", "Description": "This corpus contains TED talks, news articles, Wikipedia articles, etc.\nThe corpus is available for download from LINDAT and can be queried through KonText.", - "Languages": ["English-Hindi"], + "Language": ["English-Hindi"], "Licence": "CC-BY", "Size": ["132,300 sentences"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/hrenwac.json b/corpora/parallel-corpora/hrenwac.json index 8723b7f..32528fe 100644 --- a/corpora/parallel-corpora/hrenwac.json +++ b/corpora/parallel-corpora/hrenwac.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1058", "Family": "Parallel corpora", "Description": "This corpus contains texts crawled from top-level Croatian .hr domains.\nThe corpus was built with Spidextor, a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 80% and on the word level around 84%.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["Croatian-English"], + "Language": ["Croatian-English"], "Licence": "CLARIN.SI User License for Internet Corpora", "Size": ["1.6 million sentences", "55 million words"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/ift-fr-gr.json b/corpora/parallel-corpora/ift-fr-gr.json index 325b112..358bcb2 100644 --- a/corpora/parallel-corpora/ift-fr-gr.json +++ b/corpora/parallel-corpora/ift-fr-gr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AUTH-0000-0000-2557-5", "Family": "Parallel corpora", "Description": "This corpus contains IFT newsletters.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["French-Greek"], + "Language": ["French-Greek"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/intera-ell-eng.json b/corpora/parallel-corpora/intera-ell-eng.json index 382f58d..3cecc50 100644 --- a/corpora/parallel-corpora/intera-ell-eng.json +++ b/corpora/parallel-corpora/intera-ell-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25B2-D", "Family": "Parallel corpora", "Description": "This corpus contains texts from the law, education, environment, tourism and health domains.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["Greek-English"], + "Language": ["Greek-English"], "Licence": "CC-BY", "Size": ["4 million tokens"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/intercorp.json b/corpora/parallel-corpora/intercorp.json index f228eec..69016cb 100644 --- a/corpora/parallel-corpora/intercorp.json +++ b/corpora/parallel-corpora/intercorp.json @@ -3,7 +3,7 @@ "URL": "https://wiki.korpus.cz/doku.php/en:cnk:intercorp", "Family": "Parallel corpora", "Description": "The corpus consists of two main parts: manually aligned fiction and a number of collections: political commentaries published by Project Syndicate and VoxEurop, EU legal texts form the Acquis Communautaire corpus, proceedings of the European Parliament from the Europarl corpus, film subtitles from the Open Subtitles database, and the Bible.\nThe corpus is available primarily through the KonText concordancer. For research purposes, tailor-made linguistic data derived from the InterCorp corpus can be provided upon request. The contact e-mail is cnk@korpus.cz.", - "Languages": ["40 languages"], + "Language": ["40 languages"], "Licence": "proprietary", "Size": ["1.5 billion tokens"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/interlingual-perspectives.json b/corpora/parallel-corpora/interlingual-perspectives.json index 761320e..4772bbb 100644 --- a/corpora/parallel-corpora/interlingual-perspectives.json +++ b/corpora/parallel-corpora/interlingual-perspectives.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2577-1", "Family": "Parallel corpora", "Description": "This corpus contains research articles published from 2010 onwards focusing on the interaction of Greek with other languages through translation.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["English-Greek"], + "Language": ["English-Greek"], "Licence": "CC-BY", "Size": ["18 articles"], "Annotation": [], diff --git a/corpora/parallel-corpora/jrc-acquis.json b/corpora/parallel-corpora/jrc-acquis.json index 0e51b1d..55e554c 100644 --- a/corpora/parallel-corpora/jrc-acquis.json +++ b/corpora/parallel-corpora/jrc-acquis.json @@ -3,7 +3,7 @@ "URL": "https://ec.europa.eu/jrc/en/language-technologies/jrc-acquis", "Family": "Parallel corpora", "Description": "This corpus contains legislative and legal texts from the Acquis Communautaire from various periods beginning in the 1950s.\nThe corpus is available for download from the webpage of the European Commission.", - "Languages": ["22 languages"], + "Language": ["22 languages"], "Licence": "Usage Conditions", "Size": ["1 billion tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/kacenka.json b/corpora/parallel-corpora/kacenka.json index 17aed6e..5e70a8d 100644 --- a/corpora/parallel-corpora/kacenka.json +++ b/corpora/parallel-corpora/kacenka.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-891", "Family": "Parallel corpora", "Description": "This corpus contains both fictional and non-fictional texts. Most of the English texts for KACENKA have been retrieved from the Internet resources. The rest (andd nearly all the Czech texts) had to be scanned from fiction books (e.g., Czech translations of The Jungle Book by Rudyard Kipling, Lucky Jim by Kingsley Amis, and Sons and Lovers by D.H. Lawrence, among others) with the use of the OCR programme ProLector 1.2.", - "Languages": ["English-Czech"], + "Language": ["English-Czech"], "Licence": "", "Size": ["3.3 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/kotus-fin-swe.json b/corpora/parallel-corpora/kotus-fin-swe.json index 5b8ed76..c08e6cd 100644 --- a/corpora/parallel-corpora/kotus-fin-swe.json +++ b/corpora/parallel-corpora/kotus-fin-swe.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201406036", "Family": "Parallel corpora", "Description": "This corpus contains corporate press releases, surveys, reports, laws and regulations, as well as governmental proposals from 1993 to 2004.\nThe corpus is available for download from FIN-CLARIN and through the concordancer Korp.", - "Languages": ["Finnish-Swedish"], + "Language": ["Finnish-Swedish"], "Licence": "CC-BY", "Size": ["4.3 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/lila.json b/corpora/parallel-corpora/lila.json index 7ea1830..aeee21a 100644 --- a/corpora/parallel-corpora/lila.json +++ b/corpora/parallel-corpora/lila.json @@ -3,7 +3,7 @@ "URL": "http://tekstynas.vdu.lt/page.xhtml?id=parallelLILA", "Family": "Parallel corpora", "Description": "This corpus contains fictional and non-fictional texts from 1991 to 2012. It is bidirectional.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["Lithuanian-Latvian"], + "Language": ["Lithuanian-Latvian"], "Licence": "", "Size": ["8 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/macocu.json b/corpora/parallel-corpora/macocu.json index 0437b04..7d924ae 100644 --- a/corpora/parallel-corpora/macocu.json +++ b/corpora/parallel-corpora/macocu.json @@ -3,7 +3,7 @@ "URL": "https://www.clarin.si/repository/xmlui/handle/11356/1818", "Family": "Parallel corpora", "Description": "These corpora are a collection containing web texts and were built by crawling national internet top-level domains (specified below) and by extending the crawl dynamically to other domains as well. All the crawling process was carried out by the MaCoCu crawler. Websites containing documents in both target languages were identified and processed using the tool Bitextor. Considerable effort was devoted into cleaning the extracted text to provide a high-quality parallel corpus. This was achieved by removing boilerplate and near-duplicated paragraphs and documents that are not in one of the targeted languages. Document and segment alignment as implemented in Bitextor were carried out, and Bifixer and BicleanerAI were used for fixing, cleaning, and deduplicating the final version of the corpus.\nThe corpus is available in three formats: two sentence-level formats, TXT and TMX, and a document-level TXT format. When relevant, in each format, the texts are separated based on the script into two files: a Latin and a Cyrillic subcorpus. TMX is an XML-based format and TXT is a tab-separated format. They both consist of pairs of source and target segments (one or several sentences) and additional metadata. The following metadata is included in both sentence-level formats: - source and target document URL; - paragraph ID which includes information on the position of the sentence in the paragraph and in the document (e.g., \"p35:77s1/3\" which means \"paragraph 35 out of 77, sentence 1 out of 3\"); - quality score as provided by the tool Bicleaner AI (a likelihood of a pair of sentences being mutual translations, provided with a score between 0 and 1); - similarity score as provided by the sentence alignment tool Bleualign (value between 0 and 1); - personal information identification (\"biroamer-entities-detected\"): segments containing personal information are flagged, so final users of the corpus can decide whether to use these segments; - translation direction and machine translation identification (\"translation-direction\"): the source segment in each segment pair was identified by using a probabilistic model, which also determines if the translation has been produced by a machine-translation system; - a DSI class (\"dsi\"): information whether the segment is connected to any of Digital Service Infrastructure (DSI) classes (e.g., cybersecurity, e-health, e-justice, open-data-portal), defined by the Connecting Europe Facility; - English language variant: the language variant of English (British or American, using a lexicon-based English variety classifier) was identified on document and domain level. Furthermore, the sentence-level TXT format provides additional metadata: - web domain of the text; - source and target document title; - the date when the original file was retrieved; - the original type of the file (e.g., \"html\"), from which the sentence was extracted; - paragraph quality (labels, such as \"short\" or \"good\", assigned based on paragraph length, URL and stopword density via the jusText tool); - information whether the sentence is a heading or not in the original document.\nThe document-level TXT format provides pairs of documents identified to contain parallel data. In addition to the parallel documents (in base64 format), the corpus includes the following metadata: source and target document URL, a DSI category and the English language variant (British or American). As opposed to the previous version in the case of corpora in version 2.0, this version has more accurate metadata on languages of the texts, which was achieved by using Google's Compact Language Detector 2 (CLD2), a high-performance language detector supporting many languages. Other tools, used for web corpora creation and curation, have been updated as well, resulting in an even cleaner corpus. The new version also provides additional metadata, such as the position of the sentence in the paragraph and document, and information whether the sentence is related to a DSI. Moreover, the corpus is now also provided in a document-level format.\nThe document-level TXT format provides pairs of documents identified to contain parallel data. In addition to the parallel documents (in base64 format), the corpus includes the following metadata: source and target document URL, a DSI category and the English language variant (British or American).\nThe ALBANIAN-ENGLISH parallel corpus MaCoCu-sq-en 1.0 was built by crawling the \".al\" internet top-level domain in 2022. The BOSNIAN-ENGLISH parallel corpus MaCoCu-bs-en 1.0 was built by crawling the \".ba\" internet top-level domain in 2021 and 2022. The BULGARIAN-ENGLISH parallel corpus MaCoCu-bg-en 2.0 was built by crawling the \".bg\" and \".бг\" internet top-level domains in 2021. The CATALAN-ENGLISH parallel corpus MaCoCu-ca-en 1.0 was built by crawling the \".cat\", \".es\", \".ad\", \".fr\", \".it\" and \".eu\" internet top-level domain in 2022. The CROATIAN-ENGLISH parallel corpus MaCoCu-hr-en 2.0 was built by crawling the \".hr\" internet top-level domain in 2021 and 2022. The GREEK-ENGLISH parallel corpus MaCoCu-el-en 1.0 was built by crawling the \".gr\", \".ελ\", \".cy\" and \".eu\" internet top-level domain in 2023. The ICELANDIC-ENGLISH parallel corpus MaCoCu-is-en 2.0 was built by crawling the \".is\" internet top-level domain in 2021. The MACEDONIAN-ENGLISH parallel corpus MaCoCu-mk-en 2.0 was built by crawling the \".mk\" and \".мкд\" internet top-level domains in 2021. The MALTESE-ENGLISH parallel corpus MaCoCu-mt-en 2.0 was built by crawling the \".mt\" internet top-level domain in 2021. The MONTENEGRIN-ENGLISH parallel corpus MaCoCu-cnr-en 1.0 was built by crawling the \".me\" internet top-level domain in 2021 and 2022. The SERBIAN-ENGLISH parallel corpus MaCoCu-sr-en 1.0 was built by crawling the \".rs\" and \".срб\" internet top-level domains in 2021 and 2022. The SLOVENE-ENGLISH parallel corpus MaCoCu-sl-en 2.0 was built by crawling the \".si\" internet top-level domain in 2021 and 2022. The TURKISH-ENGLISH parallel corpus MaCoCu-tr-en 2.0 was built by crawling the \".tr\" and \".cy\" internet top-level domains in 2021. The UKRAINIAN-ENGLISH parallel corpus MaCoCu-uk-en 1.0 was built by crawling the \".ua\" and \".укр\" internet top-level domain in 2022.\nThe corpora are available for download from the Slovenian repository CLARIN.SI.", - "Languages": ["Multilingual"], + "Language": ["Multilingual"], "Licence": "CC0-No Rights Reserved", "Size": [], "Annotation": ["annotated with extensive metadata"], diff --git a/corpora/parallel-corpora/mlcc.json b/corpora/parallel-corpora/mlcc.json index 22877af..ec3748d 100644 --- a/corpora/parallel-corpora/mlcc.json +++ b/corpora/parallel-corpora/mlcc.json @@ -3,7 +3,7 @@ "URL": "https://catalog.elra.info/en-us/repository/browse/ELRA-W0023/", "Family": "Parallel corpora", "Description": "This corpus contains articles from the Official Journal of the European Communities from 1986 to 1994 in the following languages: Danish, Dutch, English, French, German, Greek, Italian, Portuguese, and Spanish.\nThe corpus is available for download from the ELRA catalogue.", - "Languages": ["9 language"], + "Language": ["9 language"], "Licence": "ELRA END USER", "Size": ["10.2 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/mulcold.json b/corpora/parallel-corpora/mulcold.json index 02cb58d..41cf030 100644 --- a/corpora/parallel-corpora/mulcold.json +++ b/corpora/parallel-corpora/mulcold.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201405278", "Family": "Parallel corpora", "Description": "This corpus contains international conventions and treaties in Russian, English, Swedish, and Finnish.\nThe corpus is available through the concordancer Korp.", - "Languages": ["4 languages"], + "Language": ["4 languages"], "Licence": "CC-BY", "Size": ["1.2 million tokens"], "Annotation": ["tokenised", "paragraph aligned", "PoS-tagged", "lemmatized"], diff --git a/corpora/parallel-corpora/multext-east.json b/corpora/parallel-corpora/multext-east.json index 96a897a..13b7b59 100644 --- a/corpora/parallel-corpora/multext-east.json +++ b/corpora/parallel-corpora/multext-east.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1043", "Family": "Parallel corpora", "Description": "This corpus contains George Orwell’s 1984 original novel in English and its translations into the following languages: Bulgarian, Czech, Estonian, Hungarian, Macedonian, Persian, Polish, Romanian, Serbian, Slovak, and Slovenian.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["11 languages"], + "Language": ["11 languages"], "Licence": "CC-BY", "Size": ["1.06 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/multijur.json b/corpora/parallel-corpora/multijur.json index 3e39f9e..6bd6d65 100644 --- a/corpora/parallel-corpora/multijur.json +++ b/corpora/parallel-corpora/multijur.json @@ -3,7 +3,7 @@ "URL": "https://kitwiki.csc.fi/twiki/bin/view/KitWiki/FinClarinCorpusResourceMultilingualCorpusOfJuridicalTexts", "Family": "Parallel corpora", "Description": "This corpus contains international conventions and treaties in the following languages: English, Russian, German, Finnish, and Swedish.\nThe corpus is available through the concordancer Korp.", - "Languages": ["5 languages"], + "Language": ["5 languages"], "Licence": "CLARIN PUB", "Size": ["1.2 million tokens"], "Annotation": ["paragraph aligned"], diff --git a/corpora/parallel-corpora/multiun.json b/corpora/parallel-corpora/multiun.json index b612691..b9f18cc 100644 --- a/corpora/parallel-corpora/multiun.json +++ b/corpora/parallel-corpora/multiun.json @@ -3,7 +3,7 @@ "URL": "http://www.euromatrixplus.net/multi-un/", "Family": "Parallel corpora", "Description": "This corpus contains texts from the United Nations website from 2000 to 2009 in the following language pairs: Spanish-Chinese, Chinese-Spanish, French-Chinese, and Chinese-French.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "", "Size": ["1 billion tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/musa.json b/corpora/parallel-corpora/musa.json index 3b535dc..2148be5 100644 --- a/corpora/parallel-corpora/musa.json +++ b/corpora/parallel-corpora/musa.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/musa-multilingual-multimodal-corpus/9f5d29a263c211e29fc5842b2b6a04d7a2d7266c56224f90ae4cb8f4757bf8ed/", "Family": "Parallel corpora", "Description": "This parallel multimodal corpus contains English, Greek, and French.\nThe corpus is distributed by CLARIN:EL.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "Academic", "Size": ["1.2 million words"], "Annotation": ["subtitle alignment"], diff --git a/corpora/parallel-corpora/naacl.json b/corpora/parallel-corpora/naacl.json index f6c873a..7e118c3 100644 --- a/corpora/parallel-corpora/naacl.json +++ b/corpora/parallel-corpora/naacl.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/the-naacl-2003-english-romanian-corpus/32d0c7f8dc7311e5aa0b00237df3e358c04f795d2d29477cbd7e46971ebdead6/", "Family": "Parallel corpora", "Description": "The corpus contains texts from 2003.", - "Languages": ["English-Romanian"], + "Language": ["English-Romanian"], "Licence": "MS-BY-NC-ND", "Size": ["1.6 million tokens"], "Annotation": [], diff --git a/corpora/parallel-corpora/nor-spa-parallel.json b/corpora/parallel-corpora/nor-spa-parallel.json index aa9e8aa..c7f3ef0 100644 --- a/corpora/parallel-corpora/nor-spa-parallel.json +++ b/corpora/parallel-corpora/nor-spa-parallel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11509/73", "Family": "Parallel corpora", "Description": "This corpus contains fictional and non-fictional texts from 2000 to 2009.\nThe corpus is available through the concordancer Corpuscle and for download in the CLARINO repository.", - "Languages": ["Norwegian-Spanish"], + "Language": ["Norwegian-Spanish"], "Licence": "CLARIN ACA", "Size": ["6 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/opensubtitles.json b/corpora/parallel-corpora/opensubtitles.json index 081d1e4..54fd844 100644 --- a/corpora/parallel-corpora/opensubtitles.json +++ b/corpora/parallel-corpora/opensubtitles.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23C4-B", "Family": "Parallel corpora", "Description": "This corpus contains subtitles from the OpenSubtitles website.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["54 languages"], + "Language": ["54 languages"], "Licence": "Open For Reuse With Restrictions ", "Size": ["8.31G tokens"], "Annotation": ["tokenised", "sentence and word aligned"], diff --git a/corpora/parallel-corpora/opus-helsinki.json b/corpora/parallel-corpora/opus-helsinki.json index 0e700a5..d06f012 100644 --- a/corpora/parallel-corpora/opus-helsinki.json +++ b/corpora/parallel-corpora/opus-helsinki.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2015102201", "Family": "Parallel corpora", "Description": "This is a multilingual variant of the OPUS corpus that contains texts in the following languages: Czech, Danish, Dutch, English, Estonian, French, German, Greek, Hungarian, Italian, Polish, Portuguese, Russian, Swedish, Spanish, and Turkish.\nThe corpus is available through the concordancer Korp.", - "Languages": ["16 languages"], + "Language": ["16 languages"], "Licence": "CC-BY", "Size": ["2.7 billion tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/opus.json b/corpora/parallel-corpora/opus.json index dd86329..2aff9b4 100644 --- a/corpora/parallel-corpora/opus.json +++ b/corpora/parallel-corpora/opus.json @@ -3,7 +3,7 @@ "URL": "http://opus.lingfil.uu.se/", "Family": "Parallel corpora", "Description": "This corpus contains various subcorpora that compile texts from a great number of domains, such as literary texts, political documents, subtitles, UN documents, and the debates of the European Parliament.\nThe corpus is available for download from a dedicated webpage and through a dedicated concordancer.", - "Languages": ["Approx. 100 languages"], + "Language": ["Approx. 100 languages"], "Licence": "CC-BY", "Size": ["A great many subcorpora"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/pages.json b/corpora/parallel-corpora/pages.json index be1b1d4..7a137fd 100644 --- a/corpora/parallel-corpora/pages.json +++ b/corpora/parallel-corpora/pages.json @@ -3,7 +3,7 @@ "URL": "https://www.corpuspages.eu/corpus/about/about", "Family": "Parallel corpora", "Description": "This corpus is comprised of two major parts: the core corpus and the supplements.\nThe core corpus is comprised of original texts in German and Spanish and their respective translations, as well as a small percentage (approx. 6%) of German and Spanish texts translated from a third language. The core corpus includes samples from 178 works of fiction (novels and short stories) as well as samples from non-fiction (essays and popular texts).\nThe text have been manually verified at different levels and the automatic alignment of the bisegments, performed by LF-Aligner, has been manually reviewed. The German texts have been lemmatized and PoS-tagged with Treetagger (part of the PoS taggers and lemmatizers Resource Family) and the Spanish texts with Freeling. The tags of both have been mapped to the Universal PoS tags.\nThe supplements include so far: Europarl v7, a corpus that collects the proceedings (Verbatim reports) of the European Parliament from 1996 to 2011 (also part of the Parliamentary Corpora Resource Family); and Ted-Talks (part of this family), a corpus that collects the German and Spanish translations of the transcriptions of Ted-Talks from 2006 to 2020.\nThe corpus is available for online browsing via a dedicated interface.", - "Languages": ["German-Spanish"], + "Language": ["German-Spanish"], "Licence": "Terms of Use", "Size": ["Main part: 38 million tokens; 1.1 million bisegments (alignments). Supplements: 80 million tokens"], "Annotation": ["sentence aligned", "PoS-tagged", "lemmatised"], diff --git a/corpora/parallel-corpora/panacea-eng-fra-eng-ell.json b/corpora/parallel-corpora/panacea-eng-fra-eng-ell.json index da38a53..151a6b1 100644 --- a/corpora/parallel-corpora/panacea-eng-fra-eng-ell.json +++ b/corpora/parallel-corpora/panacea-eng-fra-eng-ell.json @@ -3,7 +3,7 @@ "URL": "https://catalog.elra.info/en-us/repository/browse/ELRA-W0057/", "Family": "Parallel corpora", "Description": "This corpus contains environmental and legislative texts in English and their French and Greek translations.\nThe corpus is available for download from the ELRA catalogue.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "ELRA END USER", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/para-eng-gle.json b/corpora/parallel-corpora/para-eng-gle.json index 5a783c4..2bc6139 100644 --- a/corpora/parallel-corpora/para-eng-gle.json +++ b/corpora/parallel-corpora/para-eng-gle.json @@ -3,7 +3,7 @@ "URL": "http://www.gaois.ie/crp/en/", "Family": "Parallel corpora", "Description": "This corpus contains legal texts.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["English-Irish"], + "Language": ["English-Irish"], "Licence": "", "Size": [], "Annotation": ["sentence aligned "], diff --git a/corpora/parallel-corpora/para-global-voices.json b/corpora/parallel-corpora/para-global-voices.json index f10053a..202d1b6 100644 --- a/corpora/parallel-corpora/para-global-voices.json +++ b/corpora/parallel-corpora/para-global-voices.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25DD-E", "Family": "Parallel corpora", "Description": "This corpus contains texts crawled from the Global Voices webpage.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Approx. 50 languages"], + "Language": ["Approx. 50 languages"], "Licence": "CC-BY", "Size": ["174,629 documents"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/paracrawl.json b/corpora/parallel-corpora/paracrawl.json index 8e523fc..a1fe25f 100644 --- a/corpora/parallel-corpora/paracrawl.json +++ b/corpora/parallel-corpora/paracrawl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-2610", "Family": "Parallel corpora", "Description": "This corpus contains webcrawled data in the following languages: Czech, Dutch, English, Estonian, Finnish, French, German, Italian, Latvian, Polish, Portuguese, Romanian, Russian, and Spanish.\nThe corpus is available for download from LINDAT. Additionally, the 2.0 version of the corpus, which includes six new languages (Irish, Croatian, Maltese, Lithuanian, Hungarian, and Estonian), can be downloaded from the corpus's dedicated website.", - "Languages": ["11 languages"], + "Language": ["11 languages"], "Licence": "CC Zero", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/parallel-bible.json b/corpora/parallel-corpora/parallel-bible.json index c4a24df..07352ec 100644 --- a/corpora/parallel-corpora/parallel-bible.json +++ b/corpora/parallel-corpora/parallel-bible.json @@ -3,7 +3,7 @@ "URL": "http://cts.informatik.uni-leipzig.de/", "Family": "Parallel corpora", "Description": "This corpus contains historical and contemporary translations of the Bible.", - "Languages": ["Approx. 100 languages"], + "Language": ["Approx. 100 languages"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/parallel-kde4.json b/corpora/parallel-corpora/parallel-kde4.json index 7e177b0..f8a0a47 100644 --- a/corpora/parallel-corpora/parallel-kde4.json +++ b/corpora/parallel-corpora/parallel-kde4.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-258F-6", "Family": "Parallel corpora", "Description": "This corpus contains KDE4 localization files.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["92 languages"], + "Language": ["92 languages"], "Licence": "CC-BY", "Size": ["60 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/parallel-wiki.json b/corpora/parallel-corpora/parallel-wiki.json index ffbc703..fa6a1d5 100644 --- a/corpora/parallel-corpora/parallel-wiki.json +++ b/corpora/parallel-corpora/parallel-wiki.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/parallel-wiki-english-german/424504fadc7411e5aa0b00237df3e358380903df1e5642df8582c0a852c36359/", "Family": "Parallel corpora", "Description": "This corpus contains Wikipedia texts in the following language pairs: English-German, English-Romanian, and English-Spanish.", - "Languages": ["4 languages"], + "Language": ["4 languages"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/parcor.json b/corpora/parallel-corpora/parcor.json index c1035c1..8f28eda 100644 --- a/corpora/parallel-corpora/parcor.json +++ b/corpora/parallel-corpora/parcor.json @@ -3,7 +3,7 @@ "URL": "http://opus.lingfil.uu.se/ParCor/", "Family": "Parallel corpora", "Description": "This corpus contains TED talks and EU Bookshop publications.\nThe corpus is available for download from the OPUS webpage.", - "Languages": ["English-German"], + "Language": ["English-German"], "Licence": "", "Size": [], "Annotation": ["pronoun coreference"], diff --git a/corpora/parallel-corpora/parfin.json b/corpora/parallel-corpora/parfin.json index 8eb3552..944a420 100644 --- a/corpora/parallel-corpora/parfin.json +++ b/corpora/parallel-corpora/parfin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052710", "Family": "Parallel corpora", "Description": "This corpus contains literary texts from 1990 to 2010.\nThe corpus is available through the concordancer Korp.", - "Languages": ["Finnish-Russian"], + "Language": ["Finnish-Russian"], "Licence": "CLARIN RES", "Size": ["360,000 tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/parice.json b/corpora/parallel-corpora/parice.json index 64e9ee1..b2e4f53 100644 --- a/corpora/parallel-corpora/parice.json +++ b/corpora/parallel-corpora/parice.json @@ -3,7 +3,7 @@ "URL": "https://clarin.is/en/resources/parice/", "Family": "Parallel corpora", "Description": "This corpus contains Icelandic and English texts from 11 different sources.\nThe corpus is available for download from CLARIN-IS and for search through the concordancer Korp.", - "Languages": ["Icelandic-English"], + "Language": ["Icelandic-English"], "Licence": "CC-BY 4.0", "Size": ["3,589,000 sentence pairs"], "Annotation": ["tokenised", "PoS-tagged", "sentence-aligned", "word-aligned"], diff --git a/corpora/parallel-corpora/parrus.json b/corpora/parallel-corpora/parrus.json index 5a8f06c..4105d38 100644 --- a/corpora/parallel-corpora/parrus.json +++ b/corpora/parallel-corpora/parrus.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-20140730173", "Family": "Parallel corpora", "Description": "This corpus contains texts from classical and 20th century literature.\nThe corpus is available through the concordancer Korp.", - "Languages": ["Russian-Finnish"], + "Language": ["Russian-Finnish"], "Licence": "CLARIN RES", "Size": ["5.9 million tokens"], "Annotation": ["tokenised", "paragraph-aligned"], diff --git a/corpora/parallel-corpora/pelcra-clarin.json b/corpora/parallel-corpora/pelcra-clarin.json index a306115..d8f65f8 100644 --- a/corpora/parallel-corpora/pelcra-clarin.json +++ b/corpora/parallel-corpora/pelcra-clarin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23C2-D", "Family": "Parallel corpora", "Description": "This corpus contains texts from the CORDIC and RAPID websites, and the press releases of the European Parliament and the European Southern Observatory.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["25 languages"], + "Language": ["25 languages"], "Licence": "CC-BY", "Size": ["143 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/pelcra.json b/corpora/parallel-corpora/pelcra.json index 3ae4b28..53f4251 100644 --- a/corpora/parallel-corpora/pelcra.json +++ b/corpora/parallel-corpora/pelcra.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/pelcra-mutlilingual-parallel-corpora-cc-by/f05f0f1c63f111e2bff4525400d76147f9863da9a70143bebd894e55197705a1/", "Family": "Parallel corpora", "Description": "This corpus contains texts from the CORDIC and RAPID websites, and the press releases of the European Parliament and the European Southern Observatory.\nThe corpus is available for download from META-SHARE.", - "Languages": ["25 languages"], + "Language": ["25 languages"], "Licence": "CC-BY", "Size": ["143 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/pol-bul-rus-parallel.json b/corpora/parallel-corpora/pol-bul-rus-parallel.json index 897d50c..bbd623a 100644 --- a/corpora/parallel-corpora/pol-bul-rus-parallel.json +++ b/corpora/parallel-corpora/pol-bul-rus-parallel.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11321/308", "Family": "Parallel corpora", "Description": "This corpus is available for download from the CLARIN PL repository.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "IS PAS corpora license", "Size": ["55 texts"], "Annotation": [], diff --git a/corpora/parallel-corpora/pol-lit-parallel.json b/corpora/parallel-corpora/pol-lit-parallel.json index de5ba07..6b0707e 100644 --- a/corpora/parallel-corpora/pol-lit-parallel.json +++ b/corpora/parallel-corpora/pol-lit-parallel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/309", "Family": "Parallel corpora", "Description": "The corpus is available for download from the CLARIN-PL repository.", - "Languages": ["Polish-Lithuanian"], + "Language": ["Polish-Lithuanian"], "Licence": "IS PAS", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/qtleap-news.json b/corpora/parallel-corpora/qtleap-news.json index 8f85014..45711ac 100644 --- a/corpora/parallel-corpora/qtleap-news.json +++ b/corpora/parallel-corpora/qtleap-news.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/qtleap-news-corpus/834dfa6ca12a11e6a2aa782bcb074135a731473a10da4b04b0523d389854400d/", "Family": "Parallel corpora", "Description": "This corpus contains news articles in the following language pairs: English-Czech, English-German and English-Spanish.", - "Languages": ["4 languages"], + "Language": ["4 languages"], "Licence": "CC-BY", "Size": ["1,104 sentences"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/qtleap.json b/corpora/parallel-corpora/qtleap.json index 0ccc7e0..c3968f3 100644 --- a/corpora/parallel-corpora/qtleap.json +++ b/corpora/parallel-corpora/qtleap.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/qtleap-corpus-v12/0176c39ae9cd11e4a2aa782bcb074135ba7d767f645a48dca1d50ee3c9504253/", "Family": "Parallel corpora", "Description": "This corpus contains texts related to computer and IT troubleshooting for the following language pairs: Bulgarian-English, Czech-English, Portuguese-English, Spanish-English, and Basque-English\nThe corpus available for download from META-SHARE under the CC-BY license.", - "Languages": ["5 languages"], + "Language": ["5 languages"], "Licence": "CC-BY", "Size": ["140,000 tokens"], "Annotation": ["sentence-aligned"], diff --git a/corpora/parallel-corpora/qtlp-deu-ell-medical.json b/corpora/parallel-corpora/qtlp-deu-ell-medical.json index 036c5cb..b51e699 100644 --- a/corpora/parallel-corpora/qtlp-deu-ell-medical.json +++ b/corpora/parallel-corpora/qtlp-deu-ell-medical.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2458-5", "Family": "Parallel corpora", "Description": "This corpus contains medical texts. Almost all of the acquired documents were acquired from the official site of the European Union.", - "Languages": ["German-Greek"], + "Language": ["German-Greek"], "Licence": "MS-NC-NoReD", "Size": ["2,752 pairs of sentences"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/qtlp-eng-ell-automotive.json b/corpora/parallel-corpora/qtlp-eng-ell-automotive.json index b6ead54..9ef6967 100644 --- a/corpora/parallel-corpora/qtlp-eng-ell-automotive.json +++ b/corpora/parallel-corpora/qtlp-eng-ell-automotive.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2453-A", "Family": "Parallel corpora", "Description": "This corpus contains automatically detected pairs of parallel documents that were acquired from the web (i.e. from multilingual sites which contain content in the targeted languages and domain). The majority of the crawled sites were: i) websites of automobile manufacturers and ii) websites of companies that produce car accessories or car parts.", - "Languages": ["English-Greek"], + "Language": ["English-Greek"], "Licence": "MS-NC-NoReD", "Size": ["2,946 sentence pairs"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/qtlp-eng-ell-medical.json b/corpora/parallel-corpora/qtlp-eng-ell-medical.json index b8b7838..379b467 100644 --- a/corpora/parallel-corpora/qtlp-eng-ell-medical.json +++ b/corpora/parallel-corpora/qtlp-eng-ell-medical.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/ATHENA-0000-0000-2457-6", "Family": "Parallel corpora", "Description": "This corpus contains automatically detected pairs of parallel documents that were acquired from the web (i.e. from multilingual sites which contain content in the targeted languages and domain). The majority of the crawled sites were: i) websites that contain abstracts of scientific papers and ii) websites of organizations from the public or private sector that are related to medical/health services (e.g. medical centers, institutes, hospitals, etc.).", - "Languages": ["English-Greek"], + "Language": ["English-Greek"], "Licence": "MS-NC-NoReD", "Size": ["62,452 sentence pairs"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/qtlp-por-ell-automotive.json b/corpora/parallel-corpora/qtlp-por-ell-automotive.json index 8626dd7..9f203a0 100644 --- a/corpora/parallel-corpora/qtlp-por-ell-automotive.json +++ b/corpora/parallel-corpora/qtlp-por-ell-automotive.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2452-B", "Family": "Parallel corpora", "Description": "This corpus contains automatically detected pairs of parallel documents that were acquired from the web (i.e. from multilingual sites which contain content in the targeted languages and domain). The majority of the crawled sites were: i) websites of automobile manufacturers and ii) websites of companies that produce car accessories or car parts. ", - "Languages": ["Portuguese-Greek"], + "Language": ["Portuguese-Greek"], "Licence": "MS-NC-NoReD", "Size": ["59,297 sentence pairs"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/qtlp-por-ell-medical.json b/corpora/parallel-corpora/qtlp-por-ell-medical.json index 632f8d4..d6b77a2 100644 --- a/corpora/parallel-corpora/qtlp-por-ell-medical.json +++ b/corpora/parallel-corpora/qtlp-por-ell-medical.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2455-8", "Family": "Parallel corpora", "Description": "This corpus contains medical texts. Almost all of the acquired documents were acquired from the official site of the European Union.", - "Languages": ["Portuguese-Greek"], + "Language": ["Portuguese-Greek"], "Licence": "MS-NC-NoReD", "Size": ["62,608 sentence pairs"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/reveal-this.json b/corpora/parallel-corpora/reveal-this.json index 7cc4d3f..e5c3f95 100644 --- a/corpora/parallel-corpora/reveal-this.json +++ b/corpora/parallel-corpora/reveal-this.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/reveal-this-corpus/3d1c4f0e614a11e289f6842b2b6a04d7c69218b49aab4ab78249dd27add9d4a3/", "Family": "Parallel corpora", "Description": "This is a multilingual corpus of English, French and Greek.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "under negotiation", "Size": ["325,000 words"], "Annotation": [], diff --git a/corpora/parallel-corpora/scielo.json b/corpora/parallel-corpora/scielo.json index 0008b20..711bf23 100644 --- a/corpora/parallel-corpora/scielo.json +++ b/corpora/parallel-corpora/scielo.json @@ -3,7 +3,7 @@ "URL": "http://www.statmt.org/wmt16/biomedical-translation-task.html", "Family": "Parallel corpora", "Description": "This corpus contains scientific articles from the Scielo database in the following language pairs: English-French, English-Spanish, and English-Portuguese.", - "Languages": ["4 languages"], + "Language": ["4 languages"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/setimes-clarin.json b/corpora/parallel-corpora/setimes-clarin.json index 2741345..01a1959 100644 --- a/corpora/parallel-corpora/setimes-clarin.json +++ b/corpora/parallel-corpora/setimes-clarin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2591-2", "Family": "Parallel corpora", "Description": "This corpus contains texts from the setimes.com website.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["9 languages"], + "Language": ["9 languages"], "Licence": "CC-BY", "Size": ["43 million tokens"], "Annotation": ["partially sentence aligned"], diff --git a/corpora/parallel-corpora/setimes.json b/corpora/parallel-corpora/setimes.json index afda94d..00a737f 100644 --- a/corpora/parallel-corpora/setimes.json +++ b/corpora/parallel-corpora/setimes.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/south-east-european-parallel-corpus/d200935e67cc11e28a985ef2e4e6c59ef6e70e681f7745a191deeb0b0537e60a/", "Family": "Parallel corpora", "Description": "This corpus contains texts from the setimes.com website.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["10 languages"], + "Language": ["10 languages"], "Licence": "CC-BY", "Size": ["43 million tokens"], "Annotation": ["partially sentence aligned"], diff --git a/corpora/parallel-corpora/slenwac.json b/corpora/parallel-corpora/slenwac.json index 5caa3be..462b4ea 100644 --- a/corpora/parallel-corpora/slenwac.json +++ b/corpora/parallel-corpora/slenwac.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1061", "Family": "Parallel corpora", "Description": "This corpus contains texts crawled from top-level Slovenian .si domains. The corpus was built with Spidextor, a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 67% and on the word level around 68%.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["Slovenian-English"], + "Language": ["Slovenian-English"], "Licence": "CLARIN.SI User License for Internet Corpora", "Size": ["718,315 tokens"], "Annotation": ["tokenised", "sentenced-aligned"], diff --git a/corpora/parallel-corpora/slk-eng-parallel.json b/corpora/parallel-corpora/slk-eng-parallel.json index ee596a3..a47c876 100644 --- a/corpora/parallel-corpora/slk-eng-parallel.json +++ b/corpora/parallel-corpora/slk-eng-parallel.json @@ -3,7 +3,7 @@ "URL": "https://metashare.korpus.sk/repository/browse/slovak-english-parallel-corpus-free/719fe518665b11e2b2e800163e0000784f67c82f54f44ce3b06749baf33bfbd2/", "Family": "Parallel corpora", "Description": "This corpus contains texts from language books. It is bidirectional.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["Slovak-English"], + "Language": ["Slovak-English"], "Licence": "proprietary", "Size": ["556 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/spc.json b/corpora/parallel-corpora/spc.json index 8182ba2..3b2e523 100644 --- a/corpora/parallel-corpora/spc.json +++ b/corpora/parallel-corpora/spc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-25D3-8", "Family": "Parallel corpora", "Description": "This corpus contains legal texts in English, Afrikaans, Chinese, and Greek.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["4 languages"], + "Language": ["4 languages"], "Licence": "Open For Reuse With Restrictions", "Size": ["1.32 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/srenwac.json b/corpora/parallel-corpora/srenwac.json index 8fa4080..2c9ef0d 100644 --- a/corpora/parallel-corpora/srenwac.json +++ b/corpora/parallel-corpora/srenwac.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1059", "Family": "Parallel corpora", "Description": "This corpus contains texts crawled from top-level Serbian .rs domains. The corpus was built with Spidextor, a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext, given the evaluation results on other languages, can be estimated at 74% on the sentence level and 76% on the word level.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["Serbian-English"], + "Language": ["Serbian-English"], "Licence": "CLARIN.SI User License for Internet Corpora", "Size": ["23.1 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/szeged-parallel.json b/corpora/parallel-corpora/szeged-parallel.json index a029b4b..dd086cb 100644 --- a/corpora/parallel-corpora/szeged-parallel.json +++ b/corpora/parallel-corpora/szeged-parallel.json @@ -3,7 +3,7 @@ "URL": "http://rgai.inf.u-szeged.hu/index.php?lang=en&page=corpus_paralell", "Family": "Parallel corpora", "Description": "This corpus contains literary texts and texts on the European Union.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["English-Hungarian"], + "Language": ["English-Hungarian"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/parallel-corpora/tatoeba.json b/corpora/parallel-corpora/tatoeba.json index 0943e49..060364c 100644 --- a/corpora/parallel-corpora/tatoeba.json +++ b/corpora/parallel-corpora/tatoeba.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-2589-C", "Family": "Parallel corpora", "Description": "This corpus contains texts from the Tatoeba website.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["117 languages"], + "Language": ["117 languages"], "Licence": "CC-BY", "Size": ["12 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/ted-para.json b/corpora/parallel-corpora/ted-para.json index 4cabd7d..06df9dc 100644 --- a/corpora/parallel-corpora/ted-para.json +++ b/corpora/parallel-corpora/ted-para.json @@ -3,7 +3,7 @@ "URL": "https://github.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus", "Family": "Parallel corpora", "Description": "This corpus contains TED talks in English and translations into the following languages: Arabic, Simplified Chinese, Traditional Chinese, Dutch, French, German, Hebrew, Italian, Japanese, Korean, and Russian.\nThe corpus is available for download from GIT-HUB.", - "Languages": ["11 languages"], + "Language": ["11 languages"], "Licence": "", "Size": ["300,000 sentences"], "Annotation": [], diff --git a/corpora/parallel-corpora/tourism-eng-hrv.json b/corpora/parallel-corpora/tourism-eng-hrv.json index ea34edd..17e67c3 100644 --- a/corpora/parallel-corpora/tourism-eng-hrv.json +++ b/corpora/parallel-corpora/tourism-eng-hrv.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1049", "Family": "Parallel corpora", "Description": "This corpus contains automatically crawled texts from 25 tourist websites.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["English-Croatian"], + "Language": ["English-Croatian"], "Licence": "CLARIN.SI User Licence for Internet Corpora", "Size": ["140,000 tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/tris.json b/corpora/parallel-corpora/tris.json index d7d4c86..daa05cb 100644 --- a/corpora/parallel-corpora/tris.json +++ b/corpora/parallel-corpora/tris.json @@ -3,7 +3,7 @@ "URL": "http://clara.b.uib.no/fellows/carla-parra-escartin/tris/", "Family": "Parallel corpora", "Description": "This corpus contains texts from the European Commission from 1997 to 2010.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["German-Spanish"], + "Language": ["German-Spanish"], "Licence": "", "Size": ["1.76 million tokens"], "Annotation": ["tokenised", "sentence-aligned"], diff --git a/corpora/parallel-corpora/ufal-nor-levantine.json b/corpora/parallel-corpora/ufal-nor-levantine.json index 62481fb..47b7f4d 100644 --- a/corpora/parallel-corpora/ufal-nor-levantine.json +++ b/corpora/parallel-corpora/ufal-nor-levantine.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-5033", "Family": "Parallel corpora", "Description": "This corpus contains multiparallel sentences in English, French, German, Greek, Spanish, and Standard Arabic.\nThe sentences have been selected from the OpenSubtitles2018 corpus and are manually translated into the North Levantine Arabic language.\nThe corpus is available for download from LINDAT.", - "Languages": ["6 languages"], + "Language": ["6 languages"], "Licence": "CC BY-NC-SA 4.0", "Size": ["844,200 sentences", "6.2 million words"], "Annotation": ["sentence aligned"], diff --git a/corpora/parallel-corpora/umc-ces-rus-eng.json b/corpora/parallel-corpora/umc-ces-rus-eng.json index 3c7e5bf..762ac78 100644 --- a/corpora/parallel-corpora/umc-ces-rus-eng.json +++ b/corpora/parallel-corpora/umc-ces-rus-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-4909-7", "Family": "Parallel corpora", "Description": "This corpus contains news articles and commentaries in Czech, Russian, and English from the Project Syndicate website from 1995 to 2008.\nThe corpus is available for download from LINDAT and through the concordancer Korp.", - "Languages": ["3 languages"], + "Language": ["3 languages"], "Licence": "CC-BY", "Size": ["1.8 million tokens"], "Annotation": ["tokenised", "sentence aligned"], diff --git a/corpora/parallel-corpora/un-parallel.json b/corpora/parallel-corpora/un-parallel.json index 93c418f..a28baf3 100644 --- a/corpora/parallel-corpora/un-parallel.json +++ b/corpora/parallel-corpora/un-parallel.json @@ -3,7 +3,7 @@ "URL": "https://conferences.unite.un.org/uncorpus", "Family": "Parallel corpora", "Description": "This corpus contains the official records and other parliamentary documents of the United Nations that are in the public domain in the following languages: English, Russian, Spanish, French, Chinese, and Arabic.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["6 languages"], + "Language": ["6 languages"], "Licence": "", "Size": ["335 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parallel-corpora/up-tap-opennlp.json b/corpora/parallel-corpora/up-tap-opennlp.json index d1489b2..7dae540 100644 --- a/corpora/parallel-corpora/up-tap-opennlp.json +++ b/corpora/parallel-corpora/up-tap-opennlp.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-588D-F", "Family": "Parallel corpora", "Description": "This parallel corpus contains texts extracted from the TAP UP magazine.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["English-Portuguese"], + "Language": ["English-Portuguese"], "Licence": "CC-BY", "Size": ["31,849 sentences"], "Annotation": ["PoS-tagged", "sentence aligned"], diff --git "a/corpora/parallel-corpora/\316\274topia.json" "b/corpora/parallel-corpora/\316\274topia.json" index 40a69fb..9d937db 100644 --- "a/corpora/parallel-corpora/\316\274topia.json" +++ "b/corpora/parallel-corpora/\316\274topia.json" @@ -3,7 +3,7 @@ "URL": "http://www.cs.cmu.edu/~lingwang/microtopia/#overview", "Family": "Parallel corpora", "Description": "This corpus contains tweets and blogposts in the following language pairs: English-Mandarin, English-Arabic, English-Russian, English-Korean, and English-Japanese.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["6 languages"], + "Language": ["6 languages"], "Licence": "", "Size": ["1.5 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parliamentary-corpora/aalto-fin-parla.json b/corpora/parliamentary-corpora/aalto-fin-parla.json index ad5fbf3..0a906ee 100644 --- a/corpora/parliamentary-corpora/aalto-fin-parla.json +++ b/corpora/parliamentary-corpora/aalto-fin-parla.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2022052002", "Family": "Parliamentary corpora", "Description": "This corpus, which consists of both audio recordings and transcriptions, is extracted from the Finnish parliamentary plenary session transcripts and videos by the Aalto Speech Recognition group. The original session transcripts and videos are available on the websites of the Parliament of Finland (see here and here). The corpus is split into three parts:\n
          \n
        1. the 2015–2020 set
        2. \n
        3. the 2008–2016 set
        4. \n
        5. development and test sets
        6. \n
        \nThe corpus is available for download from the Language Bank of Finland.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN PUB", "Size": ["119.3 million words", "3,130 hours of recordings"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/archives-parlementaires.json b/corpora/parliamentary-corpora/archives-parlementaires.json index a8159a8..4d22a0d 100644 --- a/corpora/parliamentary-corpora/archives-parlementaires.json +++ b/corpora/parliamentary-corpora/archives-parlementaires.json @@ -3,7 +3,7 @@ "URL": "https://sul-philologic.stanford.edu/philologic/archparl/", "Family": "Parliamentary corpora", "Description": "The Archives parlementaires is a chronologically-ordered edited collection of sources on the French Revolution. It was conceived in the mid 19th century as a project to produce a definitive record of parliamentary deliberations and also includes letters, reports, speeches, and other first-hand accounts from a great variety of published and archival sources. FRDA currently contains the AP volumes covering the years 1787-1794, which can be searched using ARTFL's PhiloLogic 4 open source software platform. The texts have been marked up using TEI so that speakers, places, dates, and terms in the published index can be easily found. Users can see both scanned images of the AP pages or just the texts. ", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/parliamentary-corpora/assemblee-nationale.json b/corpora/parliamentary-corpora/assemblee-nationale.json index dcad3b0..524ecb2 100644 --- a/corpora/parliamentary-corpora/assemblee-nationale.json +++ b/corpora/parliamentary-corpora/assemblee-nationale.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/fr-parl/v1", "Family": "Parliamentary corpora", "Description": "The corpus contains French parliamentary debates from 2002 to 2012. The contextual metadata in the corpus concern the dates of the council meetings, the description of the main topic(s) of the European council meeting, the place where the European Council meeting took place; they also correspond to information about the government and the legislative session. The speaker metadata correspond to name, gender, occupation, parliamentary group, political orientation and the opposition and majority division.\nThe corpus is available for download from Ortolang.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": ["137,000 tokens"], "Annotation": ["contextual and speaker metadata"], diff --git a/corpora/parliamentary-corpora/at-parlamentsreden.json b/corpora/parliamentary-corpora/at-parlamentsreden.json index 1d44931..669798c 100644 --- a/corpora/parliamentary-corpora/at-parlamentsreden.json +++ b/corpora/parliamentary-corpora/at-parlamentsreden.json @@ -3,7 +3,7 @@ "URL": "https://homepages.uni-regensburg.de/~sic07430/", "Family": "Parliamentary corpora", "Description": "The corpus contains Austrian parliamentary debates from 2013 to 2015. It is annotated with the Stanford Tagger.\nThe corpus currently is not available.", - "Languages": ["German (Austrian)"], + "Language": ["German (Austrian)"], "Licence": "", "Size": ["1.2 million tokens"], "Annotation": ["tokenised", "PoS-tagged"], diff --git a/corpora/parliamentary-corpora/bul-pol-jour-speech.json b/corpora/parliamentary-corpora/bul-pol-jour-speech.json index f7f6de9..ee8b6d7 100644 --- a/corpora/parliamentary-corpora/bul-pol-jour-speech.json +++ b/corpora/parliamentary-corpora/bul-pol-jour-speech.json @@ -3,7 +3,7 @@ "URL": "http://www.political.webclark.org/?locale=bg", "Family": "Parliamentary corpora", "Description": "The corpus contains Bulgarian parliamentary debates from 2006 to 2012.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["bul"], + "Language": ["bul"], "Licence": "", "Size": ["10 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/bundestag-europe.json b/corpora/parliamentary-corpora/bundestag-europe.json index f299798..564cac6 100644 --- a/corpora/parliamentary-corpora/bundestag-europe.json +++ b/corpora/parliamentary-corpora/bundestag-europe.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/de-parl/v1", "Family": "Parliamentary corpora", "Description": "The corpus contains German parliamentary debates from 1998 to 2015. The contextual metadata in the corpus concern the dates of the council meetings, the description of the main topic(s) of the European council meeting, the place where the European Council meeting took place; they also correspond to information about the government and the legislative session. The speaker metadata correspond to name, gender, occupation, parliamentary group, political orientation and the opposition and majority division.\nThe corpus is available for download from Ortolang.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY", "Size": ["417,000 tokens"], "Annotation": ["contextual and speaker metadata"], diff --git a/corpora/parliamentary-corpora/cepic.json b/corpora/parliamentary-corpora/cepic.json index e4b8034..25eba8e 100644 --- a/corpora/parliamentary-corpora/cepic.json +++ b/corpora/parliamentary-corpora/cepic.json @@ -3,7 +3,7 @@ "URL": "https://digital.lib.hkbu.edu.hk/cepic/", "Family": "Parliamentary corpora", "Description": "The CEPIC consists of transcripts of speeches delivered by top political figures from Hong Kong, Beijing, Washington DC and London, as well as their translated/interpreted texts.\nThe main speech types of CEPIC include the reading of government reports such as policy addresses and budget speeches, Q&A at press conferences, parliamentary debates, as well as remarks delivered at bilateral meetings.\nThe corpus features a parallel display of up to six versions of the same speech segment, aligned at paragraph level.\nThe corpus is available for online querying through a dedicated concordancer.", - "Languages": ["zho", "eng"], + "Language": ["zho", "eng"], "Licence": "Terms of Use", "Size": ["6.5 million words"], "Annotation": ["PoS-tagged", "prosodic and paralinguistic features"], diff --git a/corpora/parliamentary-corpora/czech-parl-meetings.json b/corpora/parliamentary-corpora/czech-parl-meetings.json index 3dacdda..63d11ab 100644 --- a/corpora/parliamentary-corpora/czech-parl-meetings.json +++ b/corpora/parliamentary-corpora/czech-parl-meetings.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0005-CF9C-4", "Family": "Parliamentary corpora", "Description": "The corpus contains recordings of the parliamentary sessions as well as corresponding transcriptions.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC-BY", "Size": ["88 hours", "0.5 million tokens"], "Annotation": ["error correction of transcriptions", "division into speech sections with speaker information"], diff --git a/corpora/parliamentary-corpora/czechparl.json b/corpora/parliamentary-corpora/czechparl.json index 3706153..262fe7c 100644 --- a/corpora/parliamentary-corpora/czechparl.json +++ b/corpora/parliamentary-corpora/czechparl.json @@ -3,7 +3,7 @@ "URL": "https://www.muni.cz/en/research/publications/914268", "Family": "Parliamentary corpora", "Description": "The corpus contains Czech parliamentary debates from 1993 to 2010. It is annotated with ajka.\nThe corpus is available through the Sketch Engine.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "", "Size": ["81.9 million tokens"], "Annotation": ["tokenised", "MSD-tagged and lemmatised"], diff --git a/corpora/parliamentary-corpora/danish-parliament.json b/corpora/parliamentary-corpora/danish-parliament.json index e988e7c..d9ee8ac 100644 --- a/corpora/parliamentary-corpora/danish-parliament.json +++ b/corpora/parliamentary-corpora/danish-parliament.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12115/44", "Family": "Parliamentary corpora", "Description": "The corpus contains Danish parliamentary debates from 2009 to 2017.\nThe corpus is available for download from the DK-CLARIN repository.", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CC-BY", "Size": ["40.6 million words"], "Annotation": ["no linguistic annotation"], diff --git a/corpora/parliamentary-corpora/dutchparl.json b/corpora/parliamentary-corpora/dutchparl.json index cc3f657..ed6c73f 100644 --- a/corpora/parliamentary-corpora/dutchparl.json +++ b/corpora/parliamentary-corpora/dutchparl.json @@ -3,7 +3,7 @@ "URL": "http://search.politicalmashup.nl/about.html", "Family": "Parliamentary corpora", "Description": "The corpus contains Dutch parliamentary debates from 1814 to 2014. It is annotated with Frog. See also the information on the schema used.\nThe corpus is available for download (the authors needs to be contacted) and is also accessible online through the Political Mashup environment.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": ["800 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/epic-uds.json b/corpora/parliamentary-corpora/epic-uds.json index 405febb..7931cd9 100644 --- a/corpora/parliamentary-corpora/epic-uds.json +++ b/corpora/parliamentary-corpora/epic-uds.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/21.11119/0000-0008-F519-8", "Family": "Parliamentary corpora", "Description": "This is a parallel and comparable corpus of speeches held in the European Parliament; the corpus follows the European Parliament Interpreting Corpora tradition of the EPIC and EPICG corpora. It contains original speeches from 2008 to 2013 by English, German, and Spanish native speakers and their interpretation (English to and from German; Spanish to English).\nAll transcripts in the corpus are based on videos of the European Parliament Proceedings published by the European Parliament.\nAnnotation includes typical characteristics of spoken language such as false starts, hesitations and truncated words. To obtain better results for source-target alignment as well as sentence parsing the transcripts were segmented using a main clause approach: compound sentences were segmented separately. For the second version of the corpus, the transcripts were processed clause by clause with the spaCy NLP tools; the data is encoded in CoNLL-U and provides universal PoS tags, fine-grained language-specific PoS tags as well as Universal Dependency syntactic relations. All data was enriched with relevant metadata such as source language, name of original speaker, speech timing, mode of delivery and delivery rate.\nThe corpus is available for download from CLARIN-D (Saarland University B-centre).", - "Languages": ["eng", "deu", "spa"], + "Language": ["eng", "deu", "spa"], "Licence": "CC BY-NC-SA 4.0", "Size": ["350,000 tokens", "20,000 sentences"], "Annotation": ["tokenised", "PoS-tagged", "syntactically parsed", "speech phenomena"], diff --git a/corpora/parliamentary-corpora/europarl-ell-eng.json b/corpora/parliamentary-corpora/europarl-ell-eng.json index c13dc2f..4629911 100644 --- a/corpora/parliamentary-corpora/europarl-ell-eng.json +++ b/corpora/parliamentary-corpora/europarl-ell-eng.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23DE-F", "Family": "Parliamentary corpora", "Description": "This corpus is a bilingual Greek-English subset of the Europal parallel corpus.\nThe corpus is available for download from the CLARIN:EL repository.", - "Languages": ["Greek-English"], + "Language": ["Greek-English"], "Licence": "CC ZERO", "Size": ["31.9 million words (English)", "1.2 million sentences (Greek)"], "Annotation": ["sentence aligned"], diff --git a/corpora/parliamentary-corpora/europarl.json b/corpora/parliamentary-corpora/europarl.json index 5bc46a9..9b765e3 100644 --- a/corpora/parliamentary-corpora/europarl.json +++ b/corpora/parliamentary-corpora/europarl.json @@ -3,7 +3,7 @@ "URL": "https://www.statmt.org/europarl/", "Family": "Parliamentary corpora", "Description": "This corpus contains parliamentary debates from the European Parliament from 1996 to 2011.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["21 languages"], + "Language": ["21 languages"], "Licence": "CC0", "Size": ["33.7 million tokens"], "Annotation": ["sentence/aligned"], diff --git a/corpora/parliamentary-corpora/german-pol-speeches.json b/corpora/parliamentary-corpora/german-pol-speeches.json index a42407d..173f206 100644 --- a/corpora/parliamentary-corpora/german-pol-speeches.json +++ b/corpora/parliamentary-corpora/german-pol-speeches.json @@ -3,7 +3,7 @@ "URL": "https://www.dwds.de/d/korpora/politische_reden", "Family": "Parliamentary corpora", "Description": "The corpus contains speeches by 200 important political figures for the period between 1982 and 2020.\nA large part of the corpus contains speeches by the holders of the four highest German state offices: the Federal President, the Federal Chancellor, the President of the Bundestag and Foreign Ministers with terms of offie between 1982 and 2020.\nThe corpus is available for online browsing through the DWDS platform and a subset encoded in XML with 6,685 speeches until 2019 can be downloaded.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC BY-SA 4.0", "Size": ["15,240 speeches", "27 million texts"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/gerparcor.json b/corpora/parliamentary-corpora/gerparcor.json index 7356683..ee493f9 100644 --- a/corpora/parliamentary-corpora/gerparcor.json +++ b/corpora/parliamentary-corpora/gerparcor.json @@ -3,7 +3,7 @@ "URL": "https://github.com/texttechnologylab/GerParCor", "Family": "Parliamentary corpora", "Description": "This corpus contains (mostly historical) German-language parliamentary proceedings from the 19th, 20th, and 21th centuries, including state and federal-level data. Additionally, the corpus contains conversions of scanned protocols and, in particular, of protocols in Fraktur converted via an OCR process based on Tesseract. All protocols were preprocessed by means of the NLP pipeline spaCy v3 and automatically annotated with metadata regarding their session date. The corpus is made available in the XML format of the UIMA project.\nThe corpus is available for download from GitHub.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "AGPL-3.0 Licence", "Size": [], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "sentence segmented", "NER-tagged", "morphology", "dependency parsed"], diff --git a/corpora/parliamentary-corpora/handeset.json b/corpora/parliamentary-corpora/handeset.json index c9dc523..9511519 100644 --- a/corpora/parliamentary-corpora/handeset.json +++ b/corpora/parliamentary-corpora/handeset.json @@ -3,7 +3,7 @@ "URL": "https://data.mendeley.com/datasets/xsvp45cbt4/2", "Family": "Parliamentary corpora", "Description": "This corpus contains English parliamentary debates from 1997 to 2017.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Open Parliament Licence V3.0 and Open Data Commons Open Database License (OdbL)", "Size": ["1251 motion-speech units taken from 129 separate debates"], "Annotation": ["sentiment tags"], diff --git a/corpora/parliamentary-corpora/hansard.json b/corpora/parliamentary-corpora/hansard.json index 084337d..cc853e8 100644 --- a/corpora/parliamentary-corpora/hansard.json +++ b/corpora/parliamentary-corpora/hansard.json @@ -3,7 +3,7 @@ "URL": "http://www.clarin.ac.uk/hansard-corpus", "Family": "Parliamentary corpora", "Description": "The corpus contains British parliamentary debates from 1803 to 2005. It is semantically tagged with the USAS semantic tagger and the Historical Thesaurus Semantic Tagger (HTST).\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["1.6 billion tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "semantic tagging"], diff --git a/corpora/parliamentary-corpora/hellenic-parla.json b/corpora/parliamentary-corpora/hellenic-parla.json index 23ec826..d0f23c1 100644 --- a/corpora/parliamentary-corpora/hellenic-parla.json +++ b/corpora/parliamentary-corpora/hellenic-parla.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-57FA-5", "Family": "Parliamentary corpora", "Description": "The corpus contains Greek parliamentary debates for two periods: 1989-1994 and 1997-2018.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC", "Size": ["181 million words"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/house-of-commons-europe.json b/corpora/parliamentary-corpora/house-of-commons-europe.json index b371a37..019c4ed 100644 --- a/corpora/parliamentary-corpora/house-of-commons-europe.json +++ b/corpora/parliamentary-corpora/house-of-commons-europe.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/uk-parl/v1", "Family": "Parliamentary corpora", "Description": "The corpus contains British parliamentary debates from 1998 to 2015. The contextual metadata in the corpus concern the dates of the council meetings, the description of the main topic(s) of the European council meeting, the place where the European Council meeting took place; they also correspond to information about the government and the legislative session. The speaker metadata correspond to name, gender, occupation, parliamentary group, political orientation and the opposition and majority division.\nThe corpus is available for download from Ortolang.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC-BY", "Size": ["190,000 tokens"], "Annotation": ["contextual and speaker metadata"], diff --git a/corpora/parliamentary-corpora/icelandic-parla.json b/corpora/parliamentary-corpora/icelandic-parla.json index 805779c..a1deccc 100644 --- a/corpora/parliamentary-corpora/icelandic-parla.json +++ b/corpora/parliamentary-corpora/icelandic-parla.json @@ -3,7 +3,7 @@ "URL": "https://clarin.is/en/resources/parliament/", "Family": "Parliamentary corpora", "Description": "This corpus contains debates in the Icelandic parliament (Alþingi) from 1911 to 2017.\nThe corpus is available for download from CLARIN-IS (as a part of the Icelandic Gigaword Corpus) and for search through the concordancer Korp.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC-BY 4.0", "Size": ["238 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/kranjska.json b/corpora/parliamentary-corpora/kranjska.json index 130939e..28a8835 100644 --- a/corpora/parliamentary-corpora/kranjska.json +++ b/corpora/parliamentary-corpora/kranjska.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1824", "Family": "Parliamentary corpora", "Description": "The corpus contains meeting proceedings of 694 sessions of the Carniolan Provincial Assembly from 1861 to 1913.\nThe source data (scanned and OCR processed pdf documents) originally come from The Digital Library of Slovenia dLib.si and History of Slovenia - SIstory portals. The documents are bilingual, in Slovenian and German, depending on the speaker. German was first typeset in the Gothic script and later on in Latin.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Language was detected on the sentence level, roughly 58% sentences are in Slovenian and 42% in German. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using Trankit for Slovenian and German, while Lingua is used for language detection.\nThe documents are in the Parla-CLARIN compliant TEI XML format. Each session in one file.", - "Languages": ["deu", "slv"], + "Language": ["deu", "slv"], "Licence": "CC-BY 4.0", "Size": ["10.9 million words"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/large-czech-parl-hearings.json b/corpora/parliamentary-corpora/large-czech-parl-hearings.json index 1cb4838..f7afc16 100644 --- a/corpora/parliamentary-corpora/large-czech-parl-hearings.json +++ b/corpora/parliamentary-corpora/large-czech-parl-hearings.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11234/1-3126", "Family": "Parliamentary corpora", "Description": "This corpus contains audio recordings of Czech parliamentary sessions along with the corresponding transcriptions. The whole corpus has been segmented to short audio snippets making it suitable for both training and evaluation of automatic speech recognition (ASR) systems.\nThe corpus is available for download form the LINDAT reposiory.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY 4.0", "Size": ["444 hours"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/linkedsaeima.json b/corpora/parliamentary-corpora/linkedsaeima.json index b0656cd..aa54b8a 100644 --- a/corpora/parliamentary-corpora/linkedsaeima.json +++ b/corpora/parliamentary-corpora/linkedsaeima.json @@ -3,7 +3,7 @@ "URL": "http://dati.saeima.korpuss.lv/", "Family": "Parliamentary corpora", "Description": "The corpus contains Latvian parliamentary debates from 1993 to 2016.\nThe corpus is available through noSketchEngine.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "", "Size": ["12.5 million tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/parliamentary-corpora/lit-parla-attribution.json b/corpora/parliamentary-corpora/lit-parla-attribution.json index 9a3a592..bb73169 100644 --- a/corpora/parliamentary-corpora/lit-parla-attribution.json +++ b/corpora/parliamentary-corpora/lit-parla-attribution.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/17", "Family": "Parliamentary corpora", "Description": "The corpus contains Lithuanian parliamentary debates from 1990 to 2013. It is annotated with Lemuoklis (morphological analyzer for lemmatization) and MaltParser (generation of dependency tags).\nThe corpus is available for download from the repository of CLARIN-LT.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN PUB", "Size": ["23.9 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/nor-parla-speech.json b/corpora/parliamentary-corpora/nor-parla-speech.json index b49b4a0..87654d8 100644 --- a/corpora/parliamentary-corpora/nor-parla-speech.json +++ b/corpora/parliamentary-corpora/nor-parla-speech.json @@ -3,7 +3,7 @@ "URL": "https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-58/", "Family": "Parliamentary corpora", "Description": "This corpus consists of audio recordings of meetings in Stortinget (the Norwegian parliament), and corresponding orthographic transcriptions in either Norwegian Bokmål or Norwegian Nynorsk, as well as various metadata about the speakers. The official proceedings from the meetings are also included in the corpus for reference.\nTranscription was first done automatically; subsequently, the output of the automatic process was manually checked and corrected by trained linguists and philologists. Finally, all transcriptions were proofread to ensure consistency and accuracy. The audio files in the corpus contain the speech of entire days of plenary meetings from 2017 and 2018 (or, if a meeting lasts more than six hours, the first six hours of a day).\nThe corpus is available for download from the Norwegian Language Bank.", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CC-ZERO", "Size": ["140 hours", "65,000 sentences", "1.2 million words"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/parlameter-hr9.json b/corpora/parliamentary-corpora/parlameter-hr9.json index 9026fe5..8191987 100644 --- a/corpora/parliamentary-corpora/parlameter-hr9.json +++ b/corpora/parliamentary-corpora/parlameter-hr9.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1209", "Family": "Parliamentary corpora", "Description": "The corpus contains minutes of the National Assembly of the Republic of Croatia and currently covers its VIth mandate from 15 November 2016 to 21 Nomveber 2018. The corpus contains speaker metadata (gender, age, education, party affiliation).\nThe corpus is available for download from the CLARIN.SI repository and through the concordancers KonText and noSketchEngine, as well as through a dedicated webpage.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC-BY", "Size": ["14.1 million tokens"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "named entities"], diff --git a/corpora/parliamentary-corpora/parlameter-sl.json b/corpora/parliamentary-corpora/parlameter-sl.json index 30cafa0..4da94cc 100644 --- a/corpora/parliamentary-corpora/parlameter-sl.json +++ b/corpora/parliamentary-corpora/parlameter-sl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1208", "Family": "Parliamentary corpora", "Description": "The corpus contains minutes of the National Assembly of the Republic of Slovenia and currently covers the VIIth mandate from 1 August 2014 to 22 June 2018. The corpus contains speaker metadata (gender, age, education, party affiliation).\nThe corpus is available for download from the CLARIN.SI repository and through the concordancers KonText and noSketchEngine, as well as through a dedicated dedicated webpage.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["41 million tokens"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "named entities"], diff --git a/corpora/parliamentary-corpora/parlamint-ana-30.json b/corpora/parliamentary-corpora/parlamint-ana-30.json index 80479bf..499bbe0 100644 --- a/corpora/parliamentary-corpora/parlamint-ana-30.json +++ b/corpora/parliamentary-corpora/parlamint-ana-30.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1488", "Family": "Parliamentary corpora", "Description": "ParlaMint is a multilingual set of comparable corpora containing parliamentary debates mostly starting at the end of 2015 and extending to mid 2022, with each corpus being between 9 and 125 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (after October 2019), the pre-Covid period or the period after 24 February 2022.\nThe corpora have extensive meta-data about the speakers (name, gender, party affiliation, MP status), are structured into time-stamped terms, sessions and meetings, with each speech being marked by its speaker and their role (chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancer noSketch Engine. Note that the version of the corpus without linguistic mark-up is available for download under a separate CLARIN.SI entry.", - "Languages": ["bos", "bul", "cat", "hrv", "ces", "dan", "nld", "eng", "est", "fra", "glg", "deu", "hun", "isl", "ita", "lav", "ell", "nor", "pol", "por", "rus", "srp", "slv", "spa", "swe", "tur", "ukr"], + "Language": ["bos", "bul", "cat", "hrv", "ces", "dan", "nld", "eng", "est", "fra", "glg", "deu", "hun", "isl", "ita", "lav", "ell", "nor", "pol", "por", "rus", "srp", "slv", "spa", "swe", "tur", "ukr"], "Licence": "CC BY 4.0", "Size": ["7.5 million utterances", "1.1 billion words"], "Annotation": ["tokenised", "MSD-tagged (Universal Dependencies)", "syntactically parsed (Universal Dependencies)", "named entities"], diff --git a/corpora/parliamentary-corpora/parlamint-en-ana-30.json b/corpora/parliamentary-corpora/parlamint-en-ana-30.json index d8875f3..768d3b1 100644 --- a/corpora/parliamentary-corpora/parlamint-en-ana-30.json +++ b/corpora/parliamentary-corpora/parlamint-en-ana-30.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1810", "Family": "Parliamentary corpora", "Description": "This corpus comprises linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint.ana 3.0 which were machine translated to English and the translation linguistically annotated.\nExcept for the translation to English, small changes in the metadata and the absence of the British parliament corpus, the corpora included in this entry are in all respects identical to the source language corpora, i.e. the entry comprises the same 26 European parliamentary corpora, together with over 1.1 billion words. The translation to English was done with EasyNMT withOPUS-MT models. Machine translation was done on the sentence level, and includes both speeches and transcriber notes, including headings. The linguistic annotation of the speeches, i.e. tokenisation, tagging with UD PoS and morphological features, lemmatisation, and NER annotation was done with Stanza , using the English language model. For NER the conll03 model with 4 NE classes was used.\nThe corpus is available for download from the CLARIN.SI repository and for browsing through concordancers noSketchEngine and KonText.", - "Languages": ["bos", "bul", "cat", "hrv", "ces", "dan", "nld", "eng", "est", "fra", "glg", "deu", "hun", "isl", "ita", "lav", "ell", "nor", "pol", "por", "rus", "srp", "slv", "spa", "swe", "tur", "ukr"], + "Language": ["bos", "bul", "cat", "hrv", "ces", "dan", "nld", "eng", "est", "fra", "glg", "deu", "hun", "isl", "ita", "lav", "ell", "nor", "pol", "por", "rus", "srp", "slv", "spa", "swe", "tur", "ukr"], "Licence": "CC BY 4.0", "Size": ["1.1 billion words"], "Annotation": ["tokenised", "MSD-tagged (Universal Dependencies)", "syntactically parsed (Universal Dependencies)", "named entities"], diff --git a/corpora/parliamentary-corpora/parlasent-bcs.json b/corpora/parliamentary-corpora/parlasent-bcs.json index 1f6648b..01d8adb 100644 --- a/corpora/parliamentary-corpora/parlasent-bcs.json +++ b/corpora/parliamentary-corpora/parlasent-bcs.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1585", "Family": "Parliamentary corpora", "Description": "This corpus consists of mid-length sentences from the Bosnian, Croatian, and Serbian parliamentary proceedings that are annotated with a 6-level sentiment schema. The date of the speech and the speaker name are given as well. If the speaker is MP, information on party, gender and year of birth are available as well.\nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["bos", "hrv", "srp"], + "Language": ["bos", "hrv", "srp"], "Licence": "CC BY-SA 4.0", "Size": ["2600 sentences"], "Annotation": ["sentiment analysis"], diff --git a/corpora/parliamentary-corpora/parlat-beta.json b/corpora/parliamentary-corpora/parlat-beta.json index ad4ef6b..7d3f0bb 100644 --- a/corpora/parliamentary-corpora/parlat-beta.json +++ b/corpora/parliamentary-corpora/parlat-beta.json @@ -3,7 +3,7 @@ "URL": "https://www.oeaw.ac.at/acdh/tools/parlat/", "Family": "Parliamentary corpora", "Description": "This corpus contains Austrian parliamentary proceedings from 1996 to 2017.\nCurrently in development, ParlAT is planned to be a monitor corpus with new material added over time.", - "Languages": ["German (Austrian)"], + "Language": ["German (Austrian)"], "Licence": "", "Size": ["75.2 million tokens"], "Annotation": ["tokenised", "linked data (e.g., speaker information)"], diff --git a/corpora/parliamentary-corpora/parlspeech.json b/corpora/parliamentary-corpora/parlspeech.json index 4880661..70c52f6 100644 --- a/corpora/parliamentary-corpora/parlspeech.json +++ b/corpora/parliamentary-corpora/parlspeech.json @@ -3,7 +3,7 @@ "URL": "https://doi.org/10.7910/DVN/L4OAKN", "Family": "Parliamentary corpora", "Description": "The corpus contains complete parliamentary speeches in the key legislative chambers of Austria, the Czech Republic, Germany, Denmark, the Netherlands, New Zealand, Spain, Sweden, and the United Kingdom, covering periods between 21 and 32 years.\nThe corpus is available for download from the Harvard Dataverse repository.", - "Languages": ["deu", "ces", "dan", "nld", "eng", "spa", "swe"], + "Language": ["deu", "ces", "dan", "nld", "eng", "spa", "swe"], "Licence": "CC0", "Size": ["6.3 million parliamentary speeches"], "Annotation": ["date, speaker, party, agenda item metadata"], diff --git a/corpora/parliamentary-corpora/plenary-fin-parla.json b/corpora/parliamentary-corpora/plenary-fin-parla.json index 330afaf..2b2352c 100644 --- a/corpora/parliamentary-corpora/plenary-fin-parla.json +++ b/corpora/parliamentary-corpora/plenary-fin-parla.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017020202", "Family": "Parliamentary corpora", "Description": "The corpus contains Finnish parliamentary debates from 2008 to 2016.\nThe corpus is available through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY", "Size": ["22.4 million tokens"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "syntactically parsed"], diff --git a/corpora/parliamentary-corpora/pol-parla.json b/corpora/parliamentary-corpora/pol-parla.json index 60959c2..c6e4cfa 100644 --- a/corpora/parliamentary-corpora/pol-parla.json +++ b/corpora/parliamentary-corpora/pol-parla.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11321/467", "Family": "Parliamentary corpora", "Description": "The corpus contains Polish parliamentary debates from 1991 to 2017. It is annotated with Morfeusz SGJP (morphological analyser), Pantera (disambiguating tagger), Spejd (shallow parser), Nerf (named entity recognizer).\nThe corpus is available for download from a dedicated webpage and through the concordancer NKJP. ", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["300 million tokens"], "Annotation": ["tokenised, MSD-tagged, named entities, etc."], diff --git a/corpora/parliamentary-corpora/polminer.json b/corpora/parliamentary-corpora/polminer.json index d744b51..2629887 100644 --- a/corpora/parliamentary-corpora/polminer.json +++ b/corpora/parliamentary-corpora/polminer.json @@ -3,7 +3,7 @@ "URL": "https://github.com/PolMine", "Family": "Parliamentary corpora", "Description": "A small sample is available for download from the GitHub webpage of the corpus.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": ["Only a small sample available"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/proceedings-nor-parla.json b/corpora/parliamentary-corpora/proceedings-nor-parla.json index cc23514..45fbaad 100644 --- a/corpora/parliamentary-corpora/proceedings-nor-parla.json +++ b/corpora/parliamentary-corpora/proceedings-nor-parla.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/DA65-D02F-0EB0-9", "Family": "Parliamentary corpora", "Description": "The corpus contains Norwegian parliamentary debates from 2008 to 2015.\nThe corpus is available through the concordancer Corpuscle.", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "NLOD", "Size": ["29 million tokens"], "Annotation": ["tokenised", "sentence segmentation", "speaker metadata (name, party, time, type of utterance)"], diff --git a/corpora/parliamentary-corpora/ptparl.json b/corpora/parliamentary-corpora/ptparl.json index a7f03a9..7be9ae8 100644 --- a/corpora/parliamentary-corpora/ptparl.json +++ b/corpora/parliamentary-corpora/ptparl.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/21.11129/0000-000B-D33C-4", "Family": "Parliamentary corpora", "Description": "The corpus contains Portuguese parliamentary debates from 1970 to 2008. It is annotated with LX-Tokenizer, LX-Tagger, MBT, MBLEM (lemmatisation).\nThe corpus is available for download from the CLARIN PORTUGAL repository.", - "Languages": ["por"], + "Language": ["por"], "Licence": "CLARIN RES", "Size": ["1 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/riigikogu.json b/corpora/parliamentary-corpora/riigikogu.json index 940bc35..efac877 100644 --- a/corpora/parliamentary-corpora/riigikogu.json +++ b/corpora/parliamentary-corpora/riigikogu.json @@ -3,7 +3,7 @@ "URL": "http://www.cl.ut.ee/korpused/segakorpus/riigikogu/", "Family": "Parliamentary corpora", "Description": "The corpus contains Estonian parliamentary debates from 1995 to 2001.\nThe corpus is available for download from a dedicated webpage and through a concordancer on the same webpage.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN_ACA", "Size": ["13 million tokens"], "Annotation": ["tokenised"], diff --git a/corpora/parliamentary-corpora/riksdag-open-data.json b/corpora/parliamentary-corpora/riksdag-open-data.json index dd50427..830dc61 100644 --- a/corpora/parliamentary-corpora/riksdag-open-data.json +++ b/corpora/parliamentary-corpora/riksdag-open-data.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/eng/resources", "Family": "Parliamentary corpora", "Description": "The corpus contains Swedish parliamentary debates from 1971 to 2016. It is annotated with Sparv.\nThe corpus is available for download from Språkbanken (all entries with \"Riksdag's Open Data\" in the subtitle) and through the concordancer Korp.", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["1.25 billion tokens"], "Annotation": ["tokenised", "lemmatised"], diff --git a/corpora/parliamentary-corpora/saeima.json b/corpora/parliamentary-corpora/saeima.json index fea92f2..f2df73f 100644 --- a/corpora/parliamentary-corpora/saeima.json +++ b/corpora/parliamentary-corpora/saeima.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/20.500.12574/50", "Family": "Parliamentary corpora", "Description": "This corpus contains parliamentary debates from seven parliamentary terms (5th–12th Saeima) covering years 1993–2017. The available metadata for each utterance includes the date and type of the parliamentary session and speakers’ names and affiliations.\nThe corpus is available for online browsing through the noSketch Engine (CLARIN-LV) concordancer.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "", "Size": ["21 million words"], "Annotation": ["tokenised", "msd-tagged", "lemmatised", "syntactically parsed", "named entities"], diff --git a/corpora/parliamentary-corpora/siparl.json b/corpora/parliamentary-corpora/siparl.json index 6445a77..79d31c7 100644 --- a/corpora/parliamentary-corpora/siparl.json +++ b/corpora/parliamentary-corpora/siparl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1748", "Family": "Parliamentary corpora", "Description": "The corpus contains Slovenian parliamnetary debates from 1990 to 2022. It differs from the SlovParl 2.0 corpus (listed below) in that it contains only basic meta-data about the speakers, a typology of sessions and structural and editorian annotations.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancers KonText and noSketchEngine.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["213 million words"], "Annotation": ["tokenised", "PoS-tagged|", "lemmatised"], diff --git a/corpora/parliamentary-corpora/slovparl.json b/corpora/parliamentary-corpora/slovparl.json index 686bce0..bf36c90 100644 --- a/corpora/parliamentary-corpora/slovparl.json +++ b/corpora/parliamentary-corpora/slovparl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1167", "Family": "Parliamentary corpora", "Description": "The SlovParl corpus contains minutes of the Assembly of the Republic of Slovenia for the legislative period 1990-1992, i.e. it covers the period before, during, and after Slovenia became an independent country in 1991. The corpus comprises 232 sessions, 58,813 speeches and 10.8 million words. The corpus contains extensive meta-data about the speakers, a typology of sessions etc. and structural and editorial annotations.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["3.2 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/speeches-greek-parla.json b/corpora/parliamentary-corpora/speeches-greek-parla.json index fd8a2bd..0d4aa4a 100644 --- a/corpora/parliamentary-corpora/speeches-greek-parla.json +++ b/corpora/parliamentary-corpora/speeches-greek-parla.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-5808-5", "Family": "Parliamentary corpora", "Description": "This corpus contains speeches delivered by 5 members of parliament: Dimitris Anagnostakis, Nikos Tsoukalis, Paros Koukoulopoulos, Niki Founta, and Panayiotis Kammenos.\nThe corpus is available for download from the CLARIN:el repository.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC", "Size": ["258,036 words"], "Annotation": [], diff --git a/corpora/parliamentary-corpora/talk-of-norway.json b/corpora/parliamentary-corpora/talk-of-norway.json index 4131650..d0f1f1f 100644 --- a/corpora/parliamentary-corpora/talk-of-norway.json +++ b/corpora/parliamentary-corpora/talk-of-norway.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11509/123", "Family": "Parliamentary corpora", "Description": "The corpus contains Norwegian parliamentary debates from 1998 to 2016.\nThe corpus is available for download from the CLARINO repository.", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "NLOD", "Size": ["63.8 million tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/parliamentary-corpora/ukparl.json b/corpora/parliamentary-corpora/ukparl.json index 6926adf..c41ae8e 100644 --- a/corpora/parliamentary-corpora/ukparl.json +++ b/corpora/parliamentary-corpora/ukparl.json @@ -3,7 +3,7 @@ "URL": "https://federiconanni.com/%20ukparl/", "Family": "Parliamentary corpora", "Description": "This corpus contains British parliamentary debates of the House of Commons from 2013 to 2016.\nThe corpus is available for download from Google Drive.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["354,400 tokens"], "Annotation": ["fine-grained topic annotation", "additional semantic information (entity links)"], diff --git a/corpora/parliamentary-corpora/yu1parl.json b/corpora/parliamentary-corpora/yu1parl.json index b5c8e84..54a84e6 100644 --- a/corpora/parliamentary-corpora/yu1parl.json +++ b/corpora/parliamentary-corpora/yu1parl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1845", "Family": "Parliamentary corpora", "Description": "This historical parliamentary corpus contains meeting proceedings of the National Representation of the Kingdom of Yugoslavia from 191 to 1939. The corpus comprises 714 sessions.\nThe source data (scanned images of printed Stenographic Minutes) come from the History of Slovenia - SIstory portal. The images were OCR processed and the results saved as pdf, docx and txt. The documents are multilingual, in Serbo-Croatian and Slovenian, depending on the speaker. Serbo-Croatian is typeset in the Cyrillic (Serbian) or in the Latin (Croatian) alphabet.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Lingua was used for language detection on the sentence level. Roughly 59% of sentences are in Serbian (Cyrillic script), 38% in Croatian (Latin script) and 3% in Slovenian. Some sentences in German and French were also detected. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using CLASSLA for Serbian, Croatian and Slovenian. Words in Serbian (Cyrillic script) have lemmas in Latin script.\n The corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", - "Languages": ["hrv", "srp", "slv"], + "Language": ["hrv", "srp", "slv"], "Licence": "CC BY 4.0", "Size": ["34,542 utterances", "578,958 sentences", "13,271,885 words", "15,403 pages"], "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], diff --git a/corpora/reference-corpora/abnc.json b/corpora/reference-corpora/abnc.json index 642aed4..74c6439 100644 --- a/corpora/reference-corpora/abnc.json +++ b/corpora/reference-corpora/abnc.json @@ -3,7 +3,7 @@ "URL": "https://clarino.uib.no/abnc/page", "Family": "Reference corpora", "Description": "This corpus includes Abkhaz texts published between 1920 and 2016. The corpus is encoded in TEI.\nThe corpus is available for online browsing through the Corpuscle concordancer (CLARINO distribution).", - "Languages": ["abk"], + "Language": ["abk"], "Licence": "CLARIN_PUB-BY-NC-ND", "Size": ["10 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/bnc.json b/corpora/reference-corpora/bnc.json index dbcf47d..86509ac 100644 --- a/corpora/reference-corpora/bnc.json +++ b/corpora/reference-corpora/bnc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2554", "Family": "Reference corpora", "Description": "This corpus includes English texts (fiction, magazines, newspapers, and academic writing) published between 1980 and 1993.\nThe corpus is encoded in TEI. Non-linguistic metadata include contextual and bibliographic information. Aside from written materials, the corpus also includes transcriptions of spoken language.\nThe corpus is available for online browsing through a dedicated concordancer and can be downloaded from the Oxford Text Archive (CLARIN-UK).", - "Languages": ["English (British)"], + "Language": ["English (British)"], "Licence": "BNC User Licence (restricted for the downloadable version)", "Size": ["100 million words"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/bnrc.json b/corpora/reference-corpora/bnrc.json index 7d12b76..a451fa2 100644 --- a/corpora/reference-corpora/bnrc.json +++ b/corpora/reference-corpora/bnrc.json @@ -3,7 +3,7 @@ "URL": "http://webclark.org/?locale=en", "Family": "Reference corpora", "Description": "This corpus includes Bulgarian texts taken from news media, literature, and administrative documents between 1997 and 2002.\nThe tokenised corpus is available through WebCLaRK, while the PoS-tagged version is available only upon request.", - "Languages": ["bul"], + "Language": ["bul"], "Licence": "Individual terms of agreement", "Size": ["70 million tokens"], "Annotation": ["tokenized", "PoS-tagged"], diff --git a/corpora/reference-corpora/ccgigafida.json b/corpora/reference-corpora/ccgigafida.json index a940f55..e40a9eb 100644 --- a/corpora/reference-corpora/ccgigafida.json +++ b/corpora/reference-corpora/ccgigafida.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1035", "Family": "Reference corpora", "Description": "This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.\nThis corpus is a downloadable subset of the representative Gigafida corpus (version 1). It can be downloaded from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["126.9 million tokens", "103.2 million words", "31,722 texts"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/cckres.json b/corpora/reference-corpora/cckres.json index 884071b..e50fde6 100644 --- a/corpora/reference-corpora/cckres.json +++ b/corpora/reference-corpora/cckres.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1034", "Family": "Reference corpora", "Description": "This corpus includes balanced Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.\nThis corpus is a downloadable subset of the balanced Kres corpus. It can be downloaded from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC-BY", "Size": ["12.2 million tokens", "9.8 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/cnc.json b/corpora/reference-corpora/cnc.json index 5d81415..c6a25a6 100644 --- a/corpora/reference-corpora/cnc.json +++ b/corpora/reference-corpora/cnc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-233", "Family": "Reference corpora", "Description": "This corpus includes Croatian texts taken from newspapers, magazines, popular texts, and fiction.\nThe corpus is available for online browsing through the noSketch Engine.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "", "Size": ["101 million tokens"], "Annotation": "", diff --git a/corpora/reference-corpora/cogreek.json b/corpora/reference-corpora/cogreek.json index 9d64c9b..888a4ea 100644 --- a/corpora/reference-corpora/cogreek.json +++ b/corpora/reference-corpora/cogreek.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2471-8", "Family": "Reference corpora", "Description": "This corpus includes representative Greek texts published between 1990 and 2010. Aside from written materials, the corpus also includes transcriptions of spoken language.\nThe corpus is available for online browsing through a dedicated concordancer.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC, ACA", "Size": ["27.6 million words"], "Annotation": "", diff --git a/corpora/reference-corpora/con-lit.json b/corpora/reference-corpora/con-lit.json index 67123f6..7106baf 100644 --- a/corpora/reference-corpora/con-lit.json +++ b/corpora/reference-corpora/con-lit.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.11821/16", "Family": "Reference corpora", "Description": "This corpus includes Lithuanian texts (mostly newspapers but also fiction, non-fiction, and specialised magazines) published between 1990 and 2008.\nThe corpus is encoded in TEI. Non-linguistic metadata includes bibliographic information. Aside from written materials, the corpus also contains transcriptions of spoken language.\nThe corpus is available for online browsing through a dedicated concordancer.", - "Languages": ["lit"], + "Language": ["lit"], "Licence": "CLARIN RES", "Size": ["208.4 million tokens"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/conae.json b/corpora/reference-corpora/conae.json index cd581d4..1a04f12 100644 --- a/corpora/reference-corpora/conae.json +++ b/corpora/reference-corpora/conae.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2019031901", "Family": "Reference corpora", "Description": "This corpus includes American English texts evenly divided into the spoken, fiction, magazine, newspaper, and academic genres (around 88 million words each) published between 1990 and 2012.\nThe corpus is available for download from the Finnish Language Bank as well as for online browsing through the concordancer Korp (FIN-CLARIN distribution).", - "Languages": ["English (American)"], + "Language": ["English (American)"], "Licence": ["CLARIN ACA (online version)", "CLARIN RES (downloadable version)"], "Size": ["440 million words", "190,000 texts"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/corcencc.json b/corpora/reference-corpora/corcencc.json index 455986b..dde7825 100644 --- a/corpora/reference-corpora/corcencc.json +++ b/corpora/reference-corpora/corcencc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2564", "Family": "Reference corpora", "Description": "This corpus contains spoken, written and digital (e-language) Welsh. The corpus is accompanied by an online teaching and learning toolkit – Y Tiwtiadur – which draws directly on the data from the corpus to provide resources for Welsh language learning at all ages and levels.\nThe corpus is available for online browsing through a dedicated webpage and by request.", - "Languages": ["cym"], + "Language": ["cym"], "Licence": "CC BY-NC-SA 4.0", "Size": ["11 million words"], "Annotation": "", diff --git a/corpora/reference-corpora/corpol.json b/corpora/reference-corpora/corpol.json index 9d9eb7c..082b0c1 100644 --- a/corpora/reference-corpora/corpol.json +++ b/corpora/reference-corpora/corpol.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-676", "Family": "Reference corpora", "Description": "This is a written and spoken corpus that includes representative Polish texts published between 1945 and 2010.\nThe corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author. Aside from written materials, the corpus also includes transcriptions of spoken language.\nThe corpus is available for online browsing through a dedicated concordancer.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": ["1.8 billion tokens"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/dereko.json b/corpora/reference-corpora/dereko.json index 56a9f72..739cec5 100644 --- a/corpora/reference-corpora/dereko.json +++ b/corpora/reference-corpora/dereko.json @@ -3,7 +3,7 @@ "URL": "http://www1.ids-mannheim.de/kl/projekte/korpora/", "Family": "Reference corpora", "Description": "This corpus includes German texts in a wide variety of genres published from 1947 onwards. Non-linguistic metadata include rich bibliographic information and partial layout information.\nPart of the corpus is available for download from a dedicated webpage (CLARIN-D distribution), while the entire corpus can be queried online through the COSMAS II platform.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CC-BY-SA", "Size": ["31.7 billion words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/dia-greek.json b/corpora/reference-corpora/dia-greek.json index c170de6..70198a7 100644 --- a/corpora/reference-corpora/dia-greek.json +++ b/corpora/reference-corpora/dia-greek.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/UOA-0000-0000-2572-6", "Family": "Reference corpora", "Description": "This corpus includes Greek texts published in the 20th century.\nThe corpus is available for download from CLARIN:EL.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "CC-BY-NC", "Size": ["20 million words"], "Annotation": "", diff --git a/corpora/reference-corpora/enc2019.json b/corpora/reference-corpora/enc2019.json index 959c29f..b78b757 100644 --- a/corpora/reference-corpora/enc2019.json +++ b/corpora/reference-corpora/enc2019.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/10.15155/3-00-0000-0000-0000-08489L", "Family": "Reference corpora", "Description": "This corpus includes Estonian texts published between 1990 and 2019. Amongst others, this corpus contains the Estonian Reference Corpus as a subcorpus.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY-SA", "Size": ["1.5 billion words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/erc.json b/corpora/reference-corpora/erc.json index 471e051..7b055a9 100644 --- a/corpora/reference-corpora/erc.json +++ b/corpora/reference-corpora/erc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-1068", "Family": "Reference corpora", "Description": "This corpus includes Estonian texts (fiction, PhD theses, newspapers, magazines, parliamentary transcriptions, computer-mediated communication) published between 1990 and 2007. The corpus is encoded in TEI.\nThe corpus is available for online browsing through a dedicated concordancer and is available for download from CELR.", - "Languages": ["est"], + "Language": ["est"], "Licence": "free for non-commercial use", "Size": ["175 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/gigafida.json b/corpora/reference-corpora/gigafida.json index 539c2f7..30893e6 100644 --- a/corpora/reference-corpora/gigafida.json +++ b/corpora/reference-corpora/gigafida.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1320", "Family": "Reference corpora", "Description": "This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2018. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.\nThe corpus is available for online browsing through the noSketch Engine concordancer (CLARIN.SI distribution), as well as through a dedicated search engine.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "Individual terms of agreement", "Size": ["1.3 billion tokens", "1.1 billion words", "38,310 texts"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/gos.json b/corpora/reference-corpora/gos.json index 5335ab7..70303b2 100644 --- a/corpora/reference-corpora/gos.json +++ b/corpora/reference-corpora/gos.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1771", "Family": "Reference corpora", "Description": "This corpus contains transcripts from radio and TV shows, school lessons, private conversations, business meetings. It is composed of three different sources: Spoken corpus Gos 1.1 (112 hours, 1 million words), Spoken corpus Gos VideoLectures 4.2 (22 hours, 179,000 words), a selection from the ASR database ARTUR 1.0 (185 hours, 1.2 mllion words).\nThe corpus is available for download from CLARIN.SI as well as through a dedicated webconcordancer.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["1534 texts", "127,604 utterances", "2,462,368 words"], "Annotation": ["PoS-tagged", "lemmatised", "phonetically and orthographically transcribed"], diff --git a/corpora/reference-corpora/helnc.json b/corpora/reference-corpora/helnc.json index 050b38b..9781819 100644 --- a/corpora/reference-corpora/helnc.json +++ b/corpora/reference-corpora/helnc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E2-9", "Family": "Reference corpora", "Description": "This corpus includes Greek texts published from 1990 onwards.\nThe corpus is available for online browsing through a dedicated concordancer.", - "Languages": ["ell"], + "Language": ["ell"], "Licence": "proprietary", "Size": ["47 million words"], "Annotation": ["sentence segmented"], diff --git a/corpora/reference-corpora/hunnc.json b/corpora/reference-corpora/hunnc.json index 596582c..e213d29 100644 --- a/corpora/reference-corpora/hunnc.json +++ b/corpora/reference-corpora/hunnc.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11372/LRT-345", "Family": "Reference corpora", "Description": "This corpus includes Hungarian texts (newspapers, literature, scientific articles, official and personal documents).\nThe corpus is available for online browsing through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "free after registration", "Size": ["190 million tokens"], "Annotation": ["PoS-tagged"], diff --git a/corpora/reference-corpora/ice-giga.json b/corpora/reference-corpora/ice-giga.json index 8edfbea..05595fc 100644 --- a/corpora/reference-corpora/ice-giga.json +++ b/corpora/reference-corpora/ice-giga.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/192", "Family": "Reference corpora", "Description": "This corpus includes Icelandic texts (newspapers, parliamentary proceedings, adjudications, fiction and non-fiction) published until 2017.\nThe corpus is encoded in TEI. Non-linguistic metadata include bibliographic information. Aside from written materials, the corpus also contains transcriptions of spoken language.\nThe corpus is available for online browsing and download through CLARIN-IS (in two subsets, each with its own licence).", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC-BY and a special user licence", "Size": ["1.9 billion words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/kres.json b/corpora/reference-corpora/kres.json index 0836378..a8eaa19 100644 --- a/corpora/reference-corpora/kres.json +++ b/corpora/reference-corpora/kres.json @@ -3,7 +3,7 @@ "URL": "http://www.korpus-kres.net/", "Family": "Reference corpora", "Description": "This corpus includes balanced Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011.\nThis corpus is a balanced subset of the representative Gigafida corpus (version 1). The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.\nThe corpus is available for online browsing through a dedicated concordancer.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "Individual terms of agreement", "Size": ["99 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/lbk.json b/corpora/reference-corpora/lbk.json index 4d2c947..f8eba8e 100644 --- a/corpora/reference-corpora/lbk.json +++ b/corpora/reference-corpora/lbk.json @@ -3,7 +3,7 @@ "URL": "https://www.hf.uio.no/iln/om/organisasjon/tekstlab/prosjekter/lbk/", "Family": "Reference corpora", "Description": "This corpus includes representative Norwegian (Bokmål) texts (newspapers and periodicals, non-fiction, fiction, TV subtitles, and small print) published between 1985 and 2013.\nThe corpus is available for online browsing through the concordancer Glossa (CLARINO).", - "Languages": ["nob"], + "Language": ["nob"], "Licence": "CLARIN_ACA-NC-LOC-ND", "Size": ["100 million tokens"], "Annotation": ["PoS-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/lvk2022.json b/corpora/reference-corpora/lvk2022.json index 22413b6..782f942 100644 --- a/corpora/reference-corpora/lvk2022.json +++ b/corpora/reference-corpora/lvk2022.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12574/84", "Family": "Reference corpora", "Description": "This corpus includes texts from journalism, fiction, science, Wikipedia, legal documents, parliamentary subscripts, and subtitles.\nThe corpus is available for online browsing through the noSketch Engine concordancer.", - "Languages": ["lav"], + "Language": ["lav"], "Licence": "", "Size": ["122.9 million tokens"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/metafida.json b/corpora/reference-corpora/metafida.json index 153dce7..ebc40e2 100644 --- a/corpora/reference-corpora/metafida.json +++ b/corpora/reference-corpora/metafida.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1775", "Family": "Reference corpora", "Description": "This corpus contains a number of existing Slovenian corpora available through the CLARIN.SI concordances and thus provides a unified search across all the included corpora. metaFida contains over 4,7 billion words or 6 billion tokens from 15 million text published 1584 - 2022 from 34 corpora.\nIn the metaFida corpus we keep only information that is common to most of the selected corpora. The structure is nested very shallowly (text and paragraph), as it is then easier to create subcorpora or limit the search to individual text types. All metaFida positional attributes (word, normalised form, lemma, MULTEXT-East MSD in Slovenian and English) are considered to have multiple values, separated by a space.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "various", "Size": ["6 billion tokens"], "Annotation": ["MSD-tagged (MULTEXT-East)", "lemmatised", "normalised"], diff --git a/corpora/reference-corpora/nnk.json b/corpora/reference-corpora/nnk.json index c5cf2ca..8af3ff8 100644 --- a/corpora/reference-corpora/nnk.json +++ b/corpora/reference-corpora/nnk.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11495/E1A3-9361-1821-1", "Family": "Reference corpora", "Description": "This corpus includes representative Norwegian (Nynorsk) texts published between 1866 and 2012. The corpus is encoded in XML.\nThe corpus is available for online browsing through the Corpuscle concordancer (CLARINO).", - "Languages": ["nno"], + "Language": ["nno"], "Licence": "CLARIN_RES-NC-DEP", "Size": ["107.8 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/rcgd.json b/corpora/reference-corpora/rcgd.json index aed0030..6d00b09 100644 --- a/corpora/reference-corpora/rcgd.json +++ b/corpora/reference-corpora/rcgd.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12115/36", "Family": "Reference corpora", "Description": "This corpus includes Danish texts published between 2008 and 2011.\nThe corpus is encoded in TEI. Non-linguistic metadata includes information on source and year of publication.\nThe corpus is available for download from the CLARIN-DK repository.", - "Languages": ["dan"], + "Language": ["dan"], "Licence": "CLARIN ACA-NC", "Size": ["45.1 million words"], "Annotation": ["PoS-tagged", "sentence and paragraph segmentation", "lemmatized"], diff --git a/corpora/reference-corpora/riznica.json b/corpora/reference-corpora/riznica.json index 7463130..b7595e8 100644 --- a/corpora/reference-corpora/riznica.json +++ b/corpora/reference-corpora/riznica.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1180", "Family": "Reference corpora", "Description": "This corpus includes Croatian texts taken from fiction (28%) and specialised texts (72%).\nThe corpus is available for online browsing via noSketch Engine and KonText and for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC-BY-NC-SA 4.0", "Size": ["101.8 million tokens", "85.3 million words", "4.7 million sentences", "14,781 texts"], "Annotation": ["sentence segmented", "PoS-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/sonar.json b/corpora/reference-corpora/sonar.json index a939303..0439cec 100644 --- a/corpora/reference-corpora/sonar.json +++ b/corpora/reference-corpora/sonar.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-h5", "Family": "Reference corpora", "Description": "This corpus includes representative Dutch texts (fiction, brochures, magazines, legal texts, newspapers, parliamentary proceedings, and computer-mediated communication).\nAside from written materials, the corpus also contains transcriptions of spoken language. The corpus is encoded in FoLiA.\nThe corpus is available for online browsing through the OpenSONAR concordancer and can be downloaded from the Dutch Language Institute (CLARIAH-NL).", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "Terms of Agreement", "Size": ["500 million words"], "Annotation": ["PoS-tagged", "lemmatized", "named entities", "coreference annotation and annotation of spatial and temporal relations for the manually annotated SoNaR-1 subset "], diff --git a/corpora/reference-corpora/syn2005.json b/corpora/reference-corpora/syn2005.json index 56eda89..f760dff 100644 --- a/corpora/reference-corpora/syn2005.json +++ b/corpora/reference-corpora/syn2005.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-119E-8", "Family": "Reference corpora", "Description": "This corpus includes Czech texts published between 2000 and 2004. The corpus is encoded in XML.\nThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Czech National Corpus (Shuffled Corpus Data)", "Size": ["100 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/syn2010.json b/corpora/reference-corpora/syn2010.json index 236a08d..7827a91 100644 --- a/corpora/reference-corpora/syn2010.json +++ b/corpora/reference-corpora/syn2010.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-119F-6", "Family": "Reference corpora", "Description": "This corpus includes Czech fiction, professional literature, newspapers etc. published between 2005 and 2009. The corpus is encoded in XML.\nThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Czech National Corpus (Shuffled Corpus Data)", "Size": ["100 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/reference-corpora/syn2015.json b/corpora/reference-corpora/syn2015.json index 5c95825..66d2f75 100644 --- a/corpora/reference-corpora/syn2015.json +++ b/corpora/reference-corpora/syn2015.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1593", "Family": "Reference corpora", "Description": "This corpus includes Czech fiction, professional literature, newspapers etc. published between 2010 and 2014. The corpus is encoded in XML.\nThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Czech National Corpus (Shuffled Corpus Data)", "Size": ["100 million words"], "Annotation": ["MSD-tagged", "lemmatized"], diff --git a/corpora/sign-language-resources/adamorobe-lexicon.json b/corpora/sign-language-resources/adamorobe-lexicon.json index 4c9a66b..0377c43 100644 --- a/corpora/sign-language-resources/adamorobe-lexicon.json +++ b/corpora/sign-language-resources/adamorobe-lexicon.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0016-8A57-F", "Family": "Sign language resources", "Description": "This lexicon contains 250 signs in isolation. For a subset of the signs, encodings about phonological and iconic features are available.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["Adamorobe Sign Language"], + "Language": ["Adamorobe Sign Language"], "Licence": "Restricted", "Size": ["250 signs"], "Annotation": ["partial (phonology and iconicity)"], diff --git a/corpora/sign-language-resources/adamorobe.json b/corpora/sign-language-resources/adamorobe.json index 0508720..4bf93ee 100644 --- a/corpora/sign-language-resources/adamorobe.json +++ b/corpora/sign-language-resources/adamorobe.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0016-3693-A", "Family": "Sign language resources", "Description": "The Adamorobe Sign Language Corpus contains almost 36 hours of videorecordings of Adamorobe Sign Language, filmed in Adamorobe in Ghana between 2000 and 2004 by Victoria Nyst. The deposit contains recordings of approximately 20 signers. The 39 original tapes were digitized, cut, compressed and converted into MPG1 and MPEG2 digital clips using the standard settings of the MPI in Nijmegen. The total number of clips is 90 MPG1 and 90 MPEG2. There are 27 complete synchronized Elan-transcriptions in English and in Twi, which is the Akwapim variety of Akan, the spoken language in Adamorobe. The recordings include spontaneous narratives, personal stories and stories about the history of Adamorobe, elicited data, retellings of cartoons and picture stories.", - "Languages": ["Adamorobe Sign Language"], + "Language": ["Adamorobe Sign Language"], "Licence": "Restricted", "Size": ["90 MPG1 and 90 MPEG2 clips"], "Annotation": ["EAF transcripts"], diff --git a/corpora/sign-language-resources/addictionlink.json b/corpora/sign-language-resources/addictionlink.json index 54aca87..3f8d26f 100644 --- a/corpora/sign-language-resources/addictionlink.json +++ b/corpora/sign-language-resources/addictionlink.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073022", "Family": "Sign language resources", "Description": "This corpus contains written and recorded (audio and video) materials pertaining to alcohol, drugs and addictions, on independent change programs and a self-assessment test on the use of alcohol.", - "Languages": ["Finnish Sign Language (FinSL)"], + "Language": ["Finnish Sign Language (FinSL)"], "Licence": "Under negotiation", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/balines-homesign.json b/corpora/sign-language-resources/balines-homesign.json index 731ee1f..20494ba 100644 --- a/corpora/sign-language-resources/balines-homesign.json +++ b/corpora/sign-language-resources/balines-homesign.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/3d12f73f-e395-42da-a84c-f4ae5d9eb7b0", "Family": "Sign language resources", "Description": "The collection includes sign language data from deaf homesigners in Bali, Indonesia. The data was collected between 2021 and 2023.\nThe collection is available for download from the Language Archive.", - "Languages": ["Balinese"], + "Language": ["Balinese"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/becos.json b/corpora/sign-language-resources/becos.json index 7adcb2d..c721644 100644 --- a/corpora/sign-language-resources/becos.json +++ b/corpora/sign-language-resources/becos.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-v5", "Family": "Sign language resources", "Description": "This corpus consists of the entire archive of official press conferences from the Belgian Federal Government concerning the COVID-19 pandemic. The speakers speak mostly Dutch or French and occasionally German, and nearly all speech is accompanied by a deaf signer who interprets live what is said.\nThe corpus is available for download from the Dutch Language Institute.", - "Languages": ["Flemish Sign Lagnuage (VGT)", "The French Belgian Sign Language (LSFB)", "fra", "nld"], + "Language": ["Flemish Sign Lagnuage (VGT)", "The French Belgian Sign Language (LSFB)", "fra", "nld"], "Licence": "CC BY", "Size": ["177 hours of speech"], "Annotation": ["speaker diarisation", "ASR and post-ASR", "punctuation prediction", "signer diarisation", "sign language identification", "sign language keypoint recognition"], diff --git a/corpora/sign-language-resources/bible-translations.json b/corpora/sign-language-resources/bible-translations.json index 6680ec6..29639d4 100644 --- a/corpora/sign-language-resources/bible-translations.json +++ b/corpora/sign-language-resources/bible-translations.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073029", "Family": "Sign language resources", "Description": "This is a video corpus of Bible translations (including The Gospels of John and Luke and the Old Testament, Genesis 1:1-4:16, 6:1-9:17), mass and other religious ceremonies, as well as other religious documents.\nThe corpus is available for online browsing through a dedicated webpage.", - "Languages": ["Finnish Sign Language (FinSL)"], + "Language": ["Finnish Sign Language (FinSL)"], "Licence": "Under negotiation", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/bsl-corpus.json b/corpora/sign-language-resources/bsl-corpus.json index fe32259..cb9f450 100644 --- a/corpora/sign-language-resources/bsl-corpus.json +++ b/corpora/sign-language-resources/bsl-corpus.json @@ -3,7 +3,7 @@ "URL": "https://bslcorpusproject.org/", "Family": "Sign language resources", "Description": "The British Sign Language Corpus is a collection of British Sign Language (BSL) video clips of 249 deaf signers from the UK. The BSL Corpus project is based at the Deafness Cognition and Language Research Centre, University College London, lasted from 2008–2011 and was led by Adam Schembri. A related dataset is the BSL Signbank.", - "Languages": ["British Sign Language (BSL)"], + "Language": ["British Sign Language (BSL)"], "Licence": "", "Size": ["BSL video data from 249 deaf signers of BSL"], "Annotation": ["yes"], diff --git a/corpora/sign-language-resources/bsl-lexicon.json b/corpora/sign-language-resources/bsl-lexicon.json index 4ad4151..3813ab1 100644 --- a/corpora/sign-language-resources/bsl-lexicon.json +++ b/corpora/sign-language-resources/bsl-lexicon.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0008-1768-5", "Family": "Sign language resources", "Description": "This lexicon was derived from the British Sign Language Corpus and is part of the ECHO case study on sign languages.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["British Sign Language (BSL)"], + "Language": ["British Sign Language (BSL)"], "Licence": "Public", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/catteau-2020.json b/corpora/sign-language-resources/catteau-2020.json index ca30e6c..f6988b6 100644 --- a/corpora/sign-language-resources/catteau-2020.json +++ b/corpora/sign-language-resources/catteau-2020.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/corpus-catteau-2020/v1", "Family": "Sign language resources", "Description": "This corpus contains eleven poetic works in LSF (French sign language) and their fifty-seven translations into oral French.\nThe corpus is available for download from the Ortolang repository.", - "Languages": ["French Sign Language (LSF)", "fra"], + "Language": ["French Sign Language (LSF)", "fra"], "Licence": "CC BY-NC-ND 3.0", "Size": ["11 poems in LSF and 57 translations in French (several versions for each poem)"], "Annotation": ["multimodal annotation", "prosodic annotation"], diff --git a/corpora/sign-language-resources/consumer-info-fin.json b/corpora/sign-language-resources/consumer-info-fin.json index 6afd5aa..46a8e43 100644 --- a/corpora/sign-language-resources/consumer-info-fin.json +++ b/corpora/sign-language-resources/consumer-info-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073020", "Family": "Sign language resources", "Description": "This corpus contains written and recorded (video) materials pertaining to advice aimed at consumers with regards to e.g. product defects, service related complaints, canceling orders and online shopping.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Finnish Sign Language (FinSL)"], + "Language": ["Finnish Sign Language (FinSL)"], "Licence": "Under negotiation", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/content4all.json b/corpora/sign-language-resources/content4all.json index 322097a..5a8d73b 100644 --- a/corpora/sign-language-resources/content4all.json +++ b/corpora/sign-language-resources/content4all.json @@ -3,7 +3,7 @@ "URL": "https://www.cvssp.org/data/c4a-news-corpus/", "Family": "Sign language resources", "Description": "This is a collection of six datasets recorded and created by the Content4All research project. The datasets are hosted by University of Surrey and are password protected. To request download credentials, please contact Richard Bowden.", - "Languages": ["Flemish Sign Language (VGT)", "Swiss-German Sign Language (DSGS)"], + "Language": ["Flemish Sign Language (VGT)", "Swiss-German Sign Language (DSGS)"], "Licence": "CC BY-NC-SA 4.0", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/corlse.json b/corpora/sign-language-resources/corlse.json index 54aa77d..aeec9a6 100644 --- a/corpora/sign-language-resources/corlse.json +++ b/corpora/sign-language-resources/corlse.json @@ -3,7 +3,7 @@ "URL": "https://corpuslse.es/", "Family": "Sign language resources", "Description": "This corpus is intended for the analysis of LSE argument structure, focusing on how signers organize the names (and the forms similar to the names) and the verbs (and other forms that have a predicative function) to communicate who does what, or feels what, or talks about what, etc. It was not intended to create a representative and structured corpus, but rather a set of examples that would allow basing the grammatical description on contextualized uses. Only a part is accessible through the iSignos website. The corpus is annotated as follows: there are right-hand and left-hand id-glosses and glosses for classifiers, translation into Spanish and role-shift, PoS, argument structure, locus and animacy (2 hours and 21 minutes). Other part just with glosses, translation into Spanish and role-shift; some recordings (16) also have analysis of the non-manual component", - "Languages": ["Spanish Sign Language (LSE)"], + "Language": ["Spanish Sign Language (LSE)"], "Licence": "", "Size": ["4 hours 52 minutes", "48 recordings"], "Annotation": ["partly annotated"], diff --git a/corpora/sign-language-resources/corpus-dsl-dic.json b/corpora/sign-language-resources/corpus-dsl-dic.json index d917e75..da659ef 100644 --- a/corpora/sign-language-resources/corpus-dsl-dic.json +++ b/corpora/sign-language-resources/corpus-dsl-dic.json @@ -3,7 +3,7 @@ "URL": "https://dsn.dk/dansk-tegnsprog/oplysning-raadgivning-og-dokumentation/tools-and-resources-within-danish-sign-language/", "Family": "Sign language resources", "Description": "This corpus consists of video material from 31 signers of DTS from Denmark. The Corpus is used to build a DTS-Danish Dictionary. The Danish Sign Language Dictionary project building the corpus is based at the Bachelor’s Degree Programme in Danish Sign Language and Speech-to-text Interpreter at the University College Copenhagen and led by Mads Jonathan Pedersen and Thomas Troelsgård. The project started 2014 and is still ongoing.", - "Languages": ["Danish Sign Language (DTS)"], + "Language": ["Danish Sign Language (DTS)"], "Licence": "Only for internal use (= the dictionary staff) and guest researchers", "Size": ["4.5 hours"], "Annotation": ["ID-glosses and (ideally) sense indicators"], diff --git a/corpora/sign-language-resources/corpus-fin-sl.json b/corpora/sign-language-resources/corpus-fin-sl.json index ff8aeb2..d73fe9c 100644 --- a/corpora/sign-language-resources/corpus-fin-sl.json +++ b/corpora/sign-language-resources/corpus-fin-sl.json @@ -3,7 +3,7 @@ "URL": "http://r.jyu.fi/AB7", "Family": "Sign language resources", "Description": "The corpus consists of video-recorded conversations and elicited narratives from 21 Finnish Sign Language signers who belong to different age groups and live in different parts of Finland. The signers perform seven fixed tasks which are
        • introductions
        • ,
        • discussing work/hobbies,
        • narrating about short cartoon strips,
        • narrating about a video,
        • narrating a story from the picture book
        • The Snowman or Frog, Where are you?,
        • discussing a topic related to the deaf world, and
        • free discussion (e.g. on travel, sports)
        . All of the video data (14.5 hours by six camera angles) has been annotated for signs and translations. According to the tasks performed by the signers, the corpus has been divided into two subcorpora: one that contains the elicited narratives, and another that contains the conversations.\nThe corpus is available for download from the Meta-Share (FIN-CLARIN Distribution).", - "Languages": ["Finnish Sign Language (FinSL)", "fin"], + "Language": ["Finnish Sign Language (FinSL)", "fin"], "Licence": "CC BY-NC-SA 4.0", "Size": ["14 hours 22 minutes"], "Annotation": ["ID-glosses and translations"], diff --git a/corpora/sign-language-resources/corpus-lsfb.json b/corpora/sign-language-resources/corpus-lsfb.json index 5b5209a..77eacd5 100644 --- a/corpora/sign-language-resources/corpus-lsfb.json +++ b/corpora/sign-language-resources/corpus-lsfb.json @@ -3,7 +3,7 @@ "URL": "https://www.corpus-lsfb.be/accueil.php", "Family": "Sign language resources", "Description": "This is the first large-scale digital corpus that illustrates the current use of French Belgian Sign Language (LSFB) and all its variations.\nIt was first conceived for linguistic research. However, this digital library is an unprecedented tool for teachers, students and interpreters, as well as a safeguard of the linguistic and cultural heritage of the Deaf Community.", - "Languages": ["French Belgian Sign Language (LSFB)"], + "Language": ["French Belgian Sign Language (LSFB)"], "Licence": "CC BY-NC-ND 4.0, see also the conditions", "Size": ["10 hours"], "Annotation": ["ID-glosses"], diff --git a/corpora/sign-language-resources/corpus-ngt.json b/corpora/sign-language-resources/corpus-ngt.json index fdf4e09..ab9d64e 100644 --- a/corpora/sign-language-resources/corpus-ngt.json +++ b/corpora/sign-language-resources/corpus-ngt.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0021-2A75-B", "Family": "Sign language resources", "Description": "This corpus contains sessions with linked media files and ELAN annotation files (EAF); about 15% of the sessions are glossed and translated.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "CC-BY-NC-SA 3.0 NL", "Size": ["2375 sessions"], "Annotation": ["ID-Glosses, sentence-level translations, and mouth actions for a subset of the sessions"], diff --git a/corpora/sign-language-resources/creagest-acquisition.json b/corpora/sign-language-resources/creagest-acquisition.json index 987fb65..27b1439 100644 --- a/corpora/sign-language-resources/creagest-acquisition.json +++ b/corpora/sign-language-resources/creagest-acquisition.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/ortolang-000916/v1", "Family": "Sign language resources", "Description": "This is a corpus of children's LSF collected from 65 deaf children and 17 deaf adults (control group), conducted by four deaf interviewers from four different regions of France (4 stimuli, 2 cameras). 50 hours of recording in total.\nA sample of the corpus (10 extracts of Tom & Jerry cartoon narrative, filmed with two cameras (20 files)) is available for download from the Huma-Num repository.", - "Languages": ["French Sign Language (LSF)"], + "Language": ["French Sign Language (LSF)"], "Licence": "CC BY-NC-ND 4.0", "Size": ["50 hours of recording in total."], "Annotation": ["partially annotated corpus. Elan annotation not accessible online"], diff --git a/corpora/sign-language-resources/creagest-dialogues.json b/corpora/sign-language-resources/creagest-dialogues.json index 2e49151..0b1a62c 100644 --- a/corpora/sign-language-resources/creagest-dialogues.json +++ b/corpora/sign-language-resources/creagest-dialogues.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/ortolang-000926/v1", "Family": "Sign language resources", "Description": "This is a corpus of dialogues between Deaf adults (106 hours of video data): 51 interviews, conducted by four Deaf interviewers from four different regions of France (semi-directive interviews, 3 cameras).\nA sample of the corpus (7 extracts filmed with 3 cameras (21 files)) is available for download from the Huma-num repository.", - "Languages": ["French Sign Language (LSF)"], + "Language": ["French Sign Language (LSF)"], "Licence": "CC BY-NC-ND 4.0", "Size": ["7 extracts filmed with 3 cameras (21 files)."], "Annotation": ["partially annotated corpus. Elan annotation not accessible online"], diff --git a/corpora/sign-language-resources/csl-lexicon.json b/corpora/sign-language-resources/csl-lexicon.json index 85266b9..2d260fb 100644 --- a/corpora/sign-language-resources/csl-lexicon.json +++ b/corpora/sign-language-resources/csl-lexicon.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0001-2BF4-3", "Family": "Sign language resources", "Description": "This lexicon demostrates how a Deaf adult signs a story to Deaf children.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["Chinese Sign Language (CSL)"], + "Language": ["Chinese Sign Language (CSL)"], "Licence": "Restricted", "Size": [], "Annotation": ["unannotated"], diff --git a/corpora/sign-language-resources/czech-sl-amateur.json b/corpora/sign-language-resources/czech-sl-amateur.json index 6f79b04..2865adc 100644 --- a/corpora/sign-language-resources/czech-sl-amateur.json +++ b/corpora/sign-language-resources/czech-sl-amateur.json @@ -3,7 +3,7 @@ "URL": "http://catalog.elra.info/en-us/repository/browse/ELRA-S0285/", "Family": "Sign language resources", "Description": "This is an amateur sign-language database comprising 25 signs from Czech sign language. 15 signers (4 women and 11 men) carried out 5 repetitions of each sign and were recorded from 3 different views. The first is a frontal view of the upper part of the body. The second one is similar, but with the camera placed about one meter higher than the first one so as to produce a frontal top-view, and thus allowing to detect 3D information. The last view is a frontal-detail view of the speaker's face, thus allowing lip-reading.", - "Languages": ["Czech Sign Language"], + "Language": ["Czech Sign Language"], "Licence": "ELRA", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/czech-sl-prof.json b/corpora/sign-language-resources/czech-sl-prof.json index 8284f97..088f4bd 100644 --- a/corpora/sign-language-resources/czech-sl-prof.json +++ b/corpora/sign-language-resources/czech-sl-prof.json @@ -3,7 +3,7 @@ "URL": "http://catalog.elra.info/en-us/repository/browse/ELRA-S0286/", "Family": "Sign language resources", "Description": "This lexicon comprises signs performed by 4 everyday sign-language users (4 women, 2 of them deaf). 5 repetitions of each sign were recorded from 3 different views. The first is a frontal view of the upper part of the body. The second one is similar, but with the camera placed about one meter higher than the first one so as to produce a frontal top-view and thus allowing to detect 3D information.\nThe last view is a frontal-detail view of the speaker's face, thus allowing lip-reading.", - "Languages": ["Czech Sign Language"], + "Language": ["Czech Sign Language"], "Licence": "ELRA", "Size": ["378 signs"], "Annotation": [], diff --git a/corpora/sign-language-resources/degels1.json b/corpora/sign-language-resources/degels1.json index 9289366..92c7745 100644 --- a/corpora/sign-language-resources/degels1.json +++ b/corpora/sign-language-resources/degels1.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/degels1/v1", "Family": "Sign language resources", "Description": "The theme of the dialogues is the description of routes and places in Marseille and Aix-en-Provence in France. The corpus is composed of 3 dialogues in LSF and 3 dialogues in French. Each dyad is composed of a moderator and a speaker. There is a single moderator for French and two moderators for LSF. The recording equipment consisted of 3 cameras and 2 headset microphones for the French spoken part. The corpus is composed of 6 sessions: 1, 2 and 3 for French and 4, 5, 6 for LSF. Each dyad is composed of a speaker located on the right of the overview noted A, and a moderator located on the left of the overview noted B. Thus, for session 1, the speakers are conversing in French, A1 is the speaker located on the right of the overview and B1 is the moderator located on the left of the overview. For each session there are 4 video files (mp4/AVC): 1 for the speaker, 1 for the moderator, 1 which gives a profile view of the two speakers, the overview, and 1 which is a montage of these 3 videos. All the files are synchronised. For the LSF part, there is no sound track in the videos. For the French part, there are 2 sound files (wave) in addition to the video files, 1 per speaker. The first 3 videos do not contain a sound track. Only the editing video contains sound, that of the speaker on the right in the right channel and that of the moderator on the left in the left channel.", - "Languages": ["French Sign Language (LSF)", "Spoken French"], + "Language": ["French Sign Language (LSF)", "Spoken French"], "Licence": "CC BY-NC-ND 4.0", "Size": ["6 sessions x 4 video files"], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/dgs-corpus.json b/corpora/sign-language-resources/dgs-corpus.json index c788b45..ccc2af5 100644 --- a/corpora/sign-language-resources/dgs-corpus.json +++ b/corpora/sign-language-resources/dgs-corpus.json @@ -3,7 +3,7 @@ "URL": "https://www.sign-lang.uni-hamburg.de/dgs-korpus/index.php/welcome.html", "Family": "Sign language resources", "Description": "The DGS Corpus is a collection of German Sign Language (DGS) data from 330 signers from Germany. The 15-year long-term project is based at the Institute of German Sign Language and Communication of the Deaf at the Universität Hamburg and started in 2009. It is led by Thomas Hanke and Annika Herrmann. The DGS Corpus is used to build the DGS-German dictionary DW-DGS", - "Languages": ["German Sign Language (DGS)"], + "Language": ["German Sign Language (DGS)"], "Licence": "Restricted, see here", "Size": ["+50 hours"], "Annotation": [], diff --git a/corpora/sign-language-resources/dicta-sign-lexicon.json b/corpora/sign-language-resources/dicta-sign-lexicon.json index 418831a..ede45c7 100644 --- a/corpora/sign-language-resources/dicta-sign-lexicon.json +++ b/corpora/sign-language-resources/dicta-sign-lexicon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/ATHENA-0000-0000-28C4-6", "Family": "Sign language resources", "Description": "This is a multilingual lexicon in which concepts are linked to graphically represented signs and accompanying videos showcasing the signing process.\nThe videos are annotated with HamNoSys (\"Hamburg Sign Language Notation System\").\nThe lexicon is available for online browsing via a dedicated interface.", - "Languages": ["fra", "ell", "deu", "eng", "Modern Greek Sign Language", "British Sign Language", "German Sign Language", "French Sign Language"], + "Language": ["fra", "ell", "deu", "eng", "Modern Greek Sign Language", "British Sign Language", "German Sign Language", "French Sign Language"], "Licence": "CC BY-NC-ND 4.0", "Size": ["1000 entries per language (video and text)"], "Annotation": ["annotated, see description"], diff --git a/corpora/sign-language-resources/dicta-sign.json b/corpora/sign-language-resources/dicta-sign.json index 86dbd0b..5265819 100644 --- a/corpora/sign-language-resources/dicta-sign.json +++ b/corpora/sign-language-resources/dicta-sign.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/ATHENA-0000-0000-28C5-5", "Family": "Sign language resources", "Description": "Multimedia corpus (video) for four sign languages (english, french, german and greek) of at least 14 informants per language and a session duration of approx. 2 hours using the same elicitation materials (scripts and tasks) across languages.", - "Languages": ["fra", "ell", "deu", "eng", "Greek Sign Language", "British Sign Language (BSL)", "German Sign Language (DGS)", "French Sign Language (LSF)"], + "Language": ["fra", "ell", "deu", "eng", "Greek Sign Language", "British Sign Language (BSL)", "German Sign Language (DGS)", "French Sign Language (LSF)"], "Licence": "CC BY NC-ND 4.0", "Size": ["Text feature: 10 file, Video feature: 25 hour, Data format: MPEG-4"], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/dogon.json b/corpora/sign-language-resources/dogon.json index 540d1bf..be2838f 100644 --- a/corpora/sign-language-resources/dogon.json +++ b/corpora/sign-language-resources/dogon.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0016-2E9E-4", "Family": "Sign language resources", "Description": "The Dogon Sign Language Corpus contains 32 hours of video data, recorded in the Dogon area in Mali between 2010 and 2012. These recordings are cut into 341 shorter clips varying lengths, in MPG1 and MPG2 format. The recordings feature the signing of 41 men and 27 women. The average age of all signers was 30 years. Recordings were made in 13 locations. Following approaches developed in earlier sign language corpora, the following he types of data were collected for the the Dogon Sign Language Corpus:
        • personal narratives
        • interviews about personal history
        • signed guided tours by deaf signers around the house, fields and nature
        • elicited lexical data
        • reports by the team members of the data collection
        Metadata are stored in the IMDI sign language format, using the ARBIL editor software. The entire corpus, i.e. the video clips, annotations and metadata, is stored in the DoBeS archive at the Max Planck Institute for Psycholinguistics in Nijmegen.", - "Languages": ["Dogon Sign Language"], + "Language": ["Dogon Sign Language"], "Licence": "Restricted", "Size": ["341 clips in MPG1 and MPG2 format"], "Annotation": ["EAF transcripts"], diff --git a/corpora/sign-language-resources/echo-ngt-lex-f2.json b/corpora/sign-language-resources/echo-ngt-lex-f2.json index 655c84a..55a9c6b 100644 --- a/corpora/sign-language-resources/echo-ngt-lex-f2.json +++ b/corpora/sign-language-resources/echo-ngt-lex-f2.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0008-1755-3", "Family": "Sign language resources", "Description": "This lexicon forms part of the ECHO case study on sign languages.The signer retells the fable The Shepherd Boy and the Wolf. The source of the retelling is a Dutch version of the fables by author Paul Biegel, consisting of approximately 300 words.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "Public", "Size": [], "Annotation": ["unannotated"], diff --git a/corpora/sign-language-resources/echo-ngt-lex-m.json b/corpora/sign-language-resources/echo-ngt-lex-m.json index 84d47d2..c48262e 100644 --- a/corpora/sign-language-resources/echo-ngt-lex-m.json +++ b/corpora/sign-language-resources/echo-ngt-lex-m.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0008-1763-3", "Family": "Sign language resources", "Description": "This lexicon forms part of the ECHO case study on sign languages.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "Public", "Size": ["300 signs"], "Annotation": ["ELAN transcriptions"], diff --git a/corpora/sign-language-resources/echo-ngt-lex-m2.json b/corpora/sign-language-resources/echo-ngt-lex-m2.json index 0eeef12..2095bca 100644 --- a/corpora/sign-language-resources/echo-ngt-lex-m2.json +++ b/corpora/sign-language-resources/echo-ngt-lex-m2.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0008-1751-9", "Family": "Sign language resources", "Description": "This lexicon forms part of the ECHO case study on sign languages.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "Public", "Size": ["300 signs"], "Annotation": ["unannotated"], diff --git a/corpora/sign-language-resources/echo-ssl-lex-signer-lm.json b/corpora/sign-language-resources/echo-ssl-lex-signer-lm.json index 4f3937e..a6be3c3 100644 --- a/corpora/sign-language-resources/echo-ssl-lex-signer-lm.json +++ b/corpora/sign-language-resources/echo-ssl-lex-signer-lm.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0008-176C-2", "Family": "Sign language resources", "Description": "This lexicon forms part of the ECHO case study on sign languages.\nThe lexicon is available for download from the MPI Language Archive.", - "Languages": ["Swedish Sign Language (SSL/STS)"], + "Language": ["Swedish Sign Language (SSL/STS)"], "Licence": "Public", "Size": ["300 signs"], "Annotation": ["unannotated"], diff --git a/corpora/sign-language-resources/echo.json b/corpora/sign-language-resources/echo.json index 6f0ebbb..5a1b3e7 100644 --- a/corpora/sign-language-resources/echo.json +++ b/corpora/sign-language-resources/echo.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0001-4892-C", "Family": "Sign language resources", "Description": "The corpus contains recorded sign narrations of five fable stories, a small lexicon, and interviews with the signers for each of the three languages. In addition, there is sign language poetry from BSL, NGT and SSL. Finally, the corpus includes two annotated segments of the Gehörlos So! corpus of German Sign Language (DGS) by Jens Heßmann.", - "Languages": ["Dutch Sign Language (NGT)", "British Sign Language (BSL)", "Swedish Sign Language (SSL)", "German Sign Language (DGS)"], + "Language": ["Dutch Sign Language (NGT)", "British Sign Language (BSL)", "Swedish Sign Language (SSL)", "German Sign Language (DGS)"], "Licence": "CC BY-NC-SA 3.0", "Size": ["76 MPEG1 recordings"], "Annotation": ["EAF transcripts"], diff --git a/corpora/sign-language-resources/exhibition-corpus.json b/corpora/sign-language-resources/exhibition-corpus.json index c665e8a..365a0a4 100644 --- a/corpora/sign-language-resources/exhibition-corpus.json +++ b/corpora/sign-language-resources/exhibition-corpus.json @@ -3,7 +3,7 @@ "URL": "https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-30/", "Family": "Sign language resources", "Description": "This corpus contains texts produced during a 2013 exhibition about languages - \"Leve Språket\". The exhibition aimed at showing the linguistic diversity in Norway, and it covered topics such as language conflict, the understanding of neighbouring languages and linguistic humor. The target audience was teenagers in school, and the texts are formulated accordingly. The texts were translated into Norwegian Sign Language and either Norwegian Bokmål or Nynorsk. The texts were also recorded to serve as an audio guide in the exhibition room. ", - "Languages": ["Norwegian Bokmål", "Norwegian Nynorsk", "Norwegian", "Norwegian Sign Language (NSL)"], + "Language": ["Norwegian Bokmål", "Norwegian Nynorsk", "Norwegian", "Norwegian Sign Language (NSL)"], "Licence": "CC ZERO", "Size": ["23 texts"], "Annotation": [], diff --git a/corpora/sign-language-resources/fadwa-mhimdi.json b/corpora/sign-language-resources/fadwa-mhimdi.json index f8b06c7..c1387ab 100644 --- a/corpora/sign-language-resources/fadwa-mhimdi.json +++ b/corpora/sign-language-resources/fadwa-mhimdi.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/lst/v1", "Family": "Sign language resources", "Description": "This is the first scientific corpus of narrative discourses in Tunisian Sign Language (LST) by Deaf adults. The data were filmed in the Tunis region.\nThe corpus is available for download from the Ortolang repository.", - "Languages": ["Tunisian Sign Language (TSL)"], + "Language": ["Tunisian Sign Language (TSL)"], "Licence": "CC BY-NC-ND 3.0", "Size": ["10 narrative discourses"], "Annotation": ["ELAN annotations"], diff --git a/corpora/sign-language-resources/fin-sl-learning.json b/corpora/sign-language-resources/fin-sl-learning.json index 3bdb385..f895173 100644 --- a/corpora/sign-language-resources/fin-sl-learning.json +++ b/corpora/sign-language-resources/fin-sl-learning.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073017", "Family": "Sign language resources", "Description": "This corpus contains written and recorded (audio and video) materials pertaining to Finnish sign language greetings, names of family members, numbers and telling the time, as well as basic verbs and related words.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Finnish Sign Language (FinSL)"], + "Language": ["Finnish Sign Language (FinSL)"], "Licence": "Under negotiation", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/giving-rec.json b/corpora/sign-language-resources/giving-rec.json index 8d3e776..3a13c5b 100644 --- a/corpora/sign-language-resources/giving-rec.json +++ b/corpora/sign-language-resources/giving-rec.json @@ -3,7 +3,7 @@ "URL": "https://www.nwo.nl/en/projects/277-70-013", "Family": "Sign language resources", "Description": "This is a multilingual corpus of Turkish Sign Language (TİD) and Dutch Sign Language (NGT) as well as Turkish and Dutch data. It contains 84 video files of signers and speakers from Istanbul and Nijmegen. The project was based at the Max Planck Institute for Psycholinguistics, Centre for Language Studies.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Turkish Sign Language (TİD)", "Dutch Sign language (NGT)"], + "Language": ["Turkish Sign Language (TİD)", "Dutch Sign language (NGT)"], "Licence": "Restricted, see here", "Size": ["84 videos"], "Annotation": [], diff --git a/corpora/sign-language-resources/hotel-review-dutch.json b/corpora/sign-language-resources/hotel-review-dutch.json index 7a9af33..204883f 100644 --- a/corpora/sign-language-resources/hotel-review-dutch.json +++ b/corpora/sign-language-resources/hotel-review-dutch.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-x5", "Family": "Sign language resources", "Description": "This is a multimodal parallel corpus of hotel reviews that were originally written in Dutch, and subsequently translated into the Dutch Sign Language by 6 professionals, all of whom are deaf translators.\nThe corpus is available for download from the Institute of Dutch Language.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "CC BY-NC 3.0", "Size": ["21,825 words", "3.5 hours"], "Annotation": [], diff --git a/corpora/sign-language-resources/hotel-review-flemish.json b/corpora/sign-language-resources/hotel-review-flemish.json index cb653a6..87cf8a5 100644 --- a/corpora/sign-language-resources/hotel-review-flemish.json +++ b/corpora/sign-language-resources/hotel-review-flemish.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-x4 ", "Family": "Sign language resources", "Description": "This is a multimodal parallel corpus of hotel reviews that were originally written in Dutch, and subsequently translated into the Flemish Sign Language by 6 professionals, all of whom are deaf translators.\nThe corpus is available for download from the Institute of Dutch Language.", - "Languages": ["Flemish Sign Language (VGT)", "nld"], + "Language": ["Flemish Sign Language (VGT)", "nld"], "Licence": "CC BY-NC 4.0", "Size": ["21,825 words", "4 hours"], "Annotation": [], diff --git a/corpora/sign-language-resources/hotel-review-spanish.json b/corpora/sign-language-resources/hotel-review-spanish.json index d9b8fc3..46aea2f 100644 --- a/corpora/sign-language-resources/hotel-review-spanish.json +++ b/corpora/sign-language-resources/hotel-review-spanish.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-x6", "Family": "Sign language resources", "Description": "This is a multimodal parallel corpus of hotel reviews that were originally written in Dutch, subsequently translated into Spanish and finally into Spanish Sign Language by 6 professionals, all of whom are deaf translators.\nThe corpus is available for download from the Institute of Dutch Language.", - "Languages": ["Spanish Sign Language (LSE)", "spa"], + "Language": ["Spanish Sign Language (LSE)", "spa"], "Licence": "CC BY-NC 3.0", "Size": ["20,609 words", "3 hours of videos"], "Annotation": [], diff --git a/corpora/sign-language-resources/hun-sl-corpus.json b/corpora/sign-language-resources/hun-sl-corpus.json index b177ed0..aa91c16 100644 --- a/corpora/sign-language-resources/hun-sl-corpus.json +++ b/corpora/sign-language-resources/hun-sl-corpus.json @@ -3,7 +3,7 @@ "URL": "https://www.researchgate.net/project/JelEsely-Projekt-SIGNificant-Chance-Project", "Family": "Sign language resources", "Description": "The Hungarian Sign Language Corpus is a collection of Hungarian Sign Language (HSL) video data of 147 signers from Hungarian. Overall, 1,750 hours were recorded. The HSL corpus project ran from 2016 to 2017, was based at the Research Institute for Linguistics at the Hungarian Academy of Sciences and led by Csilla Bartha.", - "Languages": ["Hungarian Sign Language"], + "Language": ["Hungarian Sign Language"], "Licence": "", "Size": ["30 hours (Grammatical Corpus)"], "Annotation": [], diff --git a/corpora/sign-language-resources/iprosla.json b/corpora/sign-language-resources/iprosla.json index 72293c2..52c84f7 100644 --- a/corpora/sign-language-resources/iprosla.json +++ b/corpora/sign-language-resources/iprosla.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-CE31E27F-8853-4A18-80E8-AECAFAD012C0", "Family": "Sign language resources", "Description": "This corpus contains three sets of data. The first is a set of longitudinal data of deaf children from deaf and hearing parents that has been collected at the UvA since the late 1980s. The second is a new collection of longitudinal data collected at the RU from hearing and deaf children of deaf parents (2008–2020). Thirdly, data collected in an educational context by Nini Hoiting at the Kentalis Guyot school. ", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "Restricted end user license for academic use only.", "Size": ["Around 500 hours"], "Annotation": ["Unannotated"], diff --git a/corpora/sign-language-resources/isignos.json b/corpora/sign-language-resources/isignos.json index ffb0ca6..3b3d86e 100644 --- a/corpora/sign-language-resources/isignos.json +++ b/corpora/sign-language-resources/isignos.json @@ -3,7 +3,7 @@ "URL": "http://isignos.uvigo.es/", "Family": "Sign language resources", "Description": "This corpus consists of a set of video recordings of signers who express themselves in LSE, presented together with the glosses of both hands and the Spanish translation. In the first stage, a set of videos with their corresponding glosses and translations are available, which will be expanded in successive phases. You can consult the list of recordings and select by genre or theme criteria, and also by the sex or age range of the signers.The resource can be useful for all those people who need this type of linguistic data for their work, for example, for class exercises, interpretation practices, language evaluations, research on LSE, etc.\nThe corpus is available through a dedicated sarch engine that allows you to explore the corpus and observe the context in which the searched glosses appear.", - "Languages": ["Spanish Sign Language (LSE)"], + "Language": ["Spanish Sign Language (LSE)"], "Licence": "", "Size": [], "Annotation": ["annotated for right-hand and left-hand id-glosses and glosses for classifiers and Spanish translations"], diff --git a/corpora/sign-language-resources/italian-sl.json b/corpora/sign-language-resources/italian-sl.json index 970556d..95dd4af 100644 --- a/corpora/sign-language-resources/italian-sl.json +++ b/corpora/sign-language-resources/italian-sl.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-57EA1164-AC96-4541-8B1D-252673D6152A", "Family": "Sign language resources", "Description": "The Italian Sign Language Corpus is a collection of Italian Sign Language (LIS) data from 180 signers from Italy. The core part of the project involved three universities: University of Milan-Bicocca, University Ca’Foscari and Sapienza University.\nThe corpus is available for download from MPI's Language Archive (CLARIAH-NL).", - "Languages": ["Italian Sign Language (LIS)"], + "Language": ["Italian Sign Language (LIS)"], "Licence": "Restricted, see here", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/kata-kolok-child.json b/corpora/sign-language-resources/kata-kolok-child.json index 5340138..670ad5a 100644 --- a/corpora/sign-language-resources/kata-kolok-child.json +++ b/corpora/sign-language-resources/kata-kolok-child.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/f47a1a94-2d09-4e19-b721-7e9547cc796c", "Family": "Sign language resources", "Description": "This corpus covers spontaneous child-caregiver interactions focused on five deaf and eight hearing children acquiring Kata Kolok natively. Ages range between 4 months and 8;4 years of age.\nThe corpus is not freely accessible due to the vulnerable target group. Contact person: Hannah Lutzenberger", - "Languages": ["Kata Kolok (Benkala Sign Language)"], + "Language": ["Kata Kolok (Benkala Sign Language)"], "Licence": "Restricted", "Size": ["Data from four focal deaf children accumulates to 95h 24min (Lutzenberger 2022:282)."], "Annotation": ["Translations in Indonesian and English, ID-glosses linked to the Kata Kolok SignBank"], diff --git a/corpora/sign-language-resources/kata-kolok.json b/corpora/sign-language-resources/kata-kolok.json index 7365324..7cc39d4 100644 --- a/corpora/sign-language-resources/kata-kolok.json +++ b/corpora/sign-language-resources/kata-kolok.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/58506aa9-8122-48bf-93b1-f353a2d65ab1", "Family": "Sign language resources", "Description": "This corpus includes a wide range of elicited and spontaneous language materials accumulating to 100 hours of video data from generation III-V of adult deaf and hearing signers. Ongoing data collection (anno 2022) is focused on generation III as they are currently among the eldest KK signers.", - "Languages": ["Kata Kolok (Benkala Sign Language)"], + "Language": ["Kata Kolok (Benkala Sign Language)"], "Licence": "CC BY-NC-SA 4.0", "Size": ["63.5; data collection ongoing"], "Annotation": ["63.5 hours of video data, roughly 3:52 hours are translated in English and Indonesian, 3:44 hours are glossed and about 1:45 hours are translated and glossed."], diff --git a/corpora/sign-language-resources/kipo.json b/corpora/sign-language-resources/kipo.json index ffdee35..9e7ef49 100644 --- a/corpora/sign-language-resources/kipo.json +++ b/corpora/sign-language-resources/kipo.json @@ -2,8 +2,8 @@ "Name": "The Kipo Corpus", "URL": "http://urn.fi/urn:nbn:fi:lb-2020112921", "Family": "Sign language resources", - "Description": "This is a video corpus of the language policy program for the National Sign Languages in Finland translated by two people who speak the sign language as their mother tongue.\nThe corpus is available for download from the Finnish Language Bank.", - "Languages": ["Finnish Sign Language (FinSL)"], + "Description": "This is a video corpus of the language policy program for the National Sign Language in Finland translated by two people who speak the sign language as their mother tongue.\nThe corpus is available for download from the Finnish Language Bank.", + "Language": ["Finnish Sign Language (FinSL)"], "Licence": "CC-BY-NC-SA", "Size": ["163 minutes"], "Annotation": [], diff --git a/corpora/sign-language-resources/ls-colin.json b/corpora/sign-language-resources/ls-colin.json index a45edd1..8980aa2 100644 --- a/corpora/sign-language-resources/ls-colin.json +++ b/corpora/sign-language-resources/ls-colin.json @@ -3,7 +3,7 @@ "URL": "https://cocoon.huma-num.fr/exist/crdo/search2.xql?page=1&max=500&lang=fr&nonce=OTM3NzMwNg%3D%3D&language=http%3A%2F%2Flexvo.org%2Fid%2Fiso639-3%2Ffsl&contributor=http%3A%2F%2Fviaf.org%2Fviaf%2F197871692&keyword=CUXAC&del_1=&del_2=&keyword2=&limit=10", "Family": "Sign language resources", "Description": "This is a reference corpus for LSF, recorded in January 2002 in Paris, involving 13 Deaf adults (monologues). The corpus is divided in 5 video files of various length. It contains a description in French (some metadata) and a translation in French of narratives and other discourses following the time code. The topics and genres included are: \"Le Récit du Cheval\" (narrative), \"Le Récit des Oiseaux\" (narrative), \"L'Euro\" (argumentative discourse), \"La Recette de Cuisine\" (cooking recipe), \"Le 11 septembre 2001\" (argumentative and narrative discourse) et \"Le Thème Linguistique\" (metalinguistic discourse).\nThe corpus is available for download from the Huma-num repository.", - "Languages": ["French Sign Language (LSF)"], + "Language": ["French Sign Language (LSF)"], "Licence": "CC BY-NC-ND 4.0", "Size": ["2 hours"], "Annotation": ["ELAN annotations"], diff --git a/corpora/sign-language-resources/maurician-sl.json b/corpora/sign-language-resources/maurician-sl.json index 1a2e6e0..369b4d3 100644 --- a/corpora/sign-language-resources/maurician-sl.json +++ b/corpora/sign-language-resources/maurician-sl.json @@ -3,7 +3,7 @@ "URL": "https://cocoon.huma-num.fr/exist/crdo/search2.xql?page=1&max=500&lang=en&nonce=MTcyNTc0MA%3D%3D&keyword=LSM&del_1=&del_2=&keyword2=&limit=10", "Family": "Sign language resources", "Description": "This is a corpus of 19 discourses (narratives and other genres).\nThe corpus is available for download from the Huma-num repository (COCOON).", - "Languages": ["Maurician sign Language (LSM)"], + "Language": ["Maurician sign Language (LSM)"], "Licence": "CC BY-NC-ND 4.0", "Size": ["19 discourses (narratives and other genres)"], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/mediapi-skel.json b/corpora/sign-language-resources/mediapi-skel.json index c8ef4cb..2248225 100644 --- a/corpora/sign-language-resources/mediapi-skel.json +++ b/corpora/sign-language-resources/mediapi-skel.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/mediapi-skel/v1", "Family": "Sign language resources", "Description": "This is a 2D-skeleton video corpus of LSF with French subtitles. The corpus consists of 368 subtitled videos produced by Média’Pi4, a media company producing bilingual content with LSF and written French. The corpus was produced at the Laboratoire d’informatique pour la mécanique et les sciences de l’ingénieur (LIMSI).\nFrom the original videos, 25 body keypoints, 2x21 hand keypoints and 70 face keypoints were extracted using OpenPose. 135 keypoints for every person in every frame of the 368 videos were provided, as well as the associated subtitles in French.\nThe corpus is available for download from Ortolang.", - "Languages": ["French Sign Language (LSF)"], + "Language": ["French Sign Language (LSF)"], "Licence": "CC BY-NC 4.0", "Size": ["368 subtitled videos"], "Annotation": [], diff --git a/corpora/sign-language-resources/mocap1.json b/corpora/sign-language-resources/mocap1.json index 0472df7..bc8b1d7 100644 --- a/corpora/sign-language-resources/mocap1.json +++ b/corpora/sign-language-resources/mocap1.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/mocap1/v2", "Family": "Sign language resources", "Description": "This is a corpus of French Sign Language (LSF) captured with a motion capture system and an HD camera. It was designed with the objective of carrying out multidisciplinary studies in Movement Sciences, Linguistics and Computer Science. The corpus consists of 5 tasks of different natures: description, explanation, narration and translation, performed by 4 speakers (8 for the description task).\nThe corpus is available for download from the Ortolang repository.", - "Languages": ["French Sign Language (LSF)"], + "Language": ["French Sign Language (LSF)"], "Licence": "CC BY NC-ND 4.0", "Size": [], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/news-fin-sl.json b/corpora/sign-language-resources/news-fin-sl.json index 0dc6b8c..29c2e46 100644 --- a/corpora/sign-language-resources/news-fin-sl.json +++ b/corpora/sign-language-resources/news-fin-sl.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073012", "Family": "Sign language resources", "Description": "This corpus contains recordings of Finnish news.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["Finnish Sign Language (FinSL)"], + "Language": ["Finnish Sign Language (FinSL)"], "Licence": "Under negotiation", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/ngt-interactive.json b/corpora/sign-language-resources/ngt-interactive.json index 037dbe9..9e8b564 100644 --- a/corpora/sign-language-resources/ngt-interactive.json +++ b/corpora/sign-language-resources/ngt-interactive.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0021-8357-B", "Family": "Sign language resources", "Description": "This corpus contains 15 spontaneous dialogues and multi-participant conversations by deaf signers, 10 of which were recorded in authentic settings like a deaf club and a bar, 5 were recorded in the lab. In addition, two informal three-party conversations were filmed where each participant was wearing a mobile eye trackers.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "Restricted access, unspecified", "Size": ["23 hours"], "Annotation": ["Unannotated"], diff --git a/corpora/sign-language-resources/noema-plus.json b/corpora/sign-language-resources/noema-plus.json index 7e45ced..54d088b 100644 --- a/corpora/sign-language-resources/noema-plus.json +++ b/corpora/sign-language-resources/noema-plus.json @@ -3,7 +3,7 @@ "URL": "http://sign.ilsp.gr/signilsp-site/index.php/el/noima/", "Family": "Sign language resources", "Description": "This is an online dictionary of lemmas taken from three previously developed resources, namely (i) the NOEMA DB, from which it incorporates 3,000 revised entries, (ii) the GSL segment of the Dicta Sign Corpus, from which it incorporates 2,000 entries, and the POLYTROPON Parallel Corpus corpus, from which it incorporates 3,616 new entries.\nThe lexicon is available for online browsing through an interface provided by the CLARIN:EL consortium.", - "Languages": ["Greek Sign Language (GSL)"], + "Language": ["Greek Sign Language (GSL)"], "Licence": "Freely accessible", "Size": ["8,616 lemmas"], "Annotation": ["citation forms, GSL synonyms, usage examples in GSL and Greek, concept clarification in the case of homonymity in Greek"], diff --git a/corpora/sign-language-resources/noema.json b/corpora/sign-language-resources/noema.json index 398d1f0..bc94fdf 100644 --- a/corpora/sign-language-resources/noema.json +++ b/corpora/sign-language-resources/noema.json @@ -3,7 +3,7 @@ "URL": "http://archive.ilsp.gr/en/services-products/products/item/1-langtechn/2-noema", "Family": "Sign language resources", "Description": "This dictionary contains video recorded signs paired with Modern Greek translations. The dictionary incorporates explanatory remarks that help non-native GSL users understand the meaning of the sign, while at the same time allowing for native GSL signers to enrich their Modern Greek vocabulary. The dictionary allows users to search by lemma, which means either by (i) hand shape, (ii) lemma classification according to syntactic category, or (iii) by the alphabetic ordering of the sign translations in Modern Greek.\nThe dictionary is not available online.", - "Languages": ["Greek Sign Language (GSL)"], + "Language": ["Greek Sign Language (GSL)"], "Licence": "", "Size": ["3,000 video entries"], "Annotation": [], diff --git a/corpora/sign-language-resources/norwegian-sl.json b/corpora/sign-language-resources/norwegian-sl.json index e78a88e..eda0128 100644 --- a/corpora/sign-language-resources/norwegian-sl.json +++ b/corpora/sign-language-resources/norwegian-sl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11509/141", "Family": "Sign language resources", "Description": "This corpus consists of data collected in 2007 for the purposes of a doctoral research project about boundary markers in Norwegian Sign Language. Four signers were filmed: two men and two women, both young and old. They are all deaf with deaf parents, siblings, or other family members. They live in central Eastern Norway, and all have gone to the deaf school in the area. The signers were asked to retell a children’s picture book entitled \"Frog, Where Are You?\" by Mercer Mayer and also to respond to the question \"What happened on 9/11 and what did you do?\" Video recordings of the signers were made in a studio, and sessions were led by a deaf adult man who is an L1 signers of Norwegian Sign Language. No other people were present during the recordings.\nThe corpus is available for download from the CLARINO repository.", - "Languages": ["Norwegian Sign Language (NSL)"], + "Language": ["Norwegian Sign Language (NSL)"], "Licence": "CC BY-NC-SA 4.0", "Size": ["8 video clips", "18 minutes"], "Annotation": ["EAF transcripts, ELAN annotations"], diff --git a/corpora/sign-language-resources/phd-fusellier-souza.json b/corpora/sign-language-resources/phd-fusellier-souza.json index 8baf39e..ac2cb55 100644 --- a/corpora/sign-language-resources/phd-fusellier-souza.json +++ b/corpora/sign-language-resources/phd-fusellier-souza.json @@ -3,7 +3,7 @@ "URL": "https://cocoon.huma-num.fr/exist/crdo/search2.xql?lang=fr&keyword=Fusellier", "Family": "Sign language resources", "Description": "This is a corpus containing 10 discourses with 3 Deaf emerging signers in Brasil.\nThe corpus is available for download from the Huma-num repository (COCOON).", - "Languages": ["Emerging Sign Languages (in Brazil)"], + "Language": ["Emerging Sign Languages (in Brazil)"], "Licence": "CC BY-NC-ND 4.0", "Size": ["10 discourses with 3 Deaf emerging signers in Brasil"], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/phd-martinod.json b/corpora/sign-language-resources/phd-martinod.json index 4fe4628..3581a19 100644 --- a/corpora/sign-language-resources/phd-martinod.json +++ b/corpora/sign-language-resources/phd-martinod.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/corpus-these/v1.1", "Family": "Sign language resources", "Description": "This is a corpus of sign language practiced in Soure, on the island of Marajó (Brazil, Pará). These data were collected between July and August 2015 and in March 2017.\nThis corpus is available for download from the Ortolang repository. The videos made available for download represent part of the total corpus of 8 hours and 27 minutes. They consist of elicited stories (9 minutes and 27 seconds) and spontaneous speech (17 minutes and 13 seconds).", - "Languages": ["Marajó Sign Language (Brazil)"], + "Language": ["Marajó Sign Language (Brazil)"], "Licence": "CC BY-NC-ND 4.0", "Size": ["27 minutes"], "Annotation": ["annotated corpus"], diff --git a/corpora/sign-language-resources/pjm-corpus.json b/corpora/sign-language-resources/pjm-corpus.json index 75dc49d..6d566fe 100644 --- a/corpora/sign-language-resources/pjm-corpus.json +++ b/corpora/sign-language-resources/pjm-corpus.json @@ -3,7 +3,7 @@ "URL": "https://www.plm.uw.edu.pl/projekty/korpus-pjm/ ", "Family": "Sign language resources", "Description": "This is a corpus of video data from 150 Deaf native signers of Polish Sign Language (PJM).", - "Languages": ["Polish Sign Language (PJM)"], + "Language": ["Polish Sign Language (PJM)"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/polytropon-para.json b/corpora/sign-language-resources/polytropon-para.json index fc49da2..ce6f3f1 100644 --- a/corpora/sign-language-resources/polytropon-para.json +++ b/corpora/sign-language-resources/polytropon-para.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11500/ATHENA-0000-0000-4C77-6", "Family": "Sign language resources", "Description": "This is a parallel corpus for the language pair Greek Sign Language (GSL) – Greek. The corpus incorporates sentences performed by a single signer in three repetitions each, captured in front view by means of one HD and one kinect camera. Annotation of the corpus has used the iLex annotation environment and provides information for the grammar levels of lexicon, morphology, syntax and semantics, incorporating annotation tiers for gloss, classifier type, shape and semantics, clause type, sentence type and equivalent translation in Greek on sentence level. The Corpus consists of 3500 ELAN (.eaf) files.\nThe corpus is available for download from CLARIN:EL, though access requires registration.", - "Languages": ["ell", "Greek Sign Language"], + "Language": ["ell", "Greek Sign Language"], "Licence": "CC BY-NC-SA 4.0", "Size": ["3,600 sentences"], "Annotation": ["lexical", "morphosyntax", "semantics", "glosses"], diff --git a/corpora/sign-language-resources/sign-hub-life-stories.json b/corpora/sign-language-resources/sign-hub-life-stories.json index c3807fa..96d3337 100644 --- a/corpora/sign-language-resources/sign-hub-life-stories.json +++ b/corpora/sign-language-resources/sign-hub-life-stories.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/sign-hub-wp-24/v1", "Family": "Sign language resources", "Description": "This is a collection of datasets connected to the Sign-Hub project. The corpus contains interviews conducted with elderly Deaf signers from five countries on their life experiences as well as a documentary movie based on these interviews. These interviews were conducted in five of the participating countries of the SIGN-HUB project and in six different sign languages: Catalan Sign Language (LSC), German Sign Language (DGS), Italian Sign Language (LIS), Sign Language of the Netherlands (NGT), Spanish Sign Language (LSE), and Turkish Sign Language (TİD). In each country, interviews have been conducted in different geographical areas. The exact number of interviews differs per sign language, but for every sign language, at least 20 interviews have been conducted, with interviewees being between 66 and 97 years of age. Interviews followed a pre-defined questionnaire; however, the addition of country-specific questions was encouraged.\nThis collection is available for download from the Ortolang repository.", - "Languages": ["Catalan Sign Language (LSC)", "German Sign Language (DGS)", "Italian Sign Language (LIS)", "Dutch Sign Language (NGT)", "Spanish Sign Language (LSE)" ,"Turkish Sign Language (TİD)."], + "Language": ["Catalan Sign Language (LSC)", "German Sign Language (DGS)", "Italian Sign Language (LIS)", "Dutch Sign Language (NGT)", "Spanish Sign Language (LSE)" ,"Turkish Sign Language (TİD)."], "Licence": "CLARIN PUB", "Size": ["200 hours"], "Annotation": ["not annotated yet"], diff --git a/corpora/sign-language-resources/signes-en-famille.json b/corpora/sign-language-resources/signes-en-famille.json index 413f9e8..ef2819d 100644 --- a/corpora/sign-language-resources/signes-en-famille.json +++ b/corpora/sign-language-resources/signes-en-famille.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/signes-en-famille/v1", "Family": "Sign language resources", "Description": "This is a corpus of spontaneous exhanges between either hearing and deaf children on the one hand and either hearing or deaf parents on the other.\nA sample of the corpus is available for download from the Ortolang repository.", - "Languages": ["French Sign Language (LSF)", "fra"], + "Language": ["French Sign Language (LSF)", "fra"], "Licence": "CC BY-NC-ND 3.0", "Size": ["approx. 10 samples"], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/signor-corpus.json b/corpora/sign-language-resources/signor-corpus.json index a493556..23582c3 100644 --- a/corpora/sign-language-resources/signor-corpus.json +++ b/corpora/sign-language-resources/signor-corpus.json @@ -3,7 +3,7 @@ "URL": "http://lojze.lugos.si/signor", "Family": "Sign language resources", "Description": "This corpus is available for querying in its transcribed version providing an avatar demonstration of each sign. The corpus contains interviews with 80 informants. The entire corpus is currently not publishable due to data protection issues; however, permissions for publication are being collected in order to release the recordings too.", - "Languages": ["Slovene Sign Language (SZJ)"], + "Language": ["Slovene Sign Language (SZJ)"], "Licence": "Not freely accessible. contact person: prof. dr. Špela Vintar, University of Ljubljana, spela.vintar at ff.uni-lj.si", "Size": [], "Annotation": ["tokenised", "lemmatised", "gestural annotation", "mouth shape", "ID-gloss"], diff --git a/corpora/sign-language-resources/signs-of-ireland,json b/corpora/sign-language-resources/signs-of-ireland,json index 1674851..2107af8 100644 --- a/corpora/sign-language-resources/signs-of-ireland,json +++ b/corpora/sign-language-resources/signs-of-ireland,json @@ -3,7 +3,7 @@ "URL": "https://www.igi-global.com/chapter/online-delivery-deaf-studies-curricula/46372", "Family": "Sign language resources", "Description": "The Signs of Ireland Corpus is a collection of Irish Sign Language (ISL) video data from 40 signers of Ireland. The project was based at the Trinity College Dublin, took place in 2004 and was led by Lorraine Leeson.", - "Languages": ["Irish Sign Language (ISL)"], + "Language": ["Irish Sign Language (ISL)"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/swedish-sl-corpus.json b/corpora/sign-language-resources/swedish-sl-corpus.json index db461c1..ef09fe3 100644 --- a/corpora/sign-language-resources/swedish-sl-corpus.json +++ b/corpora/sign-language-resources/swedish-sl-corpus.json @@ -3,7 +3,7 @@ "URL": "https://www.ling.su.se/teckenspr%C3%A5ksresurser/teckenspr%C3%A5kskorpusar/svensk-teckenspr%C3%A5kskorpus", "Family": "Sign language resources", "Description": "This is a web-based version of the Swedish Sign Language Corpus, consisting of approximately 93,000 annotated sign tokens. Previously, the corpus was only available through the special-purpose video annotation tool ELAN. The aim of this corpus is to provide a picture of what sign language sentences look like, but also contribute new characters and variants to the Swedish Sign Language Dictionary. It can also be used to develop teaching materials.", - "Languages": ["Swedish Sign Language (STS)", "swe"], + "Language": ["Swedish Sign Language (STS)", "swe"], "Licence": "CC BY-NC-SA 2.5", "Size": ["24 hours"], "Annotation": ["ID-glosses", "PoS tags"], diff --git a/corpora/sign-language-resources/tactile-swedish-sl-corpus.json b/corpora/sign-language-resources/tactile-swedish-sl-corpus.json index f1f0755..d46de51 100644 --- a/corpora/sign-language-resources/tactile-swedish-sl-corpus.json +++ b/corpora/sign-language-resources/tactile-swedish-sl-corpus.json @@ -3,7 +3,7 @@ "URL": "https://www.ling.su.se/teckenspr%C3%A5ksresurser/teckenspr%C3%A5kskorpusar/korpus-f%C3%B6r-taktilt-teckenspr%C3%A5k", "Family": "Sign language resources", "Description": "This corpus contains dialogues and elicited narratives with 9 deafblind informants. The entire corpus is currently not publishable due to data protection issues; however, some parts are available through the STS-korpus. The project was funded by Mo Gård Research Fund.", - "Languages": ["Swedish Sign Language (STS)", "swe"], + "Language": ["Swedish Sign Language (STS)", "swe"], "Licence": "CC BY-NC-SA 2.5", "Size": ["4.5 hours"], "Annotation": ["partially annotated corpus"], diff --git a/corpora/sign-language-resources/turkish-sl.json b/corpora/sign-language-resources/turkish-sl.json index 51099ee..6b67944 100644 --- a/corpora/sign-language-resources/turkish-sl.json +++ b/corpora/sign-language-resources/turkish-sl.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0008-4252-6", "Family": "Sign language resources", "Description": "This corpus collects Turkish sign language (TID) data. For this project, native, early, and late TID signers were recorded performing different tasks (narratives of short picture stories/cartoon clips) and engaging in free conversation. These recordings and their annotations are stored in this corpus.\nThe corpus is available for download from the MIP (CLARIAH-NL distribution).", - "Languages": ["Turkish sign language (TİD)"], + "Language": ["Turkish sign language (TİD)"], "Licence": "Restricted, see here", "Size": [], "Annotation": [], diff --git a/corpora/sign-language-resources/vidi-sign-space.json b/corpora/sign-language-resources/vidi-sign-space.json index ebae6c1..eb5e29a 100644 --- a/corpora/sign-language-resources/vidi-sign-space.json +++ b/corpora/sign-language-resources/vidi-sign-space.json @@ -3,7 +3,7 @@ "URL": "https://www.nwo.nl/en/projects/276-70-009 ", "Family": "Sign language resources", "Description": "This is a corpus of DGS and TİD data collected by the Max Planck Institute for Psycholinguistics under the lead of Asli Özyürek from March 2007 to September 2012.", - "Languages": ["Turkish Sign Language (TİD)", "German Sign Language (DGS)"], + "Language": ["Turkish Sign Language (TİD)", "German Sign Language (DGS)"], "Licence": "Restricted", "Size": [], "Annotation": ["EAF transcripts"], diff --git a/corpora/sign-language-resources/visibase.json b/corpora/sign-language-resources/visibase.json index e3ed2ff..377ee24 100644 --- a/corpora/sign-language-resources/visibase.json +++ b/corpora/sign-language-resources/visibase.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/1839/00-0000-0000-0004-DF8F-4", "Family": "Sign language resources", "Description": "The Visibase corpus is a collection of digitised and described NGT material that was present in the late 1990s at the sign language research groups at the University of Amsterdam and at Leiden University. The project lasted from 1996–2001 and was based at Radboud University, University of Amsterdam and Utrecht University.", - "Languages": ["Dutch Sign Language (NGT)"], + "Language": ["Dutch Sign Language (NGT)"], "Licence": "Restricted", "Size": ["32 recordings"], "Annotation": ["yes"], diff --git a/corpora/sign-language-resources/vlaamse-gt.json b/corpora/sign-language-resources/vlaamse-gt.json index 7533a3e..f9f7353 100644 --- a/corpora/sign-language-resources/vlaamse-gt.json +++ b/corpora/sign-language-resources/vlaamse-gt.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-v6 ", "Family": "Sign language resources", "Description": "This is a collection of videos in Flemish Sign Language. 120 deaf people contributed to the Corpus VGT as informants. Age, region and gender were taken into account when selecting the informants. The informats were given a series of themes to talk about in pairs: telling a story, making agreements, discussing a theme, telling about their school days, etc. The conversations were recorded on video and edited them for each assignment.\nThe corpus is available for download from the Dutch Language Institute and for browsing through a dedicated website.", - "Languages": ["Flemish Sign Language (VGT)"], + "Language": ["Flemish Sign Language (VGT)"], "Licence": "CC BY.NC 3.0", "Size": ["140 hrs", "5 TB"], "Annotation": ["ID-glosses"], diff --git a/corpora/spoken-corpora/2nd-gen-israel-migrants.json b/corpora/spoken-corpora/2nd-gen-israel-migrants.json index 42b83c1..b76834e 100644 --- a/corpora/spoken-corpora/2nd-gen-israel-migrants.json +++ b/corpora/spoken-corpora/2nd-gen-israel-migrants.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C453-CEDC-B601-2", "Family": "Spoken corpora", "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["125 hours"], "Annotation": ["orthographically transcribed", "code switching"], diff --git a/corpora/spoken-corpora/aalto-dsp.json b/corpora/spoken-corpora/aalto-dsp.json index 32efad6..7b6348f 100644 --- a/corpora/spoken-corpora/aalto-dsp.json +++ b/corpora/spoken-corpora/aalto-dsp.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2017092133", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous conversations.\nThe corpus is available for download from FIN-CLARIN.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN ACA", "Size": ["5200 utterances"], "Annotation": [], diff --git a/corpora/spoken-corpora/absolventinnen.json b/corpora/spoken-corpora/absolventinnen.json index cb2c353..7593706 100644 --- a/corpora/spoken-corpora/absolventinnen.json +++ b/corpora/spoken-corpora/absolventinnen.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EC5D-8", "Family": "Spoken corpora", "Description": "This corpus provides data for examining the pronunciation of gender-neutral forms in German. The recordings took place at the IPS in the Munich region. 56 texts were recorded from 40 speakers. The texts came from newspapers, websites, administration offices, social services, etc., and were modified to contain either one of the three gender-neutral forms or the extended form. Each of the speakers read the 56 sentences, with target words, 25 % each, asterisk, underscore, uppercase-I or the feminine plural-form in a counterbalancing measures design. Filler sentences for this study are not a part of the corpus but will be part of further investigations. That means, that there are 56 recordings per session.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["2 hours"], "Annotation": ["orthographically transcribed", "phonetic", "phonemic transcription"], diff --git a/corpora/spoken-corpora/acwme.json b/corpora/spoken-corpora/acwme.json index cb20ea1..f281dbd 100644 --- a/corpora/spoken-corpora/acwme.json +++ b/corpora/spoken-corpora/acwme.json @@ -3,7 +3,7 @@ "URL": "https://researchdata.aston.ac.uk/id/eprint/162/", "Family": "Spoken corpora", "Description": "This corpus contains recordings of performances - comedy, drama, poetry, song and story-telling - and related interviews with performers, members of the audience and local and national celebrities.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/agender.json b/corpora/spoken-corpora/agender.json index d895984..9db1578 100644 --- a/corpora/spoken-corpora/agender.json +++ b/corpora/spoken-corpora/agender.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-1500-7", "Family": "Spoken corpora", "Description": "The speech corpus aGender contains speech sample recordings over public telephone lines with read and (semi-)spontaneous speech. Native German speakers called a voice portal from their private phone, and read text + answered some open questions. The purpose of the corpus is the automatic detection of gender and/or age (7 mixed classes ranging from 7 - 80 years). The corpus contains the voices of 945 German speakers (approx. minimum of 100 speakers per class), each delivering 18 speech items in up to six different sessions.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["47 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/air-traffic-ctrl.json b/corpora/spoken-corpora/air-traffic-ctrl.json index e034e99..c42d8bd 100644 --- a/corpora/spoken-corpora/air-traffic-ctrl.json +++ b/corpora/spoken-corpora/air-traffic-ctrl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-CCA1-0", "Family": "Spoken corpora", "Description": "This corpus contains recordings of communication between air traffic controllers and pilots.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CC BY-NC-ND 3.0", "Size": ["20 hours"], "Annotation": ["speaker information"], diff --git a/corpora/spoken-corpora/alcebla.json b/corpora/spoken-corpora/alcebla.json index 1b5b3cf..88aa06a 100644 --- a/corpora/spoken-corpora/alcebla.json +++ b/corpora/spoken-corpora/alcebla.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-50DD-D", "Family": "Spoken corpora", "Description": "This corpus contains Speech tasks performed by bilingual children.", - "Languages": ["deu", "spa"], + "Language": ["deu", "spa"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["72 hours"], "Annotation": ["orthographic and phonetic transcription"], diff --git a/corpora/spoken-corpora/ananas-mt.json b/corpora/spoken-corpora/ananas-mt.json index bc31b97..fb626f2 100644 --- a/corpora/spoken-corpora/ananas-mt.json +++ b/corpora/spoken-corpora/ananas-mt.json @@ -3,7 +3,7 @@ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/716-corpus-ananas-multilingue-ananasmt", "Family": "Spoken corpora", "Description": "This corpus contains TV-broadcasts and elicited dialogues.", - "Languages": ["eng", "ita", "spa"], + "Language": ["eng", "ita", "spa"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/arabic-speech.json b/corpora/spoken-corpora/arabic-speech.json index 049ba3e..fab2202 100644 --- a/corpora/spoken-corpora/arabic-speech.json +++ b/corpora/spoken-corpora/arabic-speech.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.14106/2561", "Family": "Spoken corpora", "Description": "This corpus is available for download from the Oxford Text Archive.", - "Languages": ["ara"], + "Language": ["ara"], "Licence": "CC BY 4.0", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/asr-artur.json b/corpora/spoken-corpora/asr-artur.json index 588c4cb..338e6f6 100644 --- a/corpora/spoken-corpora/asr-artur.json +++ b/corpora/spoken-corpora/asr-artur.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1772", "Family": "Spoken corpora", "Description": "This corpus was designed for the needs of developing automatic speech recognition for the Slovenian language. The complete database includes 1,067 hours of speech, of which 884 hours are transcribed, while the remaining 183 hours are recordings only.\nThe audio files are available in a separate repository entry. Transcriptions are available in the original TRS format of the Transcriber 1.5.1 tool which was used for making the transcriptions. All transcriptions were made manually or manually corrected.\nThe data are structured as follows:
        1. Artur-B, read speech, 573 hours in total.\nIt includes: (1a) Artur-B-Brani, 485 hours: Readings of sentences which were pre-selected from a 10% increment in the Gigafida 2.0 corpus. The sentences were chosen in such a way that they reflect the natural or the actual distribution of triphones in the words. They were distributed between 1,000 speakers, so that we recorded approx. 30 min in read form from each speaker. The speakers were balanced according to gender, age, region, and a small proportion of speakers were non-native speakers of Slovene. Each sentence is its own audio file and has a corresponding transcription file. (1b) Artur-B-Crkovani, 10 hours: Spellings. Speakers were asked to spell abbreviations and personal names and surnames, all chosen so that all Slovene letters were covered, plus the most common foreign letters. (1c) Artur-B-Studio, 51 hours: Designed for the development of speech synthesis. The sentences were read in a studio by a single speaker. Each sentence is its own audio file and has a corresponding transcription file. (1d) Artur-B-Izloceno, 27 hours: The recordings include different types of errors, typically, incorrect reading of sentences or a noisy environment.
        2. (2) Artur-J, public speech, 62 hours in total.\nIt includes: (2a) Artur-J-Splosni, 62 hours: media recordings, online recordings of conferences, workshops, education videos, etc.
        3. (3) Artur-N, private speech, 74 hours in total.\nIt includes: (3a) Artur-N-Obrazi, 6 hours: Speakers were asked to describe faces on pictures. Designed for a face-description domain-specific speech recognition. (3b) Artur-N-PDom, 7 hours: Speakers were asked to read pre-written sentences, as well as to express instructions for a potential smart-home system freely. Designed for a smart-home domain-specific speech recognition. (3c) Artur-N-Prosti, 61 hours: Monologues and dialogues between two persons, recorded for the purposes of the Artur database creation. Speakers were asked to conversate or explain freely on casual topics.
        4. (4) Artur-P, parliamentary speech, 201 hours in total.\nIt includes: (4a) Artur-P-SejeDZ, 201 hours: Speech from the Slovene National Assembly.
        5. \nThe corpus is available for download from the CLARIN.SI repository.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["884 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/asr-parlaspeech-hr.json b/corpora/spoken-corpora/asr-parlaspeech-hr.json index 736ea3d..5ae8840 100644 --- a/corpora/spoken-corpora/asr-parlaspeech-hr.json +++ b/corpora/spoken-corpora/asr-parlaspeech-hr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1494", "Family": "Spoken corpora", "Description": "This corpus is built from parliamentary proceedings available in the Croatian part of the ParlaMint corpus and the parliamentary recordings available from the Croatian Parliament's YouTube channel. The corpus consists of segments 8-20 seconds in length. There are two transcripts available: the original one, and the one normalised via a simple rule-based normaliser. Each of the transcripts contains word-level alignments to the recordings. Each segment has a reference to the ParlaMint 2.1 corpus via utterance IDs.\nThere is speaker information available for 381,849 segments, i.e., 95% of all segments. Speaker information consists of all the speaker information available from the ParlaMint 2.1 corpus (name, party, gender, age, status, role). There are all together 309 speakers in the dataset.\nThe dataset is divided into a training, a development, and a testing subset. Development data consist of 500 segments coming from the 5 most frequent speakers, with the goal of not losing speaker variety on dev data. Test data consist of 513 segments that come from 3 male (258 segments) and 3 female speakers (255 segments). There are no segments coming from the 6 test speakers in the two remaining subsets. The 22,076 instances not having speaker information are not assigned to any of the three subsets. The remaining 380,836 instances form the training set.\nThis corpus is available for download from the CLARIN.SI repository.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "CC BY-SA 4.0", "Size": ["1816 hours", "403925 entries"], "Annotation": ["normalised transcriptions", "speaker metadata", "word-level alignment to the recordings"], diff --git a/corpora/spoken-corpora/australiendeutsch.json b/corpora/spoken-corpora/australiendeutsch.json index 2adc8d8..d1bf20e 100644 --- a/corpora/spoken-corpora/australiendeutsch.json +++ b/corpora/spoken-corpora/australiendeutsch.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-BCF9-BE93-5F01-E", "Family": "Spoken corpora", "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["330,000 words", "65 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/babel.json b/corpora/spoken-corpora/babel.json index 92e392d..c52db3c 100644 --- a/corpora/spoken-corpora/babel.json +++ b/corpora/spoken-corpora/babel.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/hungarian-babel/9c27b9d481b611e2892a000c29bfc0d46a94c6ce19b843b3a452b382e2e64832/", "Family": "Spoken corpora", "Description": "This corpus contains various elicited speech tasks.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": [], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/bas-alcohol.json b/corpora/spoken-corpora/bas-alcohol.json index 8354993..b6fe1e7 100644 --- a/corpora/spoken-corpora/bas-alcohol.json +++ b/corpora/spoken-corpora/bas-alcohol.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-88E5-3", "Family": "Spoken corpora", "Description": "This corpus contains recordings of 162 speakers while being sober and intoxicated. Beginning with version 3, this corpus edition also contains an emuR compatible database version of the corpus (with a minor bugfix in the database in version 3.1).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["94 hours"], "Annotation": ["orthographically transcribed", "phonemic", "user state"], diff --git a/corpora/spoken-corpora/bas-regional-juves.json b/corpora/spoken-corpora/bas-regional-juves.json index 207d41e..f131cf1 100644 --- a/corpora/spoken-corpora/bas-regional-juves.json +++ b/corpora/spoken-corpora/bas-regional-juves.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0004-AE1D-9", "Family": "Spoken corpora", "Description": "The corpus contains both read and non-scripted German utterances. It comprises the original RVG prompts (telephone numbers, sentences, commands, digits, etc.) plus spellings, date and time expressions, and free form responses to questions, e.g. \"What are you wearing?\", \"How did you get here?\", etc. The speakers were adolescents between 13 and 20 years of age, recruited in public schools in Munich and the suburbs. More than 95% of the speakers have German as their mother language, and almost all of them attended school in Bavaria; 89 of them were male and 93 female.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["100 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/bas-siemens.json b/corpora/spoken-corpora/bas-siemens.json index 89aeea3..41f7ac2 100644 --- a/corpora/spoken-corpora/bas-siemens.json +++ b/corpora/spoken-corpora/bas-siemens.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0002-1303-5", "Family": "Spoken corpora", "Description": "This is a corpus of spontaneous, relatively casual dialogues in German. Each pair of dialogue partners is recorded conversing under real-noise conditions (in a noisy cafeteria and in a car going at different velocities), as well as in a studio at various levels of lombard noise played directly into the subjects' ears.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["24 hours"], "Annotation": ["Turn segmentation"], diff --git a/corpora/spoken-corpora/bas-sl-recog.json b/corpora/spoken-corpora/bas-sl-recog.json index 0ea53c3..b121537 100644 --- a/corpora/spoken-corpora/bas-sl-recog.json +++ b/corpora/spoken-corpora/bas-sl-recog.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-D8A5-2", "Family": "Spoken corpora", "Description": "The contains both isolated and continuous utterances of various signers. Since we use a vision-based approach for sign language recognition the corpus was recorded on video. For quick random access to individual frames, each video clip is stored as a sequence of images. The vocabulary comprises 450 basic signs in German Sign Language (DGS) representing different word types. Based on this vocabulary, overall 780 sentences were constructed.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["55 hours"], "Annotation": ["Sign language"], diff --git a/corpora/spoken-corpora/bas-smartweb-video.json b/corpora/spoken-corpora/bas-smartweb-video.json index 37dd994..8f87282 100644 --- a/corpora/spoken-corpora/bas-smartweb-video.json +++ b/corpora/spoken-corpora/bas-smartweb-video.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C059-C", "Family": "Spoken corpora", "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The recordings include 156 field recordings using a hand-held UMTS device (one person, SmartWeb Handheld Corpus SHC), 99 field recordings with video capture of the primary speaker and a secondary speaker (SmartWeb Video Corpus SVC) as well as 36 mobile recordings performed on a BMW motorbike (one speaker, SmartWeb Motorbike Corpus SMC).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["16.2 hours"], "Annotation": ["orthographically transcribed", "user state"], diff --git a/corpora/spoken-corpora/bas-verbmobil-emo.json b/corpora/spoken-corpora/bas-verbmobil-emo.json index 2b0d553..31ecfc6 100644 --- a/corpora/spoken-corpora/bas-verbmobil-emo.json +++ b/corpora/spoken-corpora/bas-verbmobil-emo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0004-2BCC-7", "Family": "Spoken corpora", "Description": "This database contains speech signals of dialogues in which a subject was recorded during a conversation via a spontaneous speech translation system. The response of the system was designed to invoke emotions (e.g. anger) in the subjects. It is part of the larger Verbmobil 2 speech data collection. Starting from BAS Clarin Respository version 2, the database is also distributed as an emuR comptatible emu database.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["17 hours"], "Annotation": ["orthographically transcribed", "emotions"], diff --git a/corpora/spoken-corpora/bas-ziptel.json b/corpora/spoken-corpora/bas-ziptel.json index 4909caa..7268d70 100644 --- a/corpora/spoken-corpora/bas-ziptel.json +++ b/corpora/spoken-corpora/bas-ziptel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0003-1E02-A", "Family": "Spoken corpora", "Description": "The ZipTel telephone speech database contains recordings of people applying for a SpeechDat prompt sheet via telephone. For the SpeechDat data collection, calls for participation were published in \"phone\", the customer magazine of the mobile telephone provider \"e-plus\", and in numerous newspapers all over Germany. In these calls, a telephone number was given where callers could order a SpeechDat prompt sheet. The calls were recorded by an automatic telephone server; callers were asked to provide name, address and telephone number. The ZipTel telephone speech database consists of 1957 recording sessions with a total of 7746 signal files. A recording session corresponds to one phone call, each signal file contains a single recorded utterance from the recording session. ", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["14 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/bcms.json b/corpora/spoken-corpora/bcms.json index 59a3ae9..3de0f46 100644 --- a/corpora/spoken-corpora/bcms.json +++ b/corpora/spoken-corpora/bcms.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1750", "Family": "Spoken corpora", "Description": "This corpus of heritage Bosnian/Croatian/Montenegrin/Serbian (BCMS) consists of elicited conversations (map tasks) by 29 second-generation BCMS speakers originating from different regions of former Yugoslavia and living in German-speaking Switzerland. The corpus is suited for researchers of heritage BCMS, as well as students and teachers of BCMS living in diaspora.\nThe corpus contains 30 turn-aligned transcripts with an average length of 6 minutes. The texts are annotated with the CLASSLA pipeline on the levels lemmatisation, MULTEXT-East Version 6 morphosyntactic descriptions, Universal Dependencies part-of-spech and morphological features. The corpus is enriched with corpus-specific annotations of truncations, elongations, stutter and code-switches. It is distributed in source TEI and derived vertical formats.\nThe corpus is available for download from CLARIN.SI as well as through the noSketchEngine and KonText concordancers.", - "Languages": ["bos", "hrv", "cnr", "srp"], + "Language": ["bos", "hrv", "cnr", "srp"], "Licence": "CC BY-NC-SA 4.0", "Size": ["12,988 tokens"], "Annotation": ["PoS-tagged (UD)", "MSD-tagged (UD & MULTEXT-East)", "lemmatised", "annotated with corpus-specific annotations"], diff --git a/corpora/spoken-corpora/bea.json b/corpora/spoken-corpora/bea.json index 5cbdffd..7e1b4f7 100644 --- a/corpora/spoken-corpora/bea.json +++ b/corpora/spoken-corpora/bea.json @@ -3,7 +3,7 @@ "URL": "http://metashare.nytud.hu/repository/browse/bea-hungarian-spontaneous-speech-database/808c4c306ba911e2aa7c68b599c26a062458e40404d44e4087901b5b720d2765/", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous speech.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "restricted", "Size": ["465 recordings"], "Annotation": ["partial transcription"], diff --git a/corpora/spoken-corpora/bel-tv-debates.json b/corpora/spoken-corpora/bel-tv-debates.json index d2564ea..a25ca67 100644 --- a/corpora/spoken-corpora/bel-tv-debates.json +++ b/corpora/spoken-corpora/bel-tv-debates.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-03FA-9CB0-5E33-8E01-8", "Family": "Spoken corpora", "Description": "This corpus contains broadcast TV debates.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["10 hours"], "Annotation": ["orthographically transcribed", "lemmatized"], diff --git a/corpora/spoken-corpora/berliner-wende.json b/corpora/spoken-corpora/berliner-wende.json index 130a70a..7443704 100644 --- a/corpora/spoken-corpora/berliner-wende.json +++ b/corpora/spoken-corpora/berliner-wende.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4", "Family": "Spoken corpora", "Description": "This corpus contains narrative interviews on German reunification.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["260,000 words", "28 hours"], "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/bielefeld-speech-and-gesture.json b/corpora/spoken-corpora/bielefeld-speech-and-gesture.json index f529633..31d93bd 100644 --- a/corpora/spoken-corpora/bielefeld-speech-and-gesture.json +++ b/corpora/spoken-corpora/bielefeld-speech-and-gesture.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-DEC1-C", "Family": "Spoken corpora", "Description": "The corpus is made up of 25 dialogs of interlocutors (50), who engage in a spatial communication task combining direction-giving and sight description. Six of those dialogues with data only from the direction giver are available including audio (*.wav) and video (*.mp4) data. There are 1764 isolated gestures in the corpus", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["9881 words"], "Annotation": ["Annotations of gestures and speech-gesture referents"], diff --git a/corpora/spoken-corpora/bigbrother.json b/corpora/spoken-corpora/bigbrother.json index 809a060..2783f4d 100644 --- a/corpora/spoken-corpora/bigbrother.json +++ b/corpora/spoken-corpora/bigbrother.json @@ -3,7 +3,7 @@ "URL": "http://www.tekstlab.uio.no/nota/bigbrother/english.html", "Family": "Spoken corpora", "Description": "This corpus contains recordings and transcripts from the Norwegian Big Brother in 2001.\nThe corpus is available through a Tekstlab concordancer.", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CLARIN ACA", "Size": ["440,300 tokens"], "Annotation": ["orthographically transcribed", "msd-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/bio-reise.json b/corpora/spoken-corpora/bio-reise.json index 2cda460..ef0c452 100644 --- a/corpora/spoken-corpora/bio-reise.json +++ b/corpora/spoken-corpora/bio-reise.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-BD7C-3EF5-0B01-4", "Family": "Spoken corpora", "Description": "This corpus contains narrative and biographic interviews.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["50,000 words", "6 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/bits.json b/corpora/spoken-corpora/bits.json index 0ad82b2..d1e99ac 100644 --- a/corpora/spoken-corpora/bits.json +++ b/corpora/spoken-corpora/bits.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C2C0-4", "Family": "Spoken corpora", "Description": "This is a corpus for speech synthesis using concatenative technique.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["16.5 hours"], "Annotation": ["orthographically transcribed", "phonetic", "phonemic", "prosodic"], diff --git a/corpora/spoken-corpora/border-karelia.json b/corpora/spoken-corpora/border-karelia.json index 0e14dc0..83cc9f0 100644 --- a/corpora/spoken-corpora/border-karelia.json +++ b/corpora/spoken-corpora/border-karelia.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073033", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThe corpus is available for download from FIN-CLARIN.", - "Languages": ["fin", "krl"], + "Language": ["fin", "krl"], "Licence": "CC-BY", "Size": ["120 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/boston-u-radio.json b/corpora/spoken-corpora/boston-u-radio.json index 5fde835..a162d79 100644 --- a/corpora/spoken-corpora/boston-u-radio.json +++ b/corpora/spoken-corpora/boston-u-radio.json @@ -3,7 +3,7 @@ "URL": "https://catalog.ldc.upenn.edu/LDC96S36", "Family": "Spoken corpora", "Description": "This corpus contains recordings and texts from radio news.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN RES", "Size": ["7 hours"], "Annotation": ["PoS-tagged", "phonetic alignment", "prosodic markers"], diff --git a/corpora/spoken-corpora/brothers.json b/corpora/spoken-corpora/brothers.json index 00111b7..2da2853 100644 --- a/corpora/spoken-corpora/brothers.json +++ b/corpora/spoken-corpora/brothers.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-55C3-3", "Family": "Spoken corpora", "Description": "This corpus contains recordings of pairs of brothers between the ages of 19 and 31. The native and recorded language is German. Recordings consist of minimal pairs in carrier sentences, a different set of sentences aimed at elicitating the full range of German vowels ('Berliner Sätze'), and a spontaneous dialogue about a TV-series. Recordings were made via a table microphone (studio quality) and via telephone (telephone quality). ", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["1.5 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/buckeye.json b/corpora/spoken-corpora/buckeye.json index 27a4e9a..ce6ebed 100644 --- a/corpora/spoken-corpora/buckeye.json +++ b/corpora/spoken-corpora/buckeye.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11041/sldr000776", "Family": "Spoken corpora", "Description": "This corpus contains an interview.\nThe corpus is available for download from ORTOLANG.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN RES", "Size": [], "Annotation": ["phonetic labels"], diff --git a/corpora/spoken-corpora/budapest-socioling.json b/corpora/spoken-corpora/budapest-socioling.json index b929a3e..d93e8e6 100644 --- a/corpora/spoken-corpora/budapest-socioling.json +++ b/corpora/spoken-corpora/budapest-socioling.json @@ -3,7 +3,7 @@ "URL": "http://buszi.nytud.hu/", "Family": "Spoken corpora", "Description": "This corpus contains sociolinguistic interviews conducted with 50 individuals.\nThe corpus is available for download and through a dedicated concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "CLARIN RES", "Size": ["270,000 words"], "Annotation": ["MSD-tagged", "spoken language phenomena (hesitation, consonant drops)"], diff --git a/corpora/spoken-corpora/cans.json b/corpora/spoken-corpora/cans.json index b6130cf..da5448a 100644 --- a/corpora/spoken-corpora/cans.json +++ b/corpora/spoken-corpora/cans.json @@ -3,7 +3,7 @@ "URL": "http://tekstlab.uio.no/norskiamerika/english/corpus.html", "Family": "Spoken corpora", "Description": "This corpus contains interviews, conversations. Norwegian and Swedish dialects in America.\nThe corpus is available through a Tekstlab concordancer.", - "Languages": ["nor", "swe"], + "Language": ["nor", "swe"], "Licence": "CLARIN ACA", "Size": ["251,000 tokens"], "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/ci-articulation.json b/corpora/spoken-corpora/ci-articulation.json index 4186ba7..fc206df 100644 --- a/corpora/spoken-corpora/ci-articulation.json +++ b/corpora/spoken-corpora/ci-articulation.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-8B63-3", "Family": "Spoken corpora", "Description": "This corpus contains speech recordings of normal hearing speakers and speakers equipped with Cochlear Implants (CI). Speech data were collected with the software SpeechRecorder, for each recording a BPF file was generated (*.par).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["5 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/clapi.json b/corpora/spoken-corpora/clapi.json index 0101907..894ffb8 100644 --- a/corpora/spoken-corpora/clapi.json +++ b/corpora/spoken-corpora/clapi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11403/CLAPI", "Family": "Spoken corpora", "Description": "This is a collection containing around 40 corpora which contain social interactions in different contexts: professional, private, institutional, commercial, medical, and educational situations.\nMost of the corpora can be downloaded and queried through a dedicated concordancer.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC BY-NC-SA 4.0", "Size": ["323,595 words"], "Annotation": [], diff --git a/corpora/spoken-corpora/clips-mt-manual.json b/corpora/spoken-corpora/clips-mt-manual.json index 153bdb3..12e2bc7 100644 --- a/corpora/spoken-corpora/clips-mt-manual.json +++ b/corpora/spoken-corpora/clips-mt-manual.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A9EE-6", "Family": "Spoken corpora", "Description": "This is a sub-corpus of the original Italian CLIPS corpus (Corpora e Lessici dell'Italiano Parlato e Scritto) that is manually annotated and covers only 15 maptask dialogues recorded in 15 locations by local speaker pairs. this corpus contains 3228 inspected and partially repaired WAV signal files, each containing one dialogue turn (*.wav), 3228 corrected original CLIPS annotation files (*.acs, *.phn, *.std, *.wrd), 3228 BAS Partitur files containing the annotation tiers ORT, KAN and SAP (*.par), 3228 EMU database annotation files (*.vot, *.hlb) covering 30 maptask dialogues performed by 30 speakers (each speaker pair performing two different map tasks) recorded in 15 different locations in Italy in 2000-2004.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CLARIN ACA", "Size": ["3 hours"], "Annotation": ["orthographically transcribed", "phonemic", "phonetic"], diff --git a/corpora/spoken-corpora/clips.json b/corpora/spoken-corpora/clips.json index 6667156..14be7db 100644 --- a/corpora/spoken-corpora/clips.json +++ b/corpora/spoken-corpora/clips.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-865", "Family": "Spoken corpora", "Description": "This corpus contains speech from 15 different cities in Italy.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["100 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json b/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json index 172f44c..79e99a4 100644 --- a/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json +++ b/corpora/spoken-corpora/consonant-cochlear-patients-diachronic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A99C-2", "Family": "Spoken corpora", "Description": "This corpus contains diachronic speech recordings from three cochlear implant (CI) users. For data used in the corresponding synchronic study, please refer to the CI_2 corpora. This corpus contains recordings used for the analysis of the temporal dynamics of the consonant cluster /ʃtr/.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["14 min"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/consonant-cochlear-patients.json b/corpora/spoken-corpora/consonant-cochlear-patients.json index 11b161f..de8b68a 100644 --- a/corpora/spoken-corpora/consonant-cochlear-patients.json +++ b/corpora/spoken-corpora/consonant-cochlear-patients.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AF40-2", "Family": "Spoken corpora", "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["2 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/contemporary-french.json b/corpora/spoken-corpora/contemporary-french.json index 9a1b46c..f870c2e 100644 --- a/corpora/spoken-corpora/contemporary-french.json +++ b/corpora/spoken-corpora/contemporary-french.json @@ -3,7 +3,7 @@ "URL": "https://hdl.handle.net/11403/cefc-orfeo/v1", "Family": "Spoken corpora", "Description": "This corpus contains debates, classroom interactions, literary and scientific texts, regional and national press, etc.\nThe corpus is available through a dedicated concordancer.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY 4.0", "Size": ["10 million words", "350 hours"], "Annotation": ["orthographically aligned", "PoS-tagged"], diff --git a/corpora/spoken-corpora/corpus-avip-api.json b/corpora/spoken-corpora/corpus-avip-api.json index 241be44..380e1c8 100644 --- a/corpora/spoken-corpora/corpus-avip-api.json +++ b/corpora/spoken-corpora/corpus-avip-api.json @@ -3,7 +3,7 @@ "URL": "http://www.parlaritaliano.it/api/", "Family": "Spoken corpora", "Description": "This corpus contains quasi-spontaneous dialogues (a map task).\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": [], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/corpus-lip.json b/corpora/spoken-corpora/corpus-lip.json index b40c58a..4f711c8 100644 --- a/corpora/spoken-corpora/corpus-lip.json +++ b/corpora/spoken-corpora/corpus-lip.json @@ -3,7 +3,7 @@ "URL": "http://badip.uni-graz.at/it/corpus-lip/descrizione", "Family": "Spoken corpora", "Description": "This corpus is available through a dedicated concordancer.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["490,000 words"], "Annotation": [], diff --git a/corpora/spoken-corpora/corpus-lips.json b/corpora/spoken-corpora/corpus-lips.json index 6f5954a..ebedd0f 100644 --- a/corpora/spoken-corpora/corpus-lips.json +++ b/corpora/spoken-corpora/corpus-lips.json @@ -3,7 +3,7 @@ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/653-corpus-lips", "Family": "Spoken corpora", "Description": "This is a L2-learner corpus.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": ["700,000 words", "100 hours"], "Annotation": ["PoS-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/cosi.json b/corpora/spoken-corpora/cosi.json index 0d707e5..4d331a8 100644 --- a/corpora/spoken-corpora/cosi.json +++ b/corpora/spoken-corpora/cosi.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-5225-A", "Family": "Spoken corpora", "Description": "This corpus contains lectures in Portuguese with simultaneous interpretation in English.", - "Languages": ["por", "eng"], + "Language": ["por", "eng"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["6 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/czech-malach.json b/corpora/spoken-corpora/czech-malach.json index 6377ca7..1073465 100644 --- a/corpora/spoken-corpora/czech-malach.json +++ b/corpora/spoken-corpora/czech-malach.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1912", "Family": "Spoken corpora", "Description": "This corpus contains interviews with survivors of the Holocaust.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces", "eng", "fra", "deu", "spa"], + "Language": ["ces", "eng", "fra", "deu", "spa"], "Licence": "CC BY-NC-ND 4.0", "Size": ["592 hours"], "Annotation": ["manual annotations of selected topics and interviews' metadata"], diff --git a/corpora/spoken-corpora/de-hochlautung.json b/corpora/spoken-corpora/de-hochlautung.json index a3440e3..88cdabf 100644 --- a/corpora/spoken-corpora/de-hochlautung.json +++ b/corpora/spoken-corpora/de-hochlautung.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C35C-4849-7B01-7", "Family": "Spoken corpora", "Description": "This corpus contains broadcasts in standard German.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["10,000 words", "2 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/de-koenig.json b/corpora/spoken-corpora/de-koenig.json index af4482e..7504304 100644 --- a/corpora/spoken-corpora/de-koenig.json +++ b/corpora/spoken-corpora/de-koenig.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C489-C64D-6D01-9", "Family": "Spoken corpora", "Description": "This corpus contains interviews and elicited speech in standard German\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["50,000 words", "6 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/de-mundarten-ddr.json b/corpora/spoken-corpora/de-mundarten-ddr.json index 1977d5a..a24aea0 100644 --- a/corpora/spoken-corpora/de-mundarten-ddr.json +++ b/corpora/spoken-corpora/de-mundarten-ddr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-BE28-4317-5D01-B", "Family": "Spoken corpora", "Description": "This corpus contains interviews and elicited speech in German dialects.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["German, (some Sorbian)"], + "Language": ["German, (some Sorbian)"], "Licence": "CLARIN RES", "Size": ["212,000 words", "385 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/de-mundarten-ost.json b/corpora/spoken-corpora/de-mundarten-ost.json index 6369889..4e0db57 100644 --- a/corpora/spoken-corpora/de-mundarten-ost.json +++ b/corpora/spoken-corpora/de-mundarten-ost.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C68C-5D03-EB01-7", "Family": "Spoken corpora", "Description": "This corpus contains interviews and elicited speech in German dialects.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["838,000 words", "461 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/de-mundarten-zwirner.json b/corpora/spoken-corpora/de-mundarten-zwirner.json index 9d081e5..bf27330 100644 --- a/corpora/spoken-corpora/de-mundarten-zwirner.json +++ b/corpora/spoken-corpora/de-mundarten-zwirner.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-D40A-3CEE-B901-4", "Family": "Spoken corpora", "Description": "This corpus contains interviews and elicited speech in German dialects.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["German, (some Frisian and Dutch)"], + "Language": ["German, (some Frisian and Dutch)"], "Licence": "CLARIN RES", "Size": ["4 million words", "1076 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/de-pfeffer.json b/corpora/spoken-corpora/de-pfeffer.json index 96574eb..ec57bf1 100644 --- a/corpora/spoken-corpora/de-pfeffer.json +++ b/corpora/spoken-corpora/de-pfeffer.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C9D0-78FE-3C01-2", "Family": "Spoken corpora", "Description": "This corpus contains interviews in regional varieties of German.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["646,000 words", "80 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/dialekt.json b/corpora/spoken-corpora/dialekt.json index 102bba4..62b07fc 100644 --- a/corpora/spoken-corpora/dialekt.json +++ b/corpora/spoken-corpora/dialekt.json @@ -3,7 +3,7 @@ "URL": "https://wiki.korpus.cz/doku.php/en:cnk:dialekt", "Family": "Spoken corpora", "Description": "This corpus contains traditional dialectological material, mostly unprepared monologue-type speech.\nThe corpus is available download (upon request) and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Academic Licence Agreement for Czech National Corpus Data", "Size": ["100,000 words"], "Annotation": ["orthographically and phonetically (dialect features) transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/dialogstrukturen.json b/corpora/spoken-corpora/dialogstrukturen.json index 167f4c7..ea74b49 100644 --- a/corpora/spoken-corpora/dialogstrukturen.json +++ b/corpora/spoken-corpora/dialogstrukturen.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C0BE-562F-C101-E", "Family": "Spoken corpora", "Description": "This corpus contains authentic interaction from various domains.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["140,000 words", "15 hours"], "Annotation": ["orthographically transcribed", "intonation", "lemmatised", "PoS-tagged", "time alignment"], diff --git a/corpora/spoken-corpora/doc-patient-ahus.json b/corpora/spoken-corpora/doc-patient-ahus.json index dbe814b..f4cc3ec 100644 --- a/corpora/spoken-corpora/doc-patient-ahus.json +++ b/corpora/spoken-corpora/doc-patient-ahus.json @@ -3,7 +3,7 @@ "URL": "https://www.hf.uio.no/iln/english/about/organization/text-laboratory/projects/doctor-patient/index.html", "Family": "Spoken corpora", "Description": "This corpus contains doctor-patient conversations.\nThe corpus is available through a Tekstlab concordancer (account needed).", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CLARIN ACA", "Size": ["958,830 tokens"], "Annotation": ["orthographically transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/elfa.json b/corpora/spoken-corpora/elfa.json index 7ce57b6..6f22926 100644 --- a/corpora/spoken-corpora/elfa.json +++ b/corpora/spoken-corpora/elfa.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201403262", "Family": "Spoken corpora", "Description": "This corpus contains recorded lectures and seminars.\nThe corpus is available for download from FIN-CLARIN.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN RES, MS-C-No ReD-ND-FF", "Size": ["13 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/emigranten-israel-wiener.json b/corpora/spoken-corpora/emigranten-israel-wiener.json index 06515ae..2d6f607 100644 --- a/corpora/spoken-corpora/emigranten-israel-wiener.json +++ b/corpora/spoken-corpora/emigranten-israel-wiener.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C42A-423C-2401-D", "Family": "Spoken corpora", "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["225,000 words", "51 hours"], "Annotation": ["PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/emigranten-israel.json b/corpora/spoken-corpora/emigranten-israel.json index b4fea2b..a16a781 100644 --- a/corpora/spoken-corpora/emigranten-israel.json +++ b/corpora/spoken-corpora/emigranten-israel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C3A7-393A-8A01-3", "Family": "Spoken corpora", "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["232,000 words", "285 hours"], "Annotation": ["orthographically transcribed", "lemma", "PoS-tagged", "time alignment"], diff --git a/corpora/spoken-corpora/eslora.json b/corpora/spoken-corpora/eslora.json index b3f6934..9ce1f26 100644 --- a/corpora/spoken-corpora/eslora.json +++ b/corpora/spoken-corpora/eslora.json @@ -3,7 +3,7 @@ "URL": "http://eslora.usc.es/", "Family": "Spoken corpora", "Description": "This corpus consists of spontaneous conversations and semi-structured interviews recorded in Galicia between 2007 and 2015, which were orthographically transcribed and manually aligned to the audio files. The transcripts have been morphologically tagged and lemmatized with the statistical PoS-tagger XIADA.\nThe corpus can be browsed via a dedicated search engine. The multiple functions of the search engine are fully described in the User Guide.", - "Languages": ["spa"], + "Language": ["spa"], "Licence": "academic, non-commercial", "Size": ["83 documents", "768,005 words", "898,914 tokens"], "Annotation": ["manual alignment", "orthographic transcription", "PoS-tagging", "lemmatisation"], diff --git a/corpora/spoken-corpora/est-dialect.json b/corpora/spoken-corpora/est-dialect.json index cad524c..95f9c17 100644 --- a/corpora/spoken-corpora/est-dialect.json +++ b/corpora/spoken-corpora/est-dialect.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00076L", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN ACA", "Size": ["1.3 million words"], "Annotation": ["phonetically transcribed", "MSD-tagged", "partly syntactically parsed"], diff --git a/corpora/spoken-corpora/est-emotional-speech.json b/corpora/spoken-corpora/est-emotional-speech.json index 19b8637..a1f8b28 100644 --- a/corpora/spoken-corpora/est-emotional-speech.json +++ b/corpora/spoken-corpora/est-emotional-speech.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/3-00-0000-0000-0000-0001AL", "Family": "Spoken corpora", "Description": "This corpus contains read sentences that express anger, joy and sadness, or are neutral.\nTThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY", "Size": ["1234 sentences"], "Annotation": [], diff --git a/corpora/spoken-corpora/est-spontaneous-speech.json b/corpora/spoken-corpora/est-spontaneous-speech.json index dd86d06..4493d8e 100644 --- a/corpora/spoken-corpora/est-spontaneous-speech.json +++ b/corpora/spoken-corpora/est-spontaneous-speech.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00154L", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous speech by speakers with different dialectological and social backgrounds.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CLARIN_RES", "Size": ["635,000 words", "90 hours"], "Annotation": ["orthographically and phonetically transcribed, syllables, prosodic feet, intonation phrases, changes in voice quality"], diff --git a/corpora/spoken-corpora/exmeralda-demo.json b/corpora/spoken-corpora/exmeralda-demo.json index d023603..9f35898 100644 --- a/corpora/spoken-corpora/exmeralda-demo.json +++ b/corpora/spoken-corpora/exmeralda-demo.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-4F70-A", "Family": "Spoken corpora", "Description": "This corpus is a demo of the EXMARaLDA system.\nThe corpus is available for download from a CLARIN-D repository.", - "Languages": ["deu", "eng", "fra", "spa", "tur", "pol", "vie", "swe", "nor", "ita", "rus", "afr", "por"], + "Language": ["deu", "eng", "fra", "spa", "tur", "pol", "vie", "swe", "nor", "ita", "rus", "afr", "por"], "Licence": "HZSK-PUB (public, non-commercial only)", "Size": ["2 hours"], "Annotation": ["suprasegmental information", "accentuation/stress marking"], diff --git a/corpora/spoken-corpora/fadac.json b/corpora/spoken-corpora/fadac.json index feee8b9..d43464e 100644 --- a/corpora/spoken-corpora/fadac.json +++ b/corpora/spoken-corpora/fadac.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-A0D3-C", "Family": "Spoken corpora", "Description": "This corpus contains informal interviews.", - "Languages": ["fao", "dan"], + "Language": ["fao", "dan"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": [], "Annotation": ["EXMARaLDA"], diff --git a/corpora/spoken-corpora/fin-broadcast.json b/corpora/spoken-corpora/fin-broadcast.json index 79f69aa..8cb88c1 100644 --- a/corpora/spoken-corpora/fin-broadcast.json +++ b/corpora/spoken-corpora/fin-broadcast.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201403265", "Family": "Spoken corpora", "Description": "This corpus contains radio and TV broadcasts.\nThe corpus is available for download from FIN-CLARIN and for online querying through the LAT-platform.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN RES", "Size": ["18 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/fin-dialect-syntax.json b/corpora/spoken-corpora/fin-dialect-syntax.json index 55961dc..e1a7252 100644 --- a/corpora/spoken-corpora/fin-dialect-syntax.json +++ b/corpora/spoken-corpora/fin-dialect-syntax.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014052716", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThe corpus is available for online querying through the LAT platform and through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY-NC-ND", "Size": ["1.2 million words"], "Annotation": ["MSD-tagged"], diff --git a/corpora/spoken-corpora/fin-parliament.json b/corpora/spoken-corpora/fin-parliament.json index 1f3c2c4..fd23bc5 100644 --- a/corpora/spoken-corpora/fin-parliament.json +++ b/corpora/spoken-corpora/fin-parliament.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407305", "Family": "Spoken corpora", "Description": "This corpus contains the proceedings of the Finnish Parliament.\nThe corpus is available through a dedicated webpage and through the concordancer Korp.", - "Languages": ["fin", "swe"], + "Language": ["fin", "swe"], "Licence": "CC-BY-NC-ND", "Size": ["22.5 million words"], "Annotation": [], diff --git a/corpora/spoken-corpora/followup-fin-dialects.json b/corpora/spoken-corpora/followup-fin-dialects.json index 2ae38c1..e2e590f 100644 --- a/corpora/spoken-corpora/followup-fin-dialects.json +++ b/corpora/spoken-corpora/followup-fin-dialects.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073043", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThis corpus is available for online querying through the LAT-platform.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CLARIN RES", "Size": ["12,200 Hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/formtask.json b/corpora/spoken-corpora/formtask.json index 10966ba..7f632ea 100644 --- a/corpora/spoken-corpora/formtask.json +++ b/corpora/spoken-corpora/formtask.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0005-8535-9", "Family": "Spoken corpora", "Description": "This is a corpus of telephone conversations including prompted descriptions of typical forms (Berlin public transport ticket, invoices, Austrian parking tickets, newsstand receipts, money transfer forms) found in everyday life.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN PUB", "Size": ["24.5 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/forschung-gespr-de.json b/corpora/spoken-corpora/forschung-gespr-de.json index 8ccd416..fbad9e1 100644 --- a/corpora/spoken-corpora/forschung-gespr-de.json +++ b/corpora/spoken-corpora/forschung-gespr-de.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C1B2-A5E3-2A01-D", "Family": "Spoken corpora", "Description": "This corpus contains authentic interactions from various domains.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["2.3 million words", "230 hours"], "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/fra-parisien-2000.json b/corpora/spoken-corpora/fra-parisien-2000.json index 39cb6d8..effc7c3 100644 --- a/corpora/spoken-corpora/fra-parisien-2000.json +++ b/corpora/spoken-corpora/fra-parisien-2000.json @@ -3,7 +3,7 @@ "URL": "https://doi.org/10.34847/cocoon.8bc96a4e-9899-30e4-99be-c72d216eb38b", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/gamli.json b/corpora/spoken-corpora/gamli.json index 61d6389..da60dd1 100644 --- a/corpora/spoken-corpora/gamli.json +++ b/corpora/spoken-corpora/gamli.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/315", "Family": "Spoken corpora", "Description": "This is an ASR corpus for Icelandic oral histories.\nThe corpus contains 210 unique speakers, 90 women and 120 men (plus the interviewers: 14 men and 1 woman), but the total audio length with each individual speaker varies quite a lot with three men accounting for one third of the entire data. The age ranges from 38 to 99, but most of the speakers are 60+ (94.8%) and the average age of the speakers is 77 years. This ratio is unprecedented in all existing corpora for Icelandic speech (cf. 4.8% of speakers in Samrómur are 60+) and makes Gamli an important addition to that collection.\nThe corpus is available for download from the CLARIN-IS repository.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["146 hours of transcribed audio"], "Annotation": ["Subset is manually annotated with speaker ID and time alignment"], diff --git a/corpora/spoken-corpora/gender-neutral-de.json b/corpora/spoken-corpora/gender-neutral-de.json index d6512f2..8fbcd23 100644 --- a/corpora/spoken-corpora/gender-neutral-de.json +++ b/corpora/spoken-corpora/gender-neutral-de.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0003-FF39-F", "Family": "Spoken corpora", "Description": "This corpus examines the pronunciation of different genderneutral forms in German. Various source texts were used, like newspaper articles, websites, etc.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN PUB", "Size": ["2 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/gesprochenes-wortkorpus.json b/corpora/spoken-corpora/gesprochenes-wortkorpus.json index ce6caac..c34208e 100644 --- a/corpora/spoken-corpora/gesprochenes-wortkorpus.json +++ b/corpora/spoken-corpora/gesprochenes-wortkorpus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-3D30-F", "Family": "Spoken corpora", "Description": "WaSeP contains recordings of one female and one male speaker, both professional actors, uttering single German nouns and pseudowords in multiple emotional prosodies. This edition improves the segmentation of the phonetic annotation, adds Praat TextGrid files and removes a few irregular items.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["3 hours"], "Annotation": ["phonetic"], diff --git a/corpora/spoken-corpora/gewiss.json b/corpora/spoken-corpora/gewiss.json index 029ee2b..c08def6 100644 --- a/corpora/spoken-corpora/gewiss.json +++ b/corpora/spoken-corpora/gewiss.json @@ -3,7 +3,7 @@ "URL": "https://gewiss.uni-leipzig.de/index.php?id=home&L=1", "Family": "Spoken corpora", "Description": "This corpus contains transcripts and audio recordings of spoken academic discourse, primarily talks including discussions and oral exams.", - "Languages": ["German (L2 and L1)", "eng", "pol", "Italian (L1)"], + "Language": ["German (L2 and L1)", "eng", "pol", "Italian (L1)"], "Licence": "", "Size": ["1.4 million tokens", "123 hours"], "Annotation": ["code switching"], diff --git a/corpora/spoken-corpora/gos-video.json b/corpora/spoken-corpora/gos-video.json index bd29549..7045d90 100644 --- a/corpora/spoken-corpora/gos-video.json +++ b/corpora/spoken-corpora/gos-video.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1190", "Family": "Spoken corpora", "Description": "This corpus contains public academic speech.\nThe corpus is available for download from CLARIN.SI and through the concordancer KonText.\nFor the version with audio recordings, click here.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY 4.0", "Size": ["126,000 words"], "Annotation": ["PoS-tagged", "lemmatised", "orthographically and phonetically transcribed"], diff --git a/corpora/spoken-corpora/gos.json b/corpora/spoken-corpora/gos.json index 1ed7c93..ac09d9f 100644 --- a/corpora/spoken-corpora/gos.json +++ b/corpora/spoken-corpora/gos.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1771", "Family": "Spoken corpora", "Description": "This corpus contains transcripts from radio and TV shows, school lessons, private conversations, business meetings. It is composed of three different sources: Spoken corpus Gos 1.1 (112 hours, 1 million words), Spoken corpus Gos VideoLectures 4.2 (22 hours, 179,000 words), a selection from the ASR database ARTUR 1.0 (185 hours, 1.2 mllion words).\nThe corpus is available for download from CLARIN.SI as well as through a dedicated webconcordancer.", - "Languages": ["slv"], + "Language": ["slv"], "Licence": "CC BY-SA 4.0", "Size": ["1534 texts", "127,604 utterances", "2,462,368 words"], "Annotation": ["phonetic and orthographic transcription", "PoS tagging", "lemmatisation"], diff --git a/corpora/spoken-corpora/gothenburg-dialogue.json b/corpora/spoken-corpora/gothenburg-dialogue.json index 1c3027b..1e5593f 100644 --- a/corpora/spoken-corpora/gothenburg-dialogue.json +++ b/corpora/spoken-corpora/gothenburg-dialogue.json @@ -3,7 +3,7 @@ "URL": "https://spraakbanken.gu.se/swe/resurs/gdc#tabs=information", "Family": "Spoken corpora", "Description": "This corpus is available through the concordancer Korp (account needed).", - "Languages": ["swe"], + "Language": ["swe"], "Licence": "CC-BY", "Size": ["1,470,000 tokens"], "Annotation": ["MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/griffith-australian.json b/corpora/spoken-corpora/griffith-australian.json index ba5d6f4..6ffd701 100644 --- a/corpora/spoken-corpora/griffith-australian.json +++ b/corpora/spoken-corpora/griffith-australian.json @@ -3,7 +3,7 @@ "URL": "https://www.ausnc.org.au/corpora/gcsause", "Family": "Spoken corpora", "Description": "This corpus is available for download and through the concordancer of the Australian National Corpus.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["32,134 words"], "Annotation": [], diff --git a/corpora/spoken-corpora/grundstrukturen-freiburg.json b/corpora/spoken-corpora/grundstrukturen-freiburg.json index 939a2d9..e23dcce 100644 --- a/corpora/spoken-corpora/grundstrukturen-freiburg.json +++ b/corpora/spoken-corpora/grundstrukturen-freiburg.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C29F-AE56-C501-7", "Family": "Spoken corpora", "Description": "This corpus contains authentic interaction from various domains.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["600,000 words", "70 hours"], "Annotation": ["orthographically transcribed", "intonation", "lemmatised", "PoS-tagged", "time alignment"], diff --git a/corpora/spoken-corpora/habla.json b/corpora/spoken-corpora/habla.json index b781000..880f930 100644 --- a/corpora/spoken-corpora/habla.json +++ b/corpora/spoken-corpora/habla.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-5C64-9", "Family": "Spoken corpora", "Description": "This corpus contains interviews.", - "Languages": ["deu", "fra", "ita"], + "Language": ["deu", "fra", "ita"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["79 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/hacaspa.json b/corpora/spoken-corpora/hacaspa.json index f91504f..f465ec6 100644 --- a/corpora/spoken-corpora/hacaspa.json +++ b/corpora/spoken-corpora/hacaspa.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-5F0B-B", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous speech and reading tasks.", - "Languages": ["Spanish (Argentinian)"], + "Language": ["Spanish (Argentinian)"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["19 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/hamburg-modern.json b/corpora/spoken-corpora/hamburg-modern.json index 5af0456..2f2748d 100644 --- a/corpora/spoken-corpora/hamburg-modern.json +++ b/corpora/spoken-corpora/hamburg-modern.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-6973-9", "Family": "Spoken corpora", "Description": "This corpus contains task-oriented communcation (e.g., a film retelling) in the context of studying adult L2 acquisition.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "HZSK-ACA (academic, non-commercial only)", "Size": ["3 hours"], "Annotation": ["manual annotation of phonetic phenomena", "accent/stress marking"], diff --git a/corpora/spoken-corpora/hamcopolig.json b/corpora/spoken-corpora/hamcopolig.json index 9b66a2f..c1847b5 100644 --- a/corpora/spoken-corpora/hamcopolig.json +++ b/corpora/spoken-corpora/hamcopolig.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-63CE-9", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous speech and reading tasks.", - "Languages": ["pol"], + "Language": ["pol"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["38 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/hempel.json b/corpora/spoken-corpora/hempel.json index 016417e..007b0d1 100644 --- a/corpora/spoken-corpora/hempel.json +++ b/corpora/spoken-corpora/hempel.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0002-F80E-8", "Family": "Spoken corpora", "Description": "This corpus is a collection of more than 3900 spontaneous speech items recorded as extra material during the German SpeechDat-II project. Speakers were asked to report what they had been doing during the last hour: \"Was haben Sie in der letzten Stunde gemacht?\". This item was recorded as the last item of the recording session. Speakers had become acquainted with the recording procedure and they were quite relaxed because they knew that this item was the last to be recorded. This resulted in quite natural, colloquial speech, sometimes with marked regional accent.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["25.5 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/hral.json b/corpora/spoken-corpora/hral.json index 1aa5e93..a487afe 100644 --- a/corpora/spoken-corpora/hral.json +++ b/corpora/spoken-corpora/hral.json @@ -3,7 +3,7 @@ "URL": "http://doi.org/10.21415/T5131S", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous conversations among 617 speakers from all Croatian counties, and it comprises more than 250 000 tokens and more than 100 000 types. Data for the corpus were collected from 2010 to 2012, from 2014 to 2015 and during 2016. Participants were adults who spoke Croatian as their mother tongue and first language. Transcripts were annotated with the ages and genders of the speakers, as well as the location of the conversation. A separate spreadsheet lists the speakers' origin, where they have spent most of their life and their level of education. The coverage of metadata for individual samples varies, and is in general more complete for samples collected from 2014 onwards.\nThe corpus is available for download and browsing from a dedicated website.", - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "author attribution required", "Size": ["250,000 tokens"], "Annotation": ["speaker metadata"], diff --git a/corpora/spoken-corpora/hun-broadcast-news.json b/corpora/spoken-corpora/hun-broadcast-news.json index 48cb2d6..8c245cd 100644 --- a/corpora/spoken-corpora/hun-broadcast-news.json +++ b/corpora/spoken-corpora/hun-broadcast-news.json @@ -3,7 +3,7 @@ "URL": "http://metashare.elda.org/repository/browse/hungarian-broadcast-news-database/99bc21d081b611e2892a000c29bfc0d4d3d173ede2e64475b596aa1857a64541/", "Family": "Spoken corpora", "Description": "This corpus is available for download (upon request) from META-SHARE.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "META_SHARE NC-NoReD", "Size": ["25,000 words", "3.5 hours"], "Annotation": ["audio-level annotations"], diff --git a/corpora/spoken-corpora/hun-gigaword-spoken.json b/corpora/spoken-corpora/hun-gigaword-spoken.json index 90e0c0d..faaab45 100644 --- a/corpora/spoken-corpora/hun-gigaword-spoken.json +++ b/corpora/spoken-corpora/hun-gigaword-spoken.json @@ -3,7 +3,7 @@ "URL": "http://hnc.nytud.hu/index_eng.html", "Family": "Spoken corpora", "Description": "This corpus contains radio broadcasts (reading aloud and spontaneous conversation)\nThe corpus is available through the Hungarian Gigaword Corpus concordancer.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "", "Size": ["76 million words"], "Annotation": ["PoS-tagged", "MSD-tagged"], diff --git a/corpora/spoken-corpora/hun-kindergarten.json b/corpora/spoken-corpora/hun-kindergarten.json index 3c6b15d..0a56629 100644 --- a/corpora/spoken-corpora/hun-kindergarten.json +++ b/corpora/spoken-corpora/hun-kindergarten.json @@ -3,7 +3,7 @@ "URL": "http://metashare.nytud.hu/repository/browse/hungarian-kindergarten-language-corpus/b572a8106ba711e2aa7c68b599c26a06a4db2e695cf94a1cad6bf6793d747d2a/", "Family": "Spoken corpora", "Description": "This corpus contains elicited speech tasks (picture descriptions) and guided conversation with children.\nThe corpus is available for download through META-SHARE.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "restricted", "Size": ["192,000 words"], "Annotation": ["PoS-tagged", "MSD-tagged"], diff --git a/corpora/spoken-corpora/hun-reference-speech-db.json b/corpora/spoken-corpora/hun-reference-speech-db.json index 9901ae7..2f64e6f 100644 --- a/corpora/spoken-corpora/hun-reference-speech-db.json +++ b/corpora/spoken-corpora/hun-reference-speech-db.json @@ -3,7 +3,7 @@ "URL": "http://metashare.ilsp.gr:8080/repository/browse/hungarian-mrba/92067ce281b611e2892a000c29bfc0d48e6c8e9c745d446a9a64e48ba4c6462d/", "Family": "Spoken corpora", "Description": "This corpus contains reading tasks.\nThe corpus is available for download (upon request) from META-SHARE.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "META-SHARE No-Redistribution Commercial FF", "Size": ["6 hours"], "Annotation": ["partial phonemic-level annotation"], diff --git a/corpora/spoken-corpora/ifa-spoken.json b/corpora/spoken-corpora/ifa-spoken.json index 942fdfd..52d1fcc 100644 --- a/corpora/spoken-corpora/ifa-spoken.json +++ b/corpora/spoken-corpora/ifa-spoken.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-734", "Family": "Spoken corpora", "Description": "The corpus is available for download from an informal webpage.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN PUB", "Size": ["50,000 words (41 minutes/speaker)"], "Annotation": ["Hand-segmented speech"], diff --git a/corpora/spoken-corpora/jasmin.json b/corpora/spoken-corpora/jasmin.json index bc0a117..3ee33a7 100644 --- a/corpora/spoken-corpora/jasmin.json +++ b/corpora/spoken-corpora/jasmin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10032/tm-a2-j7", "Family": "Spoken corpora", "Description": "The corpus contains recordings of human-machine interaction and read speech performed by children, non-native speakers and senior people.\nThe corpus is available download from the Dutch Language Institute.", - "Languages": ["nld"], + "Language": ["nld"], "Licence": "CLARIN RES", "Size": ["115 hours"], "Annotation": ["PoS-tagged", "lemmatised", "phonetically transcribed"], diff --git a/corpora/spoken-corpora/juznevesti-sr.json b/corpora/spoken-corpora/juznevesti-sr.json index 4790e7b..e7dc01c 100644 --- a/corpora/spoken-corpora/juznevesti-sr.json +++ b/corpora/spoken-corpora/juznevesti-sr.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1679", "Family": "Spoken corpora", "Description": "This corpus consists of audio recordings and manual transcripts from the Južne Vesti website and its host show called the 15 minuta. The processing of the audio and its alignment to the manual transcripts followed the pipeline of the ParlaSpeech-HR dataset as closely as possible. Segments in this dataset range from 2 to 30 seconds. Train-dev-test split has been performed with 80:10:10 ratio.\nAs with the ParlaSpeech-HR dataset, two transcriptions are provided; one with transcripts in their raw form (with punctuation, capital letters, numerals) and another normalised with the same rule-based normaliser as was used in ParlaSpeech-HR dataset creation, which is lowercased, punctuation is removed and numerals are replaced with words. The speaker_info attribute is less abundant due to the fact that compared to parliamentary corpora less data is available in this domain, so it covers only the guest name, guest description, host name, and speaker breakdown (when the host or the guest are speaking).\nThis corpus is available for download from the CLARIN.SI repository.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC BY-SA 4.0", "Size": ["50.55 hours", "10811 entries"], "Annotation": ["normalised transcriptions (lowercased, punctuation removed, numerals spelled out)", "speaker metadata", "word-level alignment to the recordings"], diff --git a/corpora/spoken-corpora/karel-makon.json b/corpora/spoken-corpora/karel-makon.json index 07cc962..18832b1 100644 --- a/corpora/spoken-corpora/karel-makon.json +++ b/corpora/spoken-corpora/karel-makon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-3422", "Family": "Spoken corpora", "Description": "This corpus contains talks on Christian mysticism given by Karel Makoň.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-SA 3.0", "Size": ["1000 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/karl-eberhard.json b/corpora/spoken-corpora/karl-eberhard.json index 199972a..8ba1580 100644 --- a/corpora/spoken-corpora/karl-eberhard.json +++ b/corpora/spoken-corpora/karl-eberhard.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-DADB-D", "Family": "Spoken corpora", "Description": "This corpus contains 79 speakers of Southern German. Two speakers, usually acquainted with each other, had an one hour long conversation in separate booths. ", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["40 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/kennsluromur.json b/corpora/spoken-corpora/kennsluromur.json index 63d6938..191bdca 100644 --- a/corpora/spoken-corpora/kennsluromur.json +++ b/corpora/spoken-corpora/kennsluromur.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/171", "Family": "Spoken corpora", "Description": "This corpus contains recordings of lectures at Reykjavik University and the University of Iceland. The lectures were donated by the lecturers (172 lectures by 14 lecturers), transcribed with an Icelandic speech recognizer and then manually corrected by human transcribers and finally verified by a proofreader.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["51 hours"], "Annotation": ["sentence-segmented orthographic transcriptions"], diff --git a/corpora/spoken-corpora/konfliktgespraeche.json b/corpora/spoken-corpora/konfliktgespraeche.json index c6b0f0c..7586353 100644 --- a/corpora/spoken-corpora/konfliktgespraeche.json +++ b/corpora/spoken-corpora/konfliktgespraeche.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0332-C11A-46E1-0001-A", "Family": "Spoken corpora", "Description": "This corpus contains elicited conflict interaction.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["160,000 words", "12 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/kontrastiv.json b/corpora/spoken-corpora/kontrastiv.json index fb0a9cf..4576842 100644 --- a/corpora/spoken-corpora/kontrastiv.json +++ b/corpora/spoken-corpora/kontrastiv.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-03BC-7412-E7EA-4101-3", "Family": "Spoken corpora", "Description": "This corpus contains academic interaction.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu", "eng", "pol", "bul"], + "Language": ["deu", "eng", "pol", "bul"], "Licence": "CLARIN RES", "Size": ["760,000 words", "92 hours"], "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed", "annotation of discourse phenomena and language mixing"], diff --git a/corpora/spoken-corpora/lang-in-migration.json b/corpora/spoken-corpora/lang-in-migration.json index 286d7fd..80867cb 100644 --- a/corpora/spoken-corpora/lang-in-migration.json +++ b/corpora/spoken-corpora/lang-in-migration.json @@ -1,9 +1,9 @@ { - "Name": "Languages in Migration", + "Name": "Language in Migration", "URL": "http://hdl.handle.net/11372/LRT-4777", "Family": "Spoken corpora", "Description": "This corpus is a representation of authentic spoken Czech and German.\nIt contains transcriptions of informal speech (private environment, spontaneity, unpreparedness etc.) by Czech-German bilingual speakers born in Czechoslovakia around 1955 and who departed for Germany after becoming 12 years old. The corpus is composed of interviews conducted from 2018–2020 with 20 speakers on language biographies and narrated in Czech and German respectively.\nThe corpus is available for download from LINDAT and for online browsing through the KonText concordancer.", - "Languages": ["ces", "deu"], + "Language": ["ces", "deu"], "Licence": "Czech National Corpus (Shuffled Corpus Data)", "Size": [], "Annotation": ["syntactic dependencies"], diff --git a/corpora/spoken-corpora/lecture-speech.json b/corpora/spoken-corpora/lecture-speech.json index 868f327..06f22a5 100644 --- a/corpora/spoken-corpora/lecture-speech.json +++ b/corpora/spoken-corpora/lecture-speech.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00023L", "Family": "Spoken corpora", "Description": "This corpus contains recordings of academic lectures and oral conference presentations.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY-SA", "Size": ["41 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/lia.json b/corpora/spoken-corpora/lia.json index b894b54..e1b2cd5 100644 --- a/corpora/spoken-corpora/lia.json +++ b/corpora/spoken-corpora/lia.json @@ -3,7 +3,7 @@ "URL": "https://www.hf.uio.no/iln/english/research/projects/language-infrastructure-made-accessible/index.html", "Family": "Spoken corpora", "Description": "This corpus contains interviews and conversation in Norwegian dialects.\nThe corpus is available through a Tekstlab concordancer (account needed).", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CLARIN ACA", "Size": ["1.5 million tokens"], "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/lmu-asica.json b/corpora/spoken-corpora/lmu-asica.json index 5c44a5a..78f6460 100644 --- a/corpora/spoken-corpora/lmu-asica.json +++ b/corpora/spoken-corpora/lmu-asica.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A531-E", "Family": "Spoken corpora", "Description": "This corpus is a documentation of the South Italian dialect 'Calabrese'. The main objects when building this corpus were the analysis of syntactical structures and their geolinguistic mapping in form of interactive, webbased cartography. The corpus consists of several audio files containing recordings of some sixty speakers of Calabrese one half of which having migration experience in Germany the other half almost always having stayed in Calabria. Furthermore the informants were selected equally balanced regarding gender, age and geographical origin. Of most of the informants there exist at least one recording with spontanous speech and one recording based on stimuli each.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "CLARIN RES", "Size": ["47 hours"], "Annotation": ["phonetic transcription"], diff --git a/corpora/spoken-corpora/long-spoken-fin.json b/corpora/spoken-corpora/long-spoken-fin.json index d1229c5..080e846 100644 --- a/corpora/spoken-corpora/long-spoken-fin.json +++ b/corpora/spoken-corpora/long-spoken-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073041", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThe corpus is available for online querying through the LAT platform and through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "restricted", "Size": ["210 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/medical-speech.json b/corpora/spoken-corpora/medical-speech.json index 0a5ebfa..41f0de4 100644 --- a/corpora/spoken-corpora/medical-speech.json +++ b/corpora/spoken-corpora/medical-speech.json @@ -3,7 +3,7 @@ "URL": "http://metashare.elda.org/repository/browse/hungarian-medical-speech-database/76a0c9f881b611e2892a000c29bfc0d4ed0651f675914bb2805e26819a60167d/", "Family": "Spoken corpora", "Description": "This corpus is available for download (upon request) from META-SHARE.", - "Languages": ["hun"], + "Language": ["hun"], "Licence": "META-SHARE C-NoReD-FF", "Size": [], "Annotation": ["phonetic transcription"], diff --git a/corpora/spoken-corpora/mehrsprachige-kinder.json b/corpora/spoken-corpora/mehrsprachige-kinder.json index 6dc57c5..f11ff91 100644 --- a/corpora/spoken-corpora/mehrsprachige-kinder.json +++ b/corpora/spoken-corpora/mehrsprachige-kinder.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-0372-30C6-B67F-ED01-5", "Family": "Spoken corpora", "Description": "This corpus contains elicitation tasks with pre-school children.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["17,000 words", "13 hours"], "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/multichannel-articulatory.json b/corpora/spoken-corpora/multichannel-articulatory.json index 7d397ef..ecfd6ad 100644 --- a/corpora/spoken-corpora/multichannel-articulatory.json +++ b/corpora/spoken-corpora/multichannel-articulatory.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C2B1-5", "Family": "Spoken corpora", "Description": "This coprus features a set of 460 short sentences designed to include the main connected speech processes in English (e.g. assimilations, weak forms ...). All recordings made in the same sound damped studio at the Edinburgh Speech Production Facility based in the department of Speech and Language Sciences, Queen Margaret University College, UK. The database contains audio files, laryngograph waveforms, electromagnetic articulograph (EMA) tracks and electropalatograph (EPG) tracks. ", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "CLARIN PUB", "Size": ["5 hours"], "Annotation": ["orthographically transcribed", "Electromagnetic Articulography"], diff --git a/corpora/spoken-corpora/natural-media-motion-capture.json b/corpora/spoken-corpora/natural-media-motion-capture.json index 004971e..9a9e96b 100644 --- a/corpora/spoken-corpora/natural-media-motion-capture.json +++ b/corpora/spoken-corpora/natural-media-motion-capture.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C34C-8", "Family": "Spoken corpora", "Description": "The corpus consists of data from 18 participants, whose task was to describe nine objects each to an experimenter, without using everyday vocabulary about forms, sizes or objects. The participants were recorded on audio and several video cameras, and their hand movements were recorded using an optical VICON motion capture system.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["3 hours"], "Annotation": ["orthographically transcribed", "gestures", "motion capture of hands"], diff --git a/corpora/spoken-corpora/nautilus.json b/corpora/spoken-corpora/nautilus.json index d284e12..76b4ceb 100644 --- a/corpora/spoken-corpora/nautilus.json +++ b/corpora/spoken-corpora/nautilus.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-C05F-6", "Family": "Spoken corpora", "Description": "This corpus contains scripted, semi-spontaneous, and spontaneous human-human dialogs. In total, 300 speakers of German without noticeable accent participated and were recorded in an acoustically-isolated room. Interactions between speakers and their interlocutor are provided in separate mono files, accompanied by timestamps and tags that define the speaker's turns. The speech corresponding to one of the semi-spontaneous dialogs was labeled with respect to perceived interpersonal speaker characteristics and naive voice descriptions. These labels are found alongside the documentation.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["155 hours"], "Annotation": ["orthographically transcribed", "Turn taking", "perceivend inter-personal speaker characteristics", "voice descriptions"], diff --git a/corpora/spoken-corpora/nordic-dialect.json b/corpora/spoken-corpora/nordic-dialect.json index 6b2a275..ff4f63d 100644 --- a/corpora/spoken-corpora/nordic-dialect.json +++ b/corpora/spoken-corpora/nordic-dialect.json @@ -3,7 +3,7 @@ "URL": "http://www.tekstlab.uio.no/nota/scandiasyn/nsd.html", "Family": "Spoken corpora", "Description": "This corpus consists of pontaneous speech data from dialects of the North Germanic languages across all of the Nordic countries. The linguistic data in the corpus comes from a variety of sources, (see homepage - Data Collection), recorded in 1998 - 2015. The corpus transcribed and linked to audio and video, has a map function, and can be searched in a large variety of ways.#SEPThe corpus can be accessed online via a concordancer provided by the TekstLab (a CLARINO node).", - "Languages": ["nor", "swe", "dan", "fao", "isl", "Övdalian"], + "Language": ["nor", "swe", "dan", "fao", "isl", "Övdalian"], "Licence": "CLARIN ACA", "Size": ["2,754,289 tokens"], "Annotation": ["MSD-tagged", "phonetically transcribed", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/north-wind-sun.json b/corpora/spoken-corpora/north-wind-sun.json index ba525b2..2825f98 100644 --- a/corpora/spoken-corpora/north-wind-sun.json +++ b/corpora/spoken-corpora/north-wind-sun.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-00129L", "Family": "Spoken corpora", "Description": "This corpus contains recordings of the tale \"Põhjatuul ja päike\" (North Wind and the Sun) read by the same speakers who participated in the Phonetic Corpus of Estonian Spontaneous Speech.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "", "Size": [], "Annotation": ["word segmentation and phonemes in SAMPA"], diff --git a/corpora/spoken-corpora/nota-oslo.json b/corpora/spoken-corpora/nota-oslo.json index 9c047ae..cdf2cf9 100644 --- a/corpora/spoken-corpora/nota-oslo.json +++ b/corpora/spoken-corpora/nota-oslo.json @@ -3,7 +3,7 @@ "URL": "http://www.tekstlab.uio.no/nota/oslo/english.html", "Family": "Spoken corpora", "Description": "This corpus contains interviews and conversations in Oslo sociolects.\nThe corpus is available through a Tekstlab concordancer (account needed).", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CLARIN ACA", "Size": ["1 million tokens"], "Annotation": ["orthographically transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/nslc.json b/corpora/spoken-corpora/nslc.json index 55846f9..ab8edc1 100644 --- a/corpora/spoken-corpora/nslc.json +++ b/corpora/spoken-corpora/nslc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0007-C6F2-8", "Family": "Spoken corpora", "Description": "This second version 0.2 of the corpus is a subcorpus that comprises 177 communications, 136 of which contain an aligned audio recording, with glossed (Toolbox/FLEx) and annotated (EXMARaLDA) transcripts from 57 speakers. All texts have been translated into Russian and English, some also into German. The corpus also contains rich metadata on the communications and speakers.", - "Languages": ["nio", "rus"], + "Language": ["nio", "rus"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["32 hours"], "Annotation": ["alignment of transcriptions and audio recordings"], diff --git a/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json b/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json index 55b9b79..d1952be 100644 --- a/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json +++ b/corpora/spoken-corpora/onset-cochlear-patients-diachronic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A9CB-D", "Family": "Spoken corpora", "Description": "This corpus contains diachronic speech recordings from three cochlear implant (CI) users. For data used in the corresponding synchronic study, please refer to the CI_2 corpora. CI_3_Sibilants contains recordings used for the analysis of /s/ and /ʃ/ in the following words: 'Tasse', 'Tasche'. CI_3_VOT contains recordings used for the analysis of voice onset time in /t/ in the word 'teilen'.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["unknown"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/onset-cochlear-patients.json b/corpora/spoken-corpora/onset-cochlear-patients.json index 19d129a..7315107 100644 --- a/corpora/spoken-corpora/onset-cochlear-patients.json +++ b/corpora/spoken-corpora/onset-cochlear-patients.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AE7E-F", "Family": "Spoken corpora", "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG). It contains recordings used for the analysis of voice onset time in /t/ in the word 'teilen'. ", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["35 min"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/oral2008.json b/corpora/spoken-corpora/oral2008.json index 82b6e71..ce66155 100644 --- a/corpora/spoken-corpora/oral2008.json +++ b/corpora/spoken-corpora/oral2008.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-119D-A", "Family": "Spoken corpora", "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 3.0", "Size": ["1 million tokens"], "Annotation": [], diff --git a/corpora/spoken-corpora/oral2013.json b/corpora/spoken-corpora/oral2013.json index bda12a1..3e2a5d7 100644 --- a/corpora/spoken-corpora/oral2013.json +++ b/corpora/spoken-corpora/oral2013.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-1848", "Family": "Spoken corpora", "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Academic Licence Agreement for Czech National Corpus Data", "Size": ["2.8 million words"], "Annotation": ["recordings and transcripts anonymised"], diff --git a/corpora/spoken-corpora/orleans.json b/corpora/spoken-corpora/orleans.json index 5df3433..c2388c8 100644 --- a/corpora/spoken-corpora/orleans.json +++ b/corpora/spoken-corpora/orleans.json @@ -3,7 +3,7 @@ "URL": "http://purl.org/poi/crdo.vjf.cnrs.fr/cocoon-5569b8dc-b40f-3ccd-95d1-86d20a1a836c", "Family": "Spoken corpora", "Description": "This corpus contains recordings of the everyday speech of Orléans residents between 1969 and 1974.\nThe corpus is available for download from the Huma-num repository.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC BY-NC-SA 3.0", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/ortofon-audio.json b/corpora/spoken-corpora/ortofon-audio.json index bafa27d..c2f9330 100644 --- a/corpora/spoken-corpora/ortofon-audio.json +++ b/corpora/spoken-corpora/ortofon-audio.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2579", "Family": "Spoken corpora", "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "Academic Licence Agreement for Czech National Corpus Data", "Size": ["1 million words"], "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/ortofon.json b/corpora/spoken-corpora/ortofon.json index 084a587..16fff16 100644 --- a/corpora/spoken-corpora/ortofon.json +++ b/corpora/spoken-corpora/ortofon.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2580", "Family": "Spoken corpora", "Description": "This corpus contains informal conversations.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC-SA 4.0", "Size": ["1 million tokens"], "Annotation": ["orthographically and phonetically transcribed", "MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/ovm.json b/corpora/spoken-corpora/ovm.json index 448a0b9..051aca9 100644 --- a/corpora/spoken-corpora/ovm.json +++ b/corpora/spoken-corpora/ovm.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-000D-EC98-3", "Family": "Spoken corpora", "Description": "This corpus contains transcribed recordings from the Czech political discussion broadcast “Otázky Václava Moravce“.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC 3.0", "Size": ["35 hours"], "Annotation": ["word-by-word transcriptions, including the transcription of some non-speech events"], diff --git a/corpora/spoken-corpora/parcorfull.json b/corpora/spoken-corpora/parcorfull.json index 7ead03d..12ddb1e 100644 --- a/corpora/spoken-corpora/parcorfull.json +++ b/corpora/spoken-corpora/parcorfull.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-2614", "Family": "Spoken corpora", "Description": "This corpus contains planned speech and newswire.\nThe corpus is available for download from LINDAT.", - "Languages": ["eng", "deu"], + "Language": ["eng", "deu"], "Licence": "CC BY-NC-ND 4.0", "Size": ["160,000 tokens"], "Annotation": ["coreference (nominal and clausal)"], diff --git a/corpora/spoken-corpora/parlato-telegiornalistico.json b/corpora/spoken-corpora/parlato-telegiornalistico.json index cfa28da..6b10e1d 100644 --- a/corpora/spoken-corpora/parlato-telegiornalistico.json +++ b/corpora/spoken-corpora/parlato-telegiornalistico.json @@ -3,7 +3,7 @@ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/647-selezione-dal-qcorpus-di-parlato-telegiornalistico-anni-sessanta-vs-2005q", "Family": "Spoken corpora", "Description": "This corpus contains news broadcast.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": [], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/pdtsl.json b/corpora/spoken-corpora/pdtsl.json index 1d8424a..1b3ed4d 100644 --- a/corpora/spoken-corpora/pdtsl.json +++ b/corpora/spoken-corpora/pdtsl.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11858/00-097C-0000-0001-4914-D", "Family": "Spoken corpora", "Description": "This corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "ACADEMIC (PDTSL)", "Size": ["120,000 words"], "Annotation": ["syntactic dependencies"], diff --git a/corpora/spoken-corpora/phattsessionz.json b/corpora/spoken-corpora/phattsessionz.json index d75da60..060ac4d 100644 --- a/corpora/spoken-corpora/phattsessionz.json +++ b/corpora/spoken-corpora/phattsessionz.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-CC6A-4", "Family": "Spoken corpora", "Description": "This corpus contains recordings of 1019 adolescent speakers of German (age range 12-20). The recordings were performed via the WWW in public schools (Gymnasium) in 45 locations in Germany. The speech material recorded is a superset of the German SpeechDat-II and RVG-I corpora.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["208 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/phon-contemp-fra.json b/corpora/spoken-corpora/phon-contemp-fra.json index 914b413..5923c0e 100644 --- a/corpora/spoken-corpora/phon-contemp-fra.json +++ b/corpora/spoken-corpora/phon-contemp-fra.json @@ -3,7 +3,7 @@ "URL": "http://cocoon.huma-num.fr/exist/crdo/ark:/87895/1.17-794340", "Family": "Spoken corpora", "Description": "This corpus is available for download from a dedicated webpage.", - "Languages": ["fra"], + "Language": ["fra"], "Licence": "CC-BY", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/phoncat.json b/corpora/spoken-corpora/phoncat.json index 063a765..c67beeb 100644 --- a/corpora/spoken-corpora/phoncat.json +++ b/corpora/spoken-corpora/phoncat.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0000-772F-7", "Family": "Spoken corpora", "Description": "This corpus contains read, elicited and spontaneous speech.", - "Languages": ["Spanish (Catalan)"], + "Language": ["Spanish (Catalan)"], "Licence": "HZSK-RES (restricted, non-commercial only)", "Size": ["144 hours"], "Annotation": ["orthographically and phonetically transcribed"], diff --git a/corpora/spoken-corpora/phondat1.json b/corpora/spoken-corpora/phondat1.json index 61c36b3..e70b3a3 100644 --- a/corpora/spoken-corpora/phondat1.json +++ b/corpora/spoken-corpora/phondat1.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-D20B-6", "Family": "Spoken corpora", "Description": "The corpus contains read speech of 201 different speakers. Each speaket read a subcorpus of 450 different sentence equivalents (including alphanumericals and two shorter passages of prose text); 8 speakers read the whole sentence corpus; 40 speakers read the subcorpora BR and MR; 112 speakers read 70 utterances of the rest corpus, including alphabet, numbers 0 to 12 and stories. The corpus contains a total of 21587 recorded utterances.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["21.4 hours"], "Annotation": ["orthographically transcribed", "phonemic"], diff --git a/corpora/spoken-corpora/phondat2.json b/corpora/spoken-corpora/phondat2.json index e1c9235..1b53632 100644 --- a/corpora/spoken-corpora/phondat2.json +++ b/corpora/spoken-corpora/phondat2.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-D288-8", "Family": "Spoken corpora", "Description": "The corpus contains read speech of 16 different speakers, 6 women and 10 men. Each speaker reads a corpus of 200 different sentences from a train query task. They were recorded at three different sites in Germany (University of Kiel, University of Bonn, University of Munich). The language is German. The corpus contains a total of 3200 recorded utterances.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["4.3 hours"], "Annotation": ["orthographically transcribed", "phonemic", "phonetic"], diff --git a/corpora/spoken-corpora/prague-db.json b/corpora/spoken-corpora/prague-db.json index 87cc916..9cbee35 100644 --- a/corpora/spoken-corpora/prague-db.json +++ b/corpora/spoken-corpora/prague-db.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11234/1-2375", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous dialogue.\nThe corpus is available for download from LINDAT.", - "Languages": ["ces"], + "Language": ["ces"], "Licence": "CC BY-NC SA 4.0", "Size": ["770,000 tokens", "7324 minutes"], "Annotation": ["MSD-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/radio-interviews.json b/corpora/spoken-corpora/radio-interviews.json index ca40204..0e6cd95 100644 --- a/corpora/spoken-corpora/radio-interviews.json +++ b/corpora/spoken-corpora/radio-interviews.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00022L", "Family": "Spoken corpora", "Description": "This corpus contains telephone interviews from different radio programmes.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "CC-BY", "Size": ["36 hours"], "Annotation": ["speech annotation to orthographically transcribed"], diff --git a/corpora/spoken-corpora/radio-news.json b/corpora/spoken-corpora/radio-news.json index 6f0c2bb..9a87173 100644 --- a/corpora/spoken-corpora/radio-news.json +++ b/corpora/spoken-corpora/radio-news.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10.15155/9-00-0000-0000-0000-00021L", "Family": "Spoken corpora", "Description": "This corpus contains public broadcast news.\nThe corpus is available for download from META-SHARE (CELR distribution).", - "Languages": ["est"], + "Language": ["est"], "Licence": "", "Size": ["19 hours"], "Annotation": ["speech annotation to orthographically transcribed"], diff --git a/corpora/spoken-corpora/route-to-a-wing.json b/corpora/spoken-corpora/route-to-a-wing.json index 6d33db2..3ec3736 100644 --- a/corpora/spoken-corpora/route-to-a-wing.json +++ b/corpora/spoken-corpora/route-to-a-wing.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014101401", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous conversations.\nThis corpus is available for online querying through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-0", "Size": ["218 tokens"], "Annotation": ["PoS-tagged"], diff --git a/corpora/spoken-corpora/russlanddeutsch.json b/corpora/spoken-corpora/russlanddeutsch.json index 3fbf283..3165a47 100644 --- a/corpora/spoken-corpora/russlanddeutsch.json +++ b/corpora/spoken-corpora/russlanddeutsch.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/10932/00-03FA-9D9C-4EEA-BB01-7", "Family": "Spoken corpora", "Description": "This corpus contains interviews in German extraterritorial varieties.\nThe corpus is available for download and online browsing via the Database of Spoken German (AGD @ IDS Mannheim).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN RES", "Size": ["100,000 words", "10 hours"], "Annotation": ["literal and PoS-tagged", "lemmatised", "time-aligned", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/rvg1_clarin.json b/corpora/spoken-corpora/rvg1_clarin.json index 72a0a20..55a3bff 100644 --- a/corpora/spoken-corpora/rvg1_clarin.json +++ b/corpora/spoken-corpora/rvg1_clarin.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0004-3FF4-3", "Family": "Spoken corpora", "Description": "The corpus is a collection of more than 500 speakers of different dialect regions of Germany. The recordings were made using four different microphones (two in low and two in high quality) and consist of single digits, connected digits, phone numbers, phonetically balanced sentences, computer command phrases prompted on a screen, and 1 min spontaneous speech (monologue). The speakers were recorded in normal office environments. The backround noise was limited to the usual noise in office environment, eg. door slam, backround crosstalk, phone ringing, paper rustle, PC noise, etc. ", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["32 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/samples-spoken-fin.json b/corpora/spoken-corpora/samples-spoken-fin.json index 24c100f..84e1bcc 100644 --- a/corpora/spoken-corpora/samples-spoken-fin.json +++ b/corpora/spoken-corpora/samples-spoken-fin.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-201407141", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThis corpus is available for online querying through the LAT platform and through the concordancer Korp.", - "Languages": ["fin"], + "Language": ["fin"], "Licence": "CC-BY", "Size": ["100 hours"], "Annotation": ["syntactically parsed (TDT alpha)", "named entities (FiNER)", "PoS-tagged", "lemmatized", "orthographically transcribed"], diff --git a/corpora/spoken-corpora/samromur.json b/corpora/spoken-corpora/samromur.json index 4261932..a3569d0 100644 --- a/corpora/spoken-corpora/samromur.json +++ b/corpora/spoken-corpora/samromur.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/189", "Family": "Spoken corpora", "Description": "This corpus contains validated speech-recordings and is a result of a crowd-sourcing effort run by the Language and Voice Lab at Reykjavik University in cooperation with Almannarómur, Center for Language Technology.\nThe corpus contains recordings by 8,392 different speakers, with the average recording lenth being 5.2 seconds. Transcriptions of the read texts are also available.\nThe corpus is available for download from the CLARIN.IS repository.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["145 hours", "100,000 utterances"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/sc1.json b/corpora/spoken-corpora/sc1.json index de0a3e1..808af29 100644 --- a/corpora/spoken-corpora/sc1.json +++ b/corpora/spoken-corpora/sc1.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0002-0B76-E", "Family": "Spoken corpora", "Description": "The corpus contains speech of 88 different speakers, reading the German story 'Der Nordwind und die Sonne'. Subcorpus T contains the recordings of 16 native Germans (L1). The other 72 speakers which were born and educated in other countries (L2) are pooled in subcorpus C. Every speaker has a distinct accent.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["1.5 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/sc10.json b/corpora/spoken-corpora/sc10.json index 3b2cd48..8e76e64 100644 --- a/corpora/spoken-corpora/sc10.json +++ b/corpora/spoken-corpora/sc10.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0002-1129-D", "Family": "Spoken corpora", "Description": "The corpus contains read and non-prompted German and mother tongue speech of 70 different speakers from 17 mother tongues (L1) in a variety of speaking styles e.g. reading, retelling, free talk etc.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["10 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/sc2.json b/corpora/spoken-corpora/sc2.json index 89d0796..d4d4596 100644 --- a/corpora/spoken-corpora/sc2.json +++ b/corpora/spoken-corpora/sc2.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0005-0E95-4", "Family": "Spoken corpora", "Description": "The corpus contains read speech of 10 different speakers with screen prompted 'automobil diagnosis phrases' recorded under real conditions in two different car maintenance halls. The language is German. All speakers are male native Germans and have never participated in such a task before. They are all experts in the field of car diagnosis. Each speaker has spoken 800 3-7 word utterances derived from 100 different sentences (see sc2_ort.txt) resulting in a total of 8000 utterances.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["9 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/schweizer-jugend.json b/corpora/spoken-corpora/schweizer-jugend.json index 8fec7a8..2b29512 100644 --- a/corpora/spoken-corpora/schweizer-jugend.json +++ b/corpora/spoken-corpora/schweizer-jugend.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A68A-9", "Family": "Spoken corpora", "Description": "This corpus contains recordings of adolescent pupils in Switzerland.", - "Languages": ["Swiss German"], + "Language": ["Swiss German"], "Licence": "CLARIN RES", "Size": ["92 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/serbian-forms-of-address.json b/corpora/spoken-corpora/serbian-forms-of-address.json index 0cff57a..ef72d83 100644 --- a/corpora/spoken-corpora/serbian-forms-of-address.json +++ b/corpora/spoken-corpora/serbian-forms-of-address.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11356/1779", "Family": "Spoken corpora", "Description": "This corpus consists of transcripts of audio-recorded biographical interviews with 19 participants. The interviews are about forms of address that speakers use in colloquial and in formal settings, and about their attitudes and evaluations concerning particular forms of address.\nWe provide original transcripts (written according to GAT conventions), as well as transcripts in CoNLL-U and TEI-XML format. The corpus has been normalised, tagged with morphosyntactic and lemma information using the CLASSLA-StanfordNLP tagger, and aligned with the respective turns in the audio files. Time alignments as well as partial annotation corrections are stored in TEI-XML.\nThe corpus is available for download from CLARIN.SI as well as through the noSketchEngine and KonText concordancers.", - "Languages": ["srp"], + "Language": ["srp"], "Licence": "CC BY-NC-SA 4.1", "Size": ["171,546 words"], "Annotation": ["MSD-tagged", "lemmatised", "normalised"], diff --git a/corpora/spoken-corpora/shc.json b/corpora/spoken-corpora/shc.json index 626189a..b3ef1fe 100644 --- a/corpora/spoken-corpora/shc.json +++ b/corpora/spoken-corpora/shc.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-0700-1", "Family": "Spoken corpora", "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The recordings include field recordings using a hand-held UMTS device (one person, SmartWeb Handheld Corpus SHC), field recordings with video capture of the primary speaker and a secondary speaker (SmartWeb Video Corpus SVC) as well as mobile recordings performed on a BMW motorbike (one speaker, SmartWeb Motorboke Corpus SMC).", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["30.6 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/si100.json b/corpora/spoken-corpora/si100.json index 5a4b70e..3a70fdb 100644 --- a/corpora/spoken-corpora/si100.json +++ b/corpora/spoken-corpora/si100.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-E9CF-A", "Family": "Spoken corpora", "Description": "The corpus contains read speech of 101 different speakers (50 female, 50 male, 1 unknown). Each speaker has read approx. 100 sentences from either the SZ subcorpus or the CeBit subcorpus. The language is German. The subcorpus SZ contains 544 sentences from newspaper articles (\"Sueddeutsche Zeitung\"). The subcorpus CeBit contains 483 sentences from newspaper articles about the CeBit 1995. Each subcorpus is divided into 5 parts of approx. 100 utterances each. Every speaker read only one part of one subcorpus (with some exceptions), thus resulting in a total of 10.387 recorded utterances", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["31.5 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/si1000.json b/corpora/spoken-corpora/si1000.json index 8c506ef..0eb9758 100644 --- a/corpora/spoken-corpora/si1000.json +++ b/corpora/spoken-corpora/si1000.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EBFB-6", "Family": "Spoken corpora", "Description": "The corpus contains read speech of 10 different speakers. Each speaker has read approx. 1000 sentences from a German news paper corpus, thus resulting in a total of approx. 10000 recorded utterances. The recording took place at the Institut fuer Phonetik, University of Munich, Germany in 1994.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["32.8 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json b/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json index d865c79..0d30f92 100644 --- a/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json +++ b/corpora/spoken-corpora/sibilant-cochlear-patients-diachronic.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-A9BB-F", "Family": "Spoken corpora", "Description": "This corpus contains diachronic speech recordings from three cochlear implant (CI) users. For data used in the corresponding synchronic study, please refer to the CI_2 corpora. CI_3_Sibilants contains recordings used for the analysis of /s/ and /ʃ/ in the following words: 'Tasse', 'Tasche'.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["unknown"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/sibilant-cochlear-patients.json b/corpora/spoken-corpora/sibilant-cochlear-patients.json index cabc877..a0f70bc 100644 --- a/corpora/spoken-corpora/sibilant-cochlear-patients.json +++ b/corpora/spoken-corpora/sibilant-cochlear-patients.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AEDF-1", "Family": "Spoken corpora", "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG). CI_2_Sibilants contains recordings used for the analysis of /s/ and /ʃ/ in the following words: 'Tasse', 'Tasche'.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["1 hour"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/siebenbuergisch.json b/corpora/spoken-corpora/siebenbuergisch.json index da5b144..b55c24a 100644 --- a/corpora/spoken-corpora/siebenbuergisch.json +++ b/corpora/spoken-corpora/siebenbuergisch.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-27B9-3", "Family": "Spoken corpora", "Description": "This corpus contains 2274 recordings (approx. 360h) of spoken dialectal German (Saxonian) recorded in Transilvania (Romania) in approx. 250 different locations. This up-to-now unpublished material has been collected on analog tape in the 1960s and 70s by different linguists based at the universities of Bukarest, Hermannstadt and Klausenburg.", - "Languages": ["Bavarian", "deu", "ron"], + "Language": ["Bavarian", "deu", "ron"], "Licence": "CLARIN RES", "Size": ["450,000 words"], "Annotation": ["Geomapping", "orthographic/partial phonetic transcription", "semantic labelling"], diff --git a/corpora/spoken-corpora/skolt-saami.json b/corpora/spoken-corpora/skolt-saami.json index a3c5e51..1335d6c 100644 --- a/corpora/spoken-corpora/skolt-saami.json +++ b/corpora/spoken-corpora/skolt-saami.json @@ -3,7 +3,7 @@ "URL": "http://urn.fi/urn:nbn:fi:lb-2014073037", "Family": "Spoken corpora", "Description": "This corpus contains interviews.\nThis corpus is available for online querying through the LAT platform.", - "Languages": ["Skolt Saami"], + "Language": ["Skolt Saami"], "Licence": "CLARIN RES", "Size": ["19 hours"], "Annotation": ["MSD-tagged"], diff --git a/corpora/spoken-corpora/smartkom-home.json b/corpora/spoken-corpora/smartkom-home.json index 4c1268c..b3b3d97 100644 --- a/corpora/spoken-corpora/smartkom-home.json +++ b/corpora/spoken-corpora/smartkom-home.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-ED38-0", "Family": "Spoken corpora", "Description": "This corpus contains multi modal recordings of 65 actors who use the SmartKom system. SmartKom Home should be an intelligent communication assistant for the private environment. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum; in fact the user only knew that the system is able to understand speech, gestures and even mimical expressions and should more or less communicate like a human.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["11 hours"], "Annotation": ["orthographically transcribed", "phonemic", "gestures", "mimic", "emotions"], diff --git a/corpora/spoken-corpora/smartkom-mobil.json b/corpora/spoken-corpora/smartkom-mobil.json index 42d0465..213603d 100644 --- a/corpora/spoken-corpora/smartkom-mobil.json +++ b/corpora/spoken-corpora/smartkom-mobil.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EDBB-C", "Family": "Spoken corpora", "Description": "This corpus contains multi modal recordings of 73 actors who use the SmartKom system. SmartKom Mobil is a portable PDA equipped with a net link and additional intelligent communication devices. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum; in fact the user only knew that the system is able to understand speech, gestures and should more or less communicate like a human. Experiments were not performed in the field but rather in a studio-like environment.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["11 hours"], "Annotation": ["orthographically transcribed", "phonemic", "gestures", "mimic", "emotions"], diff --git a/corpora/spoken-corpora/smartkom-public.json b/corpora/spoken-corpora/smartkom-public.json index eb9ed06..29a3624 100644 --- a/corpora/spoken-corpora/smartkom-public.json +++ b/corpora/spoken-corpora/smartkom-public.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-EC8B-3", "Family": "Spoken corpora", "Description": "This corpus contains multi modal recordings of 86 actors who use the SmartKom system. SmartKom Public is comparable to a traditional public phone booth but equipped with additional intelligent communication devices. Naive users were asked to test a 'prototype' for a market study not knowing that the system was in fact controlled by two human operators. They were asked to solve two tasks in a period of 4,5 min while they were left alone with the system. The instruction was kept to a minimum; in fact the user only knew that the system is able to understand speech, gestures and even mimical expressions and should more or less communicate like a human.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["11 hours"], "Annotation": ["orthographically transcribed", "phonemic", "gestures", "mimic", "emotions"], diff --git a/corpora/spoken-corpora/smartweb-motorbike.json b/corpora/spoken-corpora/smartweb-motorbike.json index 4d0ce08..d809921 100644 --- a/corpora/spoken-corpora/smartweb-motorbike.json +++ b/corpora/spoken-corpora/smartweb-motorbike.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0005-C50F-D", "Family": "Spoken corpora", "Description": "The corpus comprises a collection of user queries to a naturally spoken Web interface with the main focus on the soccer world series in 2006. The SMC corpus itself contains 36 mobile recordings performed on a BMW motorbike.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["6.3 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/spit-mdb.json b/corpora/spoken-corpora/spit-mdb.json index 8c829a5..d2c1593 100644 --- a/corpora/spoken-corpora/spit-mdb.json +++ b/corpora/spoken-corpora/spit-mdb.json @@ -3,7 +3,7 @@ "URL": "http://www.parlaritaliano.it/index.php/it/corpora-di-parlato/644-spit-mdb-spoken-italian-multilevel-database", "Family": "Spoken corpora", "Description": "This corpus contains spontaneous speech.\nThe corpus is available for download from a dedicated webpage.", - "Languages": ["ita"], + "Language": ["ita"], "Licence": "", "Size": [], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/spjallromur.json b/corpora/spoken-corpora/spjallromur.json index 86531d6..1627bbe 100644 --- a/corpora/spoken-corpora/spjallromur.json +++ b/corpora/spoken-corpora/spjallromur.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/187", "Family": "Spoken corpora", "Description": "This corpus contains recordings of 54 conversations by 102 speakers, recorded between September 2020 and September 2021.\nThe corpus is available for download from the CLARIN.IS repository.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["21 hours"], "Annotation": [], diff --git a/corpora/spoken-corpora/spoken-bnc2014.json b/corpora/spoken-corpora/spoken-bnc2014.json index 5a2c6a5..7c70e67 100644 --- a/corpora/spoken-corpora/spoken-bnc2014.json +++ b/corpora/spoken-corpora/spoken-bnc2014.json @@ -3,7 +3,7 @@ "URL": "http://cass.lancs.ac.uk/cass-projects/spoken-bnc2014/", "Family": "Spoken corpora", "Description": "This corpus contains face-to-face conversations between people who speak British English as their first language.\nThe corpus is available through the CQP concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": ["10 million words"], "Annotation": [], diff --git a/corpora/spoken-corpora/spoken-estonian.json b/corpora/spoken-corpora/spoken-estonian.json index 78ed387..0dcf2cc 100644 --- a/corpora/spoken-corpora/spoken-estonian.json +++ b/corpora/spoken-corpora/spoken-estonian.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11372/LRT-253", "Family": "Spoken corpora", "Description": "This corpus contains transcripts of recordings from various domains.", - "Languages": ["est"], + "Language": ["est"], "Licence": "", "Size": ["1 million words"], "Annotation": ["unspecified tagging"], diff --git a/corpora/spoken-corpora/spoken-icelandic.json b/corpora/spoken-corpora/spoken-icelandic.json index b2bea93..ade230e 100644 --- a/corpora/spoken-corpora/spoken-icelandic.json +++ b/corpora/spoken-corpora/spoken-icelandic.json @@ -3,7 +3,7 @@ "URL": "https://clarin.is/en/resources/spoken/", "Family": "Spoken corpora", "Description": "This corpus contains four different subcorpora: (1) Spontaneous conversations, from the project ÍSTAL (An Icelandic Spoken Language Bank), (2) Group conversations, from the project MIN (Modern loanwords in the Nordic languages), (3) Parliamentary debates, (4) Conversations of teenagers with other teenagers and adults\nThe corpus is available for download from CLARIN-IS (as a part of the Icelandic Gigaword Corpus) and for search through the concordancer Korp.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC-BY 4.0", "Size": ["536,000 tokens"], "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], diff --git a/corpora/spoken-corpora/spoken-wikipedia.json b/corpora/spoken-corpora/spoken-wikipedia.json index 215ab61..e51eb8e 100644 --- a/corpora/spoken-corpora/spoken-wikipedia.json +++ b/corpora/spoken-corpora/spoken-wikipedia.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/0000-0007-C641-0", "Family": "Spoken corpora", "Description": "This corpus contains transcripts of read Wikipedia articles.\nThe corpus is available for download from a CLARIN-D repository.", - "Languages": ["eng", "deu", "nld"], + "Language": ["eng", "deu", "nld"], "Licence": "CC-BY SA 4.0", "Size": ["1005 hours"], "Annotation": ["text segmentation", "normalization", "time-alignment"], diff --git a/corpora/spoken-corpora/talromur-2.json b/corpora/spoken-corpora/talromur-2.json index fb1d435..b0bca8d 100644 --- a/corpora/spoken-corpora/talromur-2.json +++ b/corpora/spoken-corpora/talromur-2.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/20.500.12537/167", "Family": "Spoken corpora", "Description": "This corpus consists of recordings of forty different speakers reading short sentences and is intended for modelling prosody.\nThe corpus is available for download from the CLARIN.IS repository.", - "Languages": ["isl"], + "Language": ["isl"], "Licence": "CC BY 4.0", "Size": ["56,225 utterances"], "Annotation": [], diff --git a/corpora/spoken-corpora/taus.json b/corpora/spoken-corpora/taus.json index 9df7380..da30af2 100644 --- a/corpora/spoken-corpora/taus.json +++ b/corpora/spoken-corpora/taus.json @@ -3,7 +3,7 @@ "URL": "http://www.tekstlab.uio.no/nota/taus/english.html", "Family": "Spoken corpora", "Description": "This corpus contains informal interviews in Oslo sociolects.\nThe corpus is available through a Tekstlab concordancer (account needed).", - "Languages": ["nor"], + "Language": ["nor"], "Licence": "CLARIN ACA", "Size": ["270 000 tokens"], "Annotation": ["MSD-tagged", "lemmatised", "orthographically and partially phonetically transcribed"], diff --git a/corpora/spoken-corpora/uraluid.json b/corpora/spoken-corpora/uraluid.json index bb4e542..0279a5a 100644 --- a/corpora/spoken-corpora/uraluid.json +++ b/corpora/spoken-corpora/uraluid.json @@ -1,9 +1,9 @@ { - "Name": "Uralic Languages under the Influence (UraLUID) database", + "Name": "Uralic Language under the Influence (UraLUID) database", "URL": "http://www.nytud.hu/depts/tlp/uralic/dbases.html", "Family": "Spoken corpora", "Description": "This corpus contains narratives (e.g., folk storites).\nThe corpus is available for download from a dedicated website.", - "Languages": ["Udmurt", "Tundra Nenets", "Synya Khanty", "Surgut Khanty"], + "Language": ["Udmurt", "Tundra Nenets", "Synya Khanty", "Surgut Khanty"], "Licence": "", "Size": ["108,000 tokens", "4 hours"], "Annotation": ["MSD-tagged", "time-alignment", "phonetic and orthographic transcription"], diff --git a/corpora/spoken-corpora/verbmobil-1.json b/corpora/spoken-corpora/verbmobil-1.json index 2300357..5442ba8 100644 --- a/corpora/spoken-corpora/verbmobil-1.json +++ b/corpora/spoken-corpora/verbmobil-1.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-EB31-0", "Family": "Spoken corpora", "Description": "The Verbmobil (VM) dialog database is a collection of German, American and Japanese dialog recordings in the appointment scheduling task. The data were collected during the first phase (1993 - 1996) of the German VM project funded by the German Ministry of Science and Technology (BMBF). Starting with version 3, the corpus is also provided as an emuR comptatible database.", - "Languages": ["deu", "eng", "jpn"], + "Language": ["deu", "eng", "jpn"], "Licence": "CLARIN ACA", "Size": ["77 hours"], "Annotation": ["orthographically transcribed", "phonetic", "phonemic", "prosodic"], diff --git a/corpora/spoken-corpora/verbmobil-2.json b/corpora/spoken-corpora/verbmobil-2.json index e12b816..2f70252 100644 --- a/corpora/spoken-corpora/verbmobil-2.json +++ b/corpora/spoken-corpora/verbmobil-2.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0000-FC55-5", "Family": "Spoken corpora", "Description": "Verbmobil 2 contains the speech of 401 speakers participating in 810 recordings. The emotional tagged recordings are not part of this edition but are collected inthe corpus 'BAS VMEmo'. The total VM2 corpus amounts to 17.6GB of data containing 58961 conversational turns distributed on 39 CD-R. VM2 contains dialogs in German, English, Japanese and mixed language pairs (partly with interpreter). The domain is appointment scheduling, travel planing, leisure time planing. Starting from version 3, the corpus is also available in emuR compatible emuDB format (see annotation files ending in *_annot.json).", - "Languages": ["deu", "eng", "jpn"], + "Language": ["deu", "eng", "jpn"], "Licence": "CLARIN ACA", "Size": ["65.8 hours"], "Annotation": ["orthographically transcribed", "phonetic", "phonemic", "prosodic"], diff --git a/corpora/spoken-corpora/vienna-oxford.json b/corpora/spoken-corpora/vienna-oxford.json index 84d4da9..8b71b42 100644 --- a/corpora/spoken-corpora/vienna-oxford.json +++ b/corpora/spoken-corpora/vienna-oxford.json @@ -3,7 +3,7 @@ "URL": "http://voice.univie.ac.at/", "Family": "Spoken corpora", "Description": "This corpus contains naturally occurring, non-scripted face-to-face interactions in English as a lingua franca (ELF).\nThe corpus is available through a dedicated concordancer.", - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/vowel-cochlear-patients.json b/corpora/spoken-corpora/vowel-cochlear-patients.json index 98f5769..4ef3a84 100644 --- a/corpora/spoken-corpora/vowel-cochlear-patients.json +++ b/corpora/spoken-corpora/vowel-cochlear-patients.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0001-AFA1-4", "Family": "Spoken corpora", "Description": "This corpous contains German speech recordings of 48 cochlear implant users (CI) and 48 speakers without hearing impairment (control group, KG). It contains recordings used for the analysis of sevel long, lexically stressed vowels in the words 'Taten', 'stetig', 'Toter', 'Stute', 'töten', 'Tüte' and 'kriegen'.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["2 hours"], "Annotation": ["orthographically transcribed"], diff --git a/corpora/spoken-corpora/wissenschaftssprache.json b/corpora/spoken-corpora/wissenschaftssprache.json index c84c950..2d00fa7 100644 --- a/corpora/spoken-corpora/wissenschaftssprache.json +++ b/corpora/spoken-corpora/wissenschaftssprache.json @@ -3,7 +3,7 @@ "URL": "", "Family": "Spoken corpora", "Description": "", - "Languages": [], + "Language": [], "Licence": "", "Size": [], "Annotation": [], diff --git a/corpora/spoken-corpora/zurich-tangram-bas.json b/corpora/spoken-corpora/zurich-tangram-bas.json index d3fad1d..4ff28a8 100644 --- a/corpora/spoken-corpora/zurich-tangram-bas.json +++ b/corpora/spoken-corpora/zurich-tangram-bas.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-D89D-5", "Family": "Spoken corpora", "Description": "This corpus contains tasks, where one subject (the instructor) describes different Tangram figures to another subject (the receiver) so that the receiver can recreate the same order of figures that the instructor has in front of them. The subjects initially don't know each other and work together to solve these tasks in three consecutive sessions. This edition only features the transcribed segments, not those in between, and uses separate files for the subject.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["48 hours"], "Annotation": ["orthographically transcribed", "word and phonemic segmentation"], diff --git a/corpora/spoken-corpora/zurich-tangram-uzh.json b/corpora/spoken-corpora/zurich-tangram-uzh.json index 0348590..329d388 100644 --- a/corpora/spoken-corpora/zurich-tangram-uzh.json +++ b/corpora/spoken-corpora/zurich-tangram-uzh.json @@ -3,7 +3,7 @@ "URL": "http://hdl.handle.net/11022/1009-0000-0007-D838-7", "Family": "Spoken corpora", "Description": "This corpus contains tasks, where one subject (the instructor) describes different Tangram figures to another subject (the receiver) so that the receiver can recreate the same order of figures that the instructor has in front of them. The subjects initially don't know each other and work together to solve these tasks in three consecutive sessions. This edition features the complete recordings, but lacking phone and word segmentation. Subjects audio tracks are combined into stereo files. If you would like just the transcribed segments with separate files for the subjects or want the word and phone segmentation see corpus ZTC_BAS.", - "Languages": ["deu"], + "Language": ["deu"], "Licence": "CLARIN ACA", "Size": ["48 hours"], "Annotation": ["orthographically transcribed", "turn segmentation"], diff --git a/tools/corpus-query-tools/aconcorde.json b/tools/corpus-query-tools/aconcorde.json index bdfc04d..3d0a07b 100644 --- a/tools/corpus-query-tools/aconcorde.json +++ b/tools/corpus-query-tools/aconcorde.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a multi-lingual concordance tool. Originally developed for native Arabic concordance, it posses basic concordance functionality, as well as English and Arabic interfaces.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["Platform-independent (java)"], diff --git a/tools/corpus-query-tools/antconc.json b/tools/corpus-query-tools/antconc.json index a992afc..e1b9e81 100644 --- a/tools/corpus-query-tools/antconc.json +++ b/tools/corpus-query-tools/antconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a freeware corpus analysis toolkit for concordancing and text analysis.\nOnline videos and manuals from the creator and community (Google Group).", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/antpconc.json b/tools/corpus-query-tools/antpconc.json index 77c15d3..72f678d 100644 --- a/tools/corpus-query-tools/antpconc.json +++ b/tools/corpus-query-tools/antpconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a freeware parallel corpus analysis toolkit for concordancing and text analysis using UTF-8 encoded text files.", "Functionality": ["Parallel Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/autosearch.json b/tools/corpus-query-tools/autosearch.json index 57659a1..7decd27 100644 --- a/tools/corpus-query-tools/autosearch.json +++ b/tools/corpus-query-tools/autosearch.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool allows users to upload corpora annotated at the token level for (extended) part of speech, lemma and word form in FoLiA or TEI format, after which the corpus can be searched for these properties with a Corpus of Contemporary Dutch-like interface", "Functionality": ["Querying/concordancing", "corpus upload and analysis"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/bncweb-lancaster.json b/tools/corpus-query-tools/bncweb-lancaster.json index 5fa0030..b560866 100644 --- a/tools/corpus-query-tools/bncweb-lancaster.json +++ b/tools/corpus-query-tools/bncweb-lancaster.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is a modified version of CQPweb for the British National Corpus. It allows a number of search options: publication date, text medium, author gender, target audience, genre, author domicile.\nRegistration is required to use the tool.", "Functionality": ["Querying/concordancing"] - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/casualconc.json b/tools/corpus-query-tools/casualconc.json index cc7b16d..a47b919 100644 --- a/tools/corpus-query-tools/casualconc.json +++ b/tools/corpus-query-tools/casualconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a concordance program that runs natively on macOS 11.3 or later.and can generate KWIC concordance lines, word clusters, collocation analysis, and word count.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["MacOS"], diff --git a/tools/corpus-query-tools/catma.json b/tools/corpus-query-tools/catma.json index f66d272..aa0e512 100644 --- a/tools/corpus-query-tools/catma.json +++ b/tools/corpus-query-tools/catma.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "The acronym CATMA stands for Computer Assisted Text Markup and Analysis.\nIt is possible to upload one's own corpus with this tool.", "Functionality": ["Querying/concordancing", "corpus upload and analysis"] - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/chn.json b/tools/corpus-query-tools/chn.json index ef00a02..bb98604 100644 --- a/tools/corpus-query-tools/chn.json +++ b/tools/corpus-query-tools/chn.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated query tool, built on BlackLab software, for Corpus Hedendaags Nederlands (Corpus of Contemporary Dutch), a corpus of more than 800,000 texts taken from newspapers, magazines, news broadcasts and legal writings (1814–2013).\nThe corpus is a combination of the 5, 27 and 38 million word corpora and the PAROLE Corpus, supplemented with newspaper texts from NRC and De Standaard (until 2013).\nRegistration is required for using this tool. Shibboleth log-in is supported.", "Functionality": ["Querying/concordancing"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/cintil.json b/tools/corpus-query-tools/cintil.json index 7e957d5..29aae55 100644 --- a/tools/corpus-query-tools/cintil.json +++ b/tools/corpus-query-tools/cintil.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a freely available online concordancing service to support the research usage of the CINTIL Corpus. The CINTIL concordancer allows the use of patterns to specify the occurrences to be retrieved. This permits to uncover linguistic structures of high complexity and use this service as a powerful research tool.", "Functionality": ["Querying/concordancing"] - "Languages": ["por"], + "Language": ["por"], "Licence": "Proprietary", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/clan.json b/tools/corpus-query-tools/clan.json index fab0b06..85f6414 100644 --- a/tools/corpus-query-tools/clan.json +++ b/tools/corpus-query-tools/clan.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "The CLAN Programs are downloaded, installed, and used as a single application. Functionally, however, CLAN has two parts. The first part is the CLAN editor which can be used to edit files in either CHAT or CA (Conversation Analysis) format. The editor also provides a wide range of additional functions, such as audio and video playback, linkage to audio and video, fonts for Roman and non-Roman orthographies, data validation, adding codes to files, and shipping data to other programs. The second part of CLAN is the set of data analysis programs. These programs are run from a separate window called the Commands window. The results of the analytic programs are sent to the CLAN Output window.\nThe tool is only compatible with TalkBank corpora that have CHAT annotation.\nAn online manual is available.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL2 (source code)", "Size": [], "Platform": ["Windows", "MacOS", "Source code provided for Linux users"], diff --git a/tools/corpus-query-tools/clark.json b/tools/corpus-query-tools/clark.json index 3f73882..ce1b0bd 100644 --- a/tools/corpus-query-tools/clark.json +++ b/tools/corpus-query-tools/clark.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is an XML-based system for corpus linguistics, primarily for corpus construction, but also with functionality for analysing and exploring corpora.\nThe support team is reachable through email. A user manual is also available.", "Functionality": ["Concordancing/querying", "corpus building"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Platform-independent"], diff --git a/tools/corpus-query-tools/clic.json b/tools/corpus-query-tools/clic.json index 5cf060d..3fe2cbf 100644 --- a/tools/corpus-query-tools/clic.json +++ b/tools/corpus-query-tools/clic.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool has been developed as part of the CLiC Dickens project, which demonstrates through corpus stylistics how computer-assisted methods can be used to study literary texts and lead to new insights into how readers perceive fictional characters. Further literary texts have been added to the online service.\nTechnical support is offered through email.", "Functionality": ["Querying/concordancing"] - "Languages": ["eng"], + "Language": ["eng"], "Licence": "Use of CLiC follows the University of Birmingham’s legal policy", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/coanzse.json b/tools/corpus-query-tools/coanzse.json index 2f9c38c..d566d98 100644 --- a/tools/corpus-query-tools/coanzse.json +++ b/tools/corpus-query-tools/coanzse.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated concordancer for the Corpus of Australian and New Zealand Spoken English.\nThe corpus contains 195 million words of geolocated automatic speech recognition transcripts of video content from local governments in Australia and New Zealand, created for the study of lexical, grammatical, phonetic, and discourse-pragmatic phenomena of spoken language. Additionally, the corpus contains complete textual content of the corpus, audio files and forced alignments in Praat's TextGrid format for most transcripts.\nThe corpus can be accessed through the CLARIN Service Provider Federation.", "Functionality": ["Querying/concordancing"] - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/collocate.json b/tools/corpus-query-tools/collocate.json index 31ef40e..965ebd6 100644 --- a/tools/corpus-query-tools/collocate.json +++ b/tools/corpus-query-tools/collocate.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is a Windows software program that can be used to find collocations or terms in a corpus. It is a commercial tool.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/compleat.json b/tools/corpus-query-tools/compleat.json index be801d0..141c90f 100644 --- a/tools/corpus-query-tools/compleat.json +++ b/tools/corpus-query-tools/compleat.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool includes a concordancer, vocabulary profiler, exercise maker, interactive exercises, and much more.\nIt is possible to upload one's own corpus with this tool (10 MB limit", "Functionality": ["Querying/concordancing", "corpus upload and analysis"] - "Languages": ["eng", "fra"], + "Language": ["eng", "fra"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concgram.json b/tools/corpus-query-tools/concgram.json index f1a2369..3395f01 100644 --- a/tools/corpus-query-tools/concgram.json +++ b/tools/corpus-query-tools/concgram.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is a corpus linguistics software package which is specifically designed to find all the co-occurrences of words in a text or corpus irrespective of variation. This is a commercial tool, available for purchase on optical disc.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-espanol.json b/tools/corpus-query-tools/concordancer-espanol.json index d28c55e..df34ea5 100644 --- a/tools/corpus-query-tools/concordancer-espanol.json +++ b/tools/corpus-query-tools/concordancer-espanol.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a querying tool for the corpora from Corpus del Español, which provide billions of words of recent data from 21 Spanish-speaking countries. There are four different corpora in the Corpus del Español.", "Functionality": ["Querying/concordancing"] - "Languages": ["spa"], + "Language": ["spa"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-estonian.json b/tools/corpus-query-tools/concordancer-estonian.json index 1909943..8276f4e 100644 --- a/tools/corpus-query-tools/concordancer-estonian.json +++ b/tools/corpus-query-tools/concordancer-estonian.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool provides a simple interface for a text corpus. The material for the text corpus has been collected haphazardly, 10.4 million word forms. Approximately 80% of the texts come from newspapers, which is why the corpus is not representative. The corpus also is not tagged, thus being suited for lexical search mainly.", "Functionality": ["Querying/concordancing"] - "Languages": ["est"], + "Language": ["est"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-gysseling.json b/tools/corpus-query-tools/concordancer-gysseling.json index c38e114..108ca60 100644 --- a/tools/corpus-query-tools/concordancer-gysseling.json +++ b/tools/corpus-query-tools/concordancer-gysseling.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated query tool for the Corpus Gysseling, developed by the Instituut voor de Nederlandse Taal. The backend of the application is the BlackLab Lucene-based search engine developed for corpora with token-based annotation. The web-based frontend is a further development of the corpus-frontend application developed by INT in CLARIN and CLARIAH projects.", "Functionality": ["Querying/concordancing"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-hr-nat-corp.json b/tools/corpus-query-tools/concordancer-hr-nat-corp.json index 5a4e657..8bab559 100644 --- a/tools/corpus-query-tools/concordancer-hr-nat-corp.json +++ b/tools/corpus-query-tools/concordancer-hr-nat-corp.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an implementation of NoSketchEngine for the Croatian National Corpus.", "Functionality": ["Querying/concordancing"] - "Languages": ["hrv"], + "Language": ["hrv"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-italian-heritage.json b/tools/corpus-query-tools/concordancer-italian-heritage.json index f004801..48a0725 100644 --- a/tools/corpus-query-tools/concordancer-italian-heritage.json +++ b/tools/corpus-query-tools/concordancer-italian-heritage.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool allows text and corpora querying, supporting both basic information retrieval and advanced search. It allows the customization of the query system functionalities and provides indexing also for morpho-syntactically annotated texts. The system can handle several type of text annotations and make concordances also for parallel bilingual corpora.", "Functionality": ["Querying/concordancing (non-parallel and parallel)"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-middelnederlands.json b/tools/corpus-query-tools/concordancer-middelnederlands.json index 7d8decd..269ddb2 100644 --- a/tools/corpus-query-tools/concordancer-middelnederlands.json +++ b/tools/corpus-query-tools/concordancer-middelnederlands.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated query tool for the Corpus Middelnederlands.", "Functionality": ["Querying/concordancing"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/concordancer-portuguese.json b/tools/corpus-query-tools/concordancer-portuguese.json index 64ef5fc..9e7efbc 100644 --- a/tools/corpus-query-tools/concordancer-portuguese.json +++ b/tools/corpus-query-tools/concordancer-portuguese.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated concordancer for the Corpus of Portuguese developed by Mark Davies.", "Functionality": ["Querying/concordancing"] - "Languages": ["por"], + "Language": ["por"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/coquery.json b/tools/corpus-query-tools/coquery.json index 3c240fb..4ede732 100644 --- a/tools/corpus-query-tools/coquery.json +++ b/tools/corpus-query-tools/coquery.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a free corpus query tool for linguists, lexicographers, translators, and anybody who wishes to search and analyse a text corpus. The tool works with any corpus, with installers for a number of widely used ones.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/corpkit.json b/tools/corpus-query-tools/corpkit.json index b9f6b0b..93469a8 100644 --- a/tools/corpus-query-tools/corpkit.json +++ b/tools/corpus-query-tools/corpkit.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a tool for doing corpus linguistics. It enables parsing, concordancing and keywording, including concordance by searching for combinations of lexical and grammatical features, and keywording of lemmas, of subcorpora compared to corpora, or of words in certain positions within clauses. corpkit leverages a number of sophisticated programming libraries, including pandas, matplotlib, scipy, Tkinter, tkintertable and Stanford CoreNLP.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["OSX"], diff --git a/tools/corpus-query-tools/corpus-explorer.json b/tools/corpus-query-tools/corpus-explorer.json index 16e1014..196ea58 100644 --- a/tools/corpus-query-tools/corpus-explorer.json +++ b/tools/corpus-query-tools/corpus-explorer.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is intended for corpus linguistics and for text and data mining.", "Functionality": ["Concordancing/querying", "text and data mining"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/corpus-presenter.json b/tools/corpus-query-tools/corpus-presenter.json index ec1bd09..7bfd48a 100644 --- a/tools/corpus-query-tools/corpus-presenter.json +++ b/tools/corpus-query-tools/corpus-presenter.json @@ -5,7 +5,7 @@ "Family": "Corpus query tools", "Description": "This tool can be used to compile text corpora and to carry out retrieval tasks on any corpus or selection of text files, no matter what their source or how they are organised. The tool is designed to have a maximally open architecture and can be used straight away to examine any texts users may have access to.", "Functionality": ["Concordancing/querying", "corpus compilation"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/corpus-workbench.json b/tools/corpus-query-tools/corpus-workbench.json index 9f6b6f2..7d87394 100644 --- a/tools/corpus-query-tools/corpus-workbench.json +++ b/tools/corpus-query-tools/corpus-workbench.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a collection of open-source tools for managing and querying large text corpora (up to 2 billion words) with linguistic annotations. Its central component is the flexible and efficient query processor CQP.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Linux", "MacOS", "VM images (CQPwebinABox)"], diff --git a/tools/corpus-query-tools/corpuscle.json b/tools/corpus-query-tools/corpuscle.json index 6d875bf..138a95e 100644 --- a/tools/corpus-query-tools/corpuscle.json +++ b/tools/corpus-query-tools/corpuscle.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a corpus management and analysis system for annotated corpora, with sophisticated query language. It is a reimplementation of Corpuscle featuring an improved user experience and many new features that is now available as a ", "Functionality": ["Querying/concordancing"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/cosmas-ii.json b/tools/corpus-query-tools/cosmas-ii.json index 6d69468..04ae63c 100644 --- a/tools/corpus-query-tools/cosmas-ii.json +++ b/tools/corpus-query-tools/cosmas-ii.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is used for querying the German reference corpus DeReKo, as well as several other historical and non-historical corpora.\nTechnical support is offered through email.", "Functionality": ["Querying/concordancing"] - "Languages": ["deu"], + "Language": ["deu"], "Licence": "DeReKo-EULA", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/couranten.json b/tools/corpus-query-tools/couranten.json index c5d2be8..cc96a86 100644 --- a/tools/corpus-query-tools/couranten.json +++ b/tools/corpus-query-tools/couranten.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated querying tool for the Couranten Corpus, which comprises the seventeenth-century Dutch newspapers, available on Delpher.", "Functionality": ["Querying/concordancing"] - "Languages": ["Dutch (17th Century)"], + "Language": ["Dutch (17th Century)"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/cqpweb-lancaster.json b/tools/corpus-query-tools/cqpweb-lancaster.json index 24c91b0..ad95f54 100644 --- a/tools/corpus-query-tools/cqpweb-lancaster.json +++ b/tools/corpus-query-tools/cqpweb-lancaster.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an online implementation of the CQPweb system with a large number of corpora installed. #SEPIt is possible to upload one's own corpus with this tool. Note that CQPweb will be superseded by Ziggurat, which is under development. Registration is required to use this tool.", "Functionality": ["Querying/concordancing"] - "Languages": ["eng", "ara", "fra", "ita", "nor", "pol", "lav"], + "Language": ["eng", "ara", "fra", "ita", "nor", "pol", "lav"], "Licence": "No licence", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/dwds.json b/tools/corpus-query-tools/dwds.json index 75f03b0..16d4758 100644 --- a/tools/corpus-query-tools/dwds.json +++ b/tools/corpus-query-tools/dwds.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a tool for browsing DWDS corpora. The DWDS is part of the Center for Digital Lexicography of the German Language (ZDL), funded by the Federal Ministry of Education and Research. It is based at the Berlin-Brandenburg Academy of Sciences.", "Functionality": ["Querying/concordancing"] - "Languages": ["deu"], + "Language": ["deu"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/english-corpora.json b/tools/corpus-query-tools/english-corpora.json index 8138df3..18f5ccd 100644 --- a/tools/corpus-query-tools/english-corpora.json +++ b/tools/corpus-query-tools/english-corpora.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a tool for browsing the corpora available on english-corpora.org, which are formerly known as the BYU or Brigham Young University copora.", "Functionality": ["Querying/concordancing"] - "Languages": ["eng"], + "Language": ["eng"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/exakt.json b/tools/corpus-query-tools/exakt.json index effd4f3..7160f7f 100644 --- a/tools/corpus-query-tools/exakt.json +++ b/tools/corpus-query-tools/exakt.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "EXAKT (‘EXMARaLDA Analysis- and Concordance Tool’) is the query and analysis tool for EXMARaLDA corpora. It can also be used for corpora created with other tools (FOLKER, Transcriber, ELAN).\nSupport is offered via the CLARIN-D Helpdesk. Manuals and how-to guides are available; there have also been training courses for EXAKT. The source code of the program is open source and accessible via GitHub.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Windows", "MacOS", "Linux with a current Java runtime environment."], diff --git a/tools/corpus-query-tools/gate.json b/tools/corpus-query-tools/gate.json index da345da..86d0fe3 100644 --- a/tools/corpus-query-tools/gate.json +++ b/tools/corpus-query-tools/gate.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool allows for text and corpus analysis.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GNU", "Size": [], "Platform": ["Platform-independent (Windows and generic installers available)"], diff --git a/tools/corpus-query-tools/glossa.json b/tools/corpus-query-tools/glossa.json index c5969c8..e619eb7 100644 --- a/tools/corpus-query-tools/glossa.json +++ b/tools/corpus-query-tools/glossa.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "Glossa offers a modern, simple and functional search interface with advanced post-processing possibilities for both written corpora, multilingual corpora and speech corpora.\nGlossa is developed at the Text Laboratory, Department of Linguistics and Scandinavian Studies, University of Oslo with support from the Norwegian contribution to the CLARIN infrastructure, CLARINO. Glossa is also freely available for download from GitHub and is easy to install on one's own server. Glossa is search engine agnostic and comes with support for the IMS Corpus Workbench and CLARIN Federated Content Search out of the box.", "Functionality": ["Querying/concordancing and text analysis"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Public license for software. CLARIN-licenses for corpora in Glossa", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/gretel.json b/tools/corpus-query-tools/gretel.json index a86ce27..3dafaa2 100644 --- a/tools/corpus-query-tools/gretel.json +++ b/tools/corpus-query-tools/gretel.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "GrETEL stands for Greedy Extraction of Trees for Empirical Linguistics. It is a user-friendly search engine for the exploitation of syntactically annotated corpora or treebanks.\nIt is possible to upload one's own corpus with this tool.", "Functionality": ["Querying/concordancing (treebanks)"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/i-analyzer.json b/tools/corpus-query-tools/i-analyzer.json index 3d2dec4..4d58e1f 100644 --- a/tools/corpus-query-tools/i-analyzer.json +++ b/tools/corpus-query-tools/i-analyzer.json @@ -15,7 +15,7 @@
        6. Dutch monarchs’ speeches,
        7. Dutch parliamentary debates.
        8. I-analyzer does not have an option to upload your own data yet. But you can still use your data with I-analyzer by self-hosting the tool or contacting the team (contact and self-host instructions).
        ", "Functionality": ["Querying/concordancing", "analysis", "visualizations"] - "Languages": ["Multiple"], + "Language": ["Multiple"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/icecup.json b/tools/corpus-query-tools/icecup.json index 6b9b96c..2c35719 100644 --- a/tools/corpus-query-tools/icecup.json +++ b/tools/corpus-query-tools/icecup.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a state-of-the-art corpus exploration program designed for parsed corpora such as ICE-GB and The Diachronic Corpus of Present-Day Spoken English. This is a commercial tool that works for ICE corpora with proprietary annotation scheme.\nA handbook is available.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/iness.json b/tools/corpus-query-tools/iness.json index 8b94bfd..f35210c 100644 --- a/tools/corpus-query-tools/iness.json +++ b/tools/corpus-query-tools/iness.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "INESS is the Norwegian Infrastructure for the Exploration of Syntax and Semantics. INESS offers an open, interactive, language independent platform for building, accessing, searching and visualizing treebanks.\nINESS offers a user guide for querying their treebanks.", "Functionality": ["Querying/concordancing (treebanks)"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/intellitext.json b/tools/corpus-query-tools/intellitext.json index 934ab83..28f2467 100644 --- a/tools/corpus-query-tools/intellitext.json +++ b/tools/corpus-query-tools/intellitext.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "The Intelligent Tools for Creating and Analysing Electronic Text Corpora for Humanities Research (IntelliText) project aims to facilitate corpus use for academics working in various areas of the humanities. The project produced a user-friendly corpus interface with an array of easy-to-use functions that will benefit teaching and research in several academic disciplines.\nIt is possible to upload one's own corpus with this tool. An online guide is available.", "Functionality": ["Querying/concordancing", "corpus upload"] - "Languages": ["ara", "ces", "zho", "eng", "fra", "deu", "ita", "jpn", "kan", "lit", "por", "rus", "spa", "ukr"], + "Language": ["ara", "ces", "zho", "eng", "fra", "deu", "ita", "jpn", "kan", "lit", "por", "rus", "spa", "ukr"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/kontext-clarin-si.json b/tools/corpus-query-tools/kontext-clarin-si.json index 4cd0bc7..1d9ce0c 100644 --- a/tools/corpus-query-tools/kontext-clarin-si.json +++ b/tools/corpus-query-tools/kontext-clarin-si.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is the CLARIN.SI installation of LINDAT's KonText, comprised of the KonText front-end developed by the Czech National Corpus team and the Manatee back-end, developed by Lexical Computing. This installation offers over 50 richly annotated corpora in Slovenian and other languages.\nShibboleth log-in is supported.", "Functionality": ["Querying/concordancing"] - "Languages": ["slv", "hrv", "bos", "srp", "cnr", "mkd", "hbs", "bul", "ces", "slk", "pol", "eng", "dan", "nld", "est", "fin", "fra", Gaelic, "deu", "ell", "hun", "isl", "ita", "jpn", "lav", "lit", "por"], + "Language": ["slv", "hrv", "bos", "srp", "cnr", "mkd", "hbs", "bul", "ces", "slk", "pol", "eng", "dan", "nld", "est", "fin", "fra", Gaelic, "deu", "ell", "hun", "isl", "ita", "jpn", "lav", "lit", "por"], "Licence": "No licence", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/kontext-latvian.json b/tools/corpus-query-tools/kontext-latvian.json index 84ceb7e..ad397a6 100644 --- a/tools/corpus-query-tools/kontext-latvian.json +++ b/tools/corpus-query-tools/kontext-latvian.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool corresponds to an implementation of LINDAT's KonText for Latvian resources.\nEight Latvian corpora can be searched with this tool.", "Functionality": ["Querying/concordancing"] - "Languages": ["lav"], + "Language": ["lav"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/kontext-lindat.json b/tools/corpus-query-tools/kontext-lindat.json index aed16d9..bee2151 100644 --- a/tools/corpus-query-tools/kontext-lindat.json +++ b/tools/corpus-query-tools/kontext-lindat.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "KonText is a basic web application for querying corpora available within the LINDAT/CLARIAH-CZ project. It allows evaluation of simple and complex queries, displaying their results as concordance lines, computing frequency distribution, calculating association measures for collocations and further work with language data. This LINDAT/CLARIAH-CZ instance is a fork of KonText application developed by the Institute of the Czech National Corpus that has been further extended by the Institute of Formal and Applied Linguistics to suit the needs of LINDAT/CLARIAH-CZ project.\nIt is possible to upload one's own corpus with this tool. KonText is openly developed. Registration is required and Shibboleth log-in is supported.", "Functionality": ["Querying/concordancing"] - "Languages": ["ces"], + "Language": ["ces"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/korap-corola.json b/tools/corpus-query-tools/korap-corola.json index 085a112..0c192c2 100644 --- a/tools/corpus-query-tools/korap-corola.json +++ b/tools/corpus-query-tools/korap-corola.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is used to query the Reference Corpus for Contemporary Romanian Language CoRoLa.", "Functionality": ["Querying/concordancing"] - "Languages": ["ron"], + "Language": ["ron"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/korap-dereko.json b/tools/corpus-query-tools/korap-dereko.json index 78d147d..d7acf21 100644 --- a/tools/corpus-query-tools/korap-dereko.json +++ b/tools/corpus-query-tools/korap-dereko.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a corpus analysis platform that is suited for large, multiply annotated corpora and complex search queries independent of particular research questions.\nRegistration is required only for license restricted corpora.", "Functionality": ["Querying/concordancing"] - "Languages": ["deu"], + "Language": ["deu"], "Licence": "DeReKo-EULA", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/korp-copenhagen.json b/tools/corpus-query-tools/korp-copenhagen.json index 469ee7d..42e877f 100644 --- a/tools/corpus-query-tools/korp-copenhagen.json +++ b/tools/corpus-query-tools/korp-copenhagen.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a web-based concordancer that can be used for corpus queries based on morphosyntactic analysis and various other features. It is implemented at the University of Copenhagen.\nRegistration is required.", "Functionality": ["Querying/concordancing"] - "Languages": ["dan"], + "Language": ["dan"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/korp-kielipankki.json b/tools/corpus-query-tools/korp-kielipankki.json index f4b6a0d..8b49ed0 100644 --- a/tools/corpus-query-tools/korp-kielipankki.json +++ b/tools/corpus-query-tools/korp-kielipankki.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a web-based concordance tool that can be used for corpus queries based on morphosyntactic analysis and various other features. A large proportion of the corpora in Kielipankki are offered via Korp.\nUser support is available through email.", "Functionality": ["Querying/concordancing"] - "Languages": ["fin", "swe", "rus", "eng", "and more"], + "Language": ["fin", "swe", "rus", "eng", "and more"], "Licence": "Individual corpora have different licenses (and access conditions)", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/korp-sprakbanken.json b/tools/corpus-query-tools/korp-sprakbanken.json index 06cb346..8c7a0f5 100644 --- a/tools/corpus-query-tools/korp-sprakbanken.json +++ b/tools/corpus-query-tools/korp-sprakbanken.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is Språkbanken's corpus tool for searching in large amounts of texts, including newspapers, novels and social media.", "Functionality": ["Querying/concordancing"] - "Languages": ["swe"], + "Language": ["swe"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/lancsbox.json b/tools/corpus-query-tools/lancsbox.json index 2bd65c3..b04fb60 100644 --- a/tools/corpus-query-tools/lancsbox.json +++ b/tools/corpus-query-tools/lancsbox.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "#LancsBox is a new-generation software package for the analysis of language data and corpora developed at Lancaster University. The latest version, #Lancsbox X has increased functionality for XML texts.\nA user guide is available in English, French and Japanese, along with instructional videos. See here. ", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "CC BY-NC-ND 4.0", "Size": [], "Platform": ["Platform-independent (java)"], diff --git a/tools/corpus-query-tools/liwc-22.json b/tools/corpus-query-tools/liwc-22.json index 6a5d1bb..28a8f3d 100644 --- a/tools/corpus-query-tools/liwc-22.json +++ b/tools/corpus-query-tools/liwc-22.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a commercial product for analyzing word use. It can be used to study a single individual, groups of people over time, or all of social media.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["Windows", "MacOS"], diff --git a/tools/corpus-query-tools/lncc.json b/tools/corpus-query-tools/lncc.json index 270c77f..6ed37ac 100644 --- a/tools/corpus-query-tools/lncc.json +++ b/tools/corpus-query-tools/lncc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "Latvian National Corpora Collection (LNCC) is a diverse collection of corpora representing both written and spoken language. LNCC covers various use cases and all the important text types and genres. It is a continuous multi-institutional and multi-project effort, supported by the digital humanities and language technology communities in Latvia.\nCurrently, 34 corpora developed by 13 institutions are available in the LNCC. Most of the corpora are annotated with a uniform morpho-syntactic annotation scheme and included in the federated search. The federated search combines multiple corpora from two corpus indexer instances (endpoints) maintained by IMCS UL and NLL. Federated search includes 28 corpora (2.4 billions tokens).", "Functionality": ["Querying/concordancing"] - "Languages": ["lav", "ltg", "lit"], + "Language": ["lav", "ltg", "lit"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/monoconc.json b/tools/corpus-query-tools/monoconc.json index 7b90156..588c55f 100644 --- a/tools/corpus-query-tools/monoconc.json +++ b/tools/corpus-query-tools/monoconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a concordance programme. It is made available on a commercial basis.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/nat-pol-ipi-pan.json b/tools/corpus-query-tools/nat-pol-ipi-pan.json index 2784a3e..fffaeea 100644 --- a/tools/corpus-query-tools/nat-pol-ipi-pan.json +++ b/tools/corpus-query-tools/nat-pol-ipi-pan.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated concordancer for NKJP corpora.", "Functionality": ["Querying/concordancing"] - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/nat-pol-pelcra.json b/tools/corpus-query-tools/nat-pol-pelcra.json index 92a6c24..514f1be 100644 --- a/tools/corpus-query-tools/nat-pol-pelcra.json +++ b/tools/corpus-query-tools/nat-pol-pelcra.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated concordancer for NKJP corpora.", "Functionality": ["Querying/concordancing"] - "Languages": ["pol"], + "Language": ["pol"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/nb-dh-lab.json b/tools/corpus-query-tools/nb-dh-lab.json index c0f3623..56563a2 100644 --- a/tools/corpus-query-tools/nb-dh-lab.json +++ b/tools/corpus-query-tools/nb-dh-lab.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This collection of tools corresponds to a REST API, Python package and web applications allowing a user to build corpora from the vast digital collections of the National Library of Norway (currently ca. 160 billion words). Users get concordances, frequency lists and co-occurrence data.\nUser support is available through email.", "Functionality": ["Querying/concordancing/analysis"] - "Languages": ["nob", "nno", "sme", "smj", "sma"], + "Language": ["nob", "nno", "sme", "smj", "sma"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/nederlab.json b/tools/corpus-query-tools/nederlab.json index d0ad33e..d83a6a5 100644 --- a/tools/corpus-query-tools/nederlab.json +++ b/tools/corpus-query-tools/nederlab.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an online research portal for historical texts in the Dutch language.\nRegistration is required and Shibboleth log-in is supported.", "Functionality": ["Querying/concordancing"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/nooj.json b/tools/corpus-query-tools/nooj.json index 8b47a8e..f0c38df 100644 --- a/tools/corpus-query-tools/nooj.json +++ b/tools/corpus-query-tools/nooj.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is part of a linguistic development environment, which includes functionality for text and corpus analysis.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL Academic - Non-commercial (Java version)", "Size": [], "Platform": ["Windows (cut-down java version also available for other OS)"], diff --git a/tools/corpus-query-tools/nosketch-clarin-si.json b/tools/corpus-query-tools/nosketch-clarin-si.json index 64db3bd..9b84afb 100644 --- a/tools/corpus-query-tools/nosketch-clarin-si.json +++ b/tools/corpus-query-tools/nosketch-clarin-si.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an open-source version of the commercial Sketch Engine, produced by Lexical Computing. This installation of noSketch Engine at CLARIN.SI offers over 50 richly annotated corpora in Slovenian and other languages.", "Functionality": ["Querying/concordancing"] - "Languages": ["slv", "hrv", "bos", "srp", "cnr", "mkd", "hbs", "bul", "ces", "slk", "pol", "eng", "dan", "nld", "est", "fin", "fra", Gaelic, "deu", "ell", "hun", "isl", "ita", "jpn", "lav", "lit", "por"], + "Language": ["slv", "hrv", "bos", "srp", "cnr", "mkd", "hbs", "bul", "ces", "slk", "pol", "eng", "dan", "nld", "est", "fin", "fra", Gaelic, "deu", "ell", "hun", "isl", "ita", "jpn", "lav", "lit", "por"], "Licence": "no", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/nosketch-engine.json b/tools/corpus-query-tools/nosketch-engine.json index 8e9fc17..3e2332c 100644 --- a/tools/corpus-query-tools/nosketch-engine.json +++ b/tools/corpus-query-tools/nosketch-engine.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an open source version of Sketch Engine with certain functionality limitations (for instance, WordSketch is not available).", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Components available under separate licences: GPLv2+, GPLv3", "Size": [], "Platform": ["Linux"], diff --git a/tools/corpus-query-tools/nvivo.json b/tools/corpus-query-tools/nvivo.json index 0f7d562..9890f05 100644 --- a/tools/corpus-query-tools/nvivo.json +++ b/tools/corpus-query-tools/nvivo.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a commercial software application for qualitative text and data analysis. ", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["Windows", "MacOS"], diff --git a/tools/corpus-query-tools/opensonar.json b/tools/corpus-query-tools/opensonar.json index fee235a..9656efa 100644 --- a/tools/corpus-query-tools/opensonar.json +++ b/tools/corpus-query-tools/opensonar.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an online corpus retrieval system that allows for analyzing and searching the SoNaR and CGN corpora.\nRegistration is required and Shibboleth log-in is supported.", "Functionality": ["Querying/concordancing"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/paqu.json b/tools/corpus-query-tools/paqu.json index 3c5fabc..ad2e9dc 100644 --- a/tools/corpus-query-tools/paqu.json +++ b/tools/corpus-query-tools/paqu.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an application for searching in treebanks (i.e. text corpora in which each sentence has been assigned a syntactic structure) and for analysing the search results.\nIt is possible to upload one's own corpus with this tool, for which registration is required.", "Functionality": ["Querying/concordancing (treebanks)"] - "Languages": ["nld"], + "Language": ["nld"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/paraconc.json b/tools/corpus-query-tools/paraconc.json index 96e0da2..971e77a 100644 --- a/tools/corpus-query-tools/paraconc.json +++ b/tools/corpus-query-tools/paraconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "A parallel concordance programme for aligned source and target translation texts. This is a commercial tool.", "Functionality": ["Parallel Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "No licence", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/praaline.json b/tools/corpus-query-tools/praaline.json index 76653c9..ccc7a50 100644 --- a/tools/corpus-query-tools/praaline.json +++ b/tools/corpus-query-tools/praaline.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a system for managing, annotating, visualising and analysing spoken language corpora.", "Functionality": ["Concordancing/querying", "corpus building"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/prime-machine.json b/tools/corpus-query-tools/prime-machine.json index c96740d..97cdeb1 100644 --- a/tools/corpus-query-tools/prime-machine.json +++ b/tools/corpus-query-tools/prime-machine.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This a user-friendly corpus tool for English language teaching, linguistic analysis and self-tutoring based on the Lexical Priming theory of language.\nOnline support is available.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["iOS", "MacOS", "Android", "Windows"], diff --git a/tools/corpus-query-tools/pyxmlconc.json b/tools/corpus-query-tools/pyxmlconc.json index 66a613d..49e975f 100644 --- a/tools/corpus-query-tools/pyxmlconc.json +++ b/tools/corpus-query-tools/pyxmlconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a simple concordancer. It is supposed to be used in exploratory analysis of XML-annotated corpora. Its primary feature lies in the automatic detection of XML tags and attributes. The search/concordancing function supports regular expressions.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "MIT", "Size": [], "Platform": ["Platform-independent (requires Python)"], diff --git a/tools/corpus-query-tools/qcat.json b/tools/corpus-query-tools/qcat.json index 102f325..71d146a 100644 --- a/tools/corpus-query-tools/qcat.json +++ b/tools/corpus-query-tools/qcat.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "The tools allows for manual linguistic annotation of corpora and advanced queries on top of these annotations.\nThe tool has been used in various annotation campaigns related to the ssj500k reference training corpus of Slovenian, such as named entities, dependency syntax, semantic roles and multi-word expressions, but it can also be used for adding new annotation layers of various types to this or other language corpora. Q-CAT is a .NET application, which runs on Windows operating system.\nThis resource is available for download from the CLARIN.SI repository.", "Functionality": ["Annotating/concordancing/querying/listening to audio recordings"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Apache License 2.0", "Size": [], "Platform": [".NET"], diff --git a/tools/corpus-query-tools/scattertext.json b/tools/corpus-query-tools/scattertext.json index d1cf1b8..efc2359 100644 --- a/tools/corpus-query-tools/scattertext.json +++ b/tools/corpus-query-tools/scattertext.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a tool for finding distinguishing terms in corpora and displaying them in an interactive HTML scatter plot. Points corresponding to terms are selectively labelled so that they don't overlap with other labels or points.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/shebanq.json b/tools/corpus-query-tools/shebanq.json index 994eaa1..83c6219 100644 --- a/tools/corpus-query-tools/shebanq.json +++ b/tools/corpus-query-tools/shebanq.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated online environment for querying the Hebrew Bible.", "Functionality": ["Querying/concordancing"] - "Languages": ["heb"], + "Language": ["heb"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/shinyconc.json b/tools/corpus-query-tools/shinyconc.json index 6b9246f..1b672b4 100644 --- a/tools/corpus-query-tools/shinyconc.json +++ b/tools/corpus-query-tools/shinyconc.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a framework for generating custom web-based concordancers. It requires R and Rstudio/Shiny.\nA detailed setup tutorial is available.", "Functionality": ["Concordancing/querying", "corpus building"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Windows", "MacOS", "Linux"], diff --git a/tools/corpus-query-tools/simple-concordancer.json b/tools/corpus-query-tools/simple-concordancer.json index 382bc94..5657056 100644 --- a/tools/corpus-query-tools/simple-concordancer.json +++ b/tools/corpus-query-tools/simple-concordancer.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool allows users to create word lists and search natural language text files for words, phrases, and patterns. The tool is a concordance and word listing program that is able to read texts written in many languages. There are built-in alphabets for English, French, German, Polish, Greek and Russian. The tool contains an alphabet editor which you can use to create alphabets for any other language.\nA help document is available.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/simple-corpus-tool.json b/tools/corpus-query-tools/simple-corpus-tool.json index e729ac5..a1156e5 100644 --- a/tools/corpus-query-tools/simple-corpus-tool.json +++ b/tools/corpus-query-tools/simple-corpus-tool.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a combination of an annotation and analysis tool for use with either simple XML files or basic plain-text files.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/skell.json b/tools/corpus-query-tools/skell.json index ca8d8b1..e410e9e 100644 --- a/tools/corpus-query-tools/skell.json +++ b/tools/corpus-query-tools/skell.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a simple tool for students and teachers of English to easily check whether or how a particular phrase or a word is used by real speakers of English.", "Functionality": ["Querying/concordancing"] - "Languages": ["eng", "rus", "deu", "ita", "ces", "est"], + "Language": ["eng", "rus", "deu", "ita", "ces", "est"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/sketchengine.json b/tools/corpus-query-tools/sketchengine.json index d1d48ad..710f419 100644 --- a/tools/corpus-query-tools/sketchengine.json +++ b/tools/corpus-query-tools/sketchengine.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "Sketch Engine is a commercial online corpus analysis application, used by linguists, lexicographers, translators, students and teachers. Sketch Engine contains 600 ready-to-use corpora in 90+ languages.\nIt is possible to upload one's own corpus with this tool. Registration is required and Shibboleth log-in is supported. Support is offered via email.", "Functionality": ["Querying/concordancing", "corpus upload and processing"] - "Languages": ["Multiple"], + "Language": ["Multiple"], "Licence": "Proprietary", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/spaadia.json b/tools/corpus-query-tools/spaadia.json index f9e0cfb..269a25e 100644 --- a/tools/corpus-query-tools/spaadia.json +++ b/tools/corpus-query-tools/spaadia.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "The SPAADIA concordancer (32bit Windows version): a concordancer (mainly) for use with the SPAADIA corpus.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Windows", "MacOS"], diff --git a/tools/corpus-query-tools/teitok.json b/tools/corpus-query-tools/teitok.json index 4e2bdd2..c313b47 100644 --- a/tools/corpus-query-tools/teitok.json +++ b/tools/corpus-query-tools/teitok.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a web-based system for viewing, creating, and editing corpora with both rich textual mark-up and linguistic annotation. For visitors, the system provides a graphical user interface in which the annotated document can be visualized in a number of different ways. And for administrators of the corpus, TEITOK uses the same interface to allow easy editing of the underlying XML document, meaning administrators can correct their corpus while they are consulting it.\nRegistration is required and Shibboleth log-in is supported. User documentation is available.", "Functionality": ["Querying/concordancing, corpus upload and processing"] - "Languages": ["Multiple"], + "Language": ["Multiple"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/textable.json b/tools/corpus-query-tools/textable.json index ba2cd0d..de4f9be 100644 --- a/tools/corpus-query-tools/textable.json +++ b/tools/corpus-query-tools/textable.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a free open source software application to analyze and process texts visually.\nSupport is available.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/textal.json b/tools/corpus-query-tools/textal.json index a49159e..75d7613 100644 --- a/tools/corpus-query-tools/textal.json +++ b/tools/corpus-query-tools/textal.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a free smartphone app that allows users to analyze websites, tweet streams, and documents, as you explore the relationships between words in the text via an intuitive word cloud interface. It can generate graphs and statics, and share the data and visualizations. ", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["iPhone app"], diff --git a/tools/corpus-query-tools/textstat.json b/tools/corpus-query-tools/textstat.json index dbab0ea..6b2c84a 100644 --- a/tools/corpus-query-tools/textstat.json +++ b/tools/corpus-query-tools/textstat.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a simple programme for the analysis of texts. It reads plain text files (in different encodings) and HTML files (directly from the internet) and it produces word frequency lists and concordances from these files. This version includes a web-spider which reads as many pages as the researcher wants from a particular website and puts them in a TextSTAT-corpus. The new news-reader, too, puts news messages in a TextSTAT-readable corpus file.\nA quickstart guide, a user guide and video tutorial are available online.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Versions for Windows and platform-independent Python version"], diff --git a/tools/corpus-query-tools/tsakorpus.json b/tools/corpus-query-tools/tsakorpus.json index 68c59f1..43f452b 100644 --- a/tools/corpus-query-tools/tsakorpus.json +++ b/tools/corpus-query-tools/tsakorpus.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a platform for publishing corpora online.\nThe platform is free and open source, and amongst others supports annotated and parallel corpora, as well as the use of regular expressions.", "Functionality": ["Querying/concordancing"], - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "MIT License", "Size": [], "Platform": ["Ubuntu", "Windows"], diff --git a/tools/corpus-query-tools/txm-online.json b/tools/corpus-query-tools/txm-online.json index 3bc2d04..a1fc49b 100644 --- a/tools/corpus-query-tools/txm-online.json +++ b/tools/corpus-query-tools/txm-online.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool corresponds to a number of different TXM portals running at various sites and with a number of different corpora. TXM offers online analysis tools for querying language corpora. The interface is in French.", "Functionality": ["Querying/concordancing"] - "Languages": ["fra", "eng"], + "Language": ["fra", "eng"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/txm.json b/tools/corpus-query-tools/txm.json index 84c2e3f..4f0f746 100644 --- a/tools/corpus-query-tools/txm.json +++ b/tools/corpus-query-tools/txm.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool employs lexicometry (see Scholz 2019) and text statistical analysis. It offers tools and methods tested in multiple branches of the humanities and is statistically well founded.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL2", "Size": [], "Platform": ["Linux", "MacOS", "Windows"], diff --git a/tools/corpus-query-tools/voyant-tools-dk.json b/tools/corpus-query-tools/voyant-tools-dk.json index 600bbd5..fae8f74 100644 --- a/tools/corpus-query-tools/voyant-tools-dk.json +++ b/tools/corpus-query-tools/voyant-tools-dk.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool constitutes a deployment of Voyant Tools at CLARIN-DK.", "Functionality": ["Querying/concordancing", "Stylometry"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/voyant-tools-salidar.json b/tools/corpus-query-tools/voyant-tools-salidar.json index 11e5d57..1570dd0 100644 --- a/tools/corpus-query-tools/voyant-tools-salidar.json +++ b/tools/corpus-query-tools/voyant-tools-salidar.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool constitutes a deployment of Voyant Tools used at SADILAR.", "Functionality": ["Querying/concordancing", "Stylometry"] - "Languages": ["ara", "bos", "hrv", "ces", "eng", "fra", "deu", "heb", "ita", "jpn", "por", "srp", "spa"], + "Language": ["ara", "bos", "hrv", "ces", "eng", "fra", "deu", "heb", "ita", "jpn", "por", "srp", "spa"], "Licence": "GPL3 (code)", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/voyant-tools.json b/tools/corpus-query-tools/voyant-tools.json index d6e6368..6cce162 100644 --- a/tools/corpus-query-tools/voyant-tools.json +++ b/tools/corpus-query-tools/voyant-tools.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a web-based text reading and analysis environment. It is a scholarly project that is designed to facilitate reading and interpretive practices for digital humanities students and scholars as well as for the general public.\nIt is possible to upload one's own corpus with this tool.\nThe interface is available in a number of languages. An online user guide is available. ", "Functionality": ["Querying/concordancing", "Stylometry"] - "Languages": ["ara", "bos", "hrv", "ces", "eng", "fra", "deu", "heb", "ita", "jpn", "por", "rus", "srp", "spa"], + "Language": ["ara", "bos", "hrv", "ces", "eng", "fra", "deu", "heb", "ita", "jpn", "por", "rus", "srp", "spa"], "Licence": "GPL3 (code)", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/webclark.json b/tools/corpus-query-tools/webclark.json index 97ad3bf..cbdde09 100644 --- a/tools/corpus-query-tools/webclark.json +++ b/tools/corpus-query-tools/webclark.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated concordancer for the Bulgarian National Reference Corpus.", "Functionality": ["Querying/concordancing"] - "Languages": ["bul"], + "Language": ["bul"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/webcorp-learn.json b/tools/corpus-query-tools/webcorp-learn.json index c3d9a90..f97e561 100644 --- a/tools/corpus-query-tools/webcorp-learn.json +++ b/tools/corpus-query-tools/webcorp-learn.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool gives researchers access to a large collection (corpus) of newspaper articles spanning three decades. The tool has been created by linguists to encourage curiosity in language learners. WebCorp Learn promotes playful and context-based inductive learning and enables you to discover language through exploratory experimentation.\nRegistration is required.", "Functionality": ["Querying/concordancing"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/webcorp-lse.json b/tools/corpus-query-tools/webcorp-lse.json index e4388f6..0a8a7b8 100644 --- a/tools/corpus-query-tools/webcorp-lse.json +++ b/tools/corpus-query-tools/webcorp-lse.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated tool for the study of language on the web. The corpora were built by crawling the web and extracting textual content from web pages. Searches can be performed to find words, lemmas or phrases, including pattern matching, wildcards and part-of-speech. Results are given as concordance lines in KWIC format. Post-search analyses are possible including time series, collocation tables, sorting and summaries of meta-data from the matched web pages.\nIt is possible to upload one's own corpus with this tool.\nRegistration is required.", "Functionality": ["Querying/concordancing"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/webcorp.json b/tools/corpus-query-tools/webcorp.json index 8ea5204..1dd508f 100644 --- a/tools/corpus-query-tools/webcorp.json +++ b/tools/corpus-query-tools/webcorp.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a dedicated concordancing tool.", "Functionality": ["Querying/concordancing"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/wmatrix.json b/tools/corpus-query-tools/wmatrix.json index ab03b36..35720f5 100644 --- a/tools/corpus-query-tools/wmatrix.json +++ b/tools/corpus-query-tools/wmatrix.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool provides a web interface to the English USAS and CLAWS corpus annotation tools, and standard corpus linguistic methodologies such as frequency lists and concordances. It also extends the keywords method to key grammatical categories and key semantic domains.\nIt is possible to upload one's own corpus with this tool. The tool is free for UK government and academic researchers in countries on the OECD DAC list, £50 per username per year for non commercial research and teaching. Technical support is offered here.", "Functionality": ["Querying/concordancing", "corpus upload and processing"] - "Languages": ["eng", "spa"], + "Language": ["eng", "spa"], "Licence": "", "Size": [], "Platform": [], diff --git a/tools/corpus-query-tools/word-cruncher.json b/tools/corpus-query-tools/word-cruncher.json index 3bb6e7c..ca27152 100644 --- a/tools/corpus-query-tools/word-cruncher.json +++ b/tools/corpus-query-tools/word-cruncher.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool offers a wide variety of tools for searching, studying, and analyzing texts.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "", "Size": [], "Platform": ["Windows", "iOS"], diff --git a/tools/corpus-query-tools/wordless.json b/tools/corpus-query-tools/wordless.json index 7726a33..5dbedbe 100644 --- a/tools/corpus-query-tools/wordless.json +++ b/tools/corpus-query-tools/wordless.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is an integrated corpus tool with multilingual support for the study of language, literature, and translation.\nThe latest version (3.2.0) of Wordless supports Windows 7/8/8.1/10/11, macOS 10.11 or later, and Ubuntu 16.04 or later, all 64-bit only. Both Intel-based and M1-based Macs are supported.\nThe tool is available for download from GitHub.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Windows", "MacOS", "Linux"], diff --git a/tools/corpus-query-tools/wordsmith-tools.json b/tools/corpus-query-tools/wordsmith-tools.json index 639b40f..d639703 100644 --- a/tools/corpus-query-tools/wordsmith-tools.json +++ b/tools/corpus-query-tools/wordsmith-tools.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This tool is capable of finding word patterns, and has functionalities for concordance, collocation, word lists and keywords. It is a commercial tool.\nThere is a dedicated Google Group for this tool.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "Proprietary", "Size": [], "Platform": ["Windows"], diff --git a/tools/corpus-query-tools/wordstatix.json b/tools/corpus-query-tools/wordstatix.json index e4fbf87..87f3cc4 100644 --- a/tools/corpus-query-tools/wordstatix.json +++ b/tools/corpus-query-tools/wordstatix.json @@ -4,7 +4,7 @@ "Family": "Corpus query tools", "Description": "This is a simple concordancer.", "Functionality": ["Concordancing/querying"] - "Languages": ["Language independent"], + "Language": ["Language independent"], "Licence": "GPL3", "Size": [], "Platform": ["Linux", "Windows"],