From 47fa35ec66c8e3cb1001829b21d317a94ce156bd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:20:46 -0500 Subject: [PATCH] [GH Actions] automatic-add-publications-by-author (#442) Co-authored-by: xhluca --- .../papers/2011-04-01-10.1109-ITNG.2011.67.md | 22 +++++ ...24aa9a70db6cdd57aeed0b7b0e2575231c590cb.md | 22 +++++ .../2016-06-02-10.1142-S1793962316500057.md | 23 +++++ _posts/papers/2019-05-13-1905.05961.md | 24 +++++ _posts/papers/2019-07-22-1907.09177.md | 23 +++++ _posts/papers/2019-12-05-1912.02481.md | 24 +++++ ...a39fcaae3eee1277df9f1191b8bab1b11732c25.md | 23 +++++ _posts/papers/2020-03-18-2003.08272.md | 22 +++++ _posts/papers/2020-03-18-2003.08370.md | 24 +++++ _posts/papers/2020-03-23-2003.10564.md | 23 +++++ _posts/papers/2020-06-19-2006.10919.md | 23 +++++ ...6802054111e37b6d6b517fbe1cddf394cef76c7.md | 22 +++++ _posts/papers/2020-08-07-2008.03101.md | 22 +++++ _posts/papers/2020-08-31-2103.04818.md | 23 +++++ ...2020-09-08-10.1007-978-3-030-58323-1_30.md | 23 +++++ _posts/papers/2020-10-07-2010.03179.md | 24 +++++ .../2020-11-30-10.5281-ZENODO.4297448.md | 23 +++++ ...eb70691b9abebfff63709a6ce06bf6a9cc96a65.md | 24 +++++ _posts/papers/2021-03-15-2103.08647.md | 23 +++++ _posts/papers/2021-03-22-2103.11811.md | 34 +++++++ _posts/papers/2021-04-06-2104.02516.md | 25 +++++ _posts/papers/2021-09-19-2109.09133.md | 23 +++++ .../papers/2022-01-01-10.2139-ssrn.4136717.md | 24 +++++ .../2022-01-01-10.48550-arXiv.2204.06487.md | 22 +++++ _posts/papers/2022-01-01-2022.coling-1.377.md | 22 +++++ _posts/papers/2022-01-01-2022.wmt-1.72.md | 26 ++++++ ...ef570ac8db5c7e5192334f31675cc2fd7b6622a.md | 22 +++++ _posts/papers/2022-01-20-2201.08277.md | 25 +++++ _posts/papers/2022-03-16-2203.08850.md | 24 +++++ _posts/papers/2022-04-13-2204.06487.md | 23 +++++ _posts/papers/2022-04-20-2204.09371.md | 23 +++++ _posts/papers/2022-04-20-2204.09711.md | 22 +++++ _posts/papers/2022-04-22-2204.10931.md | 23 +++++ _posts/papers/2022-05-04-2205.02022.md | 32 +++++++ _posts/papers/2022-06-03-2206.01476.md | 23 +++++ _posts/papers/2022-06-15-2206.07841.md | 24 +++++ _posts/papers/2022-07-07-2207.03546.md | 27 ++++++ _posts/papers/2022-10-22-2210.12391.md | 31 +++++++ _posts/papers/2022-11-09-2211.05100.md | 93 +++++++++++++++++++ _posts/papers/2022-12-19-2212.09535.md | 25 +++++ ...23-01-01-10.18653-v1-2023.emnlp-main.11.md | 23 +++++ .../2023-01-01-10.18653-v1-2023.mrl-1.24.md | 27 ++++++ .../2023-01-01-10.48550-arXiv.2311.09828.md | 34 +++++++ ...94e3de4e59812f824d07f785c1a982cb09bb987.md | 28 ++++++ ...e5aaa09f2c9a08e9343754c81a2310ba2d49ec3.md | 23 +++++ _posts/papers/2023-02-17-2302.08956.md | 27 ++++++ _posts/papers/2023-03-07-2303.03915.md | 32 +++++++ _posts/papers/2023-03-31-2303.17972.md | 23 +++++ _posts/papers/2023-04-08-2304.03952.md | 24 +++++ _posts/papers/2023-04-13-2304.06845.md | 23 +++++ _posts/papers/2023-04-19-2304.09972.md | 35 +++++++ _posts/papers/2023-05-11-2305.06897.md | 32 +++++++ _posts/papers/2023-05-18-2305.10971.md | 23 +++++ _posts/papers/2023-05-19-2305.11938.md | 26 ++++++ _posts/papers/2023-05-23-2305.13989.md | 30 ++++++ _posts/papers/2023-07-03-2307.01163.md | 23 +++++ _posts/papers/2023-07-29-2307.16071.md | 23 +++++ ...2023-08-01-10.1016-j.patter.2023.100820.md | 25 +++++ _posts/papers/2023-08-18-2308.09768.md | 22 +++++ _posts/papers/2023-09-14-2309.07445.md | 24 +++++ _posts/papers/2023-11-14-2311.07978.md | 22 +++++ _posts/papers/2023-11-16-2311.09828.md | 34 +++++++ _posts/papers/2024-04-03-2404.02534.md | 23 +++++ _posts/papers/2024-04-28-2404.18180.md | 24 +++++ _posts/papers/2024-04-28-2404.18286.md | 23 +++++ _posts/papers/2024-04-30-2404.19442.md | 23 +++++ .../2024-06-05-10.1038-d41586-024-00964-2.md | 22 +++++ _posts/papers/2024-06-05-2406.03368.md | 28 ++++++ _posts/papers/2024-06-27-2406.19564.md | 23 +++++ _posts/papers/2024-07-14-2407.10152.md | 27 ++++++ _posts/papers/2024-07-23-2407.16470.md | 24 +++++ _posts/papers/2024-12-01-2412.00948.md | 26 ++++++ records/semantic_paper_ids_ignored.json | 72 ++++++++++++++ 73 files changed, 1928 insertions(+) create mode 100644 _posts/papers/2011-04-01-10.1109-ITNG.2011.67.md create mode 100644 _posts/papers/2014-12-07-524aa9a70db6cdd57aeed0b7b0e2575231c590cb.md create mode 100644 _posts/papers/2016-06-02-10.1142-S1793962316500057.md create mode 100644 _posts/papers/2019-05-13-1905.05961.md create mode 100644 _posts/papers/2019-07-22-1907.09177.md create mode 100644 _posts/papers/2019-12-05-1912.02481.md create mode 100644 _posts/papers/2020-01-01-6a39fcaae3eee1277df9f1191b8bab1b11732c25.md create mode 100644 _posts/papers/2020-03-18-2003.08272.md create mode 100644 _posts/papers/2020-03-18-2003.08370.md create mode 100644 _posts/papers/2020-03-23-2003.10564.md create mode 100644 _posts/papers/2020-06-19-2006.10919.md create mode 100644 _posts/papers/2020-06-19-d6802054111e37b6d6b517fbe1cddf394cef76c7.md create mode 100644 _posts/papers/2020-08-07-2008.03101.md create mode 100644 _posts/papers/2020-08-31-2103.04818.md create mode 100644 _posts/papers/2020-09-08-10.1007-978-3-030-58323-1_30.md create mode 100644 _posts/papers/2020-10-07-2010.03179.md create mode 100644 _posts/papers/2020-11-30-10.5281-ZENODO.4297448.md create mode 100644 _posts/papers/2021-01-01-beb70691b9abebfff63709a6ce06bf6a9cc96a65.md create mode 100644 _posts/papers/2021-03-15-2103.08647.md create mode 100644 _posts/papers/2021-03-22-2103.11811.md create mode 100644 _posts/papers/2021-04-06-2104.02516.md create mode 100644 _posts/papers/2021-09-19-2109.09133.md create mode 100644 _posts/papers/2022-01-01-10.2139-ssrn.4136717.md create mode 100644 _posts/papers/2022-01-01-10.48550-arXiv.2204.06487.md create mode 100644 _posts/papers/2022-01-01-2022.coling-1.377.md create mode 100644 _posts/papers/2022-01-01-2022.wmt-1.72.md create mode 100644 _posts/papers/2022-01-01-2ef570ac8db5c7e5192334f31675cc2fd7b6622a.md create mode 100644 _posts/papers/2022-01-20-2201.08277.md create mode 100644 _posts/papers/2022-03-16-2203.08850.md create mode 100644 _posts/papers/2022-04-13-2204.06487.md create mode 100644 _posts/papers/2022-04-20-2204.09371.md create mode 100644 _posts/papers/2022-04-20-2204.09711.md create mode 100644 _posts/papers/2022-04-22-2204.10931.md create mode 100644 _posts/papers/2022-05-04-2205.02022.md create mode 100644 _posts/papers/2022-06-03-2206.01476.md create mode 100644 _posts/papers/2022-06-15-2206.07841.md create mode 100644 _posts/papers/2022-07-07-2207.03546.md create mode 100644 _posts/papers/2022-10-22-2210.12391.md create mode 100644 _posts/papers/2022-11-09-2211.05100.md create mode 100644 _posts/papers/2022-12-19-2212.09535.md create mode 100644 _posts/papers/2023-01-01-10.18653-v1-2023.emnlp-main.11.md create mode 100644 _posts/papers/2023-01-01-10.18653-v1-2023.mrl-1.24.md create mode 100644 _posts/papers/2023-01-01-10.48550-arXiv.2311.09828.md create mode 100644 _posts/papers/2023-01-01-794e3de4e59812f824d07f785c1a982cb09bb987.md create mode 100644 _posts/papers/2023-01-01-8e5aaa09f2c9a08e9343754c81a2310ba2d49ec3.md create mode 100644 _posts/papers/2023-02-17-2302.08956.md create mode 100644 _posts/papers/2023-03-07-2303.03915.md create mode 100644 _posts/papers/2023-03-31-2303.17972.md create mode 100644 _posts/papers/2023-04-08-2304.03952.md create mode 100644 _posts/papers/2023-04-13-2304.06845.md create mode 100644 _posts/papers/2023-04-19-2304.09972.md create mode 100644 _posts/papers/2023-05-11-2305.06897.md create mode 100644 _posts/papers/2023-05-18-2305.10971.md create mode 100644 _posts/papers/2023-05-19-2305.11938.md create mode 100644 _posts/papers/2023-05-23-2305.13989.md create mode 100644 _posts/papers/2023-07-03-2307.01163.md create mode 100644 _posts/papers/2023-07-29-2307.16071.md create mode 100644 _posts/papers/2023-08-01-10.1016-j.patter.2023.100820.md create mode 100644 _posts/papers/2023-08-18-2308.09768.md create mode 100644 _posts/papers/2023-09-14-2309.07445.md create mode 100644 _posts/papers/2023-11-14-2311.07978.md create mode 100644 _posts/papers/2023-11-16-2311.09828.md create mode 100644 _posts/papers/2024-04-03-2404.02534.md create mode 100644 _posts/papers/2024-04-28-2404.18180.md create mode 100644 _posts/papers/2024-04-28-2404.18286.md create mode 100644 _posts/papers/2024-04-30-2404.19442.md create mode 100644 _posts/papers/2024-06-05-10.1038-d41586-024-00964-2.md create mode 100644 _posts/papers/2024-06-05-2406.03368.md create mode 100644 _posts/papers/2024-06-27-2406.19564.md create mode 100644 _posts/papers/2024-07-14-2407.10152.md create mode 100644 _posts/papers/2024-07-23-2407.16470.md create mode 100644 _posts/papers/2024-12-01-2412.00948.md diff --git a/_posts/papers/2011-04-01-10.1109-ITNG.2011.67.md b/_posts/papers/2011-04-01-10.1109-ITNG.2011.67.md new file mode 100644 index 00000000..cae10c4d --- /dev/null +++ b/_posts/papers/2011-04-01-10.1109-ITNG.2011.67.md @@ -0,0 +1,22 @@ +--- +title: A Secure e-Voting Architecture +venue: '2011 Eighth International Conference on Information Technology: New Generations' +names: A. Sodiya, S. Onashoga, David Ifeoluwa Adelani +tags: +- '2011 Eighth International Conference on Information Technology: New Generations' +link: https://doi.org/10.1109/ITNG.2011.67 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +The constant development in computer technology now gives rise to an efficient way of using computer or electronic medium of voting. However, it is being faced with the problem of non-anonymity, coercion and bribery. In this paper, elliptic curve is combined with ElGamal cryptosystem to enhance the security of e-voting architecture. Several points from (x, y) coordinates from elliptic curve are used instead of using a large integer along with ElGamal encryption that is based on probabilistic encryption (produces several cipher texts) which is used to ensure anonymity, non-coercion and receipt-freeness. A voter can also revote to find an appropriate answer to coercion at another location. With the proposed architecture, e-voting system should be fair. \ No newline at end of file diff --git a/_posts/papers/2014-12-07-524aa9a70db6cdd57aeed0b7b0e2575231c590cb.md b/_posts/papers/2014-12-07-524aa9a70db6cdd57aeed0b7b0e2575231c590cb.md new file mode 100644 index 00000000..a9afb478 --- /dev/null +++ b/_posts/papers/2014-12-07-524aa9a70db6cdd57aeed0b7b0e2575231c590cb.md @@ -0,0 +1,22 @@ +--- +title: A Devs-Based Ann Training and Prediction Platform +venue: '' +names: David Ifeoluwa Adelani +tags: +- '' +link: https://www.semanticscholar.org/paper/524aa9a70db6cdd57aeed0b7b0e2575231c590cb +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2016-06-02-10.1142-S1793962316500057.md b/_posts/papers/2016-06-02-10.1142-S1793962316500057.md new file mode 100644 index 00000000..d88bc948 --- /dev/null +++ b/_posts/papers/2016-06-02-10.1142-S1793962316500057.md @@ -0,0 +1,23 @@ +--- +title: Enhancing the reusability and interoperability of artificial neural networks + with DEVS modeling and simulation +venue: Advances in Complex Systems +names: David Ifeoluwa Adelani, M. Traoré +tags: +- Advances in Complex Systems +link: https://doi.org/10.1142/S1793962316500057 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Artificial neural networks (ANNs), a branch of artificial intelligence, has become a very interesting domain since the eighties when back-propagation (BP) learning algorithm for multilayer feed-forward architecture was introduced to solve nonlinear problems. It is used extensively to solve complex nonalgorithmic problems such as prediction, pattern recognition and clustering. However, in the context of a holistic study, there may be a need to integrate ANN with other models developed in various paradigms to solve a problem. In this paper, we suggest discrete event system specification (DEVS) be used as a model of computation (MoC) to make ANN models interoperable with other models (since all discrete event models can be expressed in DEVS, and continuous models can be approximated by DEVS). By combining ANN and DEVS, we can model the complex configuration of ANNs and express its internal workings. Therefore, we are extending the DEVS-based ANN proposed by Toma et al. [A new DEVS-based generic artficial neural network modeling approach, The 23rd European Modeling and Simulation Symp. (Simulation in Industry), Rome, Italy, 2011] for comparing multiple configuration parameters and learning algorithms and also to do prediction. The DEVS models are described using the high level language for system specification (HiLLS), [Maiga et al., A new approach to modeling dynamic structure systems, The 29th European Modeling and Simulation Symp. (Simulation in Industry), Leicester, United Kingdom, 2015] a graphical modeling language for clarity. The developed platform is a tool to transform ANN models into DEVS computational models, making them more reusable and more interoperable in the context of larger multi-perspective modeling and simulation (MAS). \ No newline at end of file diff --git a/_posts/papers/2019-05-13-1905.05961.md b/_posts/papers/2019-05-13-1905.05961.md new file mode 100644 index 00000000..34b15564 --- /dev/null +++ b/_posts/papers/2019-05-13-1905.05961.md @@ -0,0 +1,24 @@ +--- +title: Demographic Inference and Representative Population Estimates from Multilingual + Social Media Data +venue: The Web Conference +names: Zijian Wang, Scott A. Hale, David Ifeoluwa Adelani, Przemyslaw A. Grabowicz, + Timo Hartmann, Fabian Flöck, David Jurgens +tags: +- The Web Conference +link: https://arxiv.org/abs/1905.05961 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Social media provide access to behavioural data at an unprecedented scale and granularity. However, using these data to understand phenomena in a broader population is difficult due to their non-representativeness and the bias of statistical inference tools towards dominant languages and groups. While demographic attribute inference could be used to mitigate such bias, current techniques are almost entirely monolingual and fail to work in a global environment. We address these challenges by combining multilingual demographic inference with post-stratification to create a more representative population sample. To learn demographic attributes, we create a new multimodal deep neural architecture for joint classification of age, gender, and organization-status of social media users that operates in 32 languages. This method substantially outperforms current state of the art while also reducing algorithmic bias. To correct for sampling biases, we propose fully interpretable multilevel regression methods that estimate inclusion probabilities from inferred joint population counts and ground-truth population counts. In a large experiment over multilingual heterogeneous European regions, we show that our demographic inference and bias correction together allow for more accurate estimates of populations and make a significant step towards representative social sensing in downstream applications with multilingual social media. \ No newline at end of file diff --git a/_posts/papers/2019-07-22-1907.09177.md b/_posts/papers/2019-07-22-1907.09177.md new file mode 100644 index 00000000..3dbef695 --- /dev/null +++ b/_posts/papers/2019-07-22-1907.09177.md @@ -0,0 +1,23 @@ +--- +title: Generating Sentiment-Preserving Fake Online Reviews Using Neural Language Models + and Their Human- and Machine-based Detection +venue: International Conference on Advanced Information Networking and Applications +names: David Ifeoluwa Adelani, H. Mai, Fuming Fang, H. Nguyen, J. Yamagishi, I. Echizen +tags: +- International Conference on Advanced Information Networking and Applications +link: https://arxiv.org/abs/1907.09177 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2019-12-05-1912.02481.md b/_posts/papers/2019-12-05-1912.02481.md new file mode 100644 index 00000000..419b8830 --- /dev/null +++ b/_posts/papers/2019-12-05-1912.02481.md @@ -0,0 +1,24 @@ +--- +title: 'Massive vs. Curated Embeddings for Low-Resourced Languages: the Case of Yorùbá + and Twi' +venue: International Conference on Language Resources and Evaluation +names: Jesujoba Oluwadara Alabi, Kwabena Amponsah-Kaakyire, David Ifeoluwa Adelani, + C. España-Bonet +tags: +- International Conference on Language Resources and Evaluation +link: https://arxiv.org/abs/1912.02481 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +The success of several architectures to learn semantic representations from unannotated text and the availability of these kind of texts in online multilingual resources such as Wikipedia has facilitated the massive and automatic creation of resources for multiple languages. The evaluation of such resources is usually done for the high-resourced languages, where one has a smorgasbord of tasks and test sets to evaluate on. For low-resourced languages, the evaluation is more difficult and normally ignored, with the hope that the impressive capability of deep learning architectures to learn (multilingual) representations in the high-resourced setting holds in the low-resourced setting too. In this paper we focus on two African languages, Yorùbá and Twi, and compare the word embeddings obtained in this way, with word embeddings obtained from curated corpora and a language-dependent processing. We analyse the noise in the publicly available corpora, collect high quality and noisy data for the two languages and quantify the improvements that depend not only on the amount of data but on the quality too. We also use different architectures that learn word representations both from surface forms and characters to further exploit all the available information which showed to be important for these languages. For the evaluation, we manually translate the wordsim-353 word pairs dataset from English into Yorùbá and Twi. We extend the analysis to contextual word embeddings and evaluate multilingual BERT on a named entity recognition task. For this, we annotate with named entities the Global Voices corpus for Yorùbá. As output of the work, we provide corpora, embeddings and the test suits for both languages. \ No newline at end of file diff --git a/_posts/papers/2020-01-01-6a39fcaae3eee1277df9f1191b8bab1b11732c25.md b/_posts/papers/2020-01-01-6a39fcaae3eee1277df9f1191b8bab1b11732c25.md new file mode 100644 index 00000000..238f46b4 --- /dev/null +++ b/_posts/papers/2020-01-01-6a39fcaae3eee1277df9f1191b8bab1b11732c25.md @@ -0,0 +1,23 @@ +--- +title: Improving Yorùbá Diacritic Restoration +venue: arXiv.org +names: Iroro Orife, David Ifeoluwa Adelani, Timi E. Fasubaa, Victor Williamson, W. + Oyewusi, Olamilekan Wahab, Kólá Túbosún +tags: +- arXiv.org +link: https://www.semanticscholar.org/paper/6a39fcaae3eee1277df9f1191b8bab1b11732c25 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2020-03-18-2003.08272.md b/_posts/papers/2020-03-18-2003.08272.md new file mode 100644 index 00000000..4eaa4778 --- /dev/null +++ b/_posts/papers/2020-03-18-2003.08272.md @@ -0,0 +1,22 @@ +--- +title: Unsupervised Pidgin Text Generation By Pivoting English Data and Self-Training +venue: arXiv.org +names: Ernie Chang, David Ifeoluwa Adelani, Xiaoyu Shen, Vera Demberg +tags: +- arXiv.org +link: https://arxiv.org/abs/2003.08272 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +West African Pidgin English is a language that is significantly spoken in West Africa, consisting of at least 75 million speakers. Nevertheless, proper machine translation systems and relevant NLP datasets for pidgin English are virtually absent. In this work, we develop techniques targeted at bridging the gap between Pidgin English and English in the context of natural language generation. %As a proof of concept, we explore the proposed techniques in the area of data-to-text generation. By building upon the previously released monolingual Pidgin English text and parallel English data-to-text corpus, we hope to build a system that can automatically generate Pidgin English descriptions from structured data. We first train a data-to-English text generation system, before employing techniques in unsupervised neural machine translation and self-training to establish the Pidgin-to-English cross-lingual alignment. The human evaluation performed on the generated Pidgin texts shows that, though still far from being practically usable, the pivoting + self-training technique improves both Pidgin text fluency and relevance. \ No newline at end of file diff --git a/_posts/papers/2020-03-18-2003.08370.md b/_posts/papers/2020-03-18-2003.08370.md new file mode 100644 index 00000000..15a98b9a --- /dev/null +++ b/_posts/papers/2020-03-18-2003.08370.md @@ -0,0 +1,24 @@ +--- +title: 'Distant Supervision and Noisy Label Learning for Low Resource Named Entity + Recognition: A Study on Hausa and Yorùbá' +venue: AfricaNLP +names: David Ifeoluwa Adelani, Michael A. Hedderich, D. Zhu, Esther van den Berg, + D. Klakow +tags: +- AfricaNLP +link: https://arxiv.org/abs/2003.08370 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +The lack of labeled training data has limited the development of natural language processing tools, such as named entity recognition, for many languages spoken in developing countries. Techniques such as distant and weak supervision can be used to create labeled data in a (semi-) automatic way. Additionally, to alleviate some of the negative effects of the errors in automatic annotation, noise-handling methods can be integrated. Pretrained word embeddings are another key component of most neural named entity classifiers. With the advent of more complex contextual word embeddings, an interesting trade-off between model size and performance arises. While these techniques have been shown to work well in high-resource settings, we want to study how they perform in low-resource scenarios. In this work, we perform named entity recognition for Hausa and Yor\`ub\'a, two languages that are widely spoken in several developing countries. We evaluate different embedding approaches and show that distant supervision can be successfully leveraged in a realistic low-resource scenario where it can more than double a classifier's performance. \ No newline at end of file diff --git a/_posts/papers/2020-03-23-2003.10564.md b/_posts/papers/2020-03-23-2003.10564.md new file mode 100644 index 00000000..b492901b --- /dev/null +++ b/_posts/papers/2020-03-23-2003.10564.md @@ -0,0 +1,23 @@ +--- +title: Improving Yor\`ub\'a Diacritic Restoration +venue: '' +names: Iroro Orife, David Ifeoluwa Adelani, Timi E. Fasubaa, Victor Williamson, W. + Oyewusi, Olamilekan Wahab, Kólá Túbosún +tags: +- '' +link: https://arxiv.org/abs/2003.10564 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Yoruba is a widely spoken West African language with a writing system rich in orthographic and tonal diacritics. They provide morphological information, are crucial for lexical disambiguation, pronunciation and are vital for any computational Speech or Natural Language Processing tasks. However diacritic marks are commonly excluded from electronic texts due to limited device and application support as well as general education on proper usage. We report on recent efforts at dataset cultivation. By aggregating and improving disparate texts from the web and various personal libraries, we were able to significantly grow our clean Yoruba dataset from a majority Bibilical text corpora with three sources to millions of tokens from over a dozen sources. We evaluate updated diacritic restoration models on a new, general purpose, public-domain Yoruba evaluation dataset of modern journalistic news text, selected to be multi-purpose and reflecting contemporary usage. All pre-trained models, datasets and source-code have been released as an open-source project to advance efforts on Yoruba language technology. \ No newline at end of file diff --git a/_posts/papers/2020-06-19-2006.10919.md b/_posts/papers/2020-06-19-2006.10919.md new file mode 100644 index 00000000..9144b07d --- /dev/null +++ b/_posts/papers/2020-06-19-2006.10919.md @@ -0,0 +1,23 @@ +--- +title: On the effect of normalization layers on Differentially Private training of + deep Neural networks +venue: '' +names: A. Davody, David Ifeoluwa Adelani, Thomas Kleinbauer, D. Klakow +tags: +- '' +link: https://arxiv.org/abs/2006.10919 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Differentially private stochastic gradient descent (DPSGD) is a variation of stochastic gradient descent based on the Differential Privacy (DP) paradigm, which can mitigate privacy threats that arise from the presence of sensitive information in training data. However, one major drawback of training deep neural networks with DPSGD is a reduction in the models accuracy. In this paper, we study the effect of normalization layers on the performance of DPSGD. We demonstrate that normalization layers significantly impact the utility of deep neural networks with noisy parameters and should be considered essential ingredients of training with DPSGD. In particular, we propose a novel method for integrating batch normalization with DPSGD without incurring an additional privacy loss. With our approach, we are able to train deeper networks and achieve a better utility-privacy trade-off. \ No newline at end of file diff --git a/_posts/papers/2020-06-19-d6802054111e37b6d6b517fbe1cddf394cef76c7.md b/_posts/papers/2020-06-19-d6802054111e37b6d6b517fbe1cddf394cef76c7.md new file mode 100644 index 00000000..20e41b0b --- /dev/null +++ b/_posts/papers/2020-06-19-d6802054111e37b6d6b517fbe1cddf394cef76c7.md @@ -0,0 +1,22 @@ +--- +title: Robust Differentially Private Training of Deep Neural Networks +venue: arXiv.org +names: A. Davody, David Ifeoluwa Adelani, Thomas Kleinbauer, D. Klakow +tags: +- arXiv.org +link: https://www.semanticscholar.org/paper/d6802054111e37b6d6b517fbe1cddf394cef76c7 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Differentially private stochastic gradient descent (DPSGD) is a variation of stochastic gradient descent based on the Differential Privacy (DP) paradigm which can mitigate privacy threats arising from the presence of sensitive information in training data. One major drawback of training deep neural networks with DPSGD is a reduction in the model's accuracy. In this paper, we propose an alternative method for preserving data privacy based on introducing noise through learnable probability distributions, which leads to a significant improvement in the utility of the resulting private models. We also demonstrate that normalization layers have a large beneficial impact on the performance of deep neural networks with noisy parameters. In particular, we show that contrary to general belief, a large amount of random noise can be added to the weights of neural networks without harming the performance, once the networks are augmented with normalization layers. We hypothesize that this robustness is a consequence of the scale invariance property of normalization operators. Building on these observations, we propose a new algorithmic technique for training deep neural networks under very low privacy budgets by sampling weights from Gaussian distributions and utilizing batch or layer normalization techniques to prevent performance degradation. Our method outperforms previous approaches, including DPSGD, by a substantial margin on a comprehensive set of experiments on Computer Vision and Natural Language Processing tasks. In particular, we obtain a 20 percent accuracy improvement over DPSGD on the MNIST and CIFAR10 datasets with DP-privacy budgets of $\varepsilon = 0.05$ and $\varepsilon = 2.0$, respectively. Our code is available online: this https URL. \ No newline at end of file diff --git a/_posts/papers/2020-08-07-2008.03101.md b/_posts/papers/2020-08-07-2008.03101.md new file mode 100644 index 00000000..e0be0cb1 --- /dev/null +++ b/_posts/papers/2020-08-07-2008.03101.md @@ -0,0 +1,22 @@ +--- +title: Privacy Guarantees for De-identifying Text Transformations +venue: Interspeech +names: David Ifeoluwa Adelani, A. Davody, Thomas Kleinbauer, D. Klakow +tags: +- Interspeech +link: https://arxiv.org/abs/2008.03101 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Machine Learning approaches to Natural Language Processing tasks benefit from a comprehensive collection of real-life user data. At the same time, there is a clear need for protecting the privacy of the users whose data is collected and processed. For text collections, such as, e.g., transcripts of voice interactions or patient records, replacing sensitive parts with benign alternatives can provide de-identification. However, how much privacy is actually guaranteed by such text transformations, and are the resulting texts still useful for machine learning? In this paper, we derive formal privacy guarantees for general text transformation-based de-identification methods on the basis of Differential Privacy. We also measure the effect that different ways of masking private information in dialog transcripts have on a subsequent machine learning task. To this end, we formulate different masking strategies and compare their privacy-utility trade-offs. In particular, we compare a simple redact approach with more sophisticated word-byword replacement using deep learning models on multiple natural language understanding tasks like named entity recognition, intent detection, and dialog act classification. We find that only word-byword replacement is robust against performance drops in various tasks. \ No newline at end of file diff --git a/_posts/papers/2020-08-31-2103.04818.md b/_posts/papers/2020-08-31-2103.04818.md new file mode 100644 index 00000000..7a541cab --- /dev/null +++ b/_posts/papers/2020-08-31-2103.04818.md @@ -0,0 +1,23 @@ +--- +title: Estimating community feedback effect on topic choice in social media with predictive + modeling +venue: EPJ Data Science +names: David Ifeoluwa Adelani, Ryota Kobayashi, Ingmar Weber, Przemyslaw A. Grabowicz +tags: +- EPJ Data Science +link: https://arxiv.org/abs/2103.04818 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2020-09-08-10.1007-978-3-030-58323-1_30.md b/_posts/papers/2020-09-08-10.1007-978-3-030-58323-1_30.md new file mode 100644 index 00000000..a87d39c9 --- /dev/null +++ b/_posts/papers/2020-09-08-10.1007-978-3-030-58323-1_30.md @@ -0,0 +1,23 @@ +--- +title: Investigating the Impact of Pre-trained Word Embeddings on Memorization in + Neural Networks +venue: Workshop on Time-Delay Systems +names: A. Thomas, David Ifeoluwa Adelani, A. Davody, Aditya Mogadala, D. Klakow +tags: +- Workshop on Time-Delay Systems +link: https://doi.org/10.1007/978-3-030-58323-1_30 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2020-10-07-2010.03179.md b/_posts/papers/2020-10-07-2010.03179.md new file mode 100644 index 00000000..3e9cac1d --- /dev/null +++ b/_posts/papers/2020-10-07-2010.03179.md @@ -0,0 +1,24 @@ +--- +title: 'Transfer Learning and Distant Supervision for Multilingual Transformer Models: + A Study on African Languages' +venue: Conference on Empirical Methods in Natural Language Processing +names: Michael A. Hedderich, David Ifeoluwa Adelani, D. Zhu, Jesujoba Oluwadara Alabi, + Udia Markus, D. Klakow +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2010.03179 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Multilingual transformer models like mBERT and XLM-RoBERTa have obtained great improvements for many NLP tasks on a variety of languages. However, recent works also showed that results from high-resource languages could not be easily transferred to realistic, low-resource scenarios. In this work, we study trends in performance for different amounts of available resources for the three African languages Hausa, isiXhosa and Yoruba on both NER and topic classification. We show that in combination with transfer learning or distant supervision, these models can achieve with as little as 10 or 100 labeled sentences the same performance as baselines with much more supervised training data. However, we also find settings where this does not hold. Our discussions and additional experiments on assumptions such as time and hardware restrictions highlight challenges and opportunities in low-resource learning. \ No newline at end of file diff --git a/_posts/papers/2020-11-30-10.5281-ZENODO.4297448.md b/_posts/papers/2020-11-30-10.5281-ZENODO.4297448.md new file mode 100644 index 00000000..59365c03 --- /dev/null +++ b/_posts/papers/2020-11-30-10.5281-ZENODO.4297448.md @@ -0,0 +1,23 @@ +--- +title: 'MENYO-20k: A Multi-domain English - Yorùbá Corpus for Machine Translation' +venue: '' +names: David Ifeoluwa Adelani, Jesujoba Oluwadara Alabi, Damilola Adebonojo, Adesina + Ayeni, Mofetoluwa Adeyemi, Ayodele Awokoya +tags: +- '' +link: https://doi.org/10.5281/ZENODO.4297448 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2021-01-01-beb70691b9abebfff63709a6ce06bf6a9cc96a65.md b/_posts/papers/2021-01-01-beb70691b9abebfff63709a6ce06bf6a9cc96a65.md new file mode 100644 index 00000000..85a7dcfb --- /dev/null +++ b/_posts/papers/2021-01-01-beb70691b9abebfff63709a6ce06bf6a9cc96a65.md @@ -0,0 +1,24 @@ +--- +title: 'MENYO-20k: A Multi-domain English-Yorùbá Corpus for Machine Translation and + Domain Adaptation' +venue: AfricaNLP +names: David Ifeoluwa Adelani, Dana Ruiter, Jesujoba Oluwadara Alabi, Damilola Adebonojo, + Adesina Ayeni, Mofetoluwa Adeyemi, Ayodele Awokoya, C. España-Bonet +tags: +- AfricaNLP +link: https://www.semanticscholar.org/paper/beb70691b9abebfff63709a6ce06bf6a9cc96a65 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Massively multilingual machine translation (MT) has shown impressive capabilities, including zero and few-shot translation between low-resource language pairs. However, these models are often evaluated on high-resource languages with the assumption that they generalize to low-resource ones. The difficulty of evaluating MT models on low-resource pairs is often due the lack of standardized evaluation datasets. In this paper, we present MENYO-20k, the first multi-domain parallel corpus for the low-resource Yor ` ub ´ a–English ( yo – en ) language pair with standardized train-test splits for benchmarking. We provide several neural MT (NMT) benchmarks on this dataset and compare to the performance of popular pre-trained (massively multilingual) MT models, showing that, in almost all cases, our simple benchmarks outperform the pre-trained MT models. A major gain of BLEU +9 . 9 and +8 . 6 ( en 2 yo ) is achieved in comparison to Facebook’s M2M-100 and Google multilingual NMT respectively when we use MENYO-20k to fine-tune generic models. \ No newline at end of file diff --git a/_posts/papers/2021-03-15-2103.08647.md b/_posts/papers/2021-03-15-2103.08647.md new file mode 100644 index 00000000..5239ca97 --- /dev/null +++ b/_posts/papers/2021-03-15-2103.08647.md @@ -0,0 +1,23 @@ +--- +title: The Effect of Domain and Diacritics in Yoruba–English Neural Machine Translation +venue: Machine Translation Summit +names: David Ifeoluwa Adelani, Dana Ruiter, Jesujoba Oluwadara Alabi, Damilola Adebonojo, + Adesina Ayeni, Mofetoluwa Adeyemi, Ayodele Awokoya, C. España-Bonet +tags: +- Machine Translation Summit +link: https://arxiv.org/abs/2103.08647 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Massively multilingual machine translation (MT) has shown impressive capabilities and including zero and few-shot translation between low-resource language pairs. However and these models are often evaluated on high-resource languages with the assumption that they generalize to low-resource ones. The difficulty of evaluating MT models on low-resource pairs is often due to lack of standardized evaluation datasets. In this paper and we present MENYO-20k and the first multi-domain parallel corpus with a especially curated orthography for Yoruba–English with standardized train-test splits for benchmarking. We provide several neural MT benchmarks and compare them to the performance of popular pre-trained (massively multilingual) MT models both for the heterogeneous test set and its subdomains. Since these pre-trained models use huge amounts of data with uncertain quality and we also analyze the effect of diacritics and a major characteristic of Yoruba and in the training data. We investigate how and when this training condition affects the final quality of a translation and its understandability. Our models outperform massively multilingual models such as Google (+8.7 BLEU) and Facebook M2M (+9.1) when translating to Yoruba and setting a high quality benchmark for future research. \ No newline at end of file diff --git a/_posts/papers/2021-03-22-2103.11811.md b/_posts/papers/2021-03-22-2103.11811.md new file mode 100644 index 00000000..3c19f104 --- /dev/null +++ b/_posts/papers/2021-03-22-2103.11811.md @@ -0,0 +1,34 @@ +--- +title: 'MasakhaNER: Named Entity Recognition for African Languages' +venue: Transactions of the Association for Computational Linguistics +names: David Ifeoluwa Adelani, Jade Z. Abbott, Graham Neubig, Daniel D'souza, Julia + Kreutzer, Constantine Lignos, Chester Palen-Michel, Happy Buzaaba, Shruti Rijhwani, + Sebastian Ruder, Stephen Mayhew, Israel Abebe Azime, Shamsuddeen Hassan Muhammad, + Chris C. Emezue, J. Nakatumba‐Nabende, Perez Ogayo, Anuoluwapo Aremu, Catherine + Gitau, Derguene Mbaye, Jesujoba Oluwadara Alabi, Seid Muhie Yimam, T. Gwadabe, I. + Ezeani, Andre Niyongabo Rubungo, Jonathan Mukiibi, V. Otiende, Iroro Orife, Davis + David, Samba Ngom, Tosin P. Adewumi, Paul Rayson, Mofetoluwa Adeyemi, Gerald Muriuki, + E. Anebi, C. Chukwuneke, N. Odu, Eric Peter Wairagala, S. Oyerinde, Clemencia Siro, + Tobius Saul Bateesa, Temilola Oloyede, Yvonne Wambui, Victor Akinode, Deborah Nabagereka, + Maurice Katusiime, Ayodele Awokoya, Mouhamadane Mboup, Dibora Gebreyohannes, Henok + Tilaye, Kelechi Nwaike, Degaga Wolde, A. Faye, Blessing K. Sibanda, Orevaoghene + Ahia, Bonaventure F. P. Dossou, Kelechi Ogueji, T. Diop, A. Diallo, Adewale Akinfaderin, + T. Marengereke, Salomey Osei +tags: +- Transactions of the Association for Computational Linguistics +link: https://arxiv.org/abs/2103.11811 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Abstract We take a step towards addressing the under- representation of the African continent in NLP research by bringing together different stakeholders to create the first large, publicly available, high-quality dataset for named entity recognition (NER) in ten African languages. We detail the characteristics of these languages to help researchers and practitioners better understand the challenges they pose for NER tasks. We analyze our datasets and conduct an extensive empirical evaluation of state- of-the-art methods across both supervised and transfer learning settings. Finally, we release the data, code, and models to inspire future research on African NLP.1 \ No newline at end of file diff --git a/_posts/papers/2021-04-06-2104.02516.md b/_posts/papers/2021-04-06-2104.02516.md new file mode 100644 index 00000000..57dcca38 --- /dev/null +++ b/_posts/papers/2021-04-06-2104.02516.md @@ -0,0 +1,25 @@ +--- +title: AI4D - African Language Program +venue: arXiv.org +names: Kathleen Siminyu, Godson Kalipe, D. Orlic, Jade Z. Abbott, Vukosi Marivate, + Sackey Freshia, Prateek Sibal, B. Neupane, David Ifeoluwa Adelani, Amelia Taylor, + Jamiil Toure Ali, Kevin Degila, Momboladji Balogoun, T. Diop, Davis David, Chayma + Fourati, Hatem Haddad, Malek Naski +tags: +- arXiv.org +link: https://arxiv.org/abs/2104.02516 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Advances in speech and language technologies enable tools such as voice-search, text-to-speech, speech recognition and machine translation. These are however only available for high resource languages like English, French or Chinese. Without foundational digital resources for African languages, which are considered low-resource in the digital context, these advanced tools remain out of reach. This work details the AI4D - African Language Program, a 3-part project that 1) incentivised the crowd-sourcing, collection and curation of language datasets through an online quantitative and qualitative challenge, 2) supported research fellows for a period of 3-4 months to create datasets annotated for NLP tasks, and 3) hosted competitive Machine Learning challenges on the basis of these datasets. Key outcomes of the work so far include 1) the creation of 9+ open source, African language datasets annotated for a variety of ML tasks, and 2) the creation of baseline models for these datasets through hosting of competitive ML challenges. \ No newline at end of file diff --git a/_posts/papers/2021-09-19-2109.09133.md b/_posts/papers/2021-09-19-2109.09133.md new file mode 100644 index 00000000..95f2e82a --- /dev/null +++ b/_posts/papers/2021-09-19-2109.09133.md @@ -0,0 +1,23 @@ +--- +title: Preventing Author Profiling through Zero-Shot Multilingual Back-Translation +venue: Conference on Empirical Methods in Natural Language Processing +names: David Ifeoluwa Adelani, Miaoran Zhang, Xiaoyu Shen, A. Davody, Thomas Kleinbauer, + D. Klakow +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2109.09133 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Documents as short as a single sentence may inadvertently reveal sensitive information about their authors, including e.g. their gender or ethnicity. Style transfer is an effective way of transforming texts in order to remove any information that enables author profiling. However, for a number of current state-of-the-art approaches the improved privacy is accompanied by an undesirable drop in the down-stream utility of the transformed data. In this paper, we propose a simple, zero-shot way to effectively lower the risk of author profiling through multilingual back-translation using off-the-shelf translation models. We compare our models with five representative text style transfer models on three datasets across different domains. Results from both an automatic and a human evaluation show that our approach achieves the best overall performance while requiring no training data. We are able to lower the adversarial prediction of gender and race by up to 22% while retaining 95% of the original utility on downstream tasks. \ No newline at end of file diff --git a/_posts/papers/2022-01-01-10.2139-ssrn.4136717.md b/_posts/papers/2022-01-01-10.2139-ssrn.4136717.md new file mode 100644 index 00000000..bcb105cb --- /dev/null +++ b/_posts/papers/2022-01-01-10.2139-ssrn.4136717.md @@ -0,0 +1,24 @@ +--- +title: Building Together - Towards a Roadmap for African Language Technologies +venue: Social Science Research Network +names: Kathleen Siminyu, Jade Z. Abbott, Kólá Túbosún, Aremu Anuoluwapo, Blessing + K. Sibanda, Kofi A. Yeboah, David Ifeoluwa Adelani, Masabata Mokgesi-Selinga, Frederick + R. Apina, Angela Thandizwe Mthembu, A. Ramkilowan, Babatunde Oladimeji +tags: +- Social Science Research Network +link: https://doi.org/10.2139/ssrn.4136717 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2022-01-01-10.48550-arXiv.2204.06487.md b/_posts/papers/2022-01-01-10.48550-arXiv.2204.06487.md new file mode 100644 index 00000000..61a853a4 --- /dev/null +++ b/_posts/papers/2022-01-01-10.48550-arXiv.2204.06487.md @@ -0,0 +1,22 @@ +--- +title: 'Multilingual Language Model Adaptive Fine-Tuning: A Study on African Languages' +venue: arXiv.org +names: Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, Marius Mosbach, D. Klakow +tags: +- arXiv.org +link: https://doi.org/10.48550/arXiv.2204.06487 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +and XLM-R) and three NLP tasks (NER, news topic classification, and sentiment classification) shows that our approach is competitive to applying LAFT on individual languages while requiring significantly less disk space. Finally, we show that our adapted PLM also improves the zero-shot cross-lingual transfer abilities of parameter efficient fine-tuning methods. \ No newline at end of file diff --git a/_posts/papers/2022-01-01-2022.coling-1.377.md b/_posts/papers/2022-01-01-2022.coling-1.377.md new file mode 100644 index 00000000..6b77838f --- /dev/null +++ b/_posts/papers/2022-01-01-2022.coling-1.377.md @@ -0,0 +1,22 @@ +--- +title: Few-Shot Pidgin Text Adaptation via Contrastive Fine-Tuning +venue: International Conference on Computational Linguistics +names: Ernie Chang, Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, Vera Demberg +tags: +- International Conference on Computational Linguistics +link: https://www.semanticscholar.org/paper/b1f69004b3c7d409d55fea21441b1e3a4e8940dd +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +The surging demand for multilingual dialogue systems often requires a costly labeling process for each language addition. For low resource languages, human annotators are continuously tasked with the adaptation of resource-rich language utterances for each new domain. However, this prohibitive and impractical process can often be a bottleneck for low resource languages that are still without proper translation systems nor parallel corpus. In particular, it is difficult to obtain task-specific low resource language annotations for the English-derived creoles (e.g. Nigerian and Cameroonian Pidgin). To address this issue, we utilize the pretrained language models i.e. BART which has shown great potential in language generation/understanding – we propose to finetune the BART model to generate utterances in Pidgin by leveraging the proximity of the source and target languages, and utilizing positive and negative examples in constrastive training objectives. We collected and released the first parallel Pidgin-English conversation corpus in two dialogue domains and showed that this simple and effective technique is suffice to yield impressive results for English-to-Pidgin generation, which are two closely-related languages. \ No newline at end of file diff --git a/_posts/papers/2022-01-01-2022.wmt-1.72.md b/_posts/papers/2022-01-01-2022.wmt-1.72.md new file mode 100644 index 00000000..e8e1307a --- /dev/null +++ b/_posts/papers/2022-01-01-2022.wmt-1.72.md @@ -0,0 +1,26 @@ +--- +title: Findings of the WMT’22 Shared Task on Large-Scale Machine Translation Evaluation + for African Languages +venue: Conference on Machine Translation +names: David Ifeoluwa Adelani, Md Mahfuz Ibn Alam, Antonios Anastasopoulos, Akshita + Bhagia, M. Costa-jussà, Jesse Dodge, FAHIM FAISAL, C. Federmann, Natalia Fedorova, + Francisco Guzmán, Sergey Koshelev, Jean Maillard, Vukosi Marivate, J. Mbuya, Alexandre + Mourachko, Safiyyah Saleem, Holger Schwenk, Guillaume Wenzek +tags: +- Conference on Machine Translation +link: https://www.semanticscholar.org/paper/b5b371295e7450df66aa6431c657baaf735290a1 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +We present the results of the WMT’22 SharedTask on Large-Scale Machine Translation Evaluation for African Languages. The shared taskincluded both a data and a systems track, alongwith additional innovations, such as a focus onAfrican languages and extensive human evaluation of submitted systems. We received 14system submissions from 8 teams, as well as6 data track contributions. We report a largeprogress in the quality of translation for Africanlanguages since the last iteration of this sharedtask: there is an increase of about 7.5 BLEUpoints across 72 language pairs, and the average BLEU scores went from 15.09 to 22.60. \ No newline at end of file diff --git a/_posts/papers/2022-01-01-2ef570ac8db5c7e5192334f31675cc2fd7b6622a.md b/_posts/papers/2022-01-01-2ef570ac8db5c7e5192334f31675cc2fd7b6622a.md new file mode 100644 index 00000000..82a59993 --- /dev/null +++ b/_posts/papers/2022-01-01-2ef570ac8db5c7e5192334f31675cc2fd7b6622a.md @@ -0,0 +1,22 @@ +--- +title: D IALOGUE P IDGIN T EXT A DAPTATION VIA C ON - TRASTIVE F INE -T UNING +venue: '' +names: Ernie Chang, Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, Vera Demberg +tags: +- '' +link: https://www.semanticscholar.org/paper/2ef570ac8db5c7e5192334f31675cc2fd7b6622a +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +The surging demand for multilingual dialogue systems often requires a costly labeling process for each language addition. For low resource languages, human annotators are continuously tasked with the adaptation of resource-rich language utterances for each new domain. However, this prohibitive and impractical process can often be a bottleneck for low resource languages that are still without proper translation systems nor parallel corpus. In particular, it is difficult to obtain task-specific low resource language annotations for the English-derived creoles (e.g. Nigerian and Cameroonian Pidgin). To address this issue, we utilize the pretrained language models i.e. BART which has shown great potential in language generation/understanding – we propose to finetune the BART model to generate utterances in Pidgin by leveraging the proximity of the source and target languages, and utilizing positive and negative examples in contrastive training objectives. We collected and released the first parallel PidginEnglish conversation corpus in two dialogue domains and showed that this simple and effective technique is sufficient to yield impressive results for English-to-Pidgin generation, which are two closely-related languages. \ No newline at end of file diff --git a/_posts/papers/2022-01-20-2201.08277.md b/_posts/papers/2022-01-20-2201.08277.md new file mode 100644 index 00000000..75956f21 --- /dev/null +++ b/_posts/papers/2022-01-20-2201.08277.md @@ -0,0 +1,25 @@ +--- +title: 'NaijaSenti: A Nigerian Twitter Sentiment Corpus for Multilingual Sentiment + Analysis' +venue: International Conference on Language Resources and Evaluation +names: Shamsuddeen Hassan Muhammad, David Ifeoluwa Adelani, I. Ahmad, Idris Abdulmumin, + Bello Shehu Bello, M. Choudhury, Chris C. Emezue, Anuoluwapo Aremu, Saheed Abdul, + P. Brazdil +tags: +- International Conference on Language Resources and Evaluation +link: https://arxiv.org/abs/2201.08277 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Sentiment analysis is one of the most widely studied applications in NLP, but most work focuses on languages with large amounts of data. We introduce the first large-scale human-annotated Twitter sentiment dataset for the four most widely spoken languages in Nigeria—Hausa, Igbo, Nigerian-Pidgin, and Yorùbá—consisting of around 30,000 annotated tweets per language, including a significant fraction of code-mixed tweets. We propose text collection, filtering, processing and labeling methods that enable us to create datasets for these low-resource languages. We evaluate a range of pre-trained models and transfer strategies on the dataset. We find that language-specific models and language-adaptive fine-tuning generally perform best. We release the datasets, trained models, sentiment lexicons, and code to incentivize research on sentiment analysis in under-represented languages. \ No newline at end of file diff --git a/_posts/papers/2022-03-16-2203.08850.md b/_posts/papers/2022-03-16-2203.08850.md new file mode 100644 index 00000000..7a09f0db --- /dev/null +++ b/_posts/papers/2022-03-16-2203.08850.md @@ -0,0 +1,24 @@ +--- +title: 'Pre-Trained Multilingual Sequence-to-Sequence Models: A Hope for Low-Resource + Language Translation?' +venue: Findings +names: E. Lee, Sarubi Thillainathan, Shravan Nayak, Shravan Nayak, Surangika Ranathunga, + David Ifeoluwa Adelani, Ruisi Su, Arya D. McCarthy +tags: +- Findings +link: https://arxiv.org/abs/2203.08850 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +What can pre-trained multilingual sequence-to-sequence models like mBART contribute to translating low-resource languages? We conduct a thorough empirical experiment in 10 languages to ascertain this, considering five factors: (1) the amount of fine-tuning data, (2) the noise in the fine-tuning data, (3) the amount of pre-training data in the model, (4) the impact of domain mismatch, and (5) language typology. In addition to yielding several heuristics, the experiments form a framework for evaluating the data sensitivities of machine translation systems. While mBART is robust to domain differences, its translations for unseen and typologically distant languages remain below 3.0 BLEU. In answer to our title’s question, mBART is not a low-resource panacea; we therefore encourage shifting the emphasis from new models to new data. \ No newline at end of file diff --git a/_posts/papers/2022-04-13-2204.06487.md b/_posts/papers/2022-04-13-2204.06487.md new file mode 100644 index 00000000..9e297ff8 --- /dev/null +++ b/_posts/papers/2022-04-13-2204.06487.md @@ -0,0 +1,23 @@ +--- +title: Adapting Pre-trained Language Models to African Languages via Multilingual + Adaptive Fine-Tuning +venue: International Conference on Computational Linguistics +names: Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, Marius Mosbach, D. Klakow +tags: +- International Conference on Computational Linguistics +link: https://arxiv.org/abs/2204.06487 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Multilingual pre-trained language models (PLMs) have demonstrated impressive performance on several downstream tasks for both high-resourced and low-resourced languages. However, there is still a large performance drop for languages unseen during pre-training, especially African languages. One of the most effective approaches to adapt to a new language is language adaptive fine-tuning (LAFT) — fine-tuning a multilingual PLM on monolingual texts of a language using the pre-training objective. However, adapting to target language individually takes large disk space and limits the cross-lingual transfer abilities of the resulting models because they have been specialized for a single language. In this paper, we perform multilingual adaptive fine-tuning on 17 most-resourced African languages and three other high-resource languages widely spoken on the African continent to encourage cross-lingual transfer learning. To further specialize the multilingual PLM, we removed vocabulary tokens from the embedding layer that corresponds to non-African writing scripts before MAFT, thus reducing the model size by around 50%. Our evaluation on two multilingual PLMs (AfriBERTa and XLM-R) and three NLP tasks (NER, news topic classification, and sentiment classification) shows that our approach is competitive to applying LAFT on individual languages while requiring significantly less disk space. Additionally, we show that our adapted PLM also improves the zero-shot cross-lingual transfer abilities of parameter efficient fine-tuning methods. \ No newline at end of file diff --git a/_posts/papers/2022-04-20-2204.09371.md b/_posts/papers/2022-04-20-2204.09371.md new file mode 100644 index 00000000..18604cc8 --- /dev/null +++ b/_posts/papers/2022-04-20-2204.09371.md @@ -0,0 +1,23 @@ +--- +title: Is BERT Robust to Label Noise? A Study on Learning with Noisy Labels in Text + Classification +venue: First Workshop on Insights from Negative Results in NLP +names: D. Zhu, Michael A. Hedderich, Fangzhou Zhai, David Ifeoluwa Adelani, D. Klakow +tags: +- First Workshop on Insights from Negative Results in NLP +link: https://arxiv.org/abs/2204.09371 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Incorrect labels in training data occur when human annotators make mistakes or when the data is generated via weak or distant supervision. It has been shown that complex noise-handling techniques - by modeling, cleaning or filtering the noisy instances - are required to prevent models from fitting this label noise. However, we show in this work that, for text classification tasks with modern NLP models like BERT, over a variety of noise types, existing noise-handling methods do not always improve its performance, and may even deteriorate it, suggesting the need for further investigation. We also back our observations with a comprehensive analysis. \ No newline at end of file diff --git a/_posts/papers/2022-04-20-2204.09711.md b/_posts/papers/2022-04-20-2204.09711.md new file mode 100644 index 00000000..873b0295 --- /dev/null +++ b/_posts/papers/2022-04-20-2204.09711.md @@ -0,0 +1,22 @@ +--- +title: 'yosm: A new yoruba sentiment corpus for movie reviews' +venue: arXiv.org +names: Iyanuoluwa Shode, David Ifeoluwa Adelani, Anna Feldman +tags: +- arXiv.org +link: https://arxiv.org/abs/2204.09711 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +A movie that is thoroughly enjoyed and recommended by an individual might be hated by another. One characteristic of humans is the ability to have feelings which could be positive or negative. To automatically classify and study human feelings, an aspect of natural language processing, sentiment analysis and opinion mining were designed to understand human feelings regarding several issues which could affect a product, a social media platforms, government, or societal discussions or even movies. Several works on sentiment analysis have been done on high resource languages while low resources languages like Yoruba have been sidelined. Due to the scarcity of datasets and linguistic architectures that will suit low resource languages, African languages"low resource languages"have been ignored and not fully explored. For this reason, our attention is placed on Yoruba to explore sentiment analysis on reviews of Nigerian movies. The data comprised 1500 movie reviews that were sourced from IMDB, Rotten Tomatoes, Letterboxd, Cinemapointer and Nollyrated. We develop sentiment classification models using the state-of-the-art pre-trained language models like mBERT and AfriBERTa to classify the movie reviews. \ No newline at end of file diff --git a/_posts/papers/2022-04-22-2204.10931.md b/_posts/papers/2022-04-22-2204.10931.md new file mode 100644 index 00000000..c43ec425 --- /dev/null +++ b/_posts/papers/2022-04-22-2204.10931.md @@ -0,0 +1,23 @@ +--- +title: 'MCSE: Multimodal Contrastive Learning of Sentence Embeddings' +venue: North American Chapter of the Association for Computational Linguistics +names: Miaoran Zhang, Marius Mosbach, David Ifeoluwa Adelani, Michael A. Hedderich, + D. Klakow +tags: +- North American Chapter of the Association for Computational Linguistics +link: https://arxiv.org/abs/2204.10931 +author: Marius Mosbach +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Learning semantically meaningful sentence embeddings is an open problem in natural language processing. In this work, we propose a sentence embedding learning approach that exploits both visual and textual information via a multimodal contrastive objective. Through experiments on a variety of semantic textual similarity tasks, we demonstrate that our approach consistently improves the performance across various datasets and pre-trained encoders. In particular, combining a small amount of multimodal data with a large text-only corpus, we improve the state-of-the-art average Spearman’s correlation by 1.7%. By analyzing the properties of the textual embedding space, we show that our model excels in aligning semantically similar sentences, providing an explanation for its improved performance. \ No newline at end of file diff --git a/_posts/papers/2022-05-04-2205.02022.md b/_posts/papers/2022-05-04-2205.02022.md new file mode 100644 index 00000000..cd7deb62 --- /dev/null +++ b/_posts/papers/2022-05-04-2205.02022.md @@ -0,0 +1,32 @@ +--- +title: A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for + African News Translation +venue: North American Chapter of the Association for Computational Linguistics +names: David Ifeoluwa Adelani, Jesujoba Oluwadara Alabi, Angela Fan, Julia Kreutzer, + Xiaoyu Shen, Machel Reid, Dana Ruiter, D. Klakow, Peter Nabende, Ernie Chang, T. + Gwadabe, Freshia Sackey, Bonaventure F. P. Dossou, Chris C. Emezue, Colin Leong, + Michael Beukman, Shamsuddeen Hassan Muhammad, Guyo Dub Jarso, Oreen Yousuf, Andre + Niyongabo Rubungo, Gilles Hacheme, Eric Peter Wairagala, Muhammad Umair Nasir, Benjamin + Ayoade Ajibade, T. Ajayi, Yvonne Wambui Gitau, Jade Z. Abbott, Mohamed Ahmed, Millicent + Ochieng, Anuoluwapo Aremu, Perez Ogayo, Jonathan Mukiibi, F. Kabore, Godson Kalipe, + Derguene Mbaye, A. Tapo, V. M. Koagne, Edwin Munkoh-Buabeng, Valencia Wagner, Idris + Abdulmumin, Ayodele Awokoya, Happy Buzaaba, Blessing K. Sibanda, Andiswa Bukula, + Sam Manthalu +tags: +- North American Chapter of the Association for Computational Linguistics +link: https://arxiv.org/abs/2205.02022 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Recent advances in the pre-training for language models leverage large-scale datasets to create multilingual models. However, low-resource languages are mostly left out in these datasets. This is primarily because many widely spoken languages that are not well represented on the web and therefore excluded from the large-scale crawls for datasets. Furthermore, downstream users of these models are restricted to the selection of languages originally chosen for pre-training. This work investigates how to optimally leverage existing pre-trained models to create low-resource translation systems for 16 African languages. We focus on two questions: 1) How can pre-trained models be used for languages not included in the initial pretraining? and 2) How can the resulting translation models effectively transfer to new domains? To answer these questions, we create a novel African news corpus covering 16 languages, of which eight languages are not part of any existing evaluation dataset. We demonstrate that the most effective strategy for transferring both additional languages and additional domains is to leverage small quantities of high-quality translation data to fine-tune large pre-trained models. \ No newline at end of file diff --git a/_posts/papers/2022-06-03-2206.01476.md b/_posts/papers/2022-06-03-2206.01476.md new file mode 100644 index 00000000..4b07e3d1 --- /dev/null +++ b/_posts/papers/2022-06-03-2206.01476.md @@ -0,0 +1,23 @@ +--- +title: 'Task-Adaptive Pre-Training for Boosting Learning With Noisy Labels: A Study + on Text Classification for African Languages' +venue: arXiv.org +names: D. Zhu, Michael A. Hedderich, Fangzhou Zhai, David Ifeoluwa Adelani, D. Klakow +tags: +- arXiv.org +link: https://arxiv.org/abs/2206.01476 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +For high-resource languages like English, text classification is a well-studied task. The performance of modern NLP models easily achieves an accuracy of more than 90% in many standard datasets for text classification in English (Xie et al., 2019; Yang et al., 2019; Zaheer et al., 2020). However, text classification in low-resource languages is still challenging due to the lack of annotated data. Although methods like weak supervision and crowdsourcing can help ease the annotation bottleneck, the annotations obtained by these methods contain label noise. Models trained with label noise may not generalize well. To this end, a variety of noise-handling techniques have been proposed to alleviate the negative impact caused by the errors in the annotations (for extensive surveys see (Hedderich et al., 2021; Algan&Ulusoy, 2021)). In this work, we experiment with a group of standard noisy-handling methods on text classification tasks with noisy labels. We study both simulated noise and realistic noise induced by weak supervision. Moreover, we find task-adaptive pre-training techniques (Gururangan et al., 2020) are beneficial for learning with noisy labels. \ No newline at end of file diff --git a/_posts/papers/2022-06-15-2206.07841.md b/_posts/papers/2022-06-15-2206.07841.md new file mode 100644 index 00000000..68a8303e --- /dev/null +++ b/_posts/papers/2022-06-15-2206.07841.md @@ -0,0 +1,24 @@ +--- +title: 'TOKEN is a MASK: Few-shot Named Entity Recognition with Pre-trained Language + Models' +venue: International Conference on Text, Speech and Dialogue +names: A. Davody, David Ifeoluwa Adelani, Thomas Kleinbauer, D. Klakow +tags: +- International Conference on Text +- Speech and Dialogue +link: https://arxiv.org/abs/2206.07841 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Transferring knowledge from one domain to another is of practical importance for many tasks in natural language processing, especially when the amount of available data in the target domain is limited. In this work, we propose a novel few-shot approach to domain adaptation in the context of Named Entity Recognition (NER). We propose a two-step approach consisting of a variable base module and a template module that leverages the knowledge captured in pre-trained language models with the help of simple descriptive patterns. Our approach is simple yet versatile and can be applied in few-shot and zero-shot settings. Evaluating our lightweight approach across a number of different datasets shows that it can boost the performance of state-of-the-art baselines by 2-5% F1-score. \ No newline at end of file diff --git a/_posts/papers/2022-07-07-2207.03546.md b/_posts/papers/2022-07-07-2207.03546.md new file mode 100644 index 00000000..2fa5d849 --- /dev/null +++ b/_posts/papers/2022-07-07-2207.03546.md @@ -0,0 +1,27 @@ +--- +title: 'BibleTTS: a large, high-fidelity, multilingual, and uniquely African speech + corpus' +venue: Interspeech +names: Josh Meyer, David Ifeoluwa Adelani, Edresson Casanova, A. Oktem, Daniel Whitenack + Julian Weber, Salomon Kabongo KABENAMUALU, Elizabeth Salesky, Iroro Orife, Colin + Leong, Perez Ogayo, Chris C. Emezue, Jonathan Mukiibi, Salomey Osei, Apelete Agbolo, + Victor Akinode, Bernard Opoku, S. Olanrewaju, Jesujoba Oluwadara Alabi, Shamsuddeen + Hassan Muhammad +tags: +- Interspeech +link: https://arxiv.org/abs/2207.03546 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +BibleTTS is a large, high-quality, open speech dataset for ten languages spoken in Sub-Saharan Africa. The corpus contains up to 86 hours of aligned, studio quality 48kHz single speaker recordings per language, enabling the development of high-quality text-to-speech models. The ten languages represented are: Akuapem Twi, Asante Twi, Chichewa, Ewe, Hausa, Kikuyu, Lingala, Luganda, Luo, and Yoruba. This corpus is a derivative work of Bible recordings made and released by the Open.Bible project from Biblica. We have aligned, cleaned, and filtered the original recordings, and additionally hand-checked a subset of the alignments for each language. We present results for text-to-speech models with Coqui TTS. The data is released under a commercial-friendly CC-BY-SA license. \ No newline at end of file diff --git a/_posts/papers/2022-10-22-2210.12391.md b/_posts/papers/2022-10-22-2210.12391.md new file mode 100644 index 00000000..49ba82aa --- /dev/null +++ b/_posts/papers/2022-10-22-2210.12391.md @@ -0,0 +1,31 @@ +--- +title: 'MasakhaNER 2.0: Africa-centric Transfer Learning for Named Entity Recognition' +venue: Conference on Empirical Methods in Natural Language Processing +names: David Ifeoluwa Adelani, Graham Neubig, Sebastian Ruder, Shruti Rijhwani, Michael + Beukman, Chester Palen-Michel, Constantine Lignos, Jesujoba Oluwadara Alabi, Shamsuddeen + Hassan Muhammad, Peter Nabende, Cheikh M. Bamba Dione, Andiswa Bukula, Rooweither + Mabuya, Bonaventure F. P. Dossou, Blessing K. Sibanda, Happy Buzaaba, Jonathan Mukiibi, + Godson Kalipe, Derguene Mbaye, Amelia Taylor, F. Kabore, Chris C. Emezue, Anuoluwapo + Aremu, Perez Ogayo, C. Gitau, Edwin Munkoh-Buabeng, V. M. Koagne, A. Tapo, Tebogo + Macucwa, Vukosi Marivate, Elvis Mboning, T. Gwadabe, Tosin P. Adewumi, Orevaoghene + Ahia, J. Nakatumba‐Nabende, Neo L. Mokono, Ignatius M Ezeani, C. Chukwuneke, Mofetoluwa + Adeyemi, Gilles Hacheme, Idris Abdulmumin, Odunayo Ogundepo, Oreen Yousuf, Tatiana + Moteu Ngoli, D. Klakow +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2210.12391 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +African languages are spoken by over a billion people, but they are under-represented in NLP research and development. Multiple challenges exist, including the limited availability of annotated training and evaluation datasets as well as the lack of understanding of which settings, languages, and recently proposed methods like cross-lingual transfer will be effective. In this paper, we aim to move towards solutions for these challenges, focusing on the task of named entity recognition (NER). We present the creation of the largest to-date human-annotated NER dataset for 20 African languages. We study the behaviour of state-of-the-art cross-lingual transfer methods in an Africa-centric setting, empirically demonstrating that the choice of source transfer language significantly affects performance. While much previous work defaults to using English as the source language, our results show that choosing the best transfer language improves zero-shot F1 scores by an average of 14% over 20 languages as compared to using English. \ No newline at end of file diff --git a/_posts/papers/2022-11-09-2211.05100.md b/_posts/papers/2022-11-09-2211.05100.md new file mode 100644 index 00000000..508cb4da --- /dev/null +++ b/_posts/papers/2022-11-09-2211.05100.md @@ -0,0 +1,93 @@ +--- +title: 'BLOOM: A 176B-Parameter Open-Access Multilingual Language Model' +venue: arXiv.org +names: Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili'c, + Daniel Hesslow, Roman Castagn'e, A. Luccioni, François Yvon, Matthias Gallé, J. + Tow, Alexander M. Rush, Stella Biderman, Albert Webson, Pawan Sasanka Ammanamanchi, + Thomas Wang, Benoît Sagot, Niklas Muennighoff, Albert Villanova del Moral, Olatunji + Ruwase, Rachel Bawden, Stas Bekman, Angelina McMillan-Major, Iz Beltagy, Huu Nguyen, + Lucile Saulnier, Samson Tan, Pedro Ortiz Suarez, Victor Sanh, Hugo Laurenccon, Yacine + Jernite, Julien Launay, Margaret Mitchell, Colin Raffel, Aaron Gokaslan, Adi Simhi, + Aitor Soroa Etxabe, Alham Fikri Aji, Amit Alfassy, Anna Rogers, Ariel Kreisberg + Nitzav, Canwen Xu, Chenghao Mou, Chris C. Emezue, Christopher Klamm, Colin Leong, + Daniel Alexander van Strien, David Ifeoluwa Adelani, Dragomir R. Radev, E. G. Ponferrada, + Efrat Levkovizh, Ethan Kim, Eyal Natan, F. Toni, Gérard Dupont, Germán Kruszewski, + Giada Pistilli, Hady ElSahar, Hamza Benyamina, H. Tran, Ian Yu, Idris Abdulmumin, + Isaac Johnson, Itziar Gonzalez-Dios, Javier de la Rosa, Jenny Chim, Jesse Dodge, + Jian Zhu, Jonathan Chang, Jorg Frohberg, Josephine Tobing, J. Bhattacharjee, Khalid + Almubarak, Kimbo Chen, Kyle Lo, Leandro von Werra, Leon Weber, Long Phan, Loubna + Ben Allal, Ludovic Tanguy, Manan Dey, M. Muñoz, Maraim Masoud, María Grandury, Mario + vSavsko, Max Huang, Maximin Coavoux, Mayank Singh, Mike Tian-Jian Jiang, Minh Chien + Vu, M. A. Jauhar, Mustafa Ghaleb, Nishant Subramani, Nora Kassner, Nurulaqilla Khamis, + Olivier Nguyen, Omar Espejel, Ona de Gibert, Paulo Villegas, Peter Henderson, Pierre + Colombo, Priscilla Amuok, Quentin Lhoest, Rheza Harliman, Rishi Bommasani, R. L'opez, + Rui Ribeiro, Salomey Osei, S. Pyysalo, Sebastian Nagel, Shamik Bose, Shamsuddeen + Hassan Muhammad, Shanya Sharma, S. Longpre, Somaieh Nikpoor, S. Silberberg, S. Pai, + S. Zink, Tiago Timponi Torrent, Timo Schick, Tristan Thrush, V. Danchev, Vassilina + Nikoulina, Veronika Laippala, Violette Lepercq, V. Prabhu, Zaid Alyafeai, Zeerak + Talat, Arun Raja, Benjamin Heinzerling, Chenglei Si, Elizabeth Salesky, Sabrina + J. Mielke, Wilson Y. Lee, Abheesht Sharma, Andrea Santilli, Antoine Chaffin, Arnaud + Stiegler, Debajyoti Datta, Eliza Szczechla, Gunjan Chhablani, Han Wang, Harshit + Pandey, Hendrik Strobelt, Jason Alan Fries, Jos Rozen, Leo Gao, Lintang Sutawika, + M Saiful Bari, Maged S. Al-Shaibani, Matteo Manica, Nihal V. Nayak, Ryan Teehan, + Samuel Albanie, Sheng Shen, Srulik Ben-David, Stephen H. Bach, Taewoon Kim, T. Bers, + Thibault Févry, Trishala Neeraj, Urmish Thakker, Vikas Raunak, Xiang Tang, Zheng-Xin + Yong, Zhiqing Sun, Shaked Brody, Y. Uri, Hadar Tojarieh, Adam Roberts, Hyung Won + Chung, Jaesung Tae, Jason Phang, Ofir Press, Conglong Li, D. Narayanan, Hatim Bourfoune, + J. Casper, Jeff Rasley, Max Ryabinin, Mayank Mishra, Minjia Zhang, Mohammad Shoeybi, + Myriam Peyrounette, N. Patry, Nouamane Tazi, Omar Sanseviero, Patrick von Platen, + Pierre Cornette, Pierre Franccois Lavall'ee, R. Lacroix, Samyam Rajbhandari, Sanchit + Gandhi, Shaden Smith, S. Requena, Suraj Patil, Tim Dettmers, Ahmed Baruwa, Amanpreet + Singh, Anastasia Cheveleva, Anne-Laure Ligozat, Arjun Subramonian, Aur'elie N'ev'eol, + Charles Lovering, Daniel H Garrette, D. Tunuguntla, Ehud Reiter, Ekaterina Taktasheva, + E. Voloshina, Eli Bogdanov, Genta Indra Winata, Hailey Schoelkopf, Jan-Christoph + Kalo, Jekaterina Novikova, J. Forde, Xiangru Tang, Jungo Kasai, Ken Kawamura, Liam + Hazan, Marine Carpuat, Miruna Clinciu, Najoung Kim, Newton Cheng, O. Serikov, Omer + Antverg, Oskar van der Wal, Rui Zhang, Ruochen Zhang, Sebastian Gehrmann, Shachar + Mirkin, S. Pais, Tatiana Shavrina, Thomas Scialom, Tian Yun, Tomasz Limisiewicz, + Verena Rieser, Vitaly Protasov, V. Mikhailov, Yada Pruksachatkun, Yonatan Belinkov, + Zachary Bamberger, Zdenvek Kasner, Zdeněk Kasner, A. Pestana, A. Feizpour, Ammar + Khan, Amy Faranak, A. Santos, Anthony Hevia, Antigona Unldreaj, Arash Aghagol, Arezoo + Abdollahi, A. Tammour, A. HajiHosseini, Bahareh Behroozi, Benjamin Ayoade Ajibade, + B. Saxena, Carlos Muñoz Ferrandis, Danish Contractor, D. Lansky, Davis David, Douwe + Kiela, D. A. Nguyen, Edward Tan, Emi Baylor, Ezinwanne Ozoani, F. Mirza, Frankline + Ononiwu, Habib Rezanejad, H.A. Jones, Indrani Bhattacharya, Irene Solaiman, Irina + Sedenko, Isar Nejadgholi, J. Passmore, Joshua Seltzer, Julio Bonis Sanz, Karen Fort, + Lívia Dutra, Mairon Samagaio, Maraim Elbadri, Margot Mieskes, Marissa Gerchick, + Martha Akinlolu, Michael McKenna, Mike Qiu, M. Ghauri, Mykola Burynok, Nafis Abrar, + Nazneen Rajani, Nour Elkott, N. Fahmy, Olanrewaju Samuel, Ran An, R. Kromann, Ryan + Hao, S. Alizadeh, Sarmad Shubber, Silas L. Wang, Sourav Roy, S. Viguier, Thanh-Cong + Le, Tobi Oyebade, T. Le, Yoyo Yang, Zach Nguyen, Abhinav Ramesh Kashyap, Alfredo + Palasciano, A. Callahan, Anima Shukla, Antonio Miranda-Escalada, A. Singh, Benjamin + Beilharz, Bo Wang, C. Brito, Chenxi Zhou, Chirag Jain, Chuxin Xu, Clémentine Fourrier, + Daniel Le'on Perin'an, Daniel Molano, Dian Yu, Enrique Manjavacas, Fabio Barth, + Florian Fuhrimann, Gabriel Altay, Giyaseddin Bayrak, Gully Burns, Helena U. Vrabec, + I. Bello, Isha Dash, J. Kang, John Giorgi, Jonas Golde, J. Posada, Karthi Sivaraman, + Lokesh Bulchandani, Lu Liu, Luisa Shinzato, Madeleine Hahn de Bykhovetz, Maiko Takeuchi, + Marc Pàmies, M. A. Castillo, Marianna Nezhurina, Mario Sanger, M. Samwald, Michael + Cullan, Michael Weinberg, M. Wolf, Mina Mihaljcic, Minna Liu, M. Freidank, Myungsun + Kang, Natasha Seelam, N. Dahlberg, N. Broad, N. Muellner, Pascale Fung, Patricia + Haller, Patrick Haller, R. Eisenberg, Robert Martin, Rodrigo Canalli, Rosaline Su, + Ruisi Su, Samuel Cahyawijaya, Samuele Garda, Shlok S Deshmukh, Shubhanshu Mishra, + Sid Kiblawi, Simon Ott, Sinee Sang-aroonsiri, Srishti Kumar, Stefan Schweter, S. + Bharati, Tanmay Laud, Théo Gigant, Tomoya Kainuma, Wojciech Kusa, Yanis Labrak, + Yashasvi Bajaj, Y. Venkatraman, Yifan Xu, Ying Xu, Yu Xu, Z. Tan, Zhongli Xie, Zifan + Ye, M. Bras, Younes Belkada, Thomas Wolf +tags: +- arXiv.org +link: https://arxiv.org/abs/2211.05100 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access language model designed and built thanks to a collaboration of hundreds of researchers. BLOOM is a decoder-only Transformer language model that was trained on the ROOTS corpus, a dataset comprising hundreds of sources in 46 natural and 13 programming languages (59 in total). We find that BLOOM achieves competitive performance on a wide variety of benchmarks, with stronger results after undergoing multitask prompted finetuning. To facilitate future research and applications using LLMs, we publicly release our models and code under the Responsible AI License. \ No newline at end of file diff --git a/_posts/papers/2022-12-19-2212.09535.md b/_posts/papers/2022-12-19-2212.09535.md new file mode 100644 index 00000000..283737e4 --- /dev/null +++ b/_posts/papers/2022-12-19-2212.09535.md @@ -0,0 +1,25 @@ +--- +title: 'BLOOM+1: Adding Language Support to BLOOM for Zero-Shot Prompting' +venue: Annual Meeting of the Association for Computational Linguistics +names: Zheng-Xin Yong, Hailey Schoelkopf, Niklas Muennighoff, Alham Fikri Aji, David + Ifeoluwa Adelani, Khalid Almubarak, M Saiful Bari, Lintang Sutawika, Jungo Kasai, + Ahmed Baruwa, Genta Indra Winata, Stella Biderman, Dragomir R. Radev, Vassilina + Nikoulina +tags: +- Annual Meeting of the Association for Computational Linguistics +link: https://arxiv.org/abs/2212.09535 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +The BLOOM model is a large publicly available multilingual language model, but its pretraining was limited to 46 languages. To extend the benefits of BLOOM to other languages without incurring prohibitively large costs, it is desirable to adapt BLOOM to new languages not seen during pretraining. In this work, we apply existing language adaptation strategies to BLOOM and benchmark its zero-shot prompting performance on eight new languages in a resource-constrained setting. We find language adaptation to be effective at improving zero-shot performance in new languages. Surprisingly, we find that adapter-based finetuning is more effective than continued pretraining for large models. In addition, we discover that prompting performance is not significantly affected by language specifics, such as the writing system. It is primarily determined by the size of the language adaptation data. We also add new languages to BLOOMZ, which is a multitask finetuned version of BLOOM capable of following task instructions zero-shot. We find including a new language in the multitask fine-tuning mixture to be the most effective method to teach BLOOMZ a new language. We conclude that with sufficient training data language adaptation can generalize well to diverse languages. Our code is available at https://github.com/bigscience-workshop/multilingual-modeling. \ No newline at end of file diff --git a/_posts/papers/2023-01-01-10.18653-v1-2023.emnlp-main.11.md b/_posts/papers/2023-01-01-10.18653-v1-2023.emnlp-main.11.md new file mode 100644 index 00000000..6430e575 --- /dev/null +++ b/_posts/papers/2023-01-01-10.18653-v1-2023.emnlp-main.11.md @@ -0,0 +1,23 @@ +--- +title: Better Quality Pre-training Data and T5 Models for African Languages +venue: Conference on Empirical Methods in Natural Language Processing +names: Akintunde Oladipo, Mofetoluwa Adeyemi, Orevaoghene Ahia, A. Owodunni, Odunayo + Ogundepo, David Ifeoluwa Adelani, Jimmy Lin +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://doi.org/10.18653/v1/2023.emnlp-main.11 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +In this study, we highlight the importance of enhancing the quality of pretraining data in multilingual language models. Existing web crawls have demonstrated quality issues, particularly in the context of low-resource languages. Consequently, we introduce a new multilingual pretraining corpus for 16 African languages, designed by carefully auditing existing pretraining corpora to understand and rectify prevalent quality issues. To compile this dataset, we undertake a rigorous examination of current data sources for thirteen languages within one of the most extensive multilingual web crawls, mC4, and extract cleaner data through meticulous auditing and improved web crawling strategies. Subsequently, we pretrain a new T5-based model on this dataset and evaluate its performance on multiple downstream tasks. Our model demonstrates better down-stream effectiveness over existing pretrained models across four NLP tasks, underscoring the critical role data quality plays in pretraining language models in low-resource scenarios. Specifically, on cross-lingual QA evaluation, our new model is more than twice as effective as multilingual T5. All code, data and model are publicly available at https: //github.com/castorini/AfriTeVa-keji . \ No newline at end of file diff --git a/_posts/papers/2023-01-01-10.18653-v1-2023.mrl-1.24.md b/_posts/papers/2023-01-01-10.18653-v1-2023.mrl-1.24.md new file mode 100644 index 00000000..4d0b64d9 --- /dev/null +++ b/_posts/papers/2023-01-01-10.18653-v1-2023.mrl-1.24.md @@ -0,0 +1,27 @@ +--- +title: Findings of the 1st Shared Task on Multi-lingual Multi-task Information Retrieval + at MRL 2023 +venue: MRL +names: Francesco Tinner, David Ifeoluwa Adelani, Chris Emezue, Mammad Hajili, Omer + Goldman, Muhammad Farid Adilazuarda, Muhammad Dehan Al Kautsar, Aziza Mirsaidova, + Muge Kural, Dylan Massey, Chiamaka Chukwuneke, C. Mbonu, Damilola Oluwaseun Oloyede, + Kayode Olaleye, Jonathan Atala, Benjamin Ayoade Ajibade, Saksham Bassi, Rahul Aralikatte, + Na-joung Kim, Duygu Ataman +tags: +- MRL +link: https://doi.org/10.18653/v1/2023.mrl-1.24 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Large language models (LLMs) excel in language understanding and generation, especially in English which has ample public benchmarks for various natural language processing (NLP) tasks. Nevertheless, their reliability across different languages and domains remains uncertain. Our new shared task introduces a novel benchmark to assess the ability of multilingual LLMs to comprehend and produce language under sparse settings, particularly in scenarios with under-resourced languages, with an emphasis on the ability to capture logical, factual, or causal relationships within lengthy text contexts. The shared task consists of two sub-tasks crucial to information retrieval: Named Entity Recognition (NER) and Reading Comprehension (RC), in 7 data-scarce languages: Azerbaijani, Igbo, Indonesian, Swiss German, Turkish, Uzbek and Yorùbá, which previously lacked annotated resources in information retrieval tasks. Our evaluation of leading LLMs reveals that, despite their competitive performance, they still have notable weaknesses such as producing output in the non-target language or providing counterfactual information that cannot be inferred from the context. As more advanced models emerge, the benchmark will remain essential for supporting fairness and applicability in information retrieval systems. \ No newline at end of file diff --git a/_posts/papers/2023-01-01-10.48550-arXiv.2311.09828.md b/_posts/papers/2023-01-01-10.48550-arXiv.2311.09828.md new file mode 100644 index 00000000..ac6c5602 --- /dev/null +++ b/_posts/papers/2023-01-01-10.48550-arXiv.2311.09828.md @@ -0,0 +1,34 @@ +--- +title: 'AfriMTE and AfriCOMET: Empowering COMET to Embrace Under-resourced African + Languages' +venue: arXiv.org +names: Jiayi Wang, David Ifeoluwa Adelani, Sweta Agrawal, Ricardo Rei, Eleftheria + Briakou, Marine Carpuat, Marek Masiak, Xuanli He, Sofia Bourhim, Andiswa Bukula, + Muhidin A. Mohamed, Temitayo Olatoye, Hamam Mokayede, Christine Mwase, Wangui Kimotho, + Foutse Yuehgoh, Anuoluwapo Aremu, Jessica Ojo, Shamsuddeen Hassan Muhammad, Salomey + Osei, Abdul-Hakeem Omotayo, Chiamaka Chukwuneke, Perez Ogayo, Oumaima Hourrane, + Salma El Anigri, Lolwethu Ndolela, Thabiso Mangwana, Shafie Abdi Mohamed, Ayinde + Hassan, Oluwabusayo Olufunke Awoyomi, Lama Alkhaled, S. Al-Azzawi, Naome A. Etori, + Millicent Ochieng, Clemencia Siro, Samuel Njoroge, Eric Muchiri, Wangari Kimotho, + Lyse Naomi Wamba Momo, D. Abolade, Simbiat Ajao, Tosin P. Adewumi, Iyanuoluwa Shode, + Ricky Macharm, R. Iro, S. S. Abdullahi, Stephen E. Moore, Bernard Opoku, Zainab + Akinjobi, Abeeb Afolabi, Nnaemeka Obiefuna, Onyekachi Raphael Ogbu, Sam Brian, V. + Otiende, C. Mbonu, Sakayo Toadoum Sari, Pontus Stenetorp +tags: +- arXiv.org +link: https://doi.org/10.48550/arXiv.2311.09828 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Despite the progress we have recorded in scaling multilingual machine translation (MT) models and evaluation data to several under-resourced African languages, it is difficult to measure accurately the progress we have made on these languages because evaluation is often performed on n -gram matching metrics like BLEU that often have worse correlation with human judgments. Embedding-based metrics such as COMET correlate better; however, lack of evaluation data with human ratings for under-resourced languages, complexity of annotation guidelines like Multidimensional Quality Metrics (MQM), and limited language coverage of multilingual encoders have hampered their applicability to African languages. In this paper, we address these challenges by creating high-quality human evaluation data with a simplified MQM guideline for error-span annotation and direct assessment (DA) scoring for 13 typologi-cally diverse African languages. Furthermore, we develop A FRI COMET—a COMET evaluation metric for African languages by leveraging DA training data from high-resource languages and African-centric multilingual encoder (AfroXLM-Roberta) to create the state-of-the-art evaluation metric for African languages MT with respect to Spearman-rank correlation with human judgments ( +0 . 406 ). \ No newline at end of file diff --git a/_posts/papers/2023-01-01-794e3de4e59812f824d07f785c1a982cb09bb987.md b/_posts/papers/2023-01-01-794e3de4e59812f824d07f785c1a982cb09bb987.md new file mode 100644 index 00000000..a86f20e2 --- /dev/null +++ b/_posts/papers/2023-01-01-794e3de4e59812f824d07f785c1a982cb09bb987.md @@ -0,0 +1,28 @@ +--- +title: 'A FRI S ENTI : A B ENCHMARK T WITTER S ENTIMENT A NALYSIS D ATASET FOR A FRICAN + L ANGUAGES' +venue: '' +names: Shamsuddeen Hassan Muhammad, Idris Abdulmumin, A. Ayele, N. Ousidhoum, David + Ifeoluwa Adelani, Seid Muhie Yimam, Meriem Beloucif, Saif M. Mohammad, Sebastian + Ruder, Oumaima Hourrane, P. Brazdil, Felermino M. D. A. Ali, Davis David, Salomey + Osei, Bello Shehu Bello, Falalu Ibrahim, T. Gwadabe, Samuel Rutunda, Tadesse Destaw + Belay, Wendimu Baye Messelle, Hailu Beshada Balcha, S. Chala, Hagos Tesfahun Gebremichael, + Bernard Opoku, Steven Arthur +tags: +- '' +link: https://www.semanticscholar.org/paper/794e3de4e59812f824d07f785c1a982cb09bb987 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and Yoruba) from four language families (Afro-Asiatic, English Creole, Indo European, and Niger-Congo). We describe the data collection methodology, annotation process, and related challenges when cu-rating each of the datasets. We also build different sentiment classification baseline models on the datasets and discuss their usefulness. \ No newline at end of file diff --git a/_posts/papers/2023-01-01-8e5aaa09f2c9a08e9343754c81a2310ba2d49ec3.md b/_posts/papers/2023-01-01-8e5aaa09f2c9a08e9343754c81a2310ba2d49ec3.md new file mode 100644 index 00000000..0b46a802 --- /dev/null +++ b/_posts/papers/2023-01-01-8e5aaa09f2c9a08e9343754c81a2310ba2d49ec3.md @@ -0,0 +1,23 @@ +--- +title: 'ε kú : Integrating YorùBá Cultural greetings into Machine Translation' +venue: AfricaNLP +names: Idris Akinade, Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, Clement Odoje, + D. Klakow +tags: +- AfricaNLP +link: https://www.semanticscholar.org/paper/8e5aaa09f2c9a08e9343754c81a2310ba2d49ec3 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2023-02-17-2302.08956.md b/_posts/papers/2023-02-17-2302.08956.md new file mode 100644 index 00000000..2806efce --- /dev/null +++ b/_posts/papers/2023-02-17-2302.08956.md @@ -0,0 +1,27 @@ +--- +title: 'AfriSenti: A Twitter Sentiment Analysis Benchmark for African Languages' +venue: Conference on Empirical Methods in Natural Language Processing +names: Shamsuddeen Hassan Muhammad, Idris Abdulmumin, A. Ayele, N. Ousidhoum, David + Ifeoluwa Adelani, Seid Muhie Yimam, I. Ahmad, Meriem Beloucif, Saif M. Mohammad, + Sebastian Ruder, Oumaima Hourrane, P. Brazdil, Felermino D'ario M'ario Ant'onio + Ali, Davis C. Davis, Salomey Osei, Bello Shehu Bello, Falalu Ibrahim, T. Gwadabe, + Samuel Rutunda, Tadesse Destaw Belay, Wendimu Baye Messelle, Hailu Beshada Balcha, + S. Chala, Hagos Tesfahun Gebremichael, Bernard Opoku, Steven Arthur +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2302.08956 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Africa is home to over 2000 languages from over six language families and has the highest linguistic diversity among all continents. This includes 75 languages with at least one million speakers each. Yet, there is little NLP research conducted on African languages. Crucial in enabling such research is the availability of high-quality annotated datasets. In this paper, we introduce AfriSenti, which consists of 14 sentiment datasets of 110,000+ tweets in 14 African languages (Amharic, Algerian Arabic, Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and Yor\`ub\'a) from four language families annotated by native speakers. The data is used in SemEval 2023 Task 12, the first Afro-centric SemEval shared task. We describe the data collection methodology, annotation process, and related challenges when curating each of the datasets. We conduct experiments with different sentiment classification baselines and discuss their usefulness. We hope AfriSenti enables new work on under-represented languages. The dataset is available at https://github.com/afrisenti-semeval/afrisent-semeval-2023 and can also be loaded as a huggingface datasets (https://huggingface.co/datasets/shmuhammad/AfriSenti). \ No newline at end of file diff --git a/_posts/papers/2023-03-07-2303.03915.md b/_posts/papers/2023-03-07-2303.03915.md new file mode 100644 index 00000000..6233de9b --- /dev/null +++ b/_posts/papers/2023-03-07-2303.03915.md @@ -0,0 +1,32 @@ +--- +title: 'The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset' +venue: Neural Information Processing Systems +names: Hugo Laurenccon, Lucile Saulnier, Thomas Wang, Christopher Akiki, Albert Villanova + del Moral, Teven Le Scao, Leandro von Werra, Chenghao Mou, E. G. Ponferrada, Huu + Nguyen, Jorg Frohberg, Mario vSavsko, Quentin Lhoest, Angelina McMillan-Major, Gérard + Dupont, Stella Biderman, Anna Rogers, Loubna Ben Allal, F. Toni, Giada Pistilli, + Olivier Nguyen, Somaieh Nikpoor, Maraim Masoud, Pierre Colombo, Javier de la Rosa, + Paulo Villegas, Tristan Thrush, S. Longpre, Sebastian Nagel, Leon Weber, M. Muñoz, + Jian Zhu, Daniel Alexander van Strien, Zaid Alyafeai, Khalid Almubarak, Minh Chien + Vu, Itziar Gonzalez-Dios, Aitor Soroa Etxabe, Kyle Lo, Manan Dey, Pedro Ortiz Suarez, + Aaron Gokaslan, Shamik Bose, David Ifeoluwa Adelani, Long Phan, H. Tran, I. Yu, + S. Pai, Jenny Chim, Violette Lepercq, Suzana Ilic, Margaret Mitchell, Sasha Luccioni, + Yacine Jernite +tags: +- Neural Information Processing Systems +link: https://arxiv.org/abs/2303.03915 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +As language models grow ever larger, the need for large-scale high-quality text datasets has never been more pressing, especially in multilingual settings. The BigScience workshop, a 1-year international and multidisciplinary initiative, was formed with the goal of researching and training large language models as a values-driven undertaking, putting issues of ethics, harm, and governance in the foreground. This paper documents the data creation and curation efforts undertaken by BigScience to assemble the Responsible Open-science Open-collaboration Text Sources (ROOTS) corpus, a 1.6TB dataset spanning 59 languages that was used to train the 176-billion-parameter BigScience Large Open-science Open-access Multilingual (BLOOM) language model. We further release a large initial subset of the corpus and analyses thereof, and hope to empower large-scale monolingual and multilingual modeling projects with both the data and the processing tools, as well as stimulate research around this large multilingual corpus. \ No newline at end of file diff --git a/_posts/papers/2023-03-31-2303.17972.md b/_posts/papers/2023-03-31-2303.17972.md new file mode 100644 index 00000000..8603893d --- /dev/null +++ b/_posts/papers/2023-03-31-2303.17972.md @@ -0,0 +1,23 @@ +--- +title: 'Varepsilon kú mask: Integrating Yorùbá cultural greetings into machine translation' +venue: C3NLP +names: Idris Akinade, Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, Clement Odoje, + D. Klakow +tags: +- C3NLP +link: https://arxiv.org/abs/2303.17972 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +This paper investigates the performance of massively multilingual neural machine translation (NMT) systems in translating Yorùbá greetings (kú mask), which are a big part of Yorùbá language and culture, into English. To evaluate these models, we present IkiniYorùbá, a Yorùbá-English translation dataset containing some Yorùbá greetings, and sample use cases. We analysed the performance of different multilingual NMT systems including Google and NLLB and show that these models struggle to accurately translate Yorùbá greetings into English. In addition, we trained a Yorùbá-English model by fine-tuning an existing NMT model on the training split of IkiniYorùbá and this achieved better performance when compared to the pre-trained multilingual NMT models, although they were trained on a large volume of data. \ No newline at end of file diff --git a/_posts/papers/2023-04-08-2304.03952.md b/_posts/papers/2023-04-08-2304.03952.md new file mode 100644 index 00000000..12f24f8f --- /dev/null +++ b/_posts/papers/2023-04-08-2304.03952.md @@ -0,0 +1,24 @@ +--- +title: 'MphayaNER: Named Entity Recognition for Tshivenda' +venue: AfricaNLP +names: R. Mbuvha, David Ifeoluwa Adelani, Tendani Mutavhatsindi, Tshimangadzo Rakhuhu, + A. Mauda, Tshifhiwa Joshua Maumela, Andisani Masindi, Seani Rananga, Vukosi Marivate, + T. Marwala +tags: +- AfricaNLP +link: https://arxiv.org/abs/2304.03952 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Named Entity Recognition (NER) plays a vital role in various Natural Language Processing tasks such as information retrieval, text classification, and question answering. However, NER can be challenging, especially in low-resource languages with limited annotated datasets and tools. This paper adds to the effort of addressing these challenges by introducing MphayaNER, the first Tshivenda NER corpus in the news domain. We establish NER baselines by \textit{fine-tuning} state-of-the-art models on MphayaNER. The study also explores zero-shot transfer between Tshivenda and other related Bantu languages, with chiShona and Kiswahili showing the best results. Augmenting MphayaNER with chiShona data was also found to improve model performance significantly. Both MphayaNER and the baseline models are made publicly available. \ No newline at end of file diff --git a/_posts/papers/2023-04-13-2304.06845.md b/_posts/papers/2023-04-13-2304.06845.md new file mode 100644 index 00000000..0f6341bc --- /dev/null +++ b/_posts/papers/2023-04-13-2304.06845.md @@ -0,0 +1,23 @@ +--- +title: 'SemEval-2023 Task 12: Sentiment Analysis for African Languages (AfriSenti-SemEval)' +venue: International Workshop on Semantic Evaluation +names: Shamsuddeen Hassan Muhammad, Idris Abdulmumin, Seid Muhie Yimam, David Ifeoluwa + Adelani, I. Ahmad, N. Ousidhoum, A. Ayele, Saif M. Mohammad, Meriem Beloucif +tags: +- International Workshop on Semantic Evaluation +link: https://arxiv.org/abs/2304.06845 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +We present the first Africentric SemEval Shared task, Sentiment Analysis for African Languages (AfriSenti-SemEval) - The dataset is available at https://github.com/afrisenti-semeval/afrisent-semeval-2023. AfriSenti-SemEval is a sentiment classification challenge in 14 African languages: Amharic, Algerian Arabic, Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and Yorb (Muhammad et al., 2023), using data labeled with 3 sentiment classes. We present three subtasks: (1) Task A: monolingual classification, which received 44 submissions; (2) Task B: multilingual classification, which received 32 submissions; and (3) Task C: zero-shot classification, which received 34 submissions. The best performance for tasks A and B was achieved by NLNDE team with 71.31 and 75.06 weighted F1, respectively. UCAS-IIE-NLP achieved the best average score for task C with 58.15 weighted F1. We describe the various approaches adopted by the top 10 systems and their approaches. \ No newline at end of file diff --git a/_posts/papers/2023-04-19-2304.09972.md b/_posts/papers/2023-04-19-2304.09972.md new file mode 100644 index 00000000..a720a8db --- /dev/null +++ b/_posts/papers/2023-04-19-2304.09972.md @@ -0,0 +1,35 @@ +--- +title: 'MasakhaNEWS: News Topic Classification for African languages' +venue: International Joint Conference on Natural Language Processing +names: David Ifeoluwa Adelani, Marek Masiak, Israel Abebe Azime, Jesujoba Oluwadara + Alabi, A. Tonja, Christine Mwase, Odunayo Ogundepo, Bonaventure F. P. Dossou, Akintunde + Oladipo, Doreen Nixdorf, Chris C. Emezue, S. Al-Azzawi, Blessing K. Sibanda, Davis + David, Lolwethu Ndolela, Jonathan Mukiibi, T. Ajayi, Tatiana Moteu Ngoli, B. Odhiambo, + A. Owodunni, Nnaemeka Obiefuna, Shamsuddeen Hassan Muhammad, S. S. Abdullahi, M. + Yigezu, T. Gwadabe, Idris Abdulmumin, Mahlet Taye Bame, Oluwabusayo Olufunke Awoyomi, + Iyanuoluwa Shode, T. Adelani, Habiba Abdulganiy Kailani, Abdul-Hakeem Omotayo, Adetola + Adeeko, Afolabi Abeeb, Anuoluwapo Aremu, Olanrewaju Samuel, Clemencia Siro, Wangari + Kimotho, Onyekachi Raphael Ogbu, C. Mbonu, C. Chukwuneke, Samuel Fanijo, Jessica + Ojo, Oyinkansola F. Awosan, Tadesse Kebede Guge, Sakayo Toadoum Sari, Pamela Nyatsine, + Freedmore Sidume, Oreen Yousuf, Mardiyyah Oduwole, Ussen Kimanuka, Kanda Patrick + Tshinu, Thina Diko, Siyanda Nxakama, Abdulmejid Tuni Johar, Sinodos Gebre, Muhidin + A. Mohamed, Shafie Abdi Mohamed, Fuad Mire Hassan, Moges Ahmed Mehamed, Evrard Ngabire, + Pontus Stenetorp +tags: +- International Joint Conference on Natural Language Processing +link: https://arxiv.org/abs/2304.09972 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +African languages are severely under-represented in NLP research due to lack of datasets covering several NLP tasks. While there are individual language specific datasets that are being expanded to different tasks, only a handful of NLP tasks (e.g. named entity recognition and machine translation) have standardized benchmark datasets covering several geographical and typologically-diverse African languages. In this paper, we develop MasakhaNEWS -- a new benchmark dataset for news topic classification covering 16 languages widely spoken in Africa. We provide an evaluation of baseline models by training classical machine learning models and fine-tuning several language models. Furthermore, we explore several alternatives to full fine-tuning of language models that are better suited for zero-shot and few-shot learning such as cross-lingual parameter-efficient fine-tuning (like MAD-X), pattern exploiting training (PET), prompting language models (like ChatGPT), and prompt-free sentence transformer fine-tuning (SetFit and Cohere Embedding API). Our evaluation in zero-shot setting shows the potential of prompting ChatGPT for news topic classification in low-resource African languages, achieving an average performance of 70 F1 points without leveraging additional supervision like MAD-X. In few-shot setting, we show that with as little as 10 examples per label, we achieved more than 90\% (i.e. 86.0 F1 points) of the performance of full supervised training (92.6 F1 points) leveraging the PET approach. \ No newline at end of file diff --git a/_posts/papers/2023-05-11-2305.06897.md b/_posts/papers/2023-05-11-2305.06897.md new file mode 100644 index 00000000..a426c184 --- /dev/null +++ b/_posts/papers/2023-05-11-2305.06897.md @@ -0,0 +1,32 @@ +--- +title: 'AfriQA: Cross-lingual Open-Retrieval Question Answering for African Languages' +venue: Conference on Empirical Methods in Natural Language Processing +names: Odunayo Ogundepo, T. Gwadabe, Clara Rivera, J. Clark, Sebastian Ruder, David + Ifeoluwa Adelani, Bonaventure F. P. Dossou, Abdoulahat Diop, Claytone Sikasote, + Gilles Hacheme, Happy Buzaaba, Ignatius M Ezeani, Rooweither Mabuya, Salomey Osei, + Chris C. Emezue, A. Kahira, Shamsuddeen Hassan Muhammad, Akintunde Oladipo, A. Owodunni, + A. Tonja, Iyanuoluwa Shode, Akari Asai, T. Ajayi, Clemencia Siro, Steven Arthur, + Mofetoluwa Adeyemi, Orevaoghene Ahia, Aremu Anuoluwapo, O. Awosan, C. Chukwuneke, + Bernard Opoku, A. Ayodele, V. Otiende, Christine Mwase, B. Sinkala, Andre Niyongabo + Rubungo, Daniel Ajisafe, Emeka Onwuegbuzia, Habib Mbow, Emile Niyomutabazi, Eunice + Mukonde, F. I. Lawan, I. Ahmad, Jesujoba Oluwadara Alabi, Martin Namukombo, Mbonu + Chinedu, Mofya Phiri, Neo Putini, Ndumiso Mngoma, Priscilla Amuok, R. Iro, Sonia + Adhiambo34 +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2305.06897 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +African languages have far less in-language content available digitally, making it challenging for question answering systems to satisfy the information needs of users. Cross-lingual open-retrieval question answering (XOR QA) systems -- those that retrieve answer content from other languages while serving people in their native language -- offer a means of filling this gap. To this end, we create AfriQA, the first cross-lingual QA dataset with a focus on African languages. AfriQA includes 12,000+ XOR QA examples across 10 African languages. While previous datasets have focused primarily on languages where cross-lingual QA augments coverage from the target language, AfriQA focuses on languages where cross-lingual answer content is the only high-coverage source of answer content. Because of this, we argue that African languages are one of the most important and realistic use cases for XOR QA. Our experiments demonstrate the poor performance of automatic translation and multilingual retrieval methods. Overall, AfriQA proves challenging for state-of-the-art QA models. We hope that the dataset enables the development of more equitable QA technology. \ No newline at end of file diff --git a/_posts/papers/2023-05-18-2305.10971.md b/_posts/papers/2023-05-18-2305.10971.md new file mode 100644 index 00000000..e9a01760 --- /dev/null +++ b/_posts/papers/2023-05-18-2305.10971.md @@ -0,0 +1,23 @@ +--- +title: 'NollySenti: Leveraging Transfer Learning and Machine Translation for Nigerian + Movie Sentiment Classification' +venue: Annual Meeting of the Association for Computational Linguistics +names: Iyanuoluwa Shode, David Ifeoluwa Adelani, J. Peng, Anna Feldman +tags: +- Annual Meeting of the Association for Computational Linguistics +link: https://arxiv.org/abs/2305.10971 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Africa has over 2000 indigenous languages but they are under-represented in NLP research due to lack of datasets. In recent years, there have been progress in developing labelled corpora for African languages. However, they are often available in a single domain and may not generalize to other domains. In this paper, we focus on the task of sentiment classification for cross-domain adaptation. We create a new dataset, Nollywood movie reviews for five languages widely spoken in Nigeria (English, Hausa, Igbo, Nigerian Pidgin, and Yoruba). We provide an extensive empirical evaluation using classical machine learning methods and pre-trained language models. By leveraging transfer learning, we compare the performance of cross-domain adaptation from Twitter domain, and cross-lingual adaptation from English language. Our evaluation shows that transfer from English in the same target domain leads to more than 5% improvement in accuracy compared to transfer from Twitter in the same language. To further mitigate the domain difference, we leverage machine translation from English to other Nigerian languages, which leads to a further improvement of 7% over cross-lingual evaluation. While machine translation to low-resource languages are often of low quality, our analysis shows that sentiment related words are often preserved. \ No newline at end of file diff --git a/_posts/papers/2023-05-19-2305.11938.md b/_posts/papers/2023-05-19-2305.11938.md new file mode 100644 index 00000000..de56ce78 --- /dev/null +++ b/_posts/papers/2023-05-19-2305.11938.md @@ -0,0 +1,26 @@ +--- +title: 'XTREME-UP: A User-Centric Scarce-Data Benchmark for Under-Represented Languages' +venue: Conference on Empirical Methods in Natural Language Processing +names: Sebastian Ruder, J. Clark, Alexander Gutkin, Mihir Kale, Min Ma, M. Nicosia, + Shruti Rijhwani, Parker Riley, J. M. Sarr, Xinyi Wang, J. Wieting, Nitish Gupta, + Anna Katanova, Christo Kirov, Dana L. Dickinson, Brian Roark, Bidisha Samanta, Connie + Tao, David Ifeoluwa Adelani, Vera Axelrod, Isaac Caswell, Colin Cherry, Dan Garrette, + R. Ingle, Melvin Johnson, Dmitry Panteleev, P. Talukdar +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2305.11938 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Data scarcity is a crucial issue for the development of highly multilingual NLP systems. Yet for many under-represented languages (ULs) -- languages for which NLP re-search is particularly far behind in meeting user needs -- it is feasible to annotate small amounts of data. Motivated by this, we propose XTREME-UP, a benchmark defined by: its focus on the scarce-data scenario rather than zero-shot; its focus on user-centric tasks -- tasks with broad adoption by speakers of high-resource languages; and its focus on under-represented languages where this scarce-data scenario tends to be most realistic. XTREME-UP evaluates the capabilities of language models across 88 under-represented languages over 9 key user-centric technologies including ASR, OCR, MT, and information access tasks that are of general utility. We create new datasets for OCR, autocomplete, semantic parsing, and transliteration, and build on and refine existing datasets for other tasks. XTREME-UP provides methodology for evaluating many modeling scenarios including text-only, multi-modal (vision, audio, and text),supervised parameter tuning, and in-context learning. We evaluate commonly used models on the benchmark. We release all code and scripts to train and evaluate models \ No newline at end of file diff --git a/_posts/papers/2023-05-23-2305.13989.md b/_posts/papers/2023-05-23-2305.13989.md new file mode 100644 index 00000000..6134d785 --- /dev/null +++ b/_posts/papers/2023-05-23-2305.13989.md @@ -0,0 +1,30 @@ +--- +title: 'MasakhaPOS: Part-of-Speech Tagging for Typologically Diverse African languages' +venue: Annual Meeting of the Association for Computational Linguistics +names: Cheikh M. Bamba Dione, David Ifeoluwa Adelani, Peter Nabende, Jesujoba Oluwadara + Alabi, Thapelo Sindane, Happy Buzaaba, Shamsuddeen Hassan Muhammad, Chris C. Emezue, + Perez Ogayo, Anuoluwapo Aremu, Catherine Gitau, Derguene Mbaye, Jonathan Mukiibi, + Blessing K. Sibanda, Bonaventure F. P. Dossou, Andiswa Bukula, Rooweither Mabuya, + A. Tapo, Edwin Munkoh-Buabeng, V. M. Koagne, F. Kabore, Amelia Taylor, Godson Kalipe, + Tebogo Macucwa, Vukosi Marivate, T. Gwadabe, Mboning Tchiaze Elvis, I. Onyenwe, + G. Atindogbé, T. Adelani, Idris Akinade, Olanrewaju Samuel, M. Nahimana, Th'eogene + Musabeyezu, Emile Niyomutabazi, Ester Chimhenga, Kudzai Gotosa, Patrick Mizha, Apelete + Agbolo, Seydou T. Traoré, C. Uchechukwu, Aliyu Yusuf, M. Abdullahi, D. Klakow +tags: +- Annual Meeting of the Association for Computational Linguistics +link: https://arxiv.org/abs/2305.13989 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +In this paper, we present AfricaPOS, the largest part-of-speech (POS) dataset for 20 typologically diverse African languages. We discuss the challenges in annotating POS for these languages using the universal dependencies (UD) guidelines. We conducted extensive POS baseline experiments using both conditional random field and several multilingual pre-trained language models. We applied various cross-lingual transfer models trained with data available in the UD. Evaluating on the AfricaPOS dataset, we show that choosing the best transfer language(s) in both single-source and multi-source setups greatly improves the POS tagging performance of the target languages, in particular when combined with parameter-fine-tuning methods. Crucially, transferring knowledge from a language that matches the language family and morphosyntactic properties seems to be more effective for POS tagging in unseen languages. \ No newline at end of file diff --git a/_posts/papers/2023-07-03-2307.01163.md b/_posts/papers/2023-07-03-2307.01163.md new file mode 100644 index 00000000..256fc351 --- /dev/null +++ b/_posts/papers/2023-07-03-2307.01163.md @@ -0,0 +1,23 @@ +--- +title: Improving Language Plasticity via Pretraining with Active Forgetting +venue: Neural Information Processing Systems +names: Yihong Chen, Kelly Marchisio, Roberta Raileanu, David Ifeoluwa Adelani, Pontus + Stenetorp, Sebastian Riedel, Mikel Artetx +tags: +- Neural Information Processing Systems +link: https://arxiv.org/abs/2307.01163 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Pretrained language models (PLMs) are today the primary model for natural language processing. Despite their impressive downstream performance, it can be difficult to apply PLMs to new languages, a barrier to making their capabilities universally accessible. While prior work has shown it possible to address this issue by learning a new embedding layer for the new language, doing so is both data and compute inefficient. We propose to use an active forgetting mechanism during pretraining, as a simple way of creating PLMs that can quickly adapt to new languages. Concretely, by resetting the embedding layer every K updates during pretraining, we encourage the PLM to improve its ability of learning new embeddings within a limited number of updates, similar to a meta-learning effect. Experiments with RoBERTa show that models pretrained with our forgetting mechanism not only demonstrate faster convergence during language adaptation but also outperform standard ones in a low-data regime, particularly for languages that are distant from English. \ No newline at end of file diff --git a/_posts/papers/2023-07-29-2307.16071.md b/_posts/papers/2023-07-29-2307.16071.md new file mode 100644 index 00000000..9025750b --- /dev/null +++ b/_posts/papers/2023-07-29-2307.16071.md @@ -0,0 +1,23 @@ +--- +title: 'ÌròyìnSpeech: A Multi-purpose Yorùbá Speech Corpus' +venue: International Conference on Language Resources and Evaluation +names: Tolúlopé Ògúnrèmí, Kólá Túbosún, Aremu Anuoluwapo, Iroro Orife, David Ifeoluwa + Adelani +tags: +- International Conference on Language Resources and Evaluation +link: https://arxiv.org/abs/2307.16071 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +We introduce ÌròyìnSpeech corpus—a new dataset influenced by a desire to increase the amount of high quality, freely available, contemporary Yorùbá speech data that can be used for both Text-to-Speech (TTS) and Automatic Speech Recognition (ASR) tasks. We curated about 23,000 text sentences from the news and creative writing domains with an open license i.e., CC-BY-4.0 and asked multiple speakers to record each sentence. To encourage more participatory approach to data creation, we provide 5 000 utterances from the curated sentences to the Mozilla Common Voice platform to crowd-source the recording and validation of Yorùbá speech data. In total, we created about 42 hours of speech data recorded by 80 volunteers in-house, and 6 hours validated recordings on Mozilla Common Voice platform. Our evaluation on TTS shows that we can create a good quality general domain single-speaker TTS model for Yorùbá with as little 5 hours of speech by leveraging an end-to-end VITS architecture. Similarly, for ASR, we obtained a WER of 21.5. \ No newline at end of file diff --git a/_posts/papers/2023-08-01-10.1016-j.patter.2023.100820.md b/_posts/papers/2023-08-01-10.1016-j.patter.2023.100820.md new file mode 100644 index 00000000..05969147 --- /dev/null +++ b/_posts/papers/2023-08-01-10.1016-j.patter.2023.100820.md @@ -0,0 +1,25 @@ +--- +title: Consultative engagement of stakeholders toward a roadmap for African language + technologies +venue: Patterns +names: Kathleen Siminyu, Jade Z. Abbott, Kólá Túbosún, Aremu Anuoluwapo, Blessing + K. Sibanda, Kofi A. Yeboah, David Ifeoluwa Adelani, Masabata Mokgesi-Selinga, Frederick + R. Apina, Angela Thandizwe Mthembu, A. Ramkilowan, Babatunde Oladimeji +tags: +- Patterns +link: https://doi.org/10.1016/j.patter.2023.100820 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2023-08-18-2308.09768.md b/_posts/papers/2023-08-18-2308.09768.md new file mode 100644 index 00000000..60228a73 --- /dev/null +++ b/_posts/papers/2023-08-18-2308.09768.md @@ -0,0 +1,22 @@ +--- +title: 'NaijaRC: A Multi-choice Reading Comprehension Dataset for Nigerian Languages' +venue: '' +names: Anuoluwapo Aremu, Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani +tags: +- '' +link: https://arxiv.org/abs/2308.09768 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +In this paper, we create NaijaRC: a new multi-choice Reading Comprehension dataset for three native Nigeria languages that is based on high-school reading comprehension examination. We provide baseline results by performing cross-lingual transfer using existing English RACE and Belebele training dataset based on a pre-trained encoder-only model. Additionally, we provide results by prompting large language models (LLMs) like GPT-4. \ No newline at end of file diff --git a/_posts/papers/2023-09-14-2309.07445.md b/_posts/papers/2023-09-14-2309.07445.md new file mode 100644 index 00000000..307cae80 --- /dev/null +++ b/_posts/papers/2023-09-14-2309.07445.md @@ -0,0 +1,24 @@ +--- +title: 'SIB-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic Classification + in 200+ Languages and Dialects' +venue: Conference of the European Chapter of the Association for Computational Linguistics +names: David Ifeoluwa Adelani, Hannah Liu, Xiaoyu Shen, Nikita Vassilyev, Jesujoba + Oluwadara Alabi, Yanke Mao, Haonan Gao, Annie En-Shiun Lee +tags: +- Conference of the European Chapter of the Association for Computational Linguistics +link: https://arxiv.org/abs/2309.07445 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Despite the progress in building multilingual language models, evaluation is often limited to a few languages with available datasets which excludes a large number of low-resource languages. In this paper, we create SIB-200—a large-scale open-sourced benchmark dataset for topic classification in 205 languages and dialects to address the lack of evaluation dataset for Natural Language Understanding (NLU). For many of the languages covered in SIB-200, this is the first publicly available evaluation dataset for NLU. The dataset is based on Flores-200 machine translation corpus. We annotated the English portion of the dataset and extended the sentence-level annotation to the remaining 204 languages covered in the corpus. Despite the simplicity of this task, our evaluation in full-supervised setting, cross-lingual transfer setting and prompting of large language model setting show that there is still a large gap between the performance of high-resource and low-resource languages when multilingual evaluation is scaled to numerous world languages. We found that languages unseen during the pre-training of multilingual language models, languages from under-represented families (like Nilotic and Altantic-Congo), and languages from the regions of Africa, Americas, Oceania and South East Asia, often have the lowest performance on our topic classification dataset. We hope our dataset %will encourages a more inclusive evaluation of multilingual language models on a more diverse set of languages. \ No newline at end of file diff --git a/_posts/papers/2023-11-14-2311.07978.md b/_posts/papers/2023-11-14-2311.07978.md new file mode 100644 index 00000000..4473832b --- /dev/null +++ b/_posts/papers/2023-11-14-2311.07978.md @@ -0,0 +1,22 @@ +--- +title: How good are Large Language Models on African Languages? +venue: arXiv.org +names: Jessica Ojo, Kelechi Ogueji, Pontus Stenetorp, David Ifeoluwa Adelani +tags: +- arXiv.org +link: https://arxiv.org/abs/2311.07978 +author: Jessica Ojo +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Recent advancements in natural language processing have led to the proliferation of large language models (LLMs). These models have been shown to yield good performance, using in-context learning, even on tasks and languages they are not trained on. However, their performance on African languages is largely understudied relative to high-resource languages. We present an analysis of four popular large language models (mT0, Aya, LLaMa 2, and GPT-4) on six tasks (topic classification, sentiment classification, machine translation, summarization, question answering, and named entity recognition) across 60 African languages, spanning different language families and geographical regions. Our results suggest that all LLMs produce lower performance for African languages, and there is a large gap in performance compared to high-resource languages (such as English) for most tasks. We find that GPT-4 has an average to good performance on classification tasks, yet its performance on generative tasks such as machine translation and summarization is significantly lacking. Surprisingly, we find that mT0 had the best overall performance for cross-lingual QA, better than the state-of-the-art supervised model (i.e. fine-tuned mT5) and GPT-4 on African languages. Similarly, we find the recent Aya model to have comparable result to mT0 in almost all tasks except for topic classification where it outperform mT0. Overall, LLaMa 2 showed the worst performance, which we believe is due to its English and code-centric~(around 98%) pre-training corpus. Our findings confirms that performance on African languages continues to remain a hurdle for the current LLMs, underscoring the need for additional efforts to close this gap. \ No newline at end of file diff --git a/_posts/papers/2023-11-16-2311.09828.md b/_posts/papers/2023-11-16-2311.09828.md new file mode 100644 index 00000000..95d6977f --- /dev/null +++ b/_posts/papers/2023-11-16-2311.09828.md @@ -0,0 +1,34 @@ +--- +title: 'AfriMTE and AfriCOMET: Enhancing COMET to Embrace Under-resourced African + Languages' +venue: North American Chapter of the Association for Computational Linguistics +names: Jiayi Wang, David Ifeoluwa Adelani, Sweta Agrawal, Marek Masiak, Ricardo Rei, + Eleftheria Briakou, Marine Carpuat, Xuanli He, Sofia Bourhim, Andiswa Bukula, Muhidin + A. Mohamed, Temitayo Olatoye, Tosin P. Adewumi, Hamam Mokayede, Christine Mwase, + Wangui Kimotho, Foutse Yuehgoh, Anuoluwapo Aremu, Jessica Ojo, Shamsuddeen Hassan + Muhammad, Salomey Osei, Abdul-Hakeem Omotayo, Chiamaka Chukwuneke, Perez Ogayo, + Oumaima Hourrane, Salma El Anigri, Lolwethu Ndolela, Thabiso Mangwana, Shafie Abdi + Mohamed, Ayinde Hassan, Oluwabusayo Olufunke Awoyomi, Lama Alkhaled, S. Al-Azzawi, + Naome A. Etori, Millicent Ochieng, Clemencia Siro, Samuel Njoroge, Eric Muchiri, + Wangari Kimotho, Lyse Naomi Wamba Momo, D. Abolade, Simbiat Ajao, Iyanuoluwa Shode, + Ricky Macharm, R. Iro, S. S. Abdullahi, Stephen E. Moore, Bernard Opoku, Zainab + Akinjobi, Abeeb Afolabi, Nnaemeka Obiefuna, Onyekachi Raphael Ogbu, Sam Brian, V. + Otiende, C. Mbonu, Sakayo Toadoum Sari, Yao Lu, Pontus Stenetorp +tags: +- North American Chapter of the Association for Computational Linguistics +link: https://arxiv.org/abs/2311.09828 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Despite the recent progress on scaling multilingual machine translation (MT) to several under-resourced African languages, accurately measuring this progress remains challenging, since evaluation is often performed on n-gram matching metrics such as BLEU, which typically show a weaker correlation with human judgments. Learned metrics such as COMET have higher correlation; however, the lack of evaluation data with human ratings for under-resourced languages, complexity of annotation guidelines like Multidimensional Quality Metrics (MQM), and limited language coverage of multilingual encoders have hampered their applicability to African languages. In this paper, we address these challenges by creating high-quality human evaluation data with simplified MQM guidelines for error detection and direct assessment (DA) scoring for 13 typologically diverse African languages. Furthermore, we develop AfriCOMET: COMET evaluation metrics for African languages by leveraging DA data from well-resourced languages and an African-centric multilingual encoder (AfroXLM-R) to create the state-of-the-art MT evaluation metrics for African languages with respect to Spearman-rank correlation with human judgments (0.441). \ No newline at end of file diff --git a/_posts/papers/2024-04-03-2404.02534.md b/_posts/papers/2024-04-03-2404.02534.md new file mode 100644 index 00000000..3371cb6e --- /dev/null +++ b/_posts/papers/2024-04-03-2404.02534.md @@ -0,0 +1,23 @@ +--- +title: 'ANGOFA: Leveraging OFA Embedding Initialization and Synthetic Data for Angolan + Language Model' +venue: arXiv.org +names: Osvaldo Luamba Quinjica, David Ifeoluwa Adelani +tags: +- arXiv.org +link: https://arxiv.org/abs/2404.02534 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +In recent years, the development of pre-trained language models (PLMs) has gained momentum, showcasing their capacity to transcend linguistic barriers and facilitate knowledge transfer across diverse languages. However, this progress has predominantly bypassed the inclusion of very-low resource languages, creating a notable void in the multilingual landscape. This paper addresses this gap by introducing four tailored PLMs specifically finetuned for Angolan languages, employing a Multilingual Adaptive Fine-tuning (MAFT) approach. In this paper, we survey the role of informed embedding initialization and synthetic data in enhancing the performance of MAFT models in downstream tasks. We improve baseline over SOTA AfroXLMR-base (developed through MAFT) and OFA (an effective embedding initialization) by 12.3 and 3.8 points respectively. \ No newline at end of file diff --git a/_posts/papers/2024-04-28-2404.18180.md b/_posts/papers/2024-04-28-2404.18180.md new file mode 100644 index 00000000..f06320bb --- /dev/null +++ b/_posts/papers/2024-04-28-2404.18180.md @@ -0,0 +1,24 @@ +--- +title: 'EkoHate: Abusive Language and Hate Speech Detection for Code-switched Political + Discussions on Nigerian Twitter' +venue: WOAH +names: Comfort Eseohen Ilevbare, Jesujoba Oluwadara Alabi, David Ifeoluwa Adelani, + Firdous Damilola Bakare, O. B. Abiola, O. Adeyemo +tags: +- WOAH +link: https://arxiv.org/abs/2404.18180 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Nigerians have a notable online presence and actively discuss political and topical matters. This was particularly evident throughout the 2023 general election, where Twitter was used for campaigning, fact-checking and verification, and even positive and negative discourse. However, little or none has been done in the detection of abusive language and hate speech in Nigeria. In this paper, we curated code-switched Twitter data directed at three musketeers of the governorship election on the most populous and economically vibrant state in Nigeria; Lagos state, with the view to detect offensive speech in political discussions. We developed EkoHate—an abusive language and hate speech dataset for political discussions between the three candidates and their followers using a binary (normal vs offensive) and fine-grained four-label annotation scheme. We analysed our dataset and provided an empirical evaluation of state-of-the-art methods across both supervised and cross-lingual transfer learning settings. In the supervised setting, our evaluation results in both binary and four-label annotation schemes show that we can achieve 95.1 and 70.3 F1 points respectively. Furthermore, we show that our dataset adequately transfers very well to three publicly available offensive datasets (OLID, HateUS2020, and FountaHate), generalizing to political discussions in other regions like the US. \ No newline at end of file diff --git a/_posts/papers/2024-04-28-2404.18286.md b/_posts/papers/2024-04-28-2404.18286.md new file mode 100644 index 00000000..0fa4bdd8 --- /dev/null +++ b/_posts/papers/2024-04-28-2404.18286.md @@ -0,0 +1,23 @@ +--- +title: Comparing LLM prompting with Cross-lingual transfer performance on Indigenous + and Low-resource Brazilian Languages +venue: AMERICASNLP +names: David Ifeoluwa Adelani, A. S. Dougruoz, Andr'e Coneglian, Atul Kr. Ojha +tags: +- AMERICASNLP +link: https://arxiv.org/abs/2404.18286 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Large Language Models are transforming NLP for a lot of tasks. However, how LLMs perform NLP tasks for LRLs is less explored. In alliance with the theme track of the NAACL’24, we focus on 12 low-resource languages (LRLs) from Brazil, 2 LRLs from Africa and 2 high-resource languages (HRLs) (e.g., English and Brazilian Portuguese). Our results indicate that the LLMs perform worse for the labeling of LRLs in comparison to HRLs in general. We explain the reasons behind this failure and provide an error analyses through examples from 2 Brazilian LRLs. \ No newline at end of file diff --git a/_posts/papers/2024-04-30-2404.19442.md b/_posts/papers/2024-04-30-2404.19442.md new file mode 100644 index 00000000..07bd93f2 --- /dev/null +++ b/_posts/papers/2024-04-30-2404.19442.md @@ -0,0 +1,23 @@ +--- +title: 'Does Generative AI speak Nigerian-Pidgin?: Issues about Representativeness + and Bias for Multilingualism in LLMs' +venue: '' +names: David Ifeoluwa Adelani, A. S. Dougruoz, Iyanuoluwa Shode, Anuoluwapo Aremu +tags: +- '' +link: https://arxiv.org/abs/2404.19442 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Nigeria is a multilingual country with 500+ languages. Naija is a Nigerian-Pidgin spoken by approx. 120M speakers in Nigeria and it is a mixed language (e.g., English, Portuguese, Yoruba, Hausa and Igbo). Although it has mainly been a spoken language until recently, there are now various platforms publishing exclusively in Naija such as Naija Wikipedia. However, it is hard to distinguish by non-native from a larger pidgin languages spoken across West Africa known as West African Pidgin English (WAPE) -- which is more simplied and understandable by wider audience in Ghana, Nigeria, and Cameroon. BBC news platform publishes exclusively in WAPE to cater for several countries in West Africa. In our paper, we show through statistical analyses and Machine Translation experiments that these two creole varieties do not represent each other (i.e., there are linguistic differences in word order and vocabulary) and Generative AI operates only based on WAPE. In other words, Naija is under-represented in Generative AI, and it is hard to teach LLMs with few examples. \ No newline at end of file diff --git a/_posts/papers/2024-06-05-10.1038-d41586-024-00964-2.md b/_posts/papers/2024-06-05-10.1038-d41586-024-00964-2.md new file mode 100644 index 00000000..e09072b7 --- /dev/null +++ b/_posts/papers/2024-06-05-10.1038-d41586-024-00964-2.md @@ -0,0 +1,22 @@ +--- +title: Meta's AI translation model embraces overlooked languages. +venue: Nature +names: David Ifeoluwa Adelani +tags: +- Nature +link: https://doi.org/10.1038/d41586-024-00964-2 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +None \ No newline at end of file diff --git a/_posts/papers/2024-06-05-2406.03368.md b/_posts/papers/2024-06-05-2406.03368.md new file mode 100644 index 00000000..f56119c5 --- /dev/null +++ b/_posts/papers/2024-06-05-2406.03368.md @@ -0,0 +1,28 @@ +--- +title: 'IrokoBench: A New Benchmark for African Languages in the Age of Large Language + Models' +venue: arXiv.org +names: David Ifeoluwa Adelani, Jessica Ojo, Israel Abebe Azime, Zhuang Yun Jian, Jesujoba + Oluwadara Alabi, Xuanli He, Millicent Ochieng, Sara Hooker, Andiswa Bukula, En-Shiun + Annie Lee, Chiamaka Chukwuneke, Happy Buzaaba, Blessing K. Sibanda, Godson Kalipe, + Jonathan Mukiibi, Salomon Kabongo KABENAMUALU, Foutse Yuehgoh, M. Setaka, Lolwethu + Ndolela, N. Odu, Rooweither Mabuya, Shamsuddeen Hassan Muhammad, Salomey Osei, Sokhar + Samb, Tadesse Kebede Guge, Pontus Stenetorp +tags: +- arXiv.org +link: https://arxiv.org/abs/2406.03368 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Despite the widespread adoption of Large language models (LLMs), their remarkable capabilities remain limited to a few high-resource languages. Additionally, many low-resource languages (e.g. African languages) are often evaluated only on basic text classification tasks due to the lack of appropriate or comprehensive benchmarks outside of high-resource languages. In this paper, we introduce IrokoBench -- a human-translated benchmark dataset for 16 typologically-diverse low-resource African languages covering three tasks: natural language inference~(AfriXNLI), mathematical reasoning~(AfriMGSM), and multi-choice knowledge-based QA~(AfriMMLU). We use IrokoBench to evaluate zero-shot, few-shot, and translate-test settings~(where test sets are translated into English) across 10 open and four proprietary LLMs. Our evaluation reveals a significant performance gap between high-resource languages~(such as English and French) and low-resource African languages. We observe a significant performance gap between open and proprietary models, with the highest performing open model, Aya-101 only at 58\% of the best-performing proprietary model GPT-4o performance. Machine translating the test set to English before evaluation helped to close the gap for larger models that are English-centric, like LLaMa 3 70B. These findings suggest that more efforts are needed to develop and adapt LLMs for African languages. \ No newline at end of file diff --git a/_posts/papers/2024-06-27-2406.19564.md b/_posts/papers/2024-06-27-2406.19564.md new file mode 100644 index 00000000..a55f2295 --- /dev/null +++ b/_posts/papers/2024-06-27-2406.19564.md @@ -0,0 +1,23 @@ +--- +title: 'Voices Unheard: NLP Resources and Models for Yorùbá Regional Dialects' +venue: Conference on Empirical Methods in Natural Language Processing +names: Orevaoghene Ahia, Anuoluwapo Aremu, Diana Abagyan, Hila Gonen, David Ifeoluwa + Adelani, D. Abolade, Noah A. Smith, Yulia Tsvetkov +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2406.19564 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Yoruba—an African language with roughly 47 million speakers—encompasses a continuum with several dialects. Recent efforts to develop NLP technologies for African languages have focused on their standard dialects, resulting in disparities for dialects and varieties for which there are little to no resources or tools. We take steps towards bridging this gap by introducing a new high-quality parallel text and speech corpus; YORULECT across three domains and four regional yoruba dialects. To develop this corpus, we engaged native speakers, traveling to communities where these dialects are spoken, to collect text and speech data. Using our newly created corpus, we conducted extensive experiments on (text) machine translation, automatic speech recognition, and speech-to-text translation. Our results reveal substantial performance disparities between standard yoruba and the other dialects across all tasks. However, we also show that with dialect-adaptive finetuning, we are able to narrow this gap. We believe our dataset and experimental analysis will contribute greatly to developing NLP tools for Yoruba and its dialects, and potentially for other African languages, by improving our understanding of existing challenges and offering a high-quality dataset for further development. We will release YORULECT dataset and models publicly under an open license. \ No newline at end of file diff --git a/_posts/papers/2024-07-14-2407.10152.md b/_posts/papers/2024-07-14-2407.10152.md new file mode 100644 index 00000000..9ed00d14 --- /dev/null +++ b/_posts/papers/2024-07-14-2407.10152.md @@ -0,0 +1,27 @@ +--- +title: 'Mitigating Translationese in Low-resource Languages: The Storyboard Approach' +venue: International Conference on Language Resources and Evaluation +names: Garry Kuwanto, E. Urua, Priscilla Amuok, Shamsuddeen Hassan Muhammad, Anuoluwapo + Aremu, V. Otiende, Loice Emma Nanyanga, T. Nyoike, A. D. Akpan, Nsima Ab Udouboh, + Idongesit Udeme Archibong, Idara Effiong Moses, Ifeoluwatayo A. Ige, Benjamin Ayoade + Ajibade, Olumide Benjamin Awokoya, Idris Abdulmumin, Saminu Mohammad Aliyu, R. Iro, + I. Ahmad, Deontae Smith, Praise-EL Michaels, David Ifeoluwa Adelani, Derry Tanti + Wijaya, Anietie U Andy +tags: +- International Conference on Language Resources and Evaluation +link: https://arxiv.org/abs/2407.10152 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Low-resource languages often face challenges in acquiring high-quality language data due to the reliance on translation-based methods, which can introduce the translationese effect. This phenomenon results in translated sentences that lack fluency and naturalness in the target language. In this paper, we propose a novel approach for data collection by leveraging storyboards to elicit more fluent and natural sentences. Our method involves presenting native speakers with visual stimuli in the form of storyboards and collecting their descriptions without direct exposure to the source text. We conducted a comprehensive evaluation comparing our storyboard-based approach with traditional text translation-based methods in terms of accuracy and fluency. Human annotators and quantitative metrics were used to assess translation quality. The results indicate a preference for text translation in terms of accuracy, while our method demonstrates worse accuracy but better fluency in the language focused. \ No newline at end of file diff --git a/_posts/papers/2024-07-23-2407.16470.md b/_posts/papers/2024-07-23-2407.16470.md new file mode 100644 index 00000000..58f69492 --- /dev/null +++ b/_posts/papers/2024-07-23-2407.16470.md @@ -0,0 +1,24 @@ +--- +title: Machine Translation Hallucination Detection for Low and High Resource Languages + using Large Language Models +venue: Conference on Empirical Methods in Natural Language Processing +names: Kenza Benkirane, Laura Gongas, Shahar Pelles, Naomi Fuchs, Joshua Darmon, Pontus + Stenetorp, David Ifeoluwa Adelani, Eduardo Sánchez, Meta +tags: +- Conference on Empirical Methods in Natural Language Processing +link: https://arxiv.org/abs/2407.16470 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Recent advancements in massively multilingual machine translation systems have significantly enhanced translation accuracy; however, even the best performing systems still generate hallucinations, severely impacting user trust. Detecting hallucinations in Machine Translation (MT) remains a critical challenge, particularly since existing methods excel with High-Resource Languages (HRLs) but exhibit substantial limitations when applied to Low-Resource Languages (LRLs). This paper evaluates sentence-level hallucination detection approaches using Large Language Models (LLMs) and semantic similarity within massively multilingual embeddings. Our study spans 16 language directions, covering HRLs, LRLs, with diverse scripts. We find that the choice of model is essential for performance. On average, for HRLs, Llama3-70B outperforms the previous state of the art by as much as 0.16 MCC (Matthews Correlation Coefficient). However, for LRLs we observe that Claude Sonnet outperforms other LLMs on average by 0.03 MCC. The key takeaway from our study is that LLMs can achieve performance comparable or even better than previously proposed models, despite not being explicitly trained for any machine translation task. However, their advantage is less significant for LRLs. \ No newline at end of file diff --git a/_posts/papers/2024-12-01-2412.00948.md b/_posts/papers/2024-12-01-2412.00948.md new file mode 100644 index 00000000..99a09f76 --- /dev/null +++ b/_posts/papers/2024-12-01-2412.00948.md @@ -0,0 +1,26 @@ +--- +title: 'Uhura: A Benchmark for Evaluating Scientific Question Answering and Truthfulness + in Low-Resource African Languages' +venue: '' +names: Edward Bayes, Israel Abebe Azime, Jesujoba Oluwadara Alabi, Jonas Kgomo, Tyna + Eloundou, Elizabeth Proehl, Kai Chen, Imaan Khadir, Naome A. Etori, Shamsuddeen + Hassan Muhammad, Choice Mpanza, Igneciah Pocia Thete, D. Klakow, David Ifeoluwa + Adelani +tags: +- '' +link: https://arxiv.org/abs/2412.00948 +author: David Adelani +categories: Publications +layout: paper + +--- + +*{{ page.names }}* + +**{{ page.venue }}** + +{% include display-publication-links.html pub=page %} + +## Abstract + +Evaluations of Large Language Models (LLMs) on knowledge-intensive tasks and factual accuracy often focus on high-resource languages primarily because datasets for low-resource languages (LRLs) are scarce. In this paper, we present Uhura -- a new benchmark that focuses on two tasks in six typologically-diverse African languages, created via human translation of existing English benchmarks. The first dataset, Uhura-ARC-Easy, is composed of multiple-choice science questions. The second, Uhura-TruthfulQA, is a safety benchmark testing the truthfulness of models on topics including health, law, finance, and politics. We highlight the challenges creating benchmarks with highly technical content for LRLs and outline mitigation strategies. Our evaluation reveals a significant performance gap between proprietary models such as GPT-4o and o1-preview, and Claude models, and open-source models like Meta's LLaMA and Google's Gemma. Additionally, all models perform better in English than in African languages. These results indicate that LMs struggle with answering scientific questions and are more prone to generating false claims in low-resource African languages. Our findings underscore the necessity for continuous improvement of multilingual LM capabilities in LRL settings to ensure safe and reliable use in real-world contexts. We open-source the Uhura Benchmark and Uhura Platform to foster further research and development in NLP for LRLs. \ No newline at end of file diff --git a/records/semantic_paper_ids_ignored.json b/records/semantic_paper_ids_ignored.json index d3c0aee4..c3a96524 100644 --- a/records/semantic_paper_ids_ignored.json +++ b/records/semantic_paper_ids_ignored.json @@ -1,8 +1,13 @@ [ "00c2ba51a53da5c340c3217eabab935a67abafa0", "00d7f652c4147163dbfd3aa60c97471e7afa4861", + "02dae5f4d413f4764a240616b0eba0bf124e3b3e", "04cea2d855c5bbc5e7ddeed47a3ad8ecc54d4f5c", + "05c8f6bd9eb30faeb2a61d61e8dc524dd3f83417", + "063d9fa4861356500219b7e81d5a654aa921da6f", "09a34aad92e6f416c81f47e60de0809616b49cce", + "09bbcaf6cc1d3102359061ae7321b583be21a5b5", + "0abc3c8485fcee93a358d8bb2a2a4e8470d0126b", "0b483b550b21ec42d693fc04a372dbb10dd07019", "0c1987de3b65d1b082edf13fbb41717e497966be", "0c3c87e7d106fc1a6a32e146b0c6ffe0fec43c9a", @@ -11,9 +16,13 @@ "0e8ab0fd2c619e32d7b6608f3f0d80fe417e088d", "0f394eb0cdf4a8b3802f96008b2dfaf692e0bf1d", "100474bfb6ce0d23e7db9e4e905c074eaec31b50", + "13c6b2da558f8f77af78298ffa18eaf769262c6d", + "14023dc6bb8ebf7319013aa31d01bbe6a9c7cafe", "1442f16cf4e1e7b7914c67b56bf9455d79faeb08", + "1541bc9e588bfcd4bf365c868fa2f11461896980", "15f071386522099eaf39434452621d8750e8f5a8", "16992445d3114d27fb64d2d00cd35b421bef7930", + "16c64f74ce0e6a59b0709c0d8e66596a5bc08ed6", "17c6d7e79b566279afb869fee262467e6370f43a", "184e817b4bc3f17c7a6dc95a1818dc085ced0025", "1854e1a9588ac2756a5c49168d992aa831cafa6c", @@ -23,6 +32,8 @@ "1949df8dc876e2e1919640cd03242a832b3bfcb2", "19ec04f56a6580fa7f358ee9cf3af540d95aa6a4", "19fce084bb8086e1c5150c74ce6ce03d369b7058", + "1a2b7cd531e85450b83c595e85115139152c6741", + "1a92c78f09c55498d2377b4d6ccebafc36b38a4c", "1a9be52f8d728ff70cdfc67c42d83abf1551f63b", "1af678b040ce638aedf8b582212937f0921ccc1d", "1bc3c2b305d0b508caa2a39f4663c6e79402c9e1", @@ -32,21 +43,27 @@ "20a3ab365ad179dcb0dc19c5dbcdede772a2bcb8", "222f289cb96ac4dfef7849cd068af6af02233c52", "24182dd104351de9c66848b68b3f6336256f3f89", + "24ac368d08765dfad920ceefb79fba7bfe81d83c", "259cf65eeae13861031f44cf906d43b155192b10", "2677f411aae496be93ee70bcbf0eb0e949c13e0c", + "278fe1c4ccffc1a06e818a2d6154a37687545995", "27fd24efc03fc9ec548d9f32ba93542addb7f26b", "28761feb82f380379538ac108b3bb7515d90b042", + "2ac19d63e1adba20473a6d1122c598f81efc3c58", "2b2090eab4abe27e6e5e4ca94afaf82e511b63bd", "2cc134293669b20dce3d55a67d08fea665745e7b", + "2ef570ac8db5c7e5192334f31675cc2fd7b6622a", "2f05cb3dd8194276aa26c4e71841a86edb51914f", "2f0d94cec0e46493d2b877cdacd12154ee5ed5db", "2f545cbddeeded0afd605da46183df664d136993", "2fbdb720c934bfc3bde0d27b3c94726844bac5cc", "302a691914b1e000ba260f88e6859d1b0ae35557", "3155733e8e281a09ac9246cdc9b0f0e9b67d78d7", + "3172eb17c6984273f26b4b16d179b0fc13e64732", "32580b1814416e7dfaf6f569302441046c1ac39e", "340b59e6ee93d30c055b5e89a7cfbc88874c9958", "34503c0b6a615124eaf82cb0e4a1dab2866e8980", + "34c2939d3147946b2ac218e7857e1bc4c8902679", "34e3f34d95a895891a8ae628eb3d913c034bed32", "36b1ce08de25a44c056ffe4605772e8a2167d9a9", "36cd9d81c66dea2b1ec46ae0b7c1481df2f85ff0", @@ -54,13 +71,17 @@ "37e06f3622c17dc6194b547c944462b2a513b878", "389596027e577fb28ea5e1cc313b4f3d610cffd3", "3937b11717c22f62ab0b48dfd89e5dab75cedf40", + "39bca01efce8765f0a5d3a8981bc30d56f196b96", "3a23eb4c2f2cebc8685bd9d01b3d7afed110b42a", "3c6d41ee2c3c78509b6a47e0d097087b99006ca6", "408cc1103ab953a26c7071ffd9ce808469f77a01", "417d9c76b6f7c17990e35fb502ca61983e552d4d", + "432eb11c275a4891e06c144a9854bb0db63973c1", + "442aea74b7727babbd303a737d9b29093ced0bdd", "4488b383c3b733141bbc98af6c6e6d63391e15e9", "4711b43cc86d9a10fe930bd8f626df2c1cd42ad0", "47f7b4b8a3f8c2b96091e4762b091f1d9e79cac6", + "481d29cc4cd7d5d41faed2c3a84bbb19ebc5b027", "48c2a3bac779443e37c72285d71d11d9cea64e42", "48fac0d1f5a47941d8b85bfb90e9b32433c273a9", "49ebaefd64b48d071bfb0b0c5b1ec4df306f1a35", @@ -70,11 +91,14 @@ "4dc2cfc7c3cb74254126c9895152035a4dad913a", "4dd1b575a6ac74bfe9fab2e40c2cf1567c82fc6e", "4e6c7cf8eba4a7d24939ff77afd902f59b5d8e82", + "4e96e6d2abc54bc55d5489a9e5a666f273b69410", "4ebb884f9596c3e2b53108ed02169f1574bd1643", "4f13ce21dba20b3cd7e20ab15f2b6b3355c80147", "4fcfe83c05402b5c5fb6e853082e74af6379d7f9", "511f44611a812b59184598767ea63c87afaddbe8", + "524aa9a70db6cdd57aeed0b7b0e2575231c590cb", "5340b98af4349567189333209a95aedae988f6bc", + "53cb8e303ea3ccdcf9afbd73c3011e4bb403fa76", "5411d9be13f73954715a56f1e25a686bcf296806", "544e342f3f263e49ad09d792f411410581ebd0f2", "55210ad2c71839d85653e2237e13403960c35b15", @@ -94,13 +118,18 @@ "65446173fb539694b320cfc81edb03d9ee65d81f", "659c2eaf437f7750eede5798db87e156c9017963", "66a3ec99a9eef1d14c54b3e39b83a94a523a428a", + "673094a4ee36d3e8fc4f739fc2006756e24ce0ef", + "694495885a25ec5e39b280c93f7b3bb151f190ac", "6946cb4d7027904b98a12285ef5b06b08b5a8cd1", + "6a39fcaae3eee1277df9f1191b8bab1b11732c25", "6bfd167ae276c6206cbe8f4726e843f32eaeabba", "6c3b1efa290f67048055b1d8e5ea803e4b7b588e", "6d4a9f1c41b078846901362ba0dce8295dd6a2a8", "6df1af429151da5f67173409ba564298fc551e60", "6e6983417939dbcb04a50a46a489ce6bbfe8aa9d", "6f6e2e0311589a9af045f6acd00b7dee6d19fce4", + "71cf75ea133313897a272841809c3ef6c17c72bf", + "7259ca612de44763678a30114b02d37c00280c86", "7269d4721ca2b6b555bee86aad97f562fa5cd9ac", "72d862256f707613a3c16cc79e490a69151d73bf", "732020be199519fb197a1f2839c6f91ef0583ca7", @@ -109,33 +138,47 @@ "7717ef7ae58f1969c3758b5ff4dc2ffce17088d1", "780356ea2a4b7758f0e53173fa44357ab2ccb592", "781e9f69c290e9c487f5e331a87b089120fc401c", + "794e3de4e59812f824d07f785c1a982cb09bb987", "7b33745e9025c08e51fa45238d73374d6f7f92e5", + "7edbe97556bcc2bbf36b47844d46c141b2217840", "7f13e66231c96f34f8de2b091e5b5dafb5db5327", "8008348e87d3904842a2dd230c14b83112e8bf48", + "80a2db0459ba26bd8422b43078a6280a19835d1f", "80b583413b6e4ce0ccef908612634ef9e0d06531", + "81adffb171f3d7d3869830812058cdaba5665801", "822cbd971bd4a9b31c41d48c3a400109392096e9", "826ff1912a768386662a446176aad7ed424b9086", "82d60ef4c9439ff24a4ebdd1b6eab59396a6a2ce", "86288e07187978b99f7dec4a40d0cd80c07cf212", "86f22c32e6ea59659732b33aba1a786125e6f585", + "86fd9ffb2a3e2cf267deb8c5a7dfcb089c1db44a", "87c3018f20adf046386d97975a501046df06f42d", "88325569e99652ce8c7feb60d78669da9a9d1ac8", "888f7af7173853131c672d728d248577cf23e25e", "898b14509593d235414df054527b7702e35c3099", "89cb62dc83c1b1895267bd28639fbf5bb7ed21a4", "8a0e8e3a495400220a34b3e0a78fe1534c1fbd5b", + "8a930572177545e7394ba5cd03e9342142da564e", + "8ab1f774befd453364f65272f0b8ac5689cc4a96", "8cc3d39f61d705331ecb4e66bda9b38e424c8f72", "8df1226c9e4ae891f1759983a3fb9603f1519ee7", + "8e5aaa09f2c9a08e9343754c81a2310ba2d49ec3", "8e7f0ab1570b4d5b96b7813c1f571bbfeec6f098", + "8edbd09453dc78e485fc75c6030adcac1c318dca", "8ef7812c465d99da6cd33fd007295102dce0ceb7", "8f76a0312dcbed23773e3e42bcf0b5ed534f6f5a", "90a35136bfdb158370cf5e7e6bc70fdb77fb5c8b", "91d422ddb9be5a1b70c36f20e18e164973c8a069", + "91de6f4bca502193caf98527fbcc6d0563eed319", + "92e9acc55013a7408559d5e203eb913378563377", "9408d007064bd887f1e87e71c75f7e78ac5efb09", + "94f452403107ec67f19ccb0fa06cbe28b400864e", + "964bd39b546f0f6625ff3b9ef1083f797807ef2e", "97833e2aa0da5240e62436373b58af988a4ab6ab", "990a7b4eceedb6e053e6386269481bdfc42a1094", "99704b4fbe133f556e0c1a887b7e416bacfa5f94", "99ceef1e4a7e4fbe994166b022045b5a899aa22f", + "9a2f47777b99a92effb4e998b7082e1e92ae13bc", "9a57d9234eeb5570255910b29c187b9ce43d64e1", "9c7d0596c2e5da6b7923aaa55fb3ddab6324b9c4", "9c848575f94aa8c6d03e095367e0abde80622c28", @@ -147,21 +190,28 @@ "a3b7d461b2e834ae64ec7747adc24cf4fdd660ae", "a44fd7e741fb68e8c2b341f3760cf1cbbc92241d", "a48abad56acb085fe180c76a40d361aacd0dc049", + "a517575328ca3b8289fa95bd9f71669e1cf7127a", "a60073ac37698d08db15735a5a1a8f3218cb4937", "a6d45461b69cfd420708c16c18212dc68820eb7a", "a7a75721c4ad21ac1ac1118de89690ebe2f99f0d", "a85c6a003450ef1e6caed8a6494301ad581957ee", "a91c36cc1477d06637d637e33b18da90f40b4215", + "a924a2207906fb192a3ebd6ad0208a5bae684335", "aa2256899552328b09ad3c3f27b4f6dfc67a8a26", + "aa931eaa5d4ff2d7a11a668a4b234e976691ca5e", "ab158a20229999d9d437c23fb194e05deb2801c1", "ab3ddd3f33baa1086a95de49463e6e393c65342c", "ab5c71514254a1b2b3e2df31123dbc0257f4364d", + "ac81c7325ae4c9aa72eaefb2a4a978f5ba60a107", "ad9232f48dc90540b28bbf3f3598dbc97d90b54f", "ae6b281e5732876ce72648c1440f29c96319facf", "af9d67bad068a77d165e145368e98bf7bd7cce72", "afac807436c5bf90861ae46294d25b7c9360f60c", "b181a887f4977a58de209edd694ed72237e5f640", + "b1f69004b3c7d409d55fea21441b1e3a4e8940dd", "b26d1b138d096f8d201239dd2cdac060ff18667b", + "b57da3ccf214e8dad49116c8db9590c2c89629f5", + "b5b371295e7450df66aa6431c657baaf735290a1", "b5baedd5b7c270903e6861bebbfda81b10d59419", "b680558fdb76ba37c7337ce5fe3c929a7e23a677", "b7ec5ff2ebe38c2d0191ad0a728e1726d200f645", @@ -169,29 +219,39 @@ "b86d71ceca26fbb747093aeb5669e3df9a9fbee2", "b9a21c2bf389ba693cd4692a028c7f2821b1804e", "babeda48b10a4d638252118f2238d05a06f4ec55", + "bbcf7f15c4f470bc765ac5e8520a99b42e193166", "bc1122aaef8d7fbc5714ab78448a01ceb3779f1c", + "bc342e5c42f3a7640eea65616adbba8673d6c555", "bcb651d73447d96be58db5fac6fb13324842b351", + "beb70691b9abebfff63709a6ce06bf6a9cc96a65", + "c1e2455d56b9475bdb242f88ffc0edcd67dc9487", "c25245af4128a115a1056f7aa82d1cd0f883652f", "c2fe18041e08ba360f21240e17a15f7b140660e9", "c3a45b0fc4154a5a47579c2b1d4b449c9e1aef88", + "c3cfcd42b76ec6a18376f3975198c76e964b664f", "c4b817e2d8570153657d4a0d91188f1a9fb05f54", "c50143bbe3a913950f9c07c30f58fa4782e775c4", "c5b9209508a36b869ba75336e6a02b147030412e", "c5db886ea95aa06b83e78bbb1e7cdd0e3f580cd5", "c72cdb5ce7e0911c7f442ab503652d6fdeef35e0", "c8206c0450c6928614577899b92fa389365c423d", + "c9bc6026f5f7ebd055d5c67a75e6f32b1e1d5c78", "cd02e0a094953077217e2e62f3557b36a365acff", "ce9d13d236dea4399cca0cce4cee828205f9cec5", + "cf2b6b14c93602132bb6e902335e03b62afe339f", "cf54b66a05bb6eb0ebe93d9ef48da956cb0beea6", "cfea03a2245c0a53ac708772a654ec6cf80c8ac4", "d0336e5be72e97e0493b1ba77ef8ec3c349d496a", "d11d13867594d7b7f2b7b97053e57e8418b2dbf7", "d1b480b7c13f340583e4506442d47bb3125c2d26", "d2ed783705fa0ad3ceec2a22fb1592b8d2b6cb38", + "d31e10ed07f899c4c8c0dae8503aab452e4636d7", "d4550863c9b4102472a2326ab994aafdb13de7b9", "d4f578d99d5e0356f5336d99284681da85006e7d", + "d6802054111e37b6d6b517fbe1cddf394cef76c7", "d68255e8210843118d641175105e69686ad5b40f", "d838f425c2e5e7b0fabb4ac108fc3f57bb4a85c0", + "d847ab7f4109d0a4c640d5ee34b510a76002fddb", "d9bcd2f3227859a2cc68157da3d69c01b067ee47", "da5fcb26c830663b79c9aa1c550ae62e7725fcad", "dab79db635f83640a528c36b854094acad1afdc1", @@ -199,20 +259,29 @@ "dd5d28bf383eed44f758ae16cce46f2deceae41b", "de17bdef43b2a7ac75447c494b9f791d951a6b27", "de6807676d8171472ed6cf421c4e4ed3cbb47699", + "de9fcbc454d619cde5d89fb54eb16d44e6a0c2c7", + "dfa52ddc7277eb55a140be51cc9d34094fea4cca", "e1385746eb8ea97c768f639de67ada2592c371fd", "e15ab67b1c84c69f41de2aac9f2da93078712074", + "e1fb0eb6fe6de9cbc23242baef475fb9b0b70f62", "e3919e94c811fd85f5038926fa354619861674f9", + "e3d93be97c8f8a1ecc251c51b421bd31437f4148", "e468a3b84707e9828bac47aef37815d5b4818a19", "e507aa05192ca5565d9f9ab2c50710aed01a7652", "e66b88e4dc9a6d5896748f9f40f7bbb5e67b0645", + "e6a18267b5a50f4fbee72ce364a7983b8d1a0ff9", "e7e7445967768a168ba135e403477a25383f4d7c", "e874932b6a06171e35363dc9fe1d77936226028e", "e903e9ef82d1d6629d31ab90a78c5f3e0de244a0", + "e9c965749933a12dd4b3bc275ce28cd9fb03328f", "e9f28c98a00766e484810598886cf48b0de66cfa", + "ecd9168526d1a82ac2348c8de52bff6323322da9", "ecfcd1c4b4b2f1dff05bcbd472a377dfa1b7f0e8", "ed5483d0669ae3f7146d432119f6540e461914e8", + "ee79b4da7556cdd69ca188af6d6e883c9e94fa87", "eea16dfc29f0521dd547e67a84af4ff95a9c5529", "eea70d18ff7a4f7e7788d48bf07ec9d40460868f", + "ef2c8f016c5af400bbb0406c4c2d44b2505621ae", "efd8f6ed610ac94b46161a00c23a4ea8a2370052", "f01ed9336b6d98a51055a0b33800aa9a8bcc40a6", "f0695d518edf3abee3b38ee90ee09205e2ad7c10", @@ -226,7 +295,10 @@ "fa7cf61a0a2b79633b275a4a7cff739a34607895", "faedf875dbec7b2fc9b31a8f2aaa64fb7b8d1dec", "faee0c81a1170402b149500f1b91c51ccaf24027", + "fb1d5cbeb9f52b02b977e141f7eb02fbfb6c00a4", + "fb5273032ab997dddd1b2917ed41808fd66fb909", "fcf25e1affc2f8ee5bb49d156f174e9769234deb", + "fdc0d45e08319a3078775f92048543d16f4f6e8a", "fdef97eefdd979c9cbb55ed7d0325c8e56613e1e", "fe6faa5b0fd267c7c2b99d688aa8ec40e6d47060", "feef26f7932bf16f86a043aeb0651ed45e0ece09"