@inproceedings{talafha-etal-2020-multi,
title = "Multi-dialect {A}rabic {BERT} for Country-level Dialect Identification",
author = "Talafha, Bashar and
Ali, Mohammad and
Za{'}ter, Muhy Eddin and
Seelawi, Haitham and
Tuffaha, Ibraheem and
Samir, Mostafa and
Farhan, Wael and
Al-Natsheh, Hussein",
editor = "Zitouni, Imed and
Abdul-Mageed, Muhammad and
Bouamor, Houda and
Bougares, Fethi and
El-Haj, Mahmoud and
Tomeh, Nadi and
Zaghouani, Wajdi",
booktitle = "Proceedings of the Fifth Arabic Natural Language Processing Workshop",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wanlp-1.10",
pages = "111--118",
abstract = "Arabic dialect identification is a complex problem for a number of inherent properties of the language itself. In this paper, we present the experiments conducted, and the models developed by our competing team, Mawdoo3 AI, along the way to achieving our winning solution to subtask 1 of the Nuanced Arabic Dialect Identification (NADI) shared task. The dialect identification subtask provides 21,000 country-level labeled tweets covering all 21 Arab countries. An unlabeled corpus of 10M tweets from the same domain is also presented by the competition organizers for optional use. Our winning solution itself came in the form of an ensemble of different training iterations of our pre-trained BERT model, which achieved a micro-averaged F1-score of 26.78{\%} on the subtask at hand. We publicly release the pre-trained language model component of our winning solution under the name of Multi-dialect-Arabic-BERT model, for any interested researcher out there.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="talafha-etal-2020-multi">
<titleInfo>
<title>Multi-dialect Arabic BERT for Country-level Dialect Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Talafha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhy</namePart>
<namePart type="given">Eddin</namePart>
<namePart type="family">Za’ter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitham</namePart>
<namePart type="family">Seelawi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibraheem</namePart>
<namePart type="family">Tuffaha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mostafa</namePart>
<namePart type="family">Samir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wael</namePart>
<namePart type="family">Farhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hussein</namePart>
<namePart type="family">Al-Natsheh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Arabic Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abdul-Mageed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fethi</namePart>
<namePart type="family">Bougares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahmoud</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic dialect identification is a complex problem for a number of inherent properties of the language itself. In this paper, we present the experiments conducted, and the models developed by our competing team, Mawdoo3 AI, along the way to achieving our winning solution to subtask 1 of the Nuanced Arabic Dialect Identification (NADI) shared task. The dialect identification subtask provides 21,000 country-level labeled tweets covering all 21 Arab countries. An unlabeled corpus of 10M tweets from the same domain is also presented by the competition organizers for optional use. Our winning solution itself came in the form of an ensemble of different training iterations of our pre-trained BERT model, which achieved a micro-averaged F1-score of 26.78% on the subtask at hand. We publicly release the pre-trained language model component of our winning solution under the name of Multi-dialect-Arabic-BERT model, for any interested researcher out there.</abstract>
<identifier type="citekey">talafha-etal-2020-multi</identifier>
<location>
<url>https://aclanthology.org/2020.wanlp-1.10</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>111</start>
<end>118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-dialect Arabic BERT for Country-level Dialect Identification
%A Talafha, Bashar
%A Ali, Mohammad
%A Za’ter, Muhy Eddin
%A Seelawi, Haitham
%A Tuffaha, Ibraheem
%A Samir, Mostafa
%A Farhan, Wael
%A Al-Natsheh, Hussein
%Y Zitouni, Imed
%Y Abdul-Mageed, Muhammad
%Y Bouamor, Houda
%Y Bougares, Fethi
%Y El-Haj, Mahmoud
%Y Tomeh, Nadi
%Y Zaghouani, Wajdi
%S Proceedings of the Fifth Arabic Natural Language Processing Workshop
%D 2020
%8 December
%I Association for Computational Linguistics
%C Barcelona, Spain (Online)
%F talafha-etal-2020-multi
%X Arabic dialect identification is a complex problem for a number of inherent properties of the language itself. In this paper, we present the experiments conducted, and the models developed by our competing team, Mawdoo3 AI, along the way to achieving our winning solution to subtask 1 of the Nuanced Arabic Dialect Identification (NADI) shared task. The dialect identification subtask provides 21,000 country-level labeled tweets covering all 21 Arab countries. An unlabeled corpus of 10M tweets from the same domain is also presented by the competition organizers for optional use. Our winning solution itself came in the form of an ensemble of different training iterations of our pre-trained BERT model, which achieved a micro-averaged F1-score of 26.78% on the subtask at hand. We publicly release the pre-trained language model component of our winning solution under the name of Multi-dialect-Arabic-BERT model, for any interested researcher out there.
%U https://aclanthology.org/2020.wanlp-1.10
%P 111-118
Markdown (Informal)
[Multi-dialect Arabic BERT for Country-level Dialect Identification](https://aclanthology.org/2020.wanlp-1.10) (Talafha et al., WANLP 2020)
ACL
- Bashar Talafha, Mohammad Ali, Muhy Eddin Za’ter, Haitham Seelawi, Ibraheem Tuffaha, Mostafa Samir, Wael Farhan, and Hussein Al-Natsheh. 2020. Multi-dialect Arabic BERT for Country-level Dialect Identification. In Proceedings of the Fifth Arabic Natural Language Processing Workshop, pages 111–118, Barcelona, Spain (Online). Association for Computational Linguistics.