@inproceedings{sadraeijavaheri-etal-2024-transformers,
title = "Transformers for Bridging {P}ersian Dialects: Transliteration Model for Tajiki and {I}ranian Scripts",
author = "SadraeiJavaheri, MohammadAli and
Asgari, Ehsaneddin and
Rabiee, Hamid Reza",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1459",
pages = "16770--16775",
abstract = "In this study, we address the linguistic challenges posed by Tajiki Persian, a distinct variant of the Persian language that utilizes the Cyrillic script due to historical {``}Russification{''}. This distinguishes it from other Persian dialects that adopt the Arabic script. Despite its profound linguistic and cultural significance, Tajiki Persian remains a low-resource language with scant digitized datasets for computational applications. To address this deficiency, we created a parallel corpus using Shahnameh, a seminal Persian epic poem. Employing optical character recognition, we extracted Tajiki Persian verses from primary sources and applied a heuristic method to align them with their Iranian Persian counterparts. We then trained and assessed transliteration models using two prominent sequence-to-sequence architectures: GRU with attention and transformer. Our results underscore the enhanced performance of our models, particularly in contrast to pre-trained large multilingual models like GPT-3.5, emphasizing the value of dedicated datasets in advancing computational approaches for underrepresented languages. With the publication of this work, we are disseminating, for the first time, a vast collection of Persian poetry spanning 1000 years, transcribed in Tajiki scripts for the benefit of the Tajiki-speaking communities. The dataset, along with the model{'}s code and checkpoints, is accessible at https://github.com/language-ml/Tajiki-Shahname, marking a significant contribution to computational linguistic resources for Tajiki Persian.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sadraeijavaheri-etal-2024-transformers">
<titleInfo>
<title>Transformers for Bridging Persian Dialects: Transliteration Model for Tajiki and Iranian Scripts</title>
</titleInfo>
<name type="personal">
<namePart type="given">MohammadAli</namePart>
<namePart type="family">SadraeiJavaheri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsaneddin</namePart>
<namePart type="family">Asgari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamid</namePart>
<namePart type="given">Reza</namePart>
<namePart type="family">Rabiee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this study, we address the linguistic challenges posed by Tajiki Persian, a distinct variant of the Persian language that utilizes the Cyrillic script due to historical “Russification”. This distinguishes it from other Persian dialects that adopt the Arabic script. Despite its profound linguistic and cultural significance, Tajiki Persian remains a low-resource language with scant digitized datasets for computational applications. To address this deficiency, we created a parallel corpus using Shahnameh, a seminal Persian epic poem. Employing optical character recognition, we extracted Tajiki Persian verses from primary sources and applied a heuristic method to align them with their Iranian Persian counterparts. We then trained and assessed transliteration models using two prominent sequence-to-sequence architectures: GRU with attention and transformer. Our results underscore the enhanced performance of our models, particularly in contrast to pre-trained large multilingual models like GPT-3.5, emphasizing the value of dedicated datasets in advancing computational approaches for underrepresented languages. With the publication of this work, we are disseminating, for the first time, a vast collection of Persian poetry spanning 1000 years, transcribed in Tajiki scripts for the benefit of the Tajiki-speaking communities. The dataset, along with the model’s code and checkpoints, is accessible at https://github.com/language-ml/Tajiki-Shahname, marking a significant contribution to computational linguistic resources for Tajiki Persian.</abstract>
<identifier type="citekey">sadraeijavaheri-etal-2024-transformers</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1459</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>16770</start>
<end>16775</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transformers for Bridging Persian Dialects: Transliteration Model for Tajiki and Iranian Scripts
%A SadraeiJavaheri, MohammadAli
%A Asgari, Ehsaneddin
%A Rabiee, Hamid Reza
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F sadraeijavaheri-etal-2024-transformers
%X In this study, we address the linguistic challenges posed by Tajiki Persian, a distinct variant of the Persian language that utilizes the Cyrillic script due to historical “Russification”. This distinguishes it from other Persian dialects that adopt the Arabic script. Despite its profound linguistic and cultural significance, Tajiki Persian remains a low-resource language with scant digitized datasets for computational applications. To address this deficiency, we created a parallel corpus using Shahnameh, a seminal Persian epic poem. Employing optical character recognition, we extracted Tajiki Persian verses from primary sources and applied a heuristic method to align them with their Iranian Persian counterparts. We then trained and assessed transliteration models using two prominent sequence-to-sequence architectures: GRU with attention and transformer. Our results underscore the enhanced performance of our models, particularly in contrast to pre-trained large multilingual models like GPT-3.5, emphasizing the value of dedicated datasets in advancing computational approaches for underrepresented languages. With the publication of this work, we are disseminating, for the first time, a vast collection of Persian poetry spanning 1000 years, transcribed in Tajiki scripts for the benefit of the Tajiki-speaking communities. The dataset, along with the model’s code and checkpoints, is accessible at https://github.com/language-ml/Tajiki-Shahname, marking a significant contribution to computational linguistic resources for Tajiki Persian.
%U https://aclanthology.org/2024.lrec-main.1459
%P 16770-16775
Markdown (Informal)
[Transformers for Bridging Persian Dialects: Transliteration Model for Tajiki and Iranian Scripts](https://aclanthology.org/2024.lrec-main.1459) (SadraeiJavaheri et al., LREC-COLING 2024)
ACL