@inproceedings{javed-etal-2024-indicvoices,
title = "{I}ndic{V}oices: Towards building an Inclusive Multilingual Speech Dataset for {I}ndian Languages",
author = "Javed, Tahir and
Nawale, Janki and
George, Eldho and
Joshi, Sakshi and
Bhogale, Kaushal and
Mehendale, Deovrat and
Sethi, Ishvinder and
Ananthanarayanan, Aparna and
Faquih, Hafsah and
Palit, Pratiti and
Ravishankar, Sneha and
Sukumaran, Saranya and
Panchagnula, Tripura and
Murali, Sunjay and
Gandhi, Kunal and
R, Ambujavalli and
M, Manickam and
Vaijayanthi, C and
Karunganni, Krishnan and
Kumar, Pratyush and
Khapra, Mitesh",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.639",
doi = "10.18653/v1/2024.findings-acl.639",
pages = "10740--10782",
abstract = "We present INDICVOICES, a dataset of natural and spontaneous speech containing a total of 7348 hours of read (9{\%}), extempore (74{\%}) and conversational (17{\%}) audio from 16237 speakers covering 145 Indian districts and 22 languages. Of these 7348 hours, 1639 hours have already been transcribed, with a median of 73 hours per language. Through this paper, we share our journey of capturing the cultural, linguistic and demographic diversity of India to create a one-of-its-kind inclusive and representative dataset. More specifically, we share an open-source blueprint for data collection at scale comprising of standardised protocols, centralised tools, a repository of engaging questions, prompts and conversation scenarios spanning multiple domains and topics of interest, quality control mechanisms, comprehensive transcription guidelines and transcription tools. We hope that this open source blueprint will serve as a comprehensive starter kit for data collection efforts in other multilingual regions of the world. Using INDICVOICES, we build IndicASR, the first ASR model to support all the 22 languages listed in the 8th schedule of the Constitution of India.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="javed-etal-2024-indicvoices">
<titleInfo>
<title>IndicVoices: Towards building an Inclusive Multilingual Speech Dataset for Indian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tahir</namePart>
<namePart type="family">Javed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janki</namePart>
<namePart type="family">Nawale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eldho</namePart>
<namePart type="family">George</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakshi</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaushal</namePart>
<namePart type="family">Bhogale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deovrat</namePart>
<namePart type="family">Mehendale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ishvinder</namePart>
<namePart type="family">Sethi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Ananthanarayanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hafsah</namePart>
<namePart type="family">Faquih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratiti</namePart>
<namePart type="family">Palit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sneha</namePart>
<namePart type="family">Ravishankar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saranya</namePart>
<namePart type="family">Sukumaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tripura</namePart>
<namePart type="family">Panchagnula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunjay</namePart>
<namePart type="family">Murali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kunal</namePart>
<namePart type="family">Gandhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ambujavalli</namePart>
<namePart type="family">R</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manickam</namePart>
<namePart type="family">M</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">C</namePart>
<namePart type="family">Vaijayanthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishnan</namePart>
<namePart type="family">Karunganni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyush</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mitesh</namePart>
<namePart type="family">Khapra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present INDICVOICES, a dataset of natural and spontaneous speech containing a total of 7348 hours of read (9%), extempore (74%) and conversational (17%) audio from 16237 speakers covering 145 Indian districts and 22 languages. Of these 7348 hours, 1639 hours have already been transcribed, with a median of 73 hours per language. Through this paper, we share our journey of capturing the cultural, linguistic and demographic diversity of India to create a one-of-its-kind inclusive and representative dataset. More specifically, we share an open-source blueprint for data collection at scale comprising of standardised protocols, centralised tools, a repository of engaging questions, prompts and conversation scenarios spanning multiple domains and topics of interest, quality control mechanisms, comprehensive transcription guidelines and transcription tools. We hope that this open source blueprint will serve as a comprehensive starter kit for data collection efforts in other multilingual regions of the world. Using INDICVOICES, we build IndicASR, the first ASR model to support all the 22 languages listed in the 8th schedule of the Constitution of India.</abstract>
<identifier type="citekey">javed-etal-2024-indicvoices</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.639</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.639</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>10740</start>
<end>10782</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IndicVoices: Towards building an Inclusive Multilingual Speech Dataset for Indian Languages
%A Javed, Tahir
%A Nawale, Janki
%A George, Eldho
%A Joshi, Sakshi
%A Bhogale, Kaushal
%A Mehendale, Deovrat
%A Sethi, Ishvinder
%A Ananthanarayanan, Aparna
%A Faquih, Hafsah
%A Palit, Pratiti
%A Ravishankar, Sneha
%A Sukumaran, Saranya
%A Panchagnula, Tripura
%A Murali, Sunjay
%A Gandhi, Kunal
%A R, Ambujavalli
%A M, Manickam
%A Vaijayanthi, C.
%A Karunganni, Krishnan
%A Kumar, Pratyush
%A Khapra, Mitesh
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F javed-etal-2024-indicvoices
%X We present INDICVOICES, a dataset of natural and spontaneous speech containing a total of 7348 hours of read (9%), extempore (74%) and conversational (17%) audio from 16237 speakers covering 145 Indian districts and 22 languages. Of these 7348 hours, 1639 hours have already been transcribed, with a median of 73 hours per language. Through this paper, we share our journey of capturing the cultural, linguistic and demographic diversity of India to create a one-of-its-kind inclusive and representative dataset. More specifically, we share an open-source blueprint for data collection at scale comprising of standardised protocols, centralised tools, a repository of engaging questions, prompts and conversation scenarios spanning multiple domains and topics of interest, quality control mechanisms, comprehensive transcription guidelines and transcription tools. We hope that this open source blueprint will serve as a comprehensive starter kit for data collection efforts in other multilingual regions of the world. Using INDICVOICES, we build IndicASR, the first ASR model to support all the 22 languages listed in the 8th schedule of the Constitution of India.
%R 10.18653/v1/2024.findings-acl.639
%U https://aclanthology.org/2024.findings-acl.639
%U https://doi.org/10.18653/v1/2024.findings-acl.639
%P 10740-10782
Markdown (Informal)
[IndicVoices: Towards building an Inclusive Multilingual Speech Dataset for Indian Languages](https://aclanthology.org/2024.findings-acl.639) (Javed et al., Findings 2024)
ACL
- Tahir Javed, Janki Nawale, Eldho George, Sakshi Joshi, Kaushal Bhogale, Deovrat Mehendale, Ishvinder Sethi, Aparna Ananthanarayanan, Hafsah Faquih, Pratiti Palit, Sneha Ravishankar, Saranya Sukumaran, Tripura Panchagnula, Sunjay Murali, Kunal Gandhi, Ambujavalli R, Manickam M, C Vaijayanthi, Krishnan Karunganni, et al.. 2024. IndicVoices: Towards building an Inclusive Multilingual Speech Dataset for Indian Languages. In Findings of the Association for Computational Linguistics: ACL 2024, pages 10740–10782, Bangkok, Thailand. Association for Computational Linguistics.