@inproceedings{madhani-etal-2023-bhasa,
title = "Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 {I}ndic languages",
author = "Madhani, Yash and
Khapra, Mitesh M. and
Kunchukuttan, Anoop",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-short.71",
doi = "10.18653/v1/2023.acl-short.71",
pages = "816--826",
abstract = "We create publicly available language identification (LID) datasets and models in all 22 Indian languages listed in the Indian constitution in both native-script and romanized text. First, we create Bhasha-Abhijnaanam, a language identification test set for native-script as well as romanized text which spans all 22 Indic languages. We also train IndicLID, a language identifier for all the above-mentioned languages in both native and romanized script. For native-script text, it has better language coverage than existing LIDs and is competitive or better than other LIDs. IndicLID is the first LID for romanized text in Indian languages. Two major challenges for romanized text LID are the lack of training data and low-LID performance when languages are similar. We provide simple and effective solutions to these problems. In general, there has been limited work on romanized text in any language, and our findings are relevant to other languages that need romanized language identification. Our models are publicly available at \url{https://github.com/AI4Bharat/IndicLID} under open-source licenses. Our training and test sets are also publicly available at \url{https://huggingface.co/datasets/ai4bharat/Bhasha-Abhijnaanam} under open-source licenses.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="madhani-etal-2023-bhasa">
<titleInfo>
<title>Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 Indic languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yash</namePart>
<namePart type="family">Madhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mitesh</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Khapra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kunchukuttan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We create publicly available language identification (LID) datasets and models in all 22 Indian languages listed in the Indian constitution in both native-script and romanized text. First, we create Bhasha-Abhijnaanam, a language identification test set for native-script as well as romanized text which spans all 22 Indic languages. We also train IndicLID, a language identifier for all the above-mentioned languages in both native and romanized script. For native-script text, it has better language coverage than existing LIDs and is competitive or better than other LIDs. IndicLID is the first LID for romanized text in Indian languages. Two major challenges for romanized text LID are the lack of training data and low-LID performance when languages are similar. We provide simple and effective solutions to these problems. In general, there has been limited work on romanized text in any language, and our findings are relevant to other languages that need romanized language identification. Our models are publicly available at https://github.com/AI4Bharat/IndicLID under open-source licenses. Our training and test sets are also publicly available at https://huggingface.co/datasets/ai4bharat/Bhasha-Abhijnaanam under open-source licenses.</abstract>
<identifier type="citekey">madhani-etal-2023-bhasa</identifier>
<identifier type="doi">10.18653/v1/2023.acl-short.71</identifier>
<location>
<url>https://aclanthology.org/2023.acl-short.71</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>816</start>
<end>826</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 Indic languages
%A Madhani, Yash
%A Khapra, Mitesh M.
%A Kunchukuttan, Anoop
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F madhani-etal-2023-bhasa
%X We create publicly available language identification (LID) datasets and models in all 22 Indian languages listed in the Indian constitution in both native-script and romanized text. First, we create Bhasha-Abhijnaanam, a language identification test set for native-script as well as romanized text which spans all 22 Indic languages. We also train IndicLID, a language identifier for all the above-mentioned languages in both native and romanized script. For native-script text, it has better language coverage than existing LIDs and is competitive or better than other LIDs. IndicLID is the first LID for romanized text in Indian languages. Two major challenges for romanized text LID are the lack of training data and low-LID performance when languages are similar. We provide simple and effective solutions to these problems. In general, there has been limited work on romanized text in any language, and our findings are relevant to other languages that need romanized language identification. Our models are publicly available at https://github.com/AI4Bharat/IndicLID under open-source licenses. Our training and test sets are also publicly available at https://huggingface.co/datasets/ai4bharat/Bhasha-Abhijnaanam under open-source licenses.
%R 10.18653/v1/2023.acl-short.71
%U https://aclanthology.org/2023.acl-short.71
%U https://doi.org/10.18653/v1/2023.acl-short.71
%P 816-826
Markdown (Informal)
[Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 Indic languages](https://aclanthology.org/2023.acl-short.71) (Madhani et al., ACL 2023)
ACL