@inproceedings{oncevay-etal-2022-revisiting,
title = "Revisiting Syllables in Language Modelling and Their Application on Low-Resource Machine Translation",
author = "Oncevay, Arturo and
Rivas Rojas, Kervy Dante and
Chavez Sanchez, Liz Karen and
Zariquiey, Roberto",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.374",
pages = "4258--4267",
abstract = "Language modelling and machine translation tasks mostly use subword or character inputs, but syllables are seldom used. Syllables provide shorter sequences than characters, require less-specialised extracting rules than morphemes, and their segmentation is not impacted by the corpus size. In this study, we first explore the potential of syllables for open-vocabulary language modelling in 21 languages. We use rule-based syllabification methods for six languages and address the rest with hyphenation, which works as a syllabification proxy. With a comparable perplexity, we show that syllables outperform characters and other subwords. Moreover, we study the importance of syllables on neural machine translation for a non-related and low-resource language-pair (Spanish{--}Shipibo-Konibo). In pairwise and multilingual systems, syllables outperform unsupervised subwords, and further morphological segmentation methods, when translating into a highly synthetic language with a transparent orthography (Shipibo-Konibo). Finally, we perform some human evaluation, and discuss limitations and opportunities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oncevay-etal-2022-revisiting">
<titleInfo>
<title>Revisiting Syllables in Language Modelling and Their Application on Low-Resource Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kervy</namePart>
<namePart type="given">Dante</namePart>
<namePart type="family">Rivas Rojas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liz</namePart>
<namePart type="given">Karen</namePart>
<namePart type="family">Chavez Sanchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Zariquiey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language modelling and machine translation tasks mostly use subword or character inputs, but syllables are seldom used. Syllables provide shorter sequences than characters, require less-specialised extracting rules than morphemes, and their segmentation is not impacted by the corpus size. In this study, we first explore the potential of syllables for open-vocabulary language modelling in 21 languages. We use rule-based syllabification methods for six languages and address the rest with hyphenation, which works as a syllabification proxy. With a comparable perplexity, we show that syllables outperform characters and other subwords. Moreover, we study the importance of syllables on neural machine translation for a non-related and low-resource language-pair (Spanish–Shipibo-Konibo). In pairwise and multilingual systems, syllables outperform unsupervised subwords, and further morphological segmentation methods, when translating into a highly synthetic language with a transparent orthography (Shipibo-Konibo). Finally, we perform some human evaluation, and discuss limitations and opportunities.</abstract>
<identifier type="citekey">oncevay-etal-2022-revisiting</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.374</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>4258</start>
<end>4267</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Revisiting Syllables in Language Modelling and Their Application on Low-Resource Machine Translation
%A Oncevay, Arturo
%A Rivas Rojas, Kervy Dante
%A Chavez Sanchez, Liz Karen
%A Zariquiey, Roberto
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F oncevay-etal-2022-revisiting
%X Language modelling and machine translation tasks mostly use subword or character inputs, but syllables are seldom used. Syllables provide shorter sequences than characters, require less-specialised extracting rules than morphemes, and their segmentation is not impacted by the corpus size. In this study, we first explore the potential of syllables for open-vocabulary language modelling in 21 languages. We use rule-based syllabification methods for six languages and address the rest with hyphenation, which works as a syllabification proxy. With a comparable perplexity, we show that syllables outperform characters and other subwords. Moreover, we study the importance of syllables on neural machine translation for a non-related and low-resource language-pair (Spanish–Shipibo-Konibo). In pairwise and multilingual systems, syllables outperform unsupervised subwords, and further morphological segmentation methods, when translating into a highly synthetic language with a transparent orthography (Shipibo-Konibo). Finally, we perform some human evaluation, and discuss limitations and opportunities.
%U https://aclanthology.org/2022.coling-1.374
%P 4258-4267
Markdown (Informal)
[Revisiting Syllables in Language Modelling and Their Application on Low-Resource Machine Translation](https://aclanthology.org/2022.coling-1.374) (Oncevay et al., COLING 2022)
ACL