@inproceedings{ferrando-etal-2023-automating,
title = "Automating Behavioral Testing in Machine Translation",
author = "Ferrando, Javier and
Sperber, Matthias and
Setiawan, Hendra and
Telaar, Dominic and
Hasan, Sa{\v{s}}a",
editor = "Koehn, Philipp and
Haddow, Barry and
Kocmi, Tom and
Monz, Christof",
booktitle = "Proceedings of the Eighth Conference on Machine Translation",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.wmt-1.97",
doi = "10.18653/v1/2023.wmt-1.97",
pages = "1014--1030",
abstract = "Behavioral testing in NLP allows fine-grained evaluation of systems by examining their linguistic capabilities through the analysis of input-output behavior. Unfortunately, existing work on behavioral testing in Machine Translation (MT) is currently restricted to largely handcrafted tests covering a limited range of capabilities and languages. To address this limitation, we propose to use Large Language Models (LLMs) to generate a diverse set of source sentences tailored to test the behavior of MT models in a range of situations. We can then verify whether the MT model exhibits the expected behavior through matching candidate sets that are also generated using LLMs. Our approach aims to make behavioral testing of MT systems practical while requiring only minimal human effort. In our experiments, we apply our proposed evaluation framework to assess multiple available MT systems, revealing that while in general pass-rates follow the trends observable from traditional accuracy-based metrics, our method was able to uncover several important differences and potential bugs that go unnoticed when relying only on accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ferrando-etal-2023-automating">
<titleInfo>
<title>Automating Behavioral Testing in Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Ferrando</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Sperber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hendra</namePart>
<namePart type="family">Setiawan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dominic</namePart>
<namePart type="family">Telaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saša</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Behavioral testing in NLP allows fine-grained evaluation of systems by examining their linguistic capabilities through the analysis of input-output behavior. Unfortunately, existing work on behavioral testing in Machine Translation (MT) is currently restricted to largely handcrafted tests covering a limited range of capabilities and languages. To address this limitation, we propose to use Large Language Models (LLMs) to generate a diverse set of source sentences tailored to test the behavior of MT models in a range of situations. We can then verify whether the MT model exhibits the expected behavior through matching candidate sets that are also generated using LLMs. Our approach aims to make behavioral testing of MT systems practical while requiring only minimal human effort. In our experiments, we apply our proposed evaluation framework to assess multiple available MT systems, revealing that while in general pass-rates follow the trends observable from traditional accuracy-based metrics, our method was able to uncover several important differences and potential bugs that go unnoticed when relying only on accuracy.</abstract>
<identifier type="citekey">ferrando-etal-2023-automating</identifier>
<identifier type="doi">10.18653/v1/2023.wmt-1.97</identifier>
<location>
<url>https://aclanthology.org/2023.wmt-1.97</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>1014</start>
<end>1030</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automating Behavioral Testing in Machine Translation
%A Ferrando, Javier
%A Sperber, Matthias
%A Setiawan, Hendra
%A Telaar, Dominic
%A Hasan, Saša
%Y Koehn, Philipp
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Monz, Christof
%S Proceedings of the Eighth Conference on Machine Translation
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F ferrando-etal-2023-automating
%X Behavioral testing in NLP allows fine-grained evaluation of systems by examining their linguistic capabilities through the analysis of input-output behavior. Unfortunately, existing work on behavioral testing in Machine Translation (MT) is currently restricted to largely handcrafted tests covering a limited range of capabilities and languages. To address this limitation, we propose to use Large Language Models (LLMs) to generate a diverse set of source sentences tailored to test the behavior of MT models in a range of situations. We can then verify whether the MT model exhibits the expected behavior through matching candidate sets that are also generated using LLMs. Our approach aims to make behavioral testing of MT systems practical while requiring only minimal human effort. In our experiments, we apply our proposed evaluation framework to assess multiple available MT systems, revealing that while in general pass-rates follow the trends observable from traditional accuracy-based metrics, our method was able to uncover several important differences and potential bugs that go unnoticed when relying only on accuracy.
%R 10.18653/v1/2023.wmt-1.97
%U https://aclanthology.org/2023.wmt-1.97
%U https://doi.org/10.18653/v1/2023.wmt-1.97
%P 1014-1030
Markdown (Informal)
[Automating Behavioral Testing in Machine Translation](https://aclanthology.org/2023.wmt-1.97) (Ferrando et al., WMT 2023)
ACL
- Javier Ferrando, Matthias Sperber, Hendra Setiawan, Dominic Telaar, and Saša Hasan. 2023. Automating Behavioral Testing in Machine Translation. In Proceedings of the Eighth Conference on Machine Translation, pages 1014–1030, Singapore. Association for Computational Linguistics.