@inproceedings{littell-etal-2018-measuring,
title = "Measuring sentence parallelism using Mahalanobis distances: The {NRC} unsupervised submissions to the {WMT}18 Parallel Corpus Filtering shared task",
author = "Littell, Patrick and
Larkin, Samuel and
Stewart, Darlene and
Simard, Michel and
Goutte, Cyril and
Lo, Chi-kiu",
editor = "Bojar, Ond{\v{r}}ej and
Chatterjee, Rajen and
Federmann, Christian and
Fishel, Mark and
Graham, Yvette and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Monz, Christof and
Negri, Matteo and
N{\'e}v{\'e}ol, Aur{\'e}lie and
Neves, Mariana and
Post, Matt and
Specia, Lucia and
Turchi, Marco and
Verspoor, Karin",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-6480",
doi = "10.18653/v1/W18-6480",
pages = "900--907",
abstract = "The WMT18 shared task on parallel corpus filtering (Koehn et al., 2018b) challenged teams to score sentence pairs from a large high-recall, low-precision web-scraped parallel corpus (Koehn et al., 2018a). Participants could use existing sample corpora (e.g. past WMT data) as a supervisory signal to learn what a {``}clean{''} corpus looks like. However, in lower-resource situations it often happens that the target corpus of the language is the \textit{only} sample of parallel text in that language. We therefore made several unsupervised entries, setting ourselves an additional constraint that we not utilize the additional clean parallel corpora. One such entry fairly consistently scored in the top ten systems in the 100M-word conditions, and for one task{---}translating the European Medicines Agency corpus (Tiedemann, 2009){---}scored among the best systems even in the 10M-word conditions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="littell-etal-2018-measuring">
<titleInfo>
<title>Measuring sentence parallelism using Mahalanobis distances: The NRC unsupervised submissions to the WMT18 Parallel Corpus Filtering shared task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Littell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Larkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Darlene</namePart>
<namePart type="family">Stewart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Simard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Goutte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chi-kiu</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Conference on Machine Translation: Shared Task Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajen</namePart>
<namePart type="family">Chatterjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Federmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Huck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="given">Jimeno</namePart>
<namePart type="family">Yepes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matteo</namePart>
<namePart type="family">Negri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurélie</namePart>
<namePart type="family">Névéol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Neves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Post</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Turchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karin</namePart>
<namePart type="family">Verspoor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Belgium, Brussels</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The WMT18 shared task on parallel corpus filtering (Koehn et al., 2018b) challenged teams to score sentence pairs from a large high-recall, low-precision web-scraped parallel corpus (Koehn et al., 2018a). Participants could use existing sample corpora (e.g. past WMT data) as a supervisory signal to learn what a “clean” corpus looks like. However, in lower-resource situations it often happens that the target corpus of the language is the only sample of parallel text in that language. We therefore made several unsupervised entries, setting ourselves an additional constraint that we not utilize the additional clean parallel corpora. One such entry fairly consistently scored in the top ten systems in the 100M-word conditions, and for one task—translating the European Medicines Agency corpus (Tiedemann, 2009)—scored among the best systems even in the 10M-word conditions.</abstract>
<identifier type="citekey">littell-etal-2018-measuring</identifier>
<identifier type="doi">10.18653/v1/W18-6480</identifier>
<location>
<url>https://aclanthology.org/W18-6480</url>
</location>
<part>
<date>2018-10</date>
<extent unit="page">
<start>900</start>
<end>907</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring sentence parallelism using Mahalanobis distances: The NRC unsupervised submissions to the WMT18 Parallel Corpus Filtering shared task
%A Littell, Patrick
%A Larkin, Samuel
%A Stewart, Darlene
%A Simard, Michel
%A Goutte, Cyril
%A Lo, Chi-kiu
%Y Bojar, Ondřej
%Y Chatterjee, Rajen
%Y Federmann, Christian
%Y Fishel, Mark
%Y Graham, Yvette
%Y Haddow, Barry
%Y Huck, Matthias
%Y Yepes, Antonio Jimeno
%Y Koehn, Philipp
%Y Monz, Christof
%Y Negri, Matteo
%Y Névéol, Aurélie
%Y Neves, Mariana
%Y Post, Matt
%Y Specia, Lucia
%Y Turchi, Marco
%Y Verspoor, Karin
%S Proceedings of the Third Conference on Machine Translation: Shared Task Papers
%D 2018
%8 October
%I Association for Computational Linguistics
%C Belgium, Brussels
%F littell-etal-2018-measuring
%X The WMT18 shared task on parallel corpus filtering (Koehn et al., 2018b) challenged teams to score sentence pairs from a large high-recall, low-precision web-scraped parallel corpus (Koehn et al., 2018a). Participants could use existing sample corpora (e.g. past WMT data) as a supervisory signal to learn what a “clean” corpus looks like. However, in lower-resource situations it often happens that the target corpus of the language is the only sample of parallel text in that language. We therefore made several unsupervised entries, setting ourselves an additional constraint that we not utilize the additional clean parallel corpora. One such entry fairly consistently scored in the top ten systems in the 100M-word conditions, and for one task—translating the European Medicines Agency corpus (Tiedemann, 2009)—scored among the best systems even in the 10M-word conditions.
%R 10.18653/v1/W18-6480
%U https://aclanthology.org/W18-6480
%U https://doi.org/10.18653/v1/W18-6480
%P 900-907
Markdown (Informal)
[Measuring sentence parallelism using Mahalanobis distances: The NRC unsupervised submissions to the WMT18 Parallel Corpus Filtering shared task](https://aclanthology.org/W18-6480) (Littell et al., WMT 2018)
ACL