@inproceedings{blodgett-etal-2017-dataset,
title = "A Dataset and Classifier for Recognizing Social Media {E}nglish",
author = "Blodgett, Su Lin and
Wei, Johnny and
O{'}Connor, Brendan",
editor = "Derczynski, Leon and
Xu, Wei and
Ritter, Alan and
Baldwin, Tim",
booktitle = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-4408",
doi = "10.18653/v1/W17-4408",
pages = "56--61",
abstract = "While language identification works well on standard texts, it performs much worse on social media language, in particular dialectal language{---}even for English. First, to support work on English language identification, we contribute a new dataset of tweets annotated for English versus non-English, with attention to ambiguity, code-switching, and automatic generation issues. It is randomly sampled from all public messages, avoiding biases towards pre-existing language classifiers. Second, we find that a demographic language model{---}which identifies messages with language similar to that used by several U.S. ethnic populations on Twitter{---}can be used to improve English language identification performance when combined with a traditional supervised language identifier. It increases recall with almost no loss of precision, including, surprisingly, for English messages written by non-U.S. authors. Our dataset and identifier ensemble are available online.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blodgett-etal-2017-dataset">
<titleInfo>
<title>A Dataset and Classifier for Recognizing Social Media English</title>
</titleInfo>
<name type="personal">
<namePart type="given">Su</namePart>
<namePart type="given">Lin</namePart>
<namePart type="family">Blodgett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johnny</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brendan</namePart>
<namePart type="family">O’Connor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Noisy User-generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While language identification works well on standard texts, it performs much worse on social media language, in particular dialectal language—even for English. First, to support work on English language identification, we contribute a new dataset of tweets annotated for English versus non-English, with attention to ambiguity, code-switching, and automatic generation issues. It is randomly sampled from all public messages, avoiding biases towards pre-existing language classifiers. Second, we find that a demographic language model—which identifies messages with language similar to that used by several U.S. ethnic populations on Twitter—can be used to improve English language identification performance when combined with a traditional supervised language identifier. It increases recall with almost no loss of precision, including, surprisingly, for English messages written by non-U.S. authors. Our dataset and identifier ensemble are available online.</abstract>
<identifier type="citekey">blodgett-etal-2017-dataset</identifier>
<identifier type="doi">10.18653/v1/W17-4408</identifier>
<location>
<url>https://aclanthology.org/W17-4408</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>56</start>
<end>61</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Dataset and Classifier for Recognizing Social Media English
%A Blodgett, Su Lin
%A Wei, Johnny
%A O’Connor, Brendan
%Y Derczynski, Leon
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%S Proceedings of the 3rd Workshop on Noisy User-generated Text
%D 2017
%8 September
%I Association for Computational Linguistics
%C Copenhagen, Denmark
%F blodgett-etal-2017-dataset
%X While language identification works well on standard texts, it performs much worse on social media language, in particular dialectal language—even for English. First, to support work on English language identification, we contribute a new dataset of tweets annotated for English versus non-English, with attention to ambiguity, code-switching, and automatic generation issues. It is randomly sampled from all public messages, avoiding biases towards pre-existing language classifiers. Second, we find that a demographic language model—which identifies messages with language similar to that used by several U.S. ethnic populations on Twitter—can be used to improve English language identification performance when combined with a traditional supervised language identifier. It increases recall with almost no loss of precision, including, surprisingly, for English messages written by non-U.S. authors. Our dataset and identifier ensemble are available online.
%R 10.18653/v1/W17-4408
%U https://aclanthology.org/W17-4408
%U https://doi.org/10.18653/v1/W17-4408
%P 56-61
Markdown (Informal)
[A Dataset and Classifier for Recognizing Social Media English](https://aclanthology.org/W17-4408) (Blodgett et al., WNUT 2017)
ACL