@inproceedings{mekki-etal-2021-tremolo-tweets,
title = "{TREM}o{L}o-Tweets: A Multi-Label Corpus of {F}rench Tweets for Language Register Characterization",
author = "Mekki, Jade and
Lecorv{\'e}, Gw{\'e}nol{\'e} and
Battistelli, Delphine and
B{\'e}chet, Nicolas",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.108",
pages = "950--958",
abstract = "The casual, neutral, and formal language registers are highly perceptible in discourse productions. However, they are still poorly studied in Natural Language Processing (NLP), especially outside English, and for new textual types like tweets. To stimulate research, this paper introduces a large corpus of 228,505 French tweets (6M words) annotated in language registers. Labels are provided by a multi-label CamemBERT classifier trained and checked on a manually annotated subset of the corpus, while the tweets are selected to avoid undesired biases. Based on the corpus, an initial analysis of linguistic traits from either human annotators or automatic extractions is provided to describe the corpus and pave the way for various NLP tasks. The corpus, annotation guide and classifier are available on \url{http://tremolo.irisa.fr}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mekki-etal-2021-tremolo-tweets">
<titleInfo>
<title>TREMoLo-Tweets: A Multi-Label Corpus of French Tweets for Language Register Characterization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jade</namePart>
<namePart type="family">Mekki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gwénolé</namePart>
<namePart type="family">Lecorvé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Delphine</namePart>
<namePart type="family">Battistelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The casual, neutral, and formal language registers are highly perceptible in discourse productions. However, they are still poorly studied in Natural Language Processing (NLP), especially outside English, and for new textual types like tweets. To stimulate research, this paper introduces a large corpus of 228,505 French tweets (6M words) annotated in language registers. Labels are provided by a multi-label CamemBERT classifier trained and checked on a manually annotated subset of the corpus, while the tweets are selected to avoid undesired biases. Based on the corpus, an initial analysis of linguistic traits from either human annotators or automatic extractions is provided to describe the corpus and pave the way for various NLP tasks. The corpus, annotation guide and classifier are available on http://tremolo.irisa.fr.</abstract>
<identifier type="citekey">mekki-etal-2021-tremolo-tweets</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.108</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>950</start>
<end>958</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TREMoLo-Tweets: A Multi-Label Corpus of French Tweets for Language Register Characterization
%A Mekki, Jade
%A Lecorvé, Gwénolé
%A Battistelli, Delphine
%A Béchet, Nicolas
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F mekki-etal-2021-tremolo-tweets
%X The casual, neutral, and formal language registers are highly perceptible in discourse productions. However, they are still poorly studied in Natural Language Processing (NLP), especially outside English, and for new textual types like tweets. To stimulate research, this paper introduces a large corpus of 228,505 French tweets (6M words) annotated in language registers. Labels are provided by a multi-label CamemBERT classifier trained and checked on a manually annotated subset of the corpus, while the tweets are selected to avoid undesired biases. Based on the corpus, an initial analysis of linguistic traits from either human annotators or automatic extractions is provided to describe the corpus and pave the way for various NLP tasks. The corpus, annotation guide and classifier are available on http://tremolo.irisa.fr.
%U https://aclanthology.org/2021.ranlp-1.108
%P 950-958
Markdown (Informal)
[TREMoLo-Tweets: A Multi-Label Corpus of French Tweets for Language Register Characterization](https://aclanthology.org/2021.ranlp-1.108) (Mekki et al., RANLP 2021)
ACL