@inproceedings{fetahu-etal-2023-multiconer,
title = "{M}ulti{C}o{NER} v2: a Large Multilingual dataset for Fine-grained and Noisy Named Entity Recognition",
author = "Fetahu, Besnik and
Chen, Zhiyu and
Kar, Sudipta and
Rokhlenko, Oleg and
Malmasi, Shervin",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.134",
doi = "10.18653/v1/2023.findings-emnlp.134",
pages = "2027--2051",
abstract = "We present MULTICONER V2, a dataset for fine-grained Named Entity Recognition covering 33 entity classes across 12 languages, in both monolingual and multilingual settings. This dataset aims to tackle the following practical challenges in NER: (i) effective handling of fine-grained classes that include complex entities like movie titles, and (ii) performance degradation due to noise generated from typing mistakes or OCR errors. The dataset is compiled from open resources like Wikipedia and Wikidata, and is publicly available. Evaluation based on the XLM-RoBERTa baseline highlights the unique challenges posed by MULTICONER V2: (i) the fine-grained taxonomy is challenging, where the scores are low with macro-F1=0.63 (across all languages), and (ii) the corruption strategy significantly impairs performance, with entity corruption resulting in 9{\%} lower performance relative to non-entity corruptions across all languages. This highlights the greater impact of entity noise in contrast to context noise.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fetahu-etal-2023-multiconer">
<titleInfo>
<title>MultiCoNER v2: a Large Multilingual dataset for Fine-grained and Noisy Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Besnik</namePart>
<namePart type="family">Fetahu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Kar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Rokhlenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shervin</namePart>
<namePart type="family">Malmasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present MULTICONER V2, a dataset for fine-grained Named Entity Recognition covering 33 entity classes across 12 languages, in both monolingual and multilingual settings. This dataset aims to tackle the following practical challenges in NER: (i) effective handling of fine-grained classes that include complex entities like movie titles, and (ii) performance degradation due to noise generated from typing mistakes or OCR errors. The dataset is compiled from open resources like Wikipedia and Wikidata, and is publicly available. Evaluation based on the XLM-RoBERTa baseline highlights the unique challenges posed by MULTICONER V2: (i) the fine-grained taxonomy is challenging, where the scores are low with macro-F1=0.63 (across all languages), and (ii) the corruption strategy significantly impairs performance, with entity corruption resulting in 9% lower performance relative to non-entity corruptions across all languages. This highlights the greater impact of entity noise in contrast to context noise.</abstract>
<identifier type="citekey">fetahu-etal-2023-multiconer</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.134</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.134</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>2027</start>
<end>2051</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MultiCoNER v2: a Large Multilingual dataset for Fine-grained and Noisy Named Entity Recognition
%A Fetahu, Besnik
%A Chen, Zhiyu
%A Kar, Sudipta
%A Rokhlenko, Oleg
%A Malmasi, Shervin
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F fetahu-etal-2023-multiconer
%X We present MULTICONER V2, a dataset for fine-grained Named Entity Recognition covering 33 entity classes across 12 languages, in both monolingual and multilingual settings. This dataset aims to tackle the following practical challenges in NER: (i) effective handling of fine-grained classes that include complex entities like movie titles, and (ii) performance degradation due to noise generated from typing mistakes or OCR errors. The dataset is compiled from open resources like Wikipedia and Wikidata, and is publicly available. Evaluation based on the XLM-RoBERTa baseline highlights the unique challenges posed by MULTICONER V2: (i) the fine-grained taxonomy is challenging, where the scores are low with macro-F1=0.63 (across all languages), and (ii) the corruption strategy significantly impairs performance, with entity corruption resulting in 9% lower performance relative to non-entity corruptions across all languages. This highlights the greater impact of entity noise in contrast to context noise.
%R 10.18653/v1/2023.findings-emnlp.134
%U https://aclanthology.org/2023.findings-emnlp.134
%U https://doi.org/10.18653/v1/2023.findings-emnlp.134
%P 2027-2051
Markdown (Informal)
[MultiCoNER v2: a Large Multilingual dataset for Fine-grained and Noisy Named Entity Recognition](https://aclanthology.org/2023.findings-emnlp.134) (Fetahu et al., Findings 2023)
ACL