@inproceedings{gessler-etal-2022-midas,
title = "{M}idas Loop: A Prioritized Human-in-the-Loop Annotation for Large Scale Multilayer Data",
author = "Gessler, Luke and
Levine, Lauren and
Zeldes, Amir",
editor = "Pradhan, Sameer and
Kuebler, Sandra",
booktitle = "Proceedings of the 16th Linguistic Annotation Workshop (LAW-XVI) within LREC2022",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.law-1.13",
pages = "103--110",
abstract = "Large scale annotation of rich multilayer corpus data is expensive and time consuming, motivating approaches that integrate high quality automatic tools with active learning in order to prioritize human labeling of hard cases. A related challenge in such scenarios is the concurrent management of automatically annotated data and human annotated data, particularly where different subsets of the data have been corrected for different types of annotation and with different levels of confidence. In this paper we present [REDACTED], a collaborative, version-controlled online annotation environment for multilayer corpus data which includes integrated provenance and confidence metadata for each piece of information at the document, sentence, token and annotation level. We present a case study on improving annotation quality in an existing multilayer parse bank of English called AMALGUM, focusing on active learning in corpus preprocessing, at the surprisingly challenging level of sentence segmentation. Our results show improvements to state-of-the-art sentence segmentation and a promising workflow for getting {``}silver{''} data to approach gold standard quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gessler-etal-2022-midas">
<titleInfo>
<title>Midas Loop: A Prioritized Human-in-the-Loop Annotation for Large Scale Multilayer Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Gessler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lauren</namePart>
<namePart type="family">Levine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zeldes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Linguistic Annotation Workshop (LAW-XVI) within LREC2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sameer</namePart>
<namePart type="family">Pradhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandra</namePart>
<namePart type="family">Kuebler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large scale annotation of rich multilayer corpus data is expensive and time consuming, motivating approaches that integrate high quality automatic tools with active learning in order to prioritize human labeling of hard cases. A related challenge in such scenarios is the concurrent management of automatically annotated data and human annotated data, particularly where different subsets of the data have been corrected for different types of annotation and with different levels of confidence. In this paper we present [REDACTED], a collaborative, version-controlled online annotation environment for multilayer corpus data which includes integrated provenance and confidence metadata for each piece of information at the document, sentence, token and annotation level. We present a case study on improving annotation quality in an existing multilayer parse bank of English called AMALGUM, focusing on active learning in corpus preprocessing, at the surprisingly challenging level of sentence segmentation. Our results show improvements to state-of-the-art sentence segmentation and a promising workflow for getting “silver” data to approach gold standard quality.</abstract>
<identifier type="citekey">gessler-etal-2022-midas</identifier>
<location>
<url>https://aclanthology.org/2022.law-1.13</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>103</start>
<end>110</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Midas Loop: A Prioritized Human-in-the-Loop Annotation for Large Scale Multilayer Data
%A Gessler, Luke
%A Levine, Lauren
%A Zeldes, Amir
%Y Pradhan, Sameer
%Y Kuebler, Sandra
%S Proceedings of the 16th Linguistic Annotation Workshop (LAW-XVI) within LREC2022
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F gessler-etal-2022-midas
%X Large scale annotation of rich multilayer corpus data is expensive and time consuming, motivating approaches that integrate high quality automatic tools with active learning in order to prioritize human labeling of hard cases. A related challenge in such scenarios is the concurrent management of automatically annotated data and human annotated data, particularly where different subsets of the data have been corrected for different types of annotation and with different levels of confidence. In this paper we present [REDACTED], a collaborative, version-controlled online annotation environment for multilayer corpus data which includes integrated provenance and confidence metadata for each piece of information at the document, sentence, token and annotation level. We present a case study on improving annotation quality in an existing multilayer parse bank of English called AMALGUM, focusing on active learning in corpus preprocessing, at the surprisingly challenging level of sentence segmentation. Our results show improvements to state-of-the-art sentence segmentation and a promising workflow for getting “silver” data to approach gold standard quality.
%U https://aclanthology.org/2022.law-1.13
%P 103-110
Markdown (Informal)
[Midas Loop: A Prioritized Human-in-the-Loop Annotation for Large Scale Multilayer Data](https://aclanthology.org/2022.law-1.13) (Gessler et al., LAW 2022)
ACL