@inproceedings{neubig-mori-2010-word,
title = "Word-based Partial Annotation for Efficient Corpus Construction",
author = "Neubig, Graham and
Mori, Shinsuke",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/408_Paper.pdf",
abstract = "In order to utilize the corpus-based techniques that have proven effective in natural language processing in recent years, costly and time-consuming manual creation of linguistic resources is often necessary. Traditionally these resources are created on the document or sentence-level. In this paper, we examine the benefit of annotating only particular words with high information content, as opposed to the entire sentence or document. Using the task of Japanese pronunciation estimation as an example, we devise a machine learning method that can be trained on data annotated word-by-word. This is done by dividing the estimation process into two steps (word segmentation and word-based pronunciation estimation), and introducing a point-wise estimator that is able to make each decision independent of the other decisions made for a particular sentence. In an evaluation, the proposed strategy is shown to provide greater increases in accuracy using a smaller number of annotated words than traditional sentence-based annotation techniques.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="neubig-mori-2010-word">
<titleInfo>
<title>Word-based Partial Annotation for Efficient Corpus Construction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Graham</namePart>
<namePart type="family">Neubig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinsuke</namePart>
<namePart type="family">Mori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In order to utilize the corpus-based techniques that have proven effective in natural language processing in recent years, costly and time-consuming manual creation of linguistic resources is often necessary. Traditionally these resources are created on the document or sentence-level. In this paper, we examine the benefit of annotating only particular words with high information content, as opposed to the entire sentence or document. Using the task of Japanese pronunciation estimation as an example, we devise a machine learning method that can be trained on data annotated word-by-word. This is done by dividing the estimation process into two steps (word segmentation and word-based pronunciation estimation), and introducing a point-wise estimator that is able to make each decision independent of the other decisions made for a particular sentence. In an evaluation, the proposed strategy is shown to provide greater increases in accuracy using a smaller number of annotated words than traditional sentence-based annotation techniques.</abstract>
<identifier type="citekey">neubig-mori-2010-word</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/408_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Word-based Partial Annotation for Efficient Corpus Construction
%A Neubig, Graham
%A Mori, Shinsuke
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F neubig-mori-2010-word
%X In order to utilize the corpus-based techniques that have proven effective in natural language processing in recent years, costly and time-consuming manual creation of linguistic resources is often necessary. Traditionally these resources are created on the document or sentence-level. In this paper, we examine the benefit of annotating only particular words with high information content, as opposed to the entire sentence or document. Using the task of Japanese pronunciation estimation as an example, we devise a machine learning method that can be trained on data annotated word-by-word. This is done by dividing the estimation process into two steps (word segmentation and word-based pronunciation estimation), and introducing a point-wise estimator that is able to make each decision independent of the other decisions made for a particular sentence. In an evaluation, the proposed strategy is shown to provide greater increases in accuracy using a smaller number of annotated words than traditional sentence-based annotation techniques.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/408_Paper.pdf
Markdown (Informal)
[Word-based Partial Annotation for Efficient Corpus Construction](http://www.lrec-conf.org/proceedings/lrec2010/pdf/408_Paper.pdf) (Neubig & Mori, LREC 2010)
ACL