@inproceedings{nair-etal-2023-neural,
title = "A Neural {CRF}-based Hierarchical Approach for Linear Text Segmentation",
author = "Nair, Inderjeet and
Garimella, Aparna and
Srinivasan, Balaji Vasan and
Modani, Natwar and
Chhaya, Niyati and
Karanam, Srikrishna and
Shekhar, Sumit",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.65",
doi = "10.18653/v1/2023.findings-eacl.65",
pages = "883--893",
abstract = "We consider the problem of segmenting unformatted text and transcripts linearly based on their topical structure. While prior approaches explicitly train to predict segment boundaries, our proposed approach solves this task by inferring the hierarchical segmentation structure associated with the input text fragment. Given the lack of a large annotated dataset for this task, we propose a data curation strategy and create a corpus of over 700K Wikipedia articles with their hierarchical structures. We then propose the first supervised approach to generating hierarchical segmentation structures based on these annotations. Our method, in particular, is based on a neural conditional random field (CRF), which explicitly models the statistical dependency between a node and its constituent child nodes. We introduce a new data augmentation scheme as part of our model training strategy, which involves sampling a variety of node aggregations, permutations, and removals, all of which help capture fine-grained and coarse topical shifts in the data and improve model performance. Extensive experiments show that our model outperforms or achieves competitive performance when compared to previous state-of-the-art algorithms in the following settings: rich-resource, cross-domain transferability, few-shot supervision, and segmentation when topic label annotations are provided.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nair-etal-2023-neural">
<titleInfo>
<title>A Neural CRF-based Hierarchical Approach for Linear Text Segmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Inderjeet</namePart>
<namePart type="family">Nair</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Garimella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Balaji</namePart>
<namePart type="given">Vasan</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natwar</namePart>
<namePart type="family">Modani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niyati</namePart>
<namePart type="family">Chhaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srikrishna</namePart>
<namePart type="family">Karanam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sumit</namePart>
<namePart type="family">Shekhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We consider the problem of segmenting unformatted text and transcripts linearly based on their topical structure. While prior approaches explicitly train to predict segment boundaries, our proposed approach solves this task by inferring the hierarchical segmentation structure associated with the input text fragment. Given the lack of a large annotated dataset for this task, we propose a data curation strategy and create a corpus of over 700K Wikipedia articles with their hierarchical structures. We then propose the first supervised approach to generating hierarchical segmentation structures based on these annotations. Our method, in particular, is based on a neural conditional random field (CRF), which explicitly models the statistical dependency between a node and its constituent child nodes. We introduce a new data augmentation scheme as part of our model training strategy, which involves sampling a variety of node aggregations, permutations, and removals, all of which help capture fine-grained and coarse topical shifts in the data and improve model performance. Extensive experiments show that our model outperforms or achieves competitive performance when compared to previous state-of-the-art algorithms in the following settings: rich-resource, cross-domain transferability, few-shot supervision, and segmentation when topic label annotations are provided.</abstract>
<identifier type="citekey">nair-etal-2023-neural</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.65</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.65</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>883</start>
<end>893</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Neural CRF-based Hierarchical Approach for Linear Text Segmentation
%A Nair, Inderjeet
%A Garimella, Aparna
%A Srinivasan, Balaji Vasan
%A Modani, Natwar
%A Chhaya, Niyati
%A Karanam, Srikrishna
%A Shekhar, Sumit
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F nair-etal-2023-neural
%X We consider the problem of segmenting unformatted text and transcripts linearly based on their topical structure. While prior approaches explicitly train to predict segment boundaries, our proposed approach solves this task by inferring the hierarchical segmentation structure associated with the input text fragment. Given the lack of a large annotated dataset for this task, we propose a data curation strategy and create a corpus of over 700K Wikipedia articles with their hierarchical structures. We then propose the first supervised approach to generating hierarchical segmentation structures based on these annotations. Our method, in particular, is based on a neural conditional random field (CRF), which explicitly models the statistical dependency between a node and its constituent child nodes. We introduce a new data augmentation scheme as part of our model training strategy, which involves sampling a variety of node aggregations, permutations, and removals, all of which help capture fine-grained and coarse topical shifts in the data and improve model performance. Extensive experiments show that our model outperforms or achieves competitive performance when compared to previous state-of-the-art algorithms in the following settings: rich-resource, cross-domain transferability, few-shot supervision, and segmentation when topic label annotations are provided.
%R 10.18653/v1/2023.findings-eacl.65
%U https://aclanthology.org/2023.findings-eacl.65
%U https://doi.org/10.18653/v1/2023.findings-eacl.65
%P 883-893
Markdown (Informal)
[A Neural CRF-based Hierarchical Approach for Linear Text Segmentation](https://aclanthology.org/2023.findings-eacl.65) (Nair et al., Findings 2023)
ACL
- Inderjeet Nair, Aparna Garimella, Balaji Vasan Srinivasan, Natwar Modani, Niyati Chhaya, Srikrishna Karanam, and Sumit Shekhar. 2023. A Neural CRF-based Hierarchical Approach for Linear Text Segmentation. In Findings of the Association for Computational Linguistics: EACL 2023, pages 883–893, Dubrovnik, Croatia. Association for Computational Linguistics.