@article{guan-etal-2022-lot,
title = "{LOT}: A Story-Centric Benchmark for Evaluating {C}hinese Long Text Understanding and Generation",
author = "Guan, Jian and
Feng, Zhuoer and
Chen, Yamei and
He, Ruilin and
Mao, Xiaoxi and
Fan, Changjie and
Huang, Minlie",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.tacl-1.25",
doi = "10.1162/tacl_a_00469",
pages = "434--451",
abstract = "Standard multi-task benchmarks are essential for developing pretraining models that can generalize to various downstream tasks. Existing benchmarks for natural language processing (NLP) usually focus only on understanding or generating short texts. However, long text modeling requires many distinct abilities in contrast to short texts, such as the modeling of long-range discourse and commonsense relations, and the coherence and controllability of generation. The lack of standardized benchmarks makes it difficult to assess these abilities of a model and fairly compare different models, especially Chinese models. Therefore, we propose a story-centric benchmark named LOT for evaluating Chinese long text modeling, which aggregates two understanding tasks and two generation tasks. We construct new datasets for these tasks based on human-written Chinese stories with hundreds of words. Furthermore, we release an encoder-decoder-based Chinese long text pretraining model named LongLM with up to 1 billion parameters. We pretrain LongLM on 120G Chinese novels with two generative tasks including text infilling and conditional continuation. Extensive experiments show that LongLM outperforms similar-sized pretraining models substantially on both the understanding and generation tasks in LOT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guan-etal-2022-lot">
<titleInfo>
<title>LOT: A Story-Centric Benchmark for Evaluating Chinese Long Text Understanding and Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuoer</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yamei</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruilin</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoxi</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changjie</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Standard multi-task benchmarks are essential for developing pretraining models that can generalize to various downstream tasks. Existing benchmarks for natural language processing (NLP) usually focus only on understanding or generating short texts. However, long text modeling requires many distinct abilities in contrast to short texts, such as the modeling of long-range discourse and commonsense relations, and the coherence and controllability of generation. The lack of standardized benchmarks makes it difficult to assess these abilities of a model and fairly compare different models, especially Chinese models. Therefore, we propose a story-centric benchmark named LOT for evaluating Chinese long text modeling, which aggregates two understanding tasks and two generation tasks. We construct new datasets for these tasks based on human-written Chinese stories with hundreds of words. Furthermore, we release an encoder-decoder-based Chinese long text pretraining model named LongLM with up to 1 billion parameters. We pretrain LongLM on 120G Chinese novels with two generative tasks including text infilling and conditional continuation. Extensive experiments show that LongLM outperforms similar-sized pretraining models substantially on both the understanding and generation tasks in LOT.</abstract>
<identifier type="citekey">guan-etal-2022-lot</identifier>
<identifier type="doi">10.1162/tacl_a_00469</identifier>
<location>
<url>https://aclanthology.org/2022.tacl-1.25</url>
</location>
<part>
<date>2022</date>
<detail type="volume"><number>10</number></detail>
<extent unit="page">
<start>434</start>
<end>451</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T LOT: A Story-Centric Benchmark for Evaluating Chinese Long Text Understanding and Generation
%A Guan, Jian
%A Feng, Zhuoer
%A Chen, Yamei
%A He, Ruilin
%A Mao, Xiaoxi
%A Fan, Changjie
%A Huang, Minlie
%J Transactions of the Association for Computational Linguistics
%D 2022
%V 10
%I MIT Press
%C Cambridge, MA
%F guan-etal-2022-lot
%X Standard multi-task benchmarks are essential for developing pretraining models that can generalize to various downstream tasks. Existing benchmarks for natural language processing (NLP) usually focus only on understanding or generating short texts. However, long text modeling requires many distinct abilities in contrast to short texts, such as the modeling of long-range discourse and commonsense relations, and the coherence and controllability of generation. The lack of standardized benchmarks makes it difficult to assess these abilities of a model and fairly compare different models, especially Chinese models. Therefore, we propose a story-centric benchmark named LOT for evaluating Chinese long text modeling, which aggregates two understanding tasks and two generation tasks. We construct new datasets for these tasks based on human-written Chinese stories with hundreds of words. Furthermore, we release an encoder-decoder-based Chinese long text pretraining model named LongLM with up to 1 billion parameters. We pretrain LongLM on 120G Chinese novels with two generative tasks including text infilling and conditional continuation. Extensive experiments show that LongLM outperforms similar-sized pretraining models substantially on both the understanding and generation tasks in LOT.
%R 10.1162/tacl_a_00469
%U https://aclanthology.org/2022.tacl-1.25
%U https://doi.org/10.1162/tacl_a_00469
%P 434-451
Markdown (Informal)
[LOT: A Story-Centric Benchmark for Evaluating Chinese Long Text Understanding and Generation](https://aclanthology.org/2022.tacl-1.25) (Guan et al., TACL 2022)
ACL