@inproceedings{wang-etal-2020-maven,
title = "{MAVEN}: {A} {M}assive {G}eneral {D}omain {E}vent {D}etection {D}ataset",
author = "Wang, Xiaozhi and
Wang, Ziqi and
Han, Xu and
Jiang, Wangyi and
Han, Rong and
Liu, Zhiyuan and
Li, Juanzi and
Li, Peng and
Lin, Yankai and
Zhou, Jie",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.129",
doi = "10.18653/v1/2020.emnlp-main.129",
pages = "1652--1671",
abstract = "Event detection (ED), which means identifying event trigger words and classifying event types, is the first and most fundamental step for extracting event knowledge from plain text. Most existing datasets exhibit the following issues that limit further development of ED: (1) Data scarcity. Existing small-scale datasets are not sufficient for training and stably benchmarking increasingly sophisticated modern neural methods. (2) Low coverage. Limited event types of existing datasets cannot well cover general-domain events, which restricts the applications of ED models. To alleviate these problems, we present a MAssive eVENt detection dataset (MAVEN), which contains 4,480 Wikipedia documents, 118,732 event mention instances, and 168 event types. MAVEN alleviates the data scarcity problem and covers much more general event types. We reproduce the recent state-of-the-art ED models and conduct a thorough evaluation on MAVEN. The experimental results show that existing ED methods cannot achieve promising results on MAVEN as on the small datasets, which suggests that ED in the real world remains a challenging task and requires further research efforts. We also discuss further directions for general domain ED with empirical analyses. The source code and dataset can be obtained from \url{https://github.com/THU-KEG/MAVEN-dataset}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2020-maven">
<titleInfo>
<title>MAVEN: A Massive General Domain Event Detection Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaozhi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziqi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wangyi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rong</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juanzi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yankai</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Event detection (ED), which means identifying event trigger words and classifying event types, is the first and most fundamental step for extracting event knowledge from plain text. Most existing datasets exhibit the following issues that limit further development of ED: (1) Data scarcity. Existing small-scale datasets are not sufficient for training and stably benchmarking increasingly sophisticated modern neural methods. (2) Low coverage. Limited event types of existing datasets cannot well cover general-domain events, which restricts the applications of ED models. To alleviate these problems, we present a MAssive eVENt detection dataset (MAVEN), which contains 4,480 Wikipedia documents, 118,732 event mention instances, and 168 event types. MAVEN alleviates the data scarcity problem and covers much more general event types. We reproduce the recent state-of-the-art ED models and conduct a thorough evaluation on MAVEN. The experimental results show that existing ED methods cannot achieve promising results on MAVEN as on the small datasets, which suggests that ED in the real world remains a challenging task and requires further research efforts. We also discuss further directions for general domain ED with empirical analyses. The source code and dataset can be obtained from https://github.com/THU-KEG/MAVEN-dataset.</abstract>
<identifier type="citekey">wang-etal-2020-maven</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.129</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.129</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>1652</start>
<end>1671</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MAVEN: A Massive General Domain Event Detection Dataset
%A Wang, Xiaozhi
%A Wang, Ziqi
%A Han, Xu
%A Jiang, Wangyi
%A Han, Rong
%A Liu, Zhiyuan
%A Li, Juanzi
%A Li, Peng
%A Lin, Yankai
%A Zhou, Jie
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F wang-etal-2020-maven
%X Event detection (ED), which means identifying event trigger words and classifying event types, is the first and most fundamental step for extracting event knowledge from plain text. Most existing datasets exhibit the following issues that limit further development of ED: (1) Data scarcity. Existing small-scale datasets are not sufficient for training and stably benchmarking increasingly sophisticated modern neural methods. (2) Low coverage. Limited event types of existing datasets cannot well cover general-domain events, which restricts the applications of ED models. To alleviate these problems, we present a MAssive eVENt detection dataset (MAVEN), which contains 4,480 Wikipedia documents, 118,732 event mention instances, and 168 event types. MAVEN alleviates the data scarcity problem and covers much more general event types. We reproduce the recent state-of-the-art ED models and conduct a thorough evaluation on MAVEN. The experimental results show that existing ED methods cannot achieve promising results on MAVEN as on the small datasets, which suggests that ED in the real world remains a challenging task and requires further research efforts. We also discuss further directions for general domain ED with empirical analyses. The source code and dataset can be obtained from https://github.com/THU-KEG/MAVEN-dataset.
%R 10.18653/v1/2020.emnlp-main.129
%U https://aclanthology.org/2020.emnlp-main.129
%U https://doi.org/10.18653/v1/2020.emnlp-main.129
%P 1652-1671
Markdown (Informal)
[MAVEN: A Massive General Domain Event Detection Dataset](https://aclanthology.org/2020.emnlp-main.129) (Wang et al., EMNLP 2020)
ACL
- Xiaozhi Wang, Ziqi Wang, Xu Han, Wangyi Jiang, Rong Han, Zhiyuan Liu, Juanzi Li, Peng Li, Yankai Lin, and Jie Zhou. 2020. MAVEN: A Massive General Domain Event Detection Dataset. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1652–1671, Online. Association for Computational Linguistics.