@inproceedings{paetzold-specia-2016-collecting,
title = "Collecting and Exploring Everyday Language for Predicting Psycholinguistic Properties of Words",
author = "Paetzold, Gustavo and
Specia, Lucia",
editor = "Matsumoto, Yuji and
Prasad, Rashmi",
booktitle = "Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/C16-1157",
pages = "1669--1679",
abstract = "Exploring language usage through frequency analysis in large corpora is a defining feature in most recent work in corpus and computational linguistics. From a psycholinguistic perspective, however, the corpora used in these contributions are often not representative of language usage: they are either domain-specific, limited in size, or extracted from unreliable sources. In an effort to address this limitation, we introduce SubIMDB, a corpus of everyday language spoken text we created which contains over 225 million words. The corpus was extracted from 38,102 subtitles of family, comedy and children movies and series, and is the first sizeable structured corpus of subtitles made available. Our experiments show that word frequency norms extracted from this corpus are more effective than those from well-known norms such as Kucera-Francis, HAL and SUBTLEXus in predicting various psycholinguistic properties of words, such as lexical decision times, familiarity, age of acquisition and simplicity. We also provide evidence that contradict the long-standing assumption that the ideal size for a corpus can be determined solely based on how well its word frequencies correlate with lexical decision times.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="paetzold-specia-2016-collecting">
<titleInfo>
<title>Collecting and Exploring Everyday Language for Predicting Psycholinguistic Properties of Words</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gustavo</namePart>
<namePart type="family">Paetzold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuji</namePart>
<namePart type="family">Matsumoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rashmi</namePart>
<namePart type="family">Prasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Exploring language usage through frequency analysis in large corpora is a defining feature in most recent work in corpus and computational linguistics. From a psycholinguistic perspective, however, the corpora used in these contributions are often not representative of language usage: they are either domain-specific, limited in size, or extracted from unreliable sources. In an effort to address this limitation, we introduce SubIMDB, a corpus of everyday language spoken text we created which contains over 225 million words. The corpus was extracted from 38,102 subtitles of family, comedy and children movies and series, and is the first sizeable structured corpus of subtitles made available. Our experiments show that word frequency norms extracted from this corpus are more effective than those from well-known norms such as Kucera-Francis, HAL and SUBTLEXus in predicting various psycholinguistic properties of words, such as lexical decision times, familiarity, age of acquisition and simplicity. We also provide evidence that contradict the long-standing assumption that the ideal size for a corpus can be determined solely based on how well its word frequencies correlate with lexical decision times.</abstract>
<identifier type="citekey">paetzold-specia-2016-collecting</identifier>
<location>
<url>https://aclanthology.org/C16-1157</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>1669</start>
<end>1679</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Collecting and Exploring Everyday Language for Predicting Psycholinguistic Properties of Words
%A Paetzold, Gustavo
%A Specia, Lucia
%Y Matsumoto, Yuji
%Y Prasad, Rashmi
%S Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F paetzold-specia-2016-collecting
%X Exploring language usage through frequency analysis in large corpora is a defining feature in most recent work in corpus and computational linguistics. From a psycholinguistic perspective, however, the corpora used in these contributions are often not representative of language usage: they are either domain-specific, limited in size, or extracted from unreliable sources. In an effort to address this limitation, we introduce SubIMDB, a corpus of everyday language spoken text we created which contains over 225 million words. The corpus was extracted from 38,102 subtitles of family, comedy and children movies and series, and is the first sizeable structured corpus of subtitles made available. Our experiments show that word frequency norms extracted from this corpus are more effective than those from well-known norms such as Kucera-Francis, HAL and SUBTLEXus in predicting various psycholinguistic properties of words, such as lexical decision times, familiarity, age of acquisition and simplicity. We also provide evidence that contradict the long-standing assumption that the ideal size for a corpus can be determined solely based on how well its word frequencies correlate with lexical decision times.
%U https://aclanthology.org/C16-1157
%P 1669-1679
Markdown (Informal)
[Collecting and Exploring Everyday Language for Predicting Psycholinguistic Properties of Words](https://aclanthology.org/C16-1157) (Paetzold & Specia, COLING 2016)
ACL