@inproceedings{yeganova-etal-2021-measuring,
title = "Measuring the relative importance of full text sections for information retrieval from scientific literature.",
author = "Yeganova, Lana and
Kim, Won Gyu and
Comeau, Donald and
Wilbur, W John and
Lu, Zhiyong",
editor = "Demner-Fushman, Dina and
Cohen, Kevin Bretonnel and
Ananiadou, Sophia and
Tsujii, Junichi",
booktitle = "Proceedings of the 20th Workshop on Biomedical Language Processing",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bionlp-1.27",
doi = "10.18653/v1/2021.bionlp-1.27",
pages = "247--256",
abstract = {With the growing availability of full-text articles, integrating abstracts and full texts of documents into a unified representation is essential for comprehensive search of scientific literature. However, previous studies have shown that na{\"\i}vely merging abstracts with full texts of articles does not consistently yield better performance. Balancing the contribution of query terms appearing in the abstract and in sections of different importance in full text articles remains a challenge both with traditional bag-of-words IR approaches and for neural retrieval methods. In this work we establish the connection between the BM25 score of a query term appearing in a section of a full text document and the probability of that document being clicked or identified as relevant. Probability is computed using Pool Adjacent Violators (PAV), an isotonic regression algorithm, providing a maximum likelihood estimate based on the observed data. Using this probabilistic transformation of BM25 scores we show an improved performance on the PubMed Click dataset developed and presented in this study, as well as the 2007 TREC Genomics collection.},
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yeganova-etal-2021-measuring">
<titleInfo>
<title>Measuring the relative importance of full text sections for information retrieval from scientific literature.</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lana</namePart>
<namePart type="family">Yeganova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Won</namePart>
<namePart type="given">Gyu</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Donald</namePart>
<namePart type="family">Comeau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">W</namePart>
<namePart type="given">John</namePart>
<namePart type="family">Wilbur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyong</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop on Biomedical Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">Bretonnel</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the growing availability of full-text articles, integrating abstracts and full texts of documents into a unified representation is essential for comprehensive search of scientific literature. However, previous studies have shown that naïvely merging abstracts with full texts of articles does not consistently yield better performance. Balancing the contribution of query terms appearing in the abstract and in sections of different importance in full text articles remains a challenge both with traditional bag-of-words IR approaches and for neural retrieval methods. In this work we establish the connection between the BM25 score of a query term appearing in a section of a full text document and the probability of that document being clicked or identified as relevant. Probability is computed using Pool Adjacent Violators (PAV), an isotonic regression algorithm, providing a maximum likelihood estimate based on the observed data. Using this probabilistic transformation of BM25 scores we show an improved performance on the PubMed Click dataset developed and presented in this study, as well as the 2007 TREC Genomics collection.</abstract>
<identifier type="citekey">yeganova-etal-2021-measuring</identifier>
<identifier type="doi">10.18653/v1/2021.bionlp-1.27</identifier>
<location>
<url>https://aclanthology.org/2021.bionlp-1.27</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>247</start>
<end>256</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring the relative importance of full text sections for information retrieval from scientific literature.
%A Yeganova, Lana
%A Kim, Won Gyu
%A Comeau, Donald
%A Wilbur, W. John
%A Lu, Zhiyong
%Y Demner-Fushman, Dina
%Y Cohen, Kevin Bretonnel
%Y Ananiadou, Sophia
%Y Tsujii, Junichi
%S Proceedings of the 20th Workshop on Biomedical Language Processing
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F yeganova-etal-2021-measuring
%X With the growing availability of full-text articles, integrating abstracts and full texts of documents into a unified representation is essential for comprehensive search of scientific literature. However, previous studies have shown that naïvely merging abstracts with full texts of articles does not consistently yield better performance. Balancing the contribution of query terms appearing in the abstract and in sections of different importance in full text articles remains a challenge both with traditional bag-of-words IR approaches and for neural retrieval methods. In this work we establish the connection between the BM25 score of a query term appearing in a section of a full text document and the probability of that document being clicked or identified as relevant. Probability is computed using Pool Adjacent Violators (PAV), an isotonic regression algorithm, providing a maximum likelihood estimate based on the observed data. Using this probabilistic transformation of BM25 scores we show an improved performance on the PubMed Click dataset developed and presented in this study, as well as the 2007 TREC Genomics collection.
%R 10.18653/v1/2021.bionlp-1.27
%U https://aclanthology.org/2021.bionlp-1.27
%U https://doi.org/10.18653/v1/2021.bionlp-1.27
%P 247-256
Markdown (Informal)
[Measuring the relative importance of full text sections for information retrieval from scientific literature.](https://aclanthology.org/2021.bionlp-1.27) (Yeganova et al., BioNLP 2021)
ACL