@inproceedings{selvam-etal-2023-tail,
title = "The Tail Wagging the Dog: Dataset Construction Biases of Social Bias Benchmarks",
author = "Selvam, Nikil and
Dev, Sunipa and
Khashabi, Daniel and
Khot, Tushar and
Chang, Kai-Wei",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-short.118",
doi = "10.18653/v1/2023.acl-short.118",
pages = "1373--1386",
abstract = "How reliably can we trust the scores obtained from social bias benchmarks as faithful indicators of problematic social biases in a given model? In this work, we study this question by contrasting social biases with non-social biases that stem from choices made during dataset construction (which might not even be discernible to the human eye). To do so, we empirically simulate various alternative constructions for a given benchmark based on seemingly innocuous modifications (such as paraphrasing or random-sampling) that maintain the essence of their social bias. On two well-known social bias benchmarks (Winogender and BiasNLI), we observe that these shallow modifications have a surprising effect on the resulting degree of bias across various models and consequently the relative ordering of these models when ranked by measured bias. We hope these troubling observations motivate more robust measures of social biases.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="selvam-etal-2023-tail">
<titleInfo>
<title>The Tail Wagging the Dog: Dataset Construction Biases of Social Bias Benchmarks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikil</namePart>
<namePart type="family">Selvam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunipa</namePart>
<namePart type="family">Dev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Khashabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tushar</namePart>
<namePart type="family">Khot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>How reliably can we trust the scores obtained from social bias benchmarks as faithful indicators of problematic social biases in a given model? In this work, we study this question by contrasting social biases with non-social biases that stem from choices made during dataset construction (which might not even be discernible to the human eye). To do so, we empirically simulate various alternative constructions for a given benchmark based on seemingly innocuous modifications (such as paraphrasing or random-sampling) that maintain the essence of their social bias. On two well-known social bias benchmarks (Winogender and BiasNLI), we observe that these shallow modifications have a surprising effect on the resulting degree of bias across various models and consequently the relative ordering of these models when ranked by measured bias. We hope these troubling observations motivate more robust measures of social biases.</abstract>
<identifier type="citekey">selvam-etal-2023-tail</identifier>
<identifier type="doi">10.18653/v1/2023.acl-short.118</identifier>
<location>
<url>https://aclanthology.org/2023.acl-short.118</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>1373</start>
<end>1386</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Tail Wagging the Dog: Dataset Construction Biases of Social Bias Benchmarks
%A Selvam, Nikil
%A Dev, Sunipa
%A Khashabi, Daniel
%A Khot, Tushar
%A Chang, Kai-Wei
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F selvam-etal-2023-tail
%X How reliably can we trust the scores obtained from social bias benchmarks as faithful indicators of problematic social biases in a given model? In this work, we study this question by contrasting social biases with non-social biases that stem from choices made during dataset construction (which might not even be discernible to the human eye). To do so, we empirically simulate various alternative constructions for a given benchmark based on seemingly innocuous modifications (such as paraphrasing or random-sampling) that maintain the essence of their social bias. On two well-known social bias benchmarks (Winogender and BiasNLI), we observe that these shallow modifications have a surprising effect on the resulting degree of bias across various models and consequently the relative ordering of these models when ranked by measured bias. We hope these troubling observations motivate more robust measures of social biases.
%R 10.18653/v1/2023.acl-short.118
%U https://aclanthology.org/2023.acl-short.118
%U https://doi.org/10.18653/v1/2023.acl-short.118
%P 1373-1386
Markdown (Informal)
[The Tail Wagging the Dog: Dataset Construction Biases of Social Bias Benchmarks](https://aclanthology.org/2023.acl-short.118) (Selvam et al., ACL 2023)
ACL