@inproceedings{zhao-etal-2022-explainable,
title = "An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models",
author = "Zhao, Tiancheng and
Zhang, Tianqi and
Zhu, Mingwei and
Shen, Haozhan and
Lee, Kyusong and
Lu, Xiaopeng and
Yin, Jianwei",
editor = "Che, Wanxiang and
Shutova, Ekaterina",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-demos.4",
doi = "10.18653/v1/2022.emnlp-demos.4",
pages = "30--37",
abstract = "We introduce VL-CheckList, a toolbox for evaluating Vision-Language Pretraining (VLP) models, including the preliminary datasets that deepen the image-texting ability of a VLP model. Most existing VLP works evaluated their systems by comparing the fine-tuned downstream task performance. However, only average downstream task accuracy provides little information about the pros and cons of each VLP method. In this paper, we demonstrate how minor input changes in language and vision will affect the prediction outputs. Then, we describe the detailed user guidelines to utilize and contribute to the community. We show new findings on one of the representative VLP models to provide an example analysis. The data/code is available at \url{https://github.com/om-ai-lab/VL-CheckList}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2022-explainable">
<titleInfo>
<title>An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tiancheng</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianqi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingwei</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haozhan</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyusong</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaopeng</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianwei</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce VL-CheckList, a toolbox for evaluating Vision-Language Pretraining (VLP) models, including the preliminary datasets that deepen the image-texting ability of a VLP model. Most existing VLP works evaluated their systems by comparing the fine-tuned downstream task performance. However, only average downstream task accuracy provides little information about the pros and cons of each VLP method. In this paper, we demonstrate how minor input changes in language and vision will affect the prediction outputs. Then, we describe the detailed user guidelines to utilize and contribute to the community. We show new findings on one of the representative VLP models to provide an example analysis. The data/code is available at https://github.com/om-ai-lab/VL-CheckList</abstract>
<identifier type="citekey">zhao-etal-2022-explainable</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-demos.4</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-demos.4</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>30</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models
%A Zhao, Tiancheng
%A Zhang, Tianqi
%A Zhu, Mingwei
%A Shen, Haozhan
%A Lee, Kyusong
%A Lu, Xiaopeng
%A Yin, Jianwei
%Y Che, Wanxiang
%Y Shutova, Ekaterina
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhao-etal-2022-explainable
%X We introduce VL-CheckList, a toolbox for evaluating Vision-Language Pretraining (VLP) models, including the preliminary datasets that deepen the image-texting ability of a VLP model. Most existing VLP works evaluated their systems by comparing the fine-tuned downstream task performance. However, only average downstream task accuracy provides little information about the pros and cons of each VLP method. In this paper, we demonstrate how minor input changes in language and vision will affect the prediction outputs. Then, we describe the detailed user guidelines to utilize and contribute to the community. We show new findings on one of the representative VLP models to provide an example analysis. The data/code is available at https://github.com/om-ai-lab/VL-CheckList
%R 10.18653/v1/2022.emnlp-demos.4
%U https://aclanthology.org/2022.emnlp-demos.4
%U https://doi.org/10.18653/v1/2022.emnlp-demos.4
%P 30-37
Markdown (Informal)
[An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models](https://aclanthology.org/2022.emnlp-demos.4) (Zhao et al., EMNLP 2022)
ACL
- Tiancheng Zhao, Tianqi Zhang, Mingwei Zhu, Haozhan Shen, Kyusong Lee, Xiaopeng Lu, and Jianwei Yin. 2022. An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 30–37, Abu Dhabi, UAE. Association for Computational Linguistics.