@inproceedings{shen-silberer-2023-combining,
title = "Combining Tradition with Modernness: Exploring Event Representations in Vision-and-Language Models for Visual Goal-Step Inference",
author = "Shen, Chong and
Silberer, Carina",
editor = "Padmakumar, Vishakh and
Vallejo, Gisela and
Fu, Yao",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-srw.36",
doi = "10.18653/v1/2023.acl-srw.36",
pages = "254--265",
abstract = "Procedural knowledge understanding (PKU) underlies the ability to infer goal-step relations. The task of Visual Goal{--}Step Inference addresses this ability in the multimodal domain. It requires to identify images that represent the steps towards achieving a textually expressed goal. The best existing methods encode texts and images either with independent encoders, or with object-level multimodal encoders using blackbox transformers. This stands in contrast to early, linguistically inspired methods for event representations, which focus on capturing the most crucial information, namely actions and the participants, to learn stereotypical event sequences and hence procedural knowledge. In this work, we study various methods and their effects on PKU of injecting the early shallow event representations to nowadays multimodal deep learning-based models. We find that the early, linguistically inspired methods for representing event knowledge does contribute to understand procedures in combination with modern vision-and-language models. In the future, we are going to explore more complex structure of events and study how to exploit it on top of large language models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shen-silberer-2023-combining">
<titleInfo>
<title>Combining Tradition with Modernness: Exploring Event Representations in Vision-and-Language Models for Visual Goal-Step Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chong</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carina</namePart>
<namePart type="family">Silberer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vishakh</namePart>
<namePart type="family">Padmakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gisela</namePart>
<namePart type="family">Vallejo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Procedural knowledge understanding (PKU) underlies the ability to infer goal-step relations. The task of Visual Goal–Step Inference addresses this ability in the multimodal domain. It requires to identify images that represent the steps towards achieving a textually expressed goal. The best existing methods encode texts and images either with independent encoders, or with object-level multimodal encoders using blackbox transformers. This stands in contrast to early, linguistically inspired methods for event representations, which focus on capturing the most crucial information, namely actions and the participants, to learn stereotypical event sequences and hence procedural knowledge. In this work, we study various methods and their effects on PKU of injecting the early shallow event representations to nowadays multimodal deep learning-based models. We find that the early, linguistically inspired methods for representing event knowledge does contribute to understand procedures in combination with modern vision-and-language models. In the future, we are going to explore more complex structure of events and study how to exploit it on top of large language models.</abstract>
<identifier type="citekey">shen-silberer-2023-combining</identifier>
<identifier type="doi">10.18653/v1/2023.acl-srw.36</identifier>
<location>
<url>https://aclanthology.org/2023.acl-srw.36</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>254</start>
<end>265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combining Tradition with Modernness: Exploring Event Representations in Vision-and-Language Models for Visual Goal-Step Inference
%A Shen, Chong
%A Silberer, Carina
%Y Padmakumar, Vishakh
%Y Vallejo, Gisela
%Y Fu, Yao
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F shen-silberer-2023-combining
%X Procedural knowledge understanding (PKU) underlies the ability to infer goal-step relations. The task of Visual Goal–Step Inference addresses this ability in the multimodal domain. It requires to identify images that represent the steps towards achieving a textually expressed goal. The best existing methods encode texts and images either with independent encoders, or with object-level multimodal encoders using blackbox transformers. This stands in contrast to early, linguistically inspired methods for event representations, which focus on capturing the most crucial information, namely actions and the participants, to learn stereotypical event sequences and hence procedural knowledge. In this work, we study various methods and their effects on PKU of injecting the early shallow event representations to nowadays multimodal deep learning-based models. We find that the early, linguistically inspired methods for representing event knowledge does contribute to understand procedures in combination with modern vision-and-language models. In the future, we are going to explore more complex structure of events and study how to exploit it on top of large language models.
%R 10.18653/v1/2023.acl-srw.36
%U https://aclanthology.org/2023.acl-srw.36
%U https://doi.org/10.18653/v1/2023.acl-srw.36
%P 254-265
Markdown (Informal)
[Combining Tradition with Modernness: Exploring Event Representations in Vision-and-Language Models for Visual Goal-Step Inference](https://aclanthology.org/2023.acl-srw.36) (Shen & Silberer, ACL 2023)
ACL