@inproceedings{liu-etal-2024-infimm,
title = "{I}nfi{MM}: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model",
author = "Liu, Haogeng and
You, Quanzeng and
Wang, Yiqi and
Han, Xiaotian and
Zhai, Bohan and
Liu, Yongfei and
Chen, Wentao and
Jian, Yiren and
Tao, Yunzhe and
Yuan, Jianbo and
He, Ran and
Yang, Hongxia",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.27/",
doi = "10.18653/v1/2024.findings-acl.27",
pages = "485--492",
abstract = "In this work, we present InfiMM, an advanced Multimodal Large Language Model that adapts to intricate vision-language tasks. InfiMM, inspired by the Flamingo architecture, distinguishes itself through the utilization of large-scale training data, comprehensive training strategies, and diverse large language models. This approach ensures the preservation of Flamingo`s foundational strengths while simultaneously introducing augmented capabilities. Empirical evaluations across a variety of benchmarks underscore InfiMM`s remarkable capability in multimodal understanding. The code can be found at: https://anonymous.4open.science/r/infimm-zephyr-F60C/."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2024-infimm">
<titleInfo>
<title>InfiMM: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haogeng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quanzeng</namePart>
<namePart type="family">You</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiqi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaotian</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bohan</namePart>
<namePart type="family">Zhai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongfei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wentao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiren</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunzhe</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianbo</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ran</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongxia</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we present InfiMM, an advanced Multimodal Large Language Model that adapts to intricate vision-language tasks. InfiMM, inspired by the Flamingo architecture, distinguishes itself through the utilization of large-scale training data, comprehensive training strategies, and diverse large language models. This approach ensures the preservation of Flamingo‘s foundational strengths while simultaneously introducing augmented capabilities. Empirical evaluations across a variety of benchmarks underscore InfiMM‘s remarkable capability in multimodal understanding. The code can be found at: https://anonymous.4open.science/r/infimm-zephyr-F60C/.</abstract>
<identifier type="citekey">liu-etal-2024-infimm</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.27</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.27/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>485</start>
<end>492</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T InfiMM: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model
%A Liu, Haogeng
%A You, Quanzeng
%A Wang, Yiqi
%A Han, Xiaotian
%A Zhai, Bohan
%A Liu, Yongfei
%A Chen, Wentao
%A Jian, Yiren
%A Tao, Yunzhe
%A Yuan, Jianbo
%A He, Ran
%A Yang, Hongxia
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F liu-etal-2024-infimm
%X In this work, we present InfiMM, an advanced Multimodal Large Language Model that adapts to intricate vision-language tasks. InfiMM, inspired by the Flamingo architecture, distinguishes itself through the utilization of large-scale training data, comprehensive training strategies, and diverse large language models. This approach ensures the preservation of Flamingo‘s foundational strengths while simultaneously introducing augmented capabilities. Empirical evaluations across a variety of benchmarks underscore InfiMM‘s remarkable capability in multimodal understanding. The code can be found at: https://anonymous.4open.science/r/infimm-zephyr-F60C/.
%R 10.18653/v1/2024.findings-acl.27
%U https://aclanthology.org/2024.findings-acl.27/
%U https://doi.org/10.18653/v1/2024.findings-acl.27
%P 485-492
Markdown (Informal)
[InfiMM: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model](https://aclanthology.org/2024.findings-acl.27/) (Liu et al., Findings 2024)
ACL
- Haogeng Liu, Quanzeng You, Yiqi Wang, Xiaotian Han, Bohan Zhai, Yongfei Liu, Wentao Chen, Yiren Jian, Yunzhe Tao, Jianbo Yuan, Ran He, and Hongxia Yang. 2024. InfiMM: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model. In Findings of the Association for Computational Linguistics: ACL 2024, pages 485–492, Bangkok, Thailand. Association for Computational Linguistics.