@inproceedings{hayet-etal-2022-invernet,
title = "Invernet: An Inversion Attack Framework to Infer Fine-Tuning Datasets through Word Embeddings",
author = "Hayet, Ishrak and
Yao, Zijun and
Luo, Bo",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.368",
doi = "10.18653/v1/2022.findings-emnlp.368",
pages = "5009--5018",
abstract = "Word embedding aims to learn the dense representation of words and has become a regular input preparation in many NLP tasks. Due to the data and computation intensive nature of learning embeddings from scratch, a more affordable way is to borrow the pretrained embedding available in public and fine-tune the embedding through a domain specific downstream dataset. A privacy concern can arise if a malicious owner of the pretrained embedding gets access to the fine-tuned embedding and tries to infer the critical information from the downstream datasets. In this study, we propose a novel embedding inversion framework called Invernet that materializes the privacy concern by inferring the context distribution in the downstream dataset, which can lead to key information breach. With extensive experimental studies on two real-world news datasets: Antonio Gulli{'}s News and New York Times, we validate the feasibility of proposed privacy attack and demonstrate the effectiveness of Invernet on inferring downstream datasets based on multiple word embedding methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hayet-etal-2022-invernet">
<titleInfo>
<title>Invernet: An Inversion Attack Framework to Infer Fine-Tuning Datasets through Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ishrak</namePart>
<namePart type="family">Hayet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zijun</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word embedding aims to learn the dense representation of words and has become a regular input preparation in many NLP tasks. Due to the data and computation intensive nature of learning embeddings from scratch, a more affordable way is to borrow the pretrained embedding available in public and fine-tune the embedding through a domain specific downstream dataset. A privacy concern can arise if a malicious owner of the pretrained embedding gets access to the fine-tuned embedding and tries to infer the critical information from the downstream datasets. In this study, we propose a novel embedding inversion framework called Invernet that materializes the privacy concern by inferring the context distribution in the downstream dataset, which can lead to key information breach. With extensive experimental studies on two real-world news datasets: Antonio Gulli’s News and New York Times, we validate the feasibility of proposed privacy attack and demonstrate the effectiveness of Invernet on inferring downstream datasets based on multiple word embedding methods.</abstract>
<identifier type="citekey">hayet-etal-2022-invernet</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.368</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.368</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>5009</start>
<end>5018</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Invernet: An Inversion Attack Framework to Infer Fine-Tuning Datasets through Word Embeddings
%A Hayet, Ishrak
%A Yao, Zijun
%A Luo, Bo
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F hayet-etal-2022-invernet
%X Word embedding aims to learn the dense representation of words and has become a regular input preparation in many NLP tasks. Due to the data and computation intensive nature of learning embeddings from scratch, a more affordable way is to borrow the pretrained embedding available in public and fine-tune the embedding through a domain specific downstream dataset. A privacy concern can arise if a malicious owner of the pretrained embedding gets access to the fine-tuned embedding and tries to infer the critical information from the downstream datasets. In this study, we propose a novel embedding inversion framework called Invernet that materializes the privacy concern by inferring the context distribution in the downstream dataset, which can lead to key information breach. With extensive experimental studies on two real-world news datasets: Antonio Gulli’s News and New York Times, we validate the feasibility of proposed privacy attack and demonstrate the effectiveness of Invernet on inferring downstream datasets based on multiple word embedding methods.
%R 10.18653/v1/2022.findings-emnlp.368
%U https://aclanthology.org/2022.findings-emnlp.368
%U https://doi.org/10.18653/v1/2022.findings-emnlp.368
%P 5009-5018
Markdown (Informal)
[Invernet: An Inversion Attack Framework to Infer Fine-Tuning Datasets through Word Embeddings](https://aclanthology.org/2022.findings-emnlp.368) (Hayet et al., Findings 2022)
ACL