@inproceedings{berhe-etal-2023-alibert,
title = "{A}li{BERT}: A Pre-trained Language Model for {F}rench Biomedical Text",
author = "Berhe, Aman and
Draznieks, Guillaume and
Martenot, Vincent and
Masdeu, Valentin and
Davy, Lucas and
Zucker, Jean-Daniel",
editor = "Demner-fushman, Dina and
Ananiadou, Sophia and
Cohen, Kevin",
booktitle = "The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.bionlp-1.19",
doi = "10.18653/v1/2023.bionlp-1.19",
pages = "223--236",
abstract = "Over the past few years, domain specific pretrained language models have been investigated and have shown remarkable achievements in different downstream tasks, especially in biomedical domain. These achievements stem on the well known BERT architecture which uses an attention based self-supervision for context learning of textual documents. However, these domain specific biomedical pretrained language models mainly use English corpora. Therefore, non-English, domain-specific pretrained models remain quite rare, both of these requirements being hard to achieve. In this work, we proposed AliBERT, a biomedical pretrained language model for French and investigated different learning strategies. AliBERT is trained using regularized Unigram based tokenizer trained for this purpose. AliBERT has achieved state of the art F1 and accuracy scores in different down-stream biomedical tasks. Our pretrained model manages to outperform some French non domain-specific models such as CamemBERT and FlauBERT on diverse down-stream tasks, with less pretraining and training time and with much smaller corpora.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="berhe-etal-2023-alibert">
<titleInfo>
<title>AliBERT: A Pre-trained Language Model for French Biomedical Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Berhe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guillaume</namePart>
<namePart type="family">Draznieks</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Martenot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Masdeu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="family">Davy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean-Daniel</namePart>
<namePart type="family">Zucker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Over the past few years, domain specific pretrained language models have been investigated and have shown remarkable achievements in different downstream tasks, especially in biomedical domain. These achievements stem on the well known BERT architecture which uses an attention based self-supervision for context learning of textual documents. However, these domain specific biomedical pretrained language models mainly use English corpora. Therefore, non-English, domain-specific pretrained models remain quite rare, both of these requirements being hard to achieve. In this work, we proposed AliBERT, a biomedical pretrained language model for French and investigated different learning strategies. AliBERT is trained using regularized Unigram based tokenizer trained for this purpose. AliBERT has achieved state of the art F1 and accuracy scores in different down-stream biomedical tasks. Our pretrained model manages to outperform some French non domain-specific models such as CamemBERT and FlauBERT on diverse down-stream tasks, with less pretraining and training time and with much smaller corpora.</abstract>
<identifier type="citekey">berhe-etal-2023-alibert</identifier>
<identifier type="doi">10.18653/v1/2023.bionlp-1.19</identifier>
<location>
<url>https://aclanthology.org/2023.bionlp-1.19</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>223</start>
<end>236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AliBERT: A Pre-trained Language Model for French Biomedical Text
%A Berhe, Aman
%A Draznieks, Guillaume
%A Martenot, Vincent
%A Masdeu, Valentin
%A Davy, Lucas
%A Zucker, Jean-Daniel
%Y Demner-fushman, Dina
%Y Ananiadou, Sophia
%Y Cohen, Kevin
%S The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F berhe-etal-2023-alibert
%X Over the past few years, domain specific pretrained language models have been investigated and have shown remarkable achievements in different downstream tasks, especially in biomedical domain. These achievements stem on the well known BERT architecture which uses an attention based self-supervision for context learning of textual documents. However, these domain specific biomedical pretrained language models mainly use English corpora. Therefore, non-English, domain-specific pretrained models remain quite rare, both of these requirements being hard to achieve. In this work, we proposed AliBERT, a biomedical pretrained language model for French and investigated different learning strategies. AliBERT is trained using regularized Unigram based tokenizer trained for this purpose. AliBERT has achieved state of the art F1 and accuracy scores in different down-stream biomedical tasks. Our pretrained model manages to outperform some French non domain-specific models such as CamemBERT and FlauBERT on diverse down-stream tasks, with less pretraining and training time and with much smaller corpora.
%R 10.18653/v1/2023.bionlp-1.19
%U https://aclanthology.org/2023.bionlp-1.19
%U https://doi.org/10.18653/v1/2023.bionlp-1.19
%P 223-236
Markdown (Informal)
[AliBERT: A Pre-trained Language Model for French Biomedical Text](https://aclanthology.org/2023.bionlp-1.19) (Berhe et al., BioNLP 2023)
ACL
- Aman Berhe, Guillaume Draznieks, Vincent Martenot, Valentin Masdeu, Lucas Davy, and Jean-Daniel Zucker. 2023. AliBERT: A Pre-trained Language Model for French Biomedical Text. In The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks, pages 223–236, Toronto, Canada. Association for Computational Linguistics.