@inproceedings{donmez-etal-2024-please,
title = "Please note that {I}{'}m just an {AI}: Analysis of Behavior Patterns of {LLM}s in (Non-)offensive Speech Identification",
author = {D{\"o}nmez, Esra and
Vu, Thang and
Falenska, Agnieszka},
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1019",
doi = "10.18653/v1/2024.emnlp-main.1019",
pages = "18340--18357",
abstract = "Offensive speech is highly prevalent on online platforms. Being trained on online data, Large Language Models (LLMs) display undesirable behaviors, such as generating harmful text or failing to recognize it. Despite these shortcomings, the models are becoming a part of our everyday lives by being used as tools for information search, content creation, writing assistance, and many more. Furthermore, the research explores using LLMs in applications with immense social risk, such as late-life companions and online content moderators. Despite the potential harms from LLMs in such applications, whether LLMs can reliably identify offensive speech and how they behave when they fail are open questions. This work addresses these questions by probing sixteen widely used LLMs and showing that most fail to identify (non-)offensive online language. Our experiments reveal undesirable behavior patterns in the context of offensive speech detection, such as erroneous response generation, over-reliance on profanity, and failure to recognize stereotypes. Our work highlights the need for extensive documentation of model reliability, particularly in terms of the ability to detect offensive language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="donmez-etal-2024-please">
<titleInfo>
<title>Please note that I’m just an AI: Analysis of Behavior Patterns of LLMs in (Non-)offensive Speech Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esra</namePart>
<namePart type="family">Dönmez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thang</namePart>
<namePart type="family">Vu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agnieszka</namePart>
<namePart type="family">Falenska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Offensive speech is highly prevalent on online platforms. Being trained on online data, Large Language Models (LLMs) display undesirable behaviors, such as generating harmful text or failing to recognize it. Despite these shortcomings, the models are becoming a part of our everyday lives by being used as tools for information search, content creation, writing assistance, and many more. Furthermore, the research explores using LLMs in applications with immense social risk, such as late-life companions and online content moderators. Despite the potential harms from LLMs in such applications, whether LLMs can reliably identify offensive speech and how they behave when they fail are open questions. This work addresses these questions by probing sixteen widely used LLMs and showing that most fail to identify (non-)offensive online language. Our experiments reveal undesirable behavior patterns in the context of offensive speech detection, such as erroneous response generation, over-reliance on profanity, and failure to recognize stereotypes. Our work highlights the need for extensive documentation of model reliability, particularly in terms of the ability to detect offensive language.</abstract>
<identifier type="citekey">donmez-etal-2024-please</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1019</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1019</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>18340</start>
<end>18357</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Please note that I’m just an AI: Analysis of Behavior Patterns of LLMs in (Non-)offensive Speech Identification
%A Dönmez, Esra
%A Vu, Thang
%A Falenska, Agnieszka
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F donmez-etal-2024-please
%X Offensive speech is highly prevalent on online platforms. Being trained on online data, Large Language Models (LLMs) display undesirable behaviors, such as generating harmful text or failing to recognize it. Despite these shortcomings, the models are becoming a part of our everyday lives by being used as tools for information search, content creation, writing assistance, and many more. Furthermore, the research explores using LLMs in applications with immense social risk, such as late-life companions and online content moderators. Despite the potential harms from LLMs in such applications, whether LLMs can reliably identify offensive speech and how they behave when they fail are open questions. This work addresses these questions by probing sixteen widely used LLMs and showing that most fail to identify (non-)offensive online language. Our experiments reveal undesirable behavior patterns in the context of offensive speech detection, such as erroneous response generation, over-reliance on profanity, and failure to recognize stereotypes. Our work highlights the need for extensive documentation of model reliability, particularly in terms of the ability to detect offensive language.
%R 10.18653/v1/2024.emnlp-main.1019
%U https://aclanthology.org/2024.emnlp-main.1019
%U https://doi.org/10.18653/v1/2024.emnlp-main.1019
%P 18340-18357
Markdown (Informal)
[Please note that I’m just an AI: Analysis of Behavior Patterns of LLMs in (Non-)offensive Speech Identification](https://aclanthology.org/2024.emnlp-main.1019) (Dönmez et al., EMNLP 2024)
ACL