@inproceedings{tan-2023-causal,
title = "Causal Abstraction for Chain-of-Thought Reasoning in Arithmetic Word Problems",
author = "Tan, Juanhe (TJ)",
editor = "Belinkov, Yonatan and
Hao, Sophie and
Jumelet, Jaap and
Kim, Najoung and
McCarthy, Arya and
Mohebbi, Hosein",
booktitle = "Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.blackboxnlp-1.12",
doi = "10.18653/v1/2023.blackboxnlp-1.12",
pages = "155--168",
abstract = "Recent work suggests that large language models (LLMs) achieve higher accuracy on multi-step reasoning tasks when prompted to generate intermediate reasoning steps, or a chain of thought (CoT), before their final answer. However, it is unclear how exactly CoTs improve LLMs{'} accuracy, and in particular, if LLMs use their CoTs to reason to their final answers. This paper tries to answer this question with respect to arithmetic word problems, by (i) evaluating the correctness of LLMs{'} CoTs, and (ii) using causal abstraction to assess if the intermediate tokens produced as part of a CoT causally impact LLMs{'} final answers, in line with the reasoning described by the CoT. We find that for CoT-prompted LLMs, correct answers to arithmetic problems are highly correlated with correct CoTs, and that when LLMs produce correct CoTs, they realize to a fairly large extent the causal models suggested by their CoTs. Higher degrees of realization also seem associated with better overall accuracy on the arithmetic problems. These findings suggest that some CoT-prompted LLMs may do better on multi-step arithmetic reasoning at least partly because they use their CoTs to reason to their final answers. However, for some LLMs, other internal processes may also be involved.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tan-2023-causal">
<titleInfo>
<title>Causal Abstraction for Chain-of-Thought Reasoning in Arithmetic Word Problems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juanhe</namePart>
<namePart type="given">(TJ)</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophie</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arya</namePart>
<namePart type="family">McCarthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent work suggests that large language models (LLMs) achieve higher accuracy on multi-step reasoning tasks when prompted to generate intermediate reasoning steps, or a chain of thought (CoT), before their final answer. However, it is unclear how exactly CoTs improve LLMs’ accuracy, and in particular, if LLMs use their CoTs to reason to their final answers. This paper tries to answer this question with respect to arithmetic word problems, by (i) evaluating the correctness of LLMs’ CoTs, and (ii) using causal abstraction to assess if the intermediate tokens produced as part of a CoT causally impact LLMs’ final answers, in line with the reasoning described by the CoT. We find that for CoT-prompted LLMs, correct answers to arithmetic problems are highly correlated with correct CoTs, and that when LLMs produce correct CoTs, they realize to a fairly large extent the causal models suggested by their CoTs. Higher degrees of realization also seem associated with better overall accuracy on the arithmetic problems. These findings suggest that some CoT-prompted LLMs may do better on multi-step arithmetic reasoning at least partly because they use their CoTs to reason to their final answers. However, for some LLMs, other internal processes may also be involved.</abstract>
<identifier type="citekey">tan-2023-causal</identifier>
<identifier type="doi">10.18653/v1/2023.blackboxnlp-1.12</identifier>
<location>
<url>https://aclanthology.org/2023.blackboxnlp-1.12</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>155</start>
<end>168</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Causal Abstraction for Chain-of-Thought Reasoning in Arithmetic Word Problems
%A Tan, Juanhe (TJ)
%Y Belinkov, Yonatan
%Y Hao, Sophie
%Y Jumelet, Jaap
%Y Kim, Najoung
%Y McCarthy, Arya
%Y Mohebbi, Hosein
%S Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F tan-2023-causal
%X Recent work suggests that large language models (LLMs) achieve higher accuracy on multi-step reasoning tasks when prompted to generate intermediate reasoning steps, or a chain of thought (CoT), before their final answer. However, it is unclear how exactly CoTs improve LLMs’ accuracy, and in particular, if LLMs use their CoTs to reason to their final answers. This paper tries to answer this question with respect to arithmetic word problems, by (i) evaluating the correctness of LLMs’ CoTs, and (ii) using causal abstraction to assess if the intermediate tokens produced as part of a CoT causally impact LLMs’ final answers, in line with the reasoning described by the CoT. We find that for CoT-prompted LLMs, correct answers to arithmetic problems are highly correlated with correct CoTs, and that when LLMs produce correct CoTs, they realize to a fairly large extent the causal models suggested by their CoTs. Higher degrees of realization also seem associated with better overall accuracy on the arithmetic problems. These findings suggest that some CoT-prompted LLMs may do better on multi-step arithmetic reasoning at least partly because they use their CoTs to reason to their final answers. However, for some LLMs, other internal processes may also be involved.
%R 10.18653/v1/2023.blackboxnlp-1.12
%U https://aclanthology.org/2023.blackboxnlp-1.12
%U https://doi.org/10.18653/v1/2023.blackboxnlp-1.12
%P 155-168
Markdown (Informal)
[Causal Abstraction for Chain-of-Thought Reasoning in Arithmetic Word Problems](https://aclanthology.org/2023.blackboxnlp-1.12) (Tan, BlackboxNLP-WS 2023)
ACL