@inproceedings{yang-wan-2022-investigating,
title = "Investigating Metric Diversity for Evaluating Long Document Summarisation",
author = "Yang, Cai and
Wan, Stephen",
editor = "Cohan, Arman and
Feigenblat, Guy and
Freitag, Dayne and
Ghosal, Tirthankar and
Herrmannova, Drahomira and
Knoth, Petr and
Lo, Kyle and
Mayr, Philipp and
Shmueli-Scheuer, Michal and
de Waard, Anita and
Wang, Lucy Lu",
booktitle = "Proceedings of the Third Workshop on Scholarly Document Processing",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.sdp-1.13/",
pages = "115--125",
abstract = "Long document summarisation, a challenging summarisation scenario, is the focus of the recently proposed LongSumm shared task. One of the limitations of this shared task has been its use of a single family of metrics for evaluation (the ROUGE metrics). In contrast, other fields, like text generation, employ multiple metrics. We replicated the LongSumm evaluation using multiple test set samples (vs. the single test set of the official shared task) and investigated how different metrics might complement each other in this evaluation framework. We show that under this more rigorous evaluation, (1) some of the key learnings from Longsumm 2020 and 2021 still hold, but the relative ranking of systems changes, and (2) the use of additional metrics reveals additional high-quality summaries missed by ROUGE, and (3) we show that SPICE is a candidate metric for summarisation evaluation for LongSumm."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-wan-2022-investigating">
<titleInfo>
<title>Investigating Metric Diversity for Evaluating Long Document Summarisation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cai</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Scholarly Document Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Feigenblat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayne</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drahomira</namePart>
<namePart type="family">Herrmannova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Knoth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Mayr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli-Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anita</namePart>
<namePart type="family">de Waard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucy</namePart>
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Long document summarisation, a challenging summarisation scenario, is the focus of the recently proposed LongSumm shared task. One of the limitations of this shared task has been its use of a single family of metrics for evaluation (the ROUGE metrics). In contrast, other fields, like text generation, employ multiple metrics. We replicated the LongSumm evaluation using multiple test set samples (vs. the single test set of the official shared task) and investigated how different metrics might complement each other in this evaluation framework. We show that under this more rigorous evaluation, (1) some of the key learnings from Longsumm 2020 and 2021 still hold, but the relative ranking of systems changes, and (2) the use of additional metrics reveals additional high-quality summaries missed by ROUGE, and (3) we show that SPICE is a candidate metric for summarisation evaluation for LongSumm.</abstract>
<identifier type="citekey">yang-wan-2022-investigating</identifier>
<location>
<url>https://aclanthology.org/2022.sdp-1.13/</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>115</start>
<end>125</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Investigating Metric Diversity for Evaluating Long Document Summarisation
%A Yang, Cai
%A Wan, Stephen
%Y Cohan, Arman
%Y Feigenblat, Guy
%Y Freitag, Dayne
%Y Ghosal, Tirthankar
%Y Herrmannova, Drahomira
%Y Knoth, Petr
%Y Lo, Kyle
%Y Mayr, Philipp
%Y Shmueli-Scheuer, Michal
%Y de Waard, Anita
%Y Wang, Lucy Lu
%S Proceedings of the Third Workshop on Scholarly Document Processing
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F yang-wan-2022-investigating
%X Long document summarisation, a challenging summarisation scenario, is the focus of the recently proposed LongSumm shared task. One of the limitations of this shared task has been its use of a single family of metrics for evaluation (the ROUGE metrics). In contrast, other fields, like text generation, employ multiple metrics. We replicated the LongSumm evaluation using multiple test set samples (vs. the single test set of the official shared task) and investigated how different metrics might complement each other in this evaluation framework. We show that under this more rigorous evaluation, (1) some of the key learnings from Longsumm 2020 and 2021 still hold, but the relative ranking of systems changes, and (2) the use of additional metrics reveals additional high-quality summaries missed by ROUGE, and (3) we show that SPICE is a candidate metric for summarisation evaluation for LongSumm.
%U https://aclanthology.org/2022.sdp-1.13/
%P 115-125
Markdown (Informal)
[Investigating Metric Diversity for Evaluating Long Document Summarisation](https://aclanthology.org/2022.sdp-1.13/) (Yang & Wan, sdp 2022)
ACL