@inproceedings{pouran-ben-veyseh-etal-2024-mcecr,
title = "{MCECR}: A Novel Dataset for Multilingual Cross-Document Event Coreference Resolution",
author = "Pouran Ben Veyseh, Amir and
Lai, Viet Dac and
Nguyen, Chien and
Dernoncourt, Franck and
Nguyen, Thien",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.245",
doi = "10.18653/v1/2024.findings-naacl.245",
pages = "3869--3880",
abstract = "Event coreference resolution (ECR) is a critical task in information extraction of natural language processing, aiming to identify and link event mentions across multiple documents. Despite recent progress, existing datasets for ECR primarily focus on within-document event coreference and English text, lacking cross-document ECR datasets for multiple languages beyond English. To address this issue, this work presents the first multiligual dataset for cross-document ECR, called MCECR (Multilingual Cross-Document Event Coreference Resolution), that manually annotates a diverse collection of documents for event mentions and coreference in five languages, i.e., English, Spanish, Hindi, Turkish, and Ukrainian. Using sampled articles from Wikinews over various topics as the seeds, our dataset fetches related news articles from the Google search engine to increase the number of non-singleton event clusters. In total, we annotate 5,802 news articles, providing a substantial and varied dataset for multilingual ECR in both within-document and cross-document scenarios. Extensive analysis of the proposed dataset reveals the challenging nature of multilingual event coreference resolution tasks, promoting MCECR as a strong benchmark dataset for future research in this area.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pouran-ben-veyseh-etal-2024-mcecr">
<titleInfo>
<title>MCECR: A Novel Dataset for Multilingual Cross-Document Event Coreference Resolution</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Pouran Ben Veyseh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viet</namePart>
<namePart type="given">Dac</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chien</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thien</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Event coreference resolution (ECR) is a critical task in information extraction of natural language processing, aiming to identify and link event mentions across multiple documents. Despite recent progress, existing datasets for ECR primarily focus on within-document event coreference and English text, lacking cross-document ECR datasets for multiple languages beyond English. To address this issue, this work presents the first multiligual dataset for cross-document ECR, called MCECR (Multilingual Cross-Document Event Coreference Resolution), that manually annotates a diverse collection of documents for event mentions and coreference in five languages, i.e., English, Spanish, Hindi, Turkish, and Ukrainian. Using sampled articles from Wikinews over various topics as the seeds, our dataset fetches related news articles from the Google search engine to increase the number of non-singleton event clusters. In total, we annotate 5,802 news articles, providing a substantial and varied dataset for multilingual ECR in both within-document and cross-document scenarios. Extensive analysis of the proposed dataset reveals the challenging nature of multilingual event coreference resolution tasks, promoting MCECR as a strong benchmark dataset for future research in this area.</abstract>
<identifier type="citekey">pouran-ben-veyseh-etal-2024-mcecr</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.245</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.245</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3869</start>
<end>3880</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MCECR: A Novel Dataset for Multilingual Cross-Document Event Coreference Resolution
%A Pouran Ben Veyseh, Amir
%A Lai, Viet Dac
%A Nguyen, Chien
%A Dernoncourt, Franck
%A Nguyen, Thien
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F pouran-ben-veyseh-etal-2024-mcecr
%X Event coreference resolution (ECR) is a critical task in information extraction of natural language processing, aiming to identify and link event mentions across multiple documents. Despite recent progress, existing datasets for ECR primarily focus on within-document event coreference and English text, lacking cross-document ECR datasets for multiple languages beyond English. To address this issue, this work presents the first multiligual dataset for cross-document ECR, called MCECR (Multilingual Cross-Document Event Coreference Resolution), that manually annotates a diverse collection of documents for event mentions and coreference in five languages, i.e., English, Spanish, Hindi, Turkish, and Ukrainian. Using sampled articles from Wikinews over various topics as the seeds, our dataset fetches related news articles from the Google search engine to increase the number of non-singleton event clusters. In total, we annotate 5,802 news articles, providing a substantial and varied dataset for multilingual ECR in both within-document and cross-document scenarios. Extensive analysis of the proposed dataset reveals the challenging nature of multilingual event coreference resolution tasks, promoting MCECR as a strong benchmark dataset for future research in this area.
%R 10.18653/v1/2024.findings-naacl.245
%U https://aclanthology.org/2024.findings-naacl.245
%U https://doi.org/10.18653/v1/2024.findings-naacl.245
%P 3869-3880
Markdown (Informal)
[MCECR: A Novel Dataset for Multilingual Cross-Document Event Coreference Resolution](https://aclanthology.org/2024.findings-naacl.245) (Pouran Ben Veyseh et al., Findings 2024)
ACL