@inproceedings{nikiforova-etal-2020-geo,
title = "Geo-Aware Image Caption Generation",
author = "Nikiforova, Sofia and
Deoskar, Tejaswini and
Paperno, Denis and
Winter, Yoad",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.280/",
doi = "10.18653/v1/2020.coling-main.280",
pages = "3143--3156",
abstract = "Standard image caption generation systems produce generic descriptions of images and do not utilize any contextual information or world knowledge. In particular, they are unable to generate captions that contain references to the geographic context of an image, for example, the location where a photograph is taken or relevant geographic objects around an image location. In this paper, we develop a geo-aware image caption generation system, which incorporates geographic contextual information into a standard image captioning pipeline. We propose a way to build an image-specific representation of the geographic context and adapt the caption generation network to produce appropriate geographic names in the image descriptions. We evaluate our system on a novel captioning dataset that contains contextualized captions and geographic metadata and achieve substantial improvements in BLEU, ROUGE, METEOR and CIDEr scores. We also introduce a new metric to assess generated geographic references directly and empirically demonstrate our system`s ability to produce captions with relevant and factually accurate geographic referencing."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nikiforova-etal-2020-geo">
<titleInfo>
<title>Geo-Aware Image Caption Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sofia</namePart>
<namePart type="family">Nikiforova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tejaswini</namePart>
<namePart type="family">Deoskar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Paperno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoad</namePart>
<namePart type="family">Winter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Standard image caption generation systems produce generic descriptions of images and do not utilize any contextual information or world knowledge. In particular, they are unable to generate captions that contain references to the geographic context of an image, for example, the location where a photograph is taken or relevant geographic objects around an image location. In this paper, we develop a geo-aware image caption generation system, which incorporates geographic contextual information into a standard image captioning pipeline. We propose a way to build an image-specific representation of the geographic context and adapt the caption generation network to produce appropriate geographic names in the image descriptions. We evaluate our system on a novel captioning dataset that contains contextualized captions and geographic metadata and achieve substantial improvements in BLEU, ROUGE, METEOR and CIDEr scores. We also introduce a new metric to assess generated geographic references directly and empirically demonstrate our system‘s ability to produce captions with relevant and factually accurate geographic referencing.</abstract>
<identifier type="citekey">nikiforova-etal-2020-geo</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.280</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.280/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>3143</start>
<end>3156</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Geo-Aware Image Caption Generation
%A Nikiforova, Sofia
%A Deoskar, Tejaswini
%A Paperno, Denis
%A Winter, Yoad
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F nikiforova-etal-2020-geo
%X Standard image caption generation systems produce generic descriptions of images and do not utilize any contextual information or world knowledge. In particular, they are unable to generate captions that contain references to the geographic context of an image, for example, the location where a photograph is taken or relevant geographic objects around an image location. In this paper, we develop a geo-aware image caption generation system, which incorporates geographic contextual information into a standard image captioning pipeline. We propose a way to build an image-specific representation of the geographic context and adapt the caption generation network to produce appropriate geographic names in the image descriptions. We evaluate our system on a novel captioning dataset that contains contextualized captions and geographic metadata and achieve substantial improvements in BLEU, ROUGE, METEOR and CIDEr scores. We also introduce a new metric to assess generated geographic references directly and empirically demonstrate our system‘s ability to produce captions with relevant and factually accurate geographic referencing.
%R 10.18653/v1/2020.coling-main.280
%U https://aclanthology.org/2020.coling-main.280/
%U https://doi.org/10.18653/v1/2020.coling-main.280
%P 3143-3156
Markdown (Informal)
[Geo-Aware Image Caption Generation](https://aclanthology.org/2020.coling-main.280/) (Nikiforova et al., COLING 2020)
ACL
- Sofia Nikiforova, Tejaswini Deoskar, Denis Paperno, and Yoad Winter. 2020. Geo-Aware Image Caption Generation. In Proceedings of the 28th International Conference on Computational Linguistics, pages 3143–3156, Barcelona, Spain (Online). International Committee on Computational Linguistics.