@inproceedings{871d1ab950e342969d3fc761ba9eb141,
title = "Seeing Through Words: A Zero-Shot Multimodal Audio Description System with Foundation Models",
abstract = "Audio description (AD) plays a crucial role in making video content accessible to visually impaired audiences, yet current approaches often rely on expensive supervised training or struggle to capture temporal and narrative consistency. We introduce a training-free framework that integrates vision–language models (VLMs) with large language models (LLMs) through three complementary mechanisms: semantic-constrained prompting to reduce irrelevant content, adaptive character reasoning for accurate entity grounding, and a memory structure that aligns fine-grained shot-level cues with longer scene-level context. This design allows the system to generate temporally coherent and context-aware AD without requiring additional training data. Evaluation on the MAD-eval-Named and TV-AD benchmarks demonstrates consistent improvements over state-of-the-art training-free methods, with gains in both lexical and semantic quality metrics.",
keywords = "Audio description, Character recognition, Semantic prompting, Temporal memory, Video understanding",
author = "Bogdan Mocanu and Ruxandra Tapu",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2026.; 20th International Symposium on Visual Computing, ISVC 2025 ; Conference date: 17-11-2025 Through 19-11-2025",
year = "2026",
month = jan,
day = "1",
doi = "10.1007/978-3-032-14495-9\_7",
language = "English",
isbn = "9783032144942",
series = "Lecture Notes in Computer Science",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "85--97",
editor = "George Bebis and Jinwei Ye and Yuxiong Wang and \{Konakovic Lukovic\}, Mina and Kalantari, \{Nima Khademi\} and Isaac Cho and Yalong Yang and Evanthia Dimara and Matthew Brehmer",
booktitle = "Advances in Visual Computing - 20th International Symposium, ISVC 2025, Proceedings",
}