@inproceedings{c7678aa20cdf45cbbd3f5d22c4ee5aa1,
title = "Audio-Video Fusion with Double Attention for Multimodal Emotion Recognition",
abstract = "Recently, the multimodal emotion recognition has become a hot topic of research, within the affective computing community, due to its robust performances. In this paper, we propose to analyze emotions in an end-to-end manner based on various convolutional neural networks (CNN) architectures and attention mechanisms. Specifically, we develop a new framework that integrates the spatial and temporal attention into a visual 3D-CNN and temporal attention into an audio 2D-CNN in order to capture the intra-modal features characteristics. Further, the system is extended with an audio-video cross-attention fusion approach that effectively exploits the relationship across the two modalities. The proposed method achieves 87.89\% of accuracy on RAVDESS dataset. When compared with state-of-the art methods our system demonstrates accuracy gains of more than 1.89\%.",
keywords = "cross-fusion, emotion recognition, spatial attention, temporal attention",
author = "Bogdan Mocanu and Ruxandra Tapu",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 14th IEEE Image, Video, and Multidimensional Signal Processing Workshop, IVMSP 2022 ; Conference date: 26-06-2022 Through 29-06-2022",
year = "2022",
month = jan,
day = "1",
doi = "10.1109/IVMSP54334.2022.9816349",
language = "English",
series = "IVMSP 2022 - 2022 IEEE 14th Image, Video, and Multidimensional Signal Processing Workshop",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "IVMSP 2022 - 2022 IEEE 14th Image, Video, and Multidimensional Signal Processing Workshop",
}