@inproceedings{499b22939e9e4a98b83c36864d2ab052,
title = "Perfect Match: Improved Cross-modal Embeddings for Audio-visual Synchronisation",
abstract = "This paper proposes a new strategy for learning powerful cross-modal embeddings for audio-to-video synchronisation. Here, we set up the problem as one of cross-modal retrieval, where the objective is to find the most relevant audio segment given a short video clip. The method builds on the recent advances in learning representations from cross-modal self-supervision. The main contributions of this paper are as follows: (1) we propose a new learning strategy where the embeddings are learnt via a multi-way matching problem, as opposed to a binary classification (matching or non-matching) problem as proposed by recent papers; (2) we demonstrate that performance of this method far exceeds the existing baselines on the synchronisation task; (3) we use the learnt embeddings for visual speech recognition in self-supervision, and show that the performance matches the representations learnt end-to-end in a fully-supervised manner.",
author = "Chung, {Soo Whan} and Chung, {Joon Son} and Kang, {Hong Goo}",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE. Copyright: Copyright 2019 Elsevier B.V., All rights reserved.; 44th IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2019 ; Conference date: 12-05-2019 Through 17-05-2019",
year = "2019",
month = may,
doi = "10.1109/ICASSP.2019.8682524",
language = "English",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "3965--3969",
booktitle = "2019 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2019 - Proceedings",
address = "United States",
}