@misc{10481/70588, year = {2020}, month = {10}, url = {http://hdl.handle.net/10481/70588}, abstract = {Visual voice activity detection (V-VAD) uses visual features to predict whether a person is speaking or not. VVAD is useful whenever audio VAD (A-VAD) is inefficient either because the acoustic signal is difficult to analyze or because it is simply missing. We propose two deep architectures for V-VAD, one based on facial landmarks and one based on optical flow. Moreover, available datasets, used for learning and for testing VVAD, lack content variability. We introduce a novel methodology to automatically create and annotate very large datasets inthe- wild – WildVVAD – based on combining A-VAD with face detection and tracking. A thorough empirical evaluation shows the advantage of training the proposed deep V-VAD models with this dataset.}, organization = {European Commission 871245 SPRING}, organization = {Multidisciplinary Institute in Artificial Intelligence (MIAI) ANR-19-P3IA-0003}, publisher = {IEEE}, title = {Learning Visual Voice Activity Detection with an Automatically Annotated Dataset}, author = {Guy, Sylvain and Lathuilière, Stéphane and Mesejo Santiago, Pablo and Horaud, Radu}, }