The Impact of Coverbal Visual Cues on Speech Intelligibility and Cognitive Load in Virtual Reality environments
In natural communication settings, auditory information, including spatial cues and spectro-temporal variations, is typically accompanied by corresponding coverbal visual cues, specifically information about the talker’s location and lip movements. These visual cues can enhance speech intelligibility in noisy environments compared to audio-only conditions but may also increase cognitive load as listeners process information from multiple modalities simultaneously. While numerous studies have examined the effects of isolated visual cues on auditory processing, often using faces displayed on computer screens, the interplay and relevance of these cues in natural scenarios with fully rendered embodied conversational agents remain largely underexplored. This study aims to identify which coverbal visual cues enhance speech intelligibility and which contribute to an increased cognitive load in a virtual reality (VR) environment. Specifically, we evaluate two types of visual cues that directly correspond to information conveyed in the auditory signal: spatial information and lip movements. Using a conversational setting in VR, we assess speech intelligibility and cognitive load with the Oldenburger Sentence Test (OLSA) and verbal response times under varying levels of coverbal information. By investigating these cues in a more natural VR setting, the results contribute to a deeper understanding of multimodal speech processing.
@article{10.1121/10.0037748,
author = {Ermert, Cosima A. and Bönsch, Andrea and Kuhlen, Torsten W. and Fels, Janina},
title = {The impact of coverbal visual cues on speech intelligibility and cognitive load in virtual reality environments},
journal = {The Journal of the Acoustical Society of America},
volume = {157},
number = {4_Supplement},
pages = {A159-A159},
year = {2025},
month = {04},
abstract = {In natural communication settings, auditory information, including spatial cues and spectro-temporal variations, is typically accompanied by corresponding coverbal visual cues, specifically information about the talker’s location and lip movements. These visual cues can enhance speech intelligibility in noisy environments compared to audio-only conditions but may also increase cognitive load as listeners process information from multiple modalities simultaneously. While numerous studies have examined the effects of isolated visual cues on auditory processing, often using faces displayed on computer screens, the interplay and relevance of these cues in natural scenarios with fully rendered embodied conversational agents remain largely underexplored. This study aims to identify which coverbal visual cues enhance speech intelligibility and which contribute to an increased cognitive load in a virtual reality (VR) environment. Specifically, we evaluate two types of visual cues that directly correspond to information conveyed in the auditory signal: spatial information and lip movements. Using a conversational setting in VR, we assess speech intelligibility and cognitive load with the Oldenburger Sentence Test (OLSA) and verbal response times under varying levels of coverbal information. By investigating these cues in a more natural VR setting, the results contribute to a deeper understanding of multimodal speech processing. [This work was funded by the German Research Foundation (DFG): SPP2236—444724862.]},
issn = {0001-4966},
doi = {10.1121/10.0037748},
url = {https://doi.org/10.1121/10.0037748},
}