Low-level Fusion of Audio and Video Feature for Multi-modal Emotion Recognition (bibtex)
  author = {Wimmer, Matthias and Schuller, Björn and Arsic, Dejan and Radig,
	Bernd and Rigoll, Gerhard},
  title = {Low-level Fusion of Audio and Video Feature for Multi-modal Emotion
  booktitle = {3rd International Conference on Computer Vision Theory and Applications
  year = {2008},
  volume = {2},
  pages = {145--151},
  address = {Madeira, Portugal},
  month = jan,
  abstract = {Bimodal emotion recognition through audiovisual feature fusion has
	been shown superior over each individual modality in the past. Still,
	synchronization of the two streams is a challenge, as many vision
	approaches work on a frame basis opposing audio turn- or chunk-basis.
	Therefore, late fusion schemes such as simple logic or voting strategies
	are commonly used for the overall estimation of underlying affect.
	However, early fusion is known to be more effective in many other
	multimodal recognition tasks. We therefore suggest a combined analysis
	by descriptive statistics of audio and video Low-Level-Descriptors
	for subsequent static {SVM} Classification. This strategy also allows
	for a combined feature-space optimization which will be discussed
	herein. The high effectiveness of this approach is shown on a database
	of 11.5h containing six emotional situations in an airplane scenario.}
