Edit model card

Arousal - Dominance - Valence

Dimensional Speech Emotion Recognition model of simultaneous use of Wavlm / wav2vec2.0. Achieves 0.6760566 valence CCC on MSP-Podcast Test 1. Used as teacher for wav2small arXiv.

PapersWithCode

Wav2Small: Distilling Wav2Vec2 to 72K parameters for Low-Resource Speech emotion recognition.
Dionyssos Kounadis-Bastian, Oliver Schrüfer, Anna Derington, Hagen Wierstorf,
Florian Eyben, Felix Burkhardt, Björn Schuller.
2024, arXiV Preprint
CCC MSP Podcast v1.7
Test 1Test 2
Val Dom Aro Val Dom Aro
0.6760566 0.6840044 0.7620181 0.4229267 0.4684658 0.4857733

Usage

from transformers import AutoModelForAudioClassification
import librosa
import torch
import types
import torch.nn as nn
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)

device = 'cpu'


class RegressionHead(nn.Module):
    r"""A/D/V"""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, x):

        x = self.dense(x)
        x = torch.tanh(x)

        return self.out_proj(x)


class Dawn(Wav2Vec2PreTrainedModel):
    r"""https://arxiv.org/abs/2203.07378"""

    def __init__(self, config):

        super().__init__(config)

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)

    def forward(self, x):
        '''x: (batch, audio-samples-16KHz)'''
        x = x - x.mean(1, keepdim=True)
        variance = (x * x).mean(1, keepdim=True) + 1e-7
        out = self.wav2vec2(x / variance.sqrt())
        return self.classifier(out[0].mean(1)).clip(0, 1)


def _infer(self, x):
    '''x: (batch, audio-samples-16KHz)'''
    x = (x + self.config.mean) / self.config.std  # plus
    x = self.ssl_model(x, attention_mask=None).last_hidden_state
    # pool
    h = self.pool_model.sap_linear(x).tanh()
    w = torch.matmul(h, self.pool_model.attention)
    w = w.softmax(1)
    mu = (x * w).sum(1)
    x = torch.cat(
        [
            mu,
            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
        ], 1)
    return self.ser_model(x)


# WavLM

base = AutoModelForAudioClassification.from_pretrained(
    '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
).to(device).eval()
base.forward = types.MethodType(_infer, base)

# Wav2Vec2

dawn = Dawn.from_pretrained(
    'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
).to(device).eval()


def wav2small(x):
    return .5 * dawn(x) + .5 * base(x)


x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)

with torch.no_grad():
    pred = wav2small(torch.from_numpy(x[None, :]).to(device))
print(f'\nArousal  = {pred[0, 0]} Dominance= {pred[0, 1]}',
      f' Valence  = {pred[0, 2]}')
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Examples
Inference API (serverless) is not available, repository is disabled.