Spaces:

hsiangyualex
/

audio_demo

Build error

App Files Files Community

hsiangyualex commited on 9 days ago

Commit

d1add7a

•

1 Parent(s): 0fc1d10

Upload 41 files

Browse files

Files changed (41) hide show

app.py +133 -0
audio_models.py +389 -0
ckpt/emo_dim_model/config.json +122 -0
ckpt/emo_dim_model/model.safetensors +3 -0
ckpt/emo_dim_model/preprocessor_config.json +9 -0
ckpt/emo_dim_model/vocab.json +1 -0
ckpt/sepformer-wham-enhancement/decoder.ckpt +3 -0
ckpt/sepformer-wham-enhancement/encoder.ckpt +3 -0
ckpt/sepformer-wham-enhancement/hyperparams.yaml +66 -0
ckpt/sepformer-wham-enhancement/masknet.ckpt +3 -0
ckpt/ser_cn_audio/config.json +74 -0
ckpt/ser_cn_audio/preprocessor_config.json +9 -0
ckpt/ser_cn_audio/pytorch_model.bin +3 -0
ckpt/ser_en_audio/config.json +148 -0
ckpt/ser_en_audio/model.safetensors +3 -0
ckpt/ser_en_audio/optimizer.pt +3 -0
ckpt/ser_en_audio/preprocessor_config.json +10 -0
ckpt/ser_en_audio/rng_state_0.pth +3 -0
ckpt/ser_en_audio/rng_state_1.pth +3 -0
ckpt/ser_en_audio/rng_state_2.pth +3 -0
ckpt/ser_en_audio/rng_state_3.pth +3 -0
ckpt/ser_en_audio/scheduler.pt +3 -0
ckpt/ser_en_audio/trainer_state.json +652 -0
ckpt/ser_en_audio/training_args.bin +3 -0
ckpt/ser_en_text/config.json +45 -0
ckpt/ser_en_text/merges.txt +0 -0
ckpt/ser_en_text/pytorch_model.bin +3 -0
ckpt/ser_en_text/special_tokens_map.json +1 -0
ckpt/ser_en_text/tokenizer.json +0 -0
ckpt/ser_en_text/tokenizer_config.json +1 -0
ckpt/ser_en_text/training_args.bin +3 -0
ckpt/ser_en_text/vocab.json +0 -0
ckpt/zh-2-en/config.json +60 -0
ckpt/zh-2-en/generation_config.json +16 -0
ckpt/zh-2-en/metadata.json +1 -0
ckpt/zh-2-en/pytorch_model.bin +3 -0
ckpt/zh-2-en/rust_model.ot +3 -0
ckpt/zh-2-en/source.spm +0 -0
ckpt/zh-2-en/target.spm +0 -0
ckpt/zh-2-en/tokenizer_config.json +1 -0
ckpt/zh-2-en/vocab.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+import time
+import shutil
+import argparse
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import gradio as gr
+from glob import glob
+from audio_models import EnglishEmotionModel
+def classify(audio, model_choice, preprocess, weight):
+    return en_model.predict(audio, model_choice, preprocess, weight)
+def handle_feedback(audio: str, model_prediction, unsatisfied, true_label, savedir='./user_feedback'):
+    os.makedirs(savedir, exist_ok=True)
+    if unsatisfied:
+        audio_path = os.path.join(savedir, f"{int(time.time())}.wav")
+        # 保存用户反馈的音频
+        if isinstance(audio, str):  # a temporary file path
+            shutil.copy(audio, audio_path)
+        elif isinstance(audio, tuple):  # audio data
+            sr, data = audio
+            librosa.output.write_wav(audio_path, data, sr)
+        else:
+            raise ValueError("Invalid audio input")
+        # save model prediction and true label as Python dict, save it as pickle file
+        feedback = {
+            "audio_path": audio_path,
+            "model_prediction": model_prediction,
+            "true_label": true_label
+        }
+        feedback_path = os.path.join(savedir, f"{int(time.time())}.pkl")
+        torch.save(feedback, feedback_path)
+        return f"Feedback submitted: True Label = {true_label}, Model Prediction = {model_prediction}"
+    return "Thank you for using our SER demo!"
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--duration', type=int, default=10, help='duration of audio')
+    parser.add_argument('--sr', type=int, default=16000, help='sampling rate of audio')
+    parser.add_argument('--device', type=str, default='cuda', help='device index to run model')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    en_model = EnglishEmotionModel(duration=args.duration, sr=args.sr, device=torch.device(args.device))
+    english_audio_paths = glob('audio_files/english/*.wav')
+    english_audio_paths.sort()
+    english_audio_paths = {f"English Audio {idx + 1}": path for idx, path in enumerate(english_audio_paths)}
+    # 更新音频选项的函数
+    def update_audio_options():
+        return gr.update(choices=list(english_audio_paths.keys()), value="English Audio 1")
+    # 更新音频播放器的函数
+    def update_audio_file(audio_selection):
+        return english_audio_paths[audio_selection]
+    with gr.Blocks() as demo:
+        # 创建tab界面
+        # with gr.Tab("Demo (Built-In Audio)"):
+        #     gr.Markdown("""## Automatic Emotion Recognition Demo \n
+        #                 This is a demo for audio emotion recognition.
+        #                 Note that the model is still under active developments. Please feel free to report any issues. \n
+        #                 The Chinese model is based on Hubert and the English model is based on Wav2Vec2.""")
+        #     with gr.Row():
+        #         with gr.Column():
+        #             # 选择音频的 Dropdown，默认显示中文音频的第一个
+        #             audio_dropdown = gr.Dropdown(list(english_audio_paths.keys()), label="Select Audio", value="English Audio 1", interactive=True)
+        #             # 音频播放器，默认播放中文音频 1
+        #             audio_player = gr.Audio(value=english_audio_paths["English Audio 1"], interactive=False)
+        #             slider = gr.Slider(label='Context Weight', minimum=0, maximum=1, step=0.01, value=0.6)
+        #         with gr.Column():
+        #             # 显示情感分类结果
+        #             emotion_label = gr.Label(label="Emotion Prediction")
+        #             dim_label = gr.Plot(label="Emotion Dimension")
+        #             transcripts = gr.Textbox(label="Transcription", type='text', lines=5, max_lines=20, placeholder="Transcription")
+        #             # 按钮，点击后更新情感分类结果
+        #             classify_button = gr.Button("Classify Emotion")
+        #     audio_dropdown.change(
+        #         fn=update_audio_file,
+        #         inputs=audio_dropdown,
+        #         outputs=audio_player
+        #     )
+        #     # 点击按钮后，更新情感分类结果
+        #     classify_button.click(base_classify, inputs=[audio_player, slider], outputs=[emotion_label, dim_label, transcripts])
+        with gr.Tab("Speech Emotion Recognition Demo"):
+            gr.Markdown("""## Interactive SER Demo \n
+                        Please upload audio via file path or microphone. If you are recording audio via microphone, please make sure that the audio is clear. \n
+                        The performance could be affected by environmental noise. \n
+                        If you are recording in a noisy environment, please enable the noise reduction option. Note that this will lead to slight deterioration in performance.\n""")
+            with gr.Row():
+                with gr.Column():
+                    audio = gr.Audio(sources=['microphone', 'upload'], type='filepath')
+                    text = gr.Textbox(label="Transcription", type='text', lines=5, max_lines=20, placeholder="Transcription")
+                    model_choice = gr.Dropdown(choices=['中文', 'English'], label='语言 / Language', value='中文')
+                    with gr.Accordion("Advanced Settings", open=False):
+                        preprocess = gr.Checkbox(label='Noise Reduction (Do not tick the box unless the environment is noisy)', value=False)
+                        weight_slider = gr.Slider(label='Context Weight', minimum=0, maximum=1, step=0.01, value=0.6)
+                    demo_button = gr.Button("Analyze Emotion")
+                with gr.Column():
+                    emotion_pred = gr.Label(label="Emotion Prediction")
+                    dim_pred = gr.Plot(label="Emotion Dimension")
+            with gr.Accordion("Feedback", open=False) as feedback_section:
+                gr.Markdown("### User Feedback")
+                satisfied_checkbox = gr.Checkbox(label="Are you unsatisfied with the result?", value=False)
+                true_label_dropdown = gr.Dropdown(
+                    label="Select the correct label",
+                    choices=["angry", "disgust", "fearful", "happy", "neutral", "sad", "surprised"],
+                )
+                submit_feedback_button = gr.Button("Submit Feedback")
+                feedback_result = gr.Textbox(label="Feedback Result", interactive=False)
+            demo_button.click(classify, inputs=[audio, model_choice, preprocess, weight_slider], outputs=[emotion_pred, dim_pred, text])
+            submit_feedback_button.click(handle_feedback, inputs=[audio, emotion_pred, satisfied_checkbox, true_label_dropdown], outputs=[feedback_result])
+    demo.launch(share=True)

audio_models.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+# os.environ['HF_HUB_OFFLINE'] = '1'
+import librosa
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import whisper
+from transformers import AutoConfig, AutoModelForAudioClassification, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, \
+    Wav2Vec2FeatureExtractor, Wav2Vec2PreTrainedModel, HubertPreTrainedModel, HubertModel, Wav2Vec2Model
+from transformers.modeling_outputs import SequenceClassifierOutput
+from speechbrain.inference.separation import SepformerSeparation as separator
+class HubertClassificationHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_class)
+    def forward(self, x):
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class RegressionHead(nn.Module):
+    r"""Classification head."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class HubertForSpeechClassification(HubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.hubert = HubertModel(config)
+        self.classifier = HubertClassificationHead(config)
+        self.init_weights()
+    def forward(self, x):
+        outputs = self.hubert(x)
+        hidden_states = outputs[0]
+        x = torch.mean(hidden_states, dim=1)
+        x = self.classifier(x)
+        return SequenceClassifierOutput(
+            loss=None,
+            logits=x,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class Wav2VecForSpeechRegression(Wav2Vec2PreTrainedModel):
+    r"""Speech emotion classifier."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = RegressionHead(config)
+        self.init_weights()
+    def forward(
+            self,
+            input_values,
+    ):
+        outputs = self.wav2vec2(input_values)
+        hidden_states = outputs[0]
+        hidden_states = torch.mean(hidden_states, dim=1)
+        logits = self.classifier(hidden_states)
+        return hidden_states, logits
+class EmotionModel:
+    def __init__(self, duration: int = 6, sr: int = 16000, device: torch.device = "cuda", use_text: bool = True):
+        # basic configurations
+        self.device = device
+        self.duration = duration
+        self.sr = sr
+        self.use_text = use_text
+        # audio config
+        self.audio_id2label = {}
+        self.processor = None
+        self.audio_model = None
+        # text config
+        self.text_id2label = {}  # note that id2label should be identical, we perform classification on the intersection classes
+        self.tokenizer = None
+        self.text_model = None
+        # noise reduction
+        self.nr_model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement", savedir='ckpt/sepformer-wham-enhancement', run_opts={'device': 'cuda'})
+        # TTS using openai-whisper
+        self.tts_model = whisper.load_model('turbo', device=device)
+    def preprocess_audio(self, speech):
+        """
+        Preprocess the audio: including noise reduction and silence removal.
+        Args:
+            speech: audio waveform.
+        """
+        # noise reduction
+        speech = self.nr_model.separate_batch(torch.as_tensor(speech).unsqueeze(0))[0, :, 0].detach().cpu().numpy()
+        # speech = nr.reduce_noise(y=speech, sr=self.sr, stationary=True)
+        # # remove silence in the segment
+        # speech, index = librosa.effects.trim(speech, top_db=40)
+        return speech
+    def load_audio(self, audio, preprocess: bool = True):
+        """
+        Load the audio segment into np.ndarray.
+        Args:
+            audio: audio file path or audio data;
+            preprocess: bool, whether to run preprocess function.
+        """
+        if isinstance(audio, str):
+            # load the speech and resample it to the target sampling rate
+            speech, _ = librosa.load(path=audio, sr=self.sr)
+            speech = librosa.to_mono(speech)
+            # clip the very beginning and end of the audio
+            speech = speech[int(0.5 * self.sr):int(-0.1 * self.sr)]
+        elif isinstance(audio, tuple):
+            assert len(audio) == 2, "audio tuple must have 2 elements: sr and speech"
+            orig_sr, orig_speech = audio
+            speech = librosa.resample(orig_speech.astype(np.float32), orig_sr=orig_sr, target_sr=self.sr)
+            speech = librosa.to_mono(speech)
+        else:
+            raise ValueError("audio must be a file path or audio data, get file type: {}".format(type(audio)))
+        if preprocess:
+            speech = self.preprocess_audio(speech)
+        return speech
+    def id2label(self, id2label, indices, scores):
+        """
+        Get the label based on the index.
+        Args:
+            indices: emotion class index;
+            scores: emotion class scores;
+            modal: str, "audio" or "text".
+        """
+        output = {}
+        for idx, score in zip(indices, scores):
+            if idx in id2label.keys():
+                output[id2label[idx]] = score
+        return output
+    def normalize_scores(self, audio_result, text_result, audio_weight: float = 0.25, text_weight: float = 0.75):
+        """
+        Normalize the scores based on the weights.
+        Args:
+            audio_result: a dict of audio pred, keys being emotion labels and values being scores;
+            text_result: a dict of text pred, keys being emotion labels and values being scores;
+            audio_weight: float, weight for audio;
+            text_weight: float, weight for text.
+        """
+        audio_result = {k: v * audio_weight for k, v in audio_result.items()}
+        text_result = {k: v * text_weight for k, v in text_result.items()}
+        # merge the results, the order of classes should be the same
+        result = {}
+        for k in audio_result.keys():
+            result[k] = audio_result[k] + text_result[k]
+        # normalize the scores to 1
+        total = sum(result.values())
+        result = {k: v / total for k, v in result.items()}
+        return result
+    def audio_pred(self, inputs):
+        """
+        Predict emotion class on audio segment.
+        Args:
+            inputs: audio inputs;
+            scores_only: bool, whether to return only scores.
+        """
+        speech = self.processor(inputs, padding="max_length", truncation=True, max_length=self.duration * self.sr,
+                                return_tensors="pt", sampling_rate=self.sr).input_values.to(self.device)
+        with torch.no_grad():
+            logits = self.audio_model(speech).logits
+        scores, indices = torch.sort(logits.squeeze().detach().cpu(), descending=True)
+        scores = F.softmax(scores, dim=0).numpy()
+        indices = indices.numpy()
+        return self.id2label(self.audio_id2label, indices, scores)
+    def text_pred(self, text):
+        """
+        Predict emotion class on text.
+        Args:
+            text: text inputs by TTS;
+            scores_only: bool, whether to return only scores.
+        """
+        inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to(self.device)
+        with torch.no_grad():
+            logits = self.text_model(**inputs).logits
+        scores, indices = torch.sort(logits.squeeze().detach().cpu(), descending=True)
+        scores = scores.numpy()
+        scores = F.softmax(torch.tensor(scores), dim=0).numpy()
+        indices = indices.numpy()
+        return self.id2label(self.text_id2label, indices, scores)
+    def predict(self, audio, preprocess: bool = True, scores_only: bool = False):
+        """
+        Run prediction based on the recipe.
+        """
+        speech = self.load_audio(audio, preprocess=preprocess)
+        audio_scores = self.audio_pred(speech)
+        if not self.use_text:
+            return audio_scores if not scores_only else list(audio_scores.values())
+        output = self.tts_model.transcribe(speech)
+        text, language = output['text'], output['language']
+        text_scores = self.text_pred(text)
+        result = self.normalize_scores(audio_scores, text_scores)
+        return result if not scores_only else list(result.values())
+class EnglishEmotionModel(EmotionModel):
+    def __init__(self, duration: int = 6, sr: int = 16000, device: torch.device = "cuda", use_text: bool = True):
+        super().__init__(duration, sr, device, use_text)
+        # english audio model
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained("./ckpt/ser_en_audio")
+        self.audio_model = AutoModelForAudioClassification.from_pretrained("./ckpt/ser_en_audio").eval()
+        self.audio_id2label = {
+            0: "angry",
+            6: "disgust",
+            9: "fearful",
+            10: "happy",
+            11: "neutral",
+            12: "sad",
+            13: "surprised"
+        }
+        # chinese audio model
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path="./ckpt/ser_cn_audio")
+        self.cn_processor = Wav2Vec2FeatureExtractor.from_pretrained("./ckpt/ser_cn_audio")
+        self.cn_audio_model = HubertForSpeechClassification.from_pretrained("./ckpt/ser_cn_audio", config=config).eval()
+        self.cn_audio_id2label = {
+            0: "angry",
+            1: "fearful",
+            2: "happy",
+            3: "neutral",
+            4: "sad",
+            5: "surprised"
+        }
+        # english text model
+        self.text_model = AutoModelForSequenceClassification.from_pretrained("./ckpt/ser_en_text").eval()
+        self.tokenizer = AutoTokenizer.from_pretrained("./ckpt/ser_en_text")
+        self.text_id2label = {
+            0: "angry",
+            1: "disgust",
+            2: "fearful",
+            3: "happy",
+            4: "neutral",
+            5: "sad",
+            6: "surprised"
+        }
+        self.audio_model.to(self.device)
+        self.text_model.to(self.device)
+        self.cn_audio_model.to(self.device)
+        # load the MSP-DIM model
+        self.msp_dim = Wav2VecForSpeechRegression.from_pretrained('./ckpt/emo_dim_model').to(device)
+        self.msp_processor = Wav2Vec2FeatureExtractor.from_pretrained('./ckpt/emo_dim_model')
+        # self.msp_dim = Wav2VecForSpeechRegression.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim').to(device)
+        # self.msp_processor = Wav2Vec2FeatureExtractor.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim')
+        # load the translation model (CN2EN)
+        # self.translator = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-zh-en').to(device)
+        # self.translator_tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-zh-en')
+        self.translator = AutoModelForSeq2SeqLM.from_pretrained('./ckpt/zh-2-en').to(device)
+        self.translator_tokenizer = AutoTokenizer.from_pretrained('./ckpt/zh-2-en')
+    def plot_dim(self, dim_result, line_height=1):
+        # 设置图表参数
+        fig, ax = plt.subplots(figsize=(7, 3), dpi=300)
+        # 配置条形图参数
+        labels = list(dim_result.keys())  # 标签 (A, V, D)
+        values = list(dim_result.values())  # 各标签的值
+        colors = ['blue', 'red', 'green']  # 每个标签的颜色
+        # 绘制每个条形图
+        for i, (label, value) in enumerate(dim_result.items()):
+            # 绘制条形图
+            ax.barh(i, value, color=colors[i], height=line_height, align='center')
+            # 绘制边框，从 -1 到 1
+            rect = patches.Rectangle((-1, i - line_height/2), 2, line_height, edgecolor='black', facecolor='none', linewidth=1)
+            ax.add_patch(rect)
+        # 设置 x 轴范围和样式
+        ax.set_xlim(-1.0, 1.1)
+        ax.axvline(0, color='black', linewidth=1)  # 中心线
+        # 设置轴标签和标题
+        ax.set_xticks([-1, 0, 1])
+        ax.set_xticklabels(['Low (-1)', 'Neutral (0)', 'High (1)'])
+        ax.set_yticks(range(len(dim_result)))
+        ax.set_yticklabels(labels)
+        # 去除图像外边框
+        ax.spines['top'].set_visible(False)
+        ax.spines['right'].set_visible(False)
+        ax.spines['left'].set_visible(False)
+        ax.spines['bottom'].set_visible(False)
+        # 隐藏 y 轴网格线
+        ax.grid(False)
+        # 显示图形
+        plt.tight_layout()
+        return fig
+    def dim_pred(self, inputs, return_plot=True):
+        inputs = self.msp_processor(inputs, return_tensors="pt", padding=False, truncation=True, max_length=160000, sampling_rate=16000).input_values.to(self.device)
+        with torch.no_grad():
+            hidden_states, logits = self.msp_dim(input_values=inputs)
+        logits = logits[0].clamp_(0, 1).detach().cpu().numpy()
+        logits = (logits - 0.5) * 2  # remap to (-1, 1)
+        result = {'arousal': logits[0], 'valence': logits[2], 'dominance': logits[1]}
+        if return_plot:
+            result = self.plot_dim(result)
+        return result
+    def cn_audio_pred(self, inputs):
+        """
+        Predict emotion class on audio segment.
+        Args:
+            inputs: audio inputs;
+            scores_only: bool, whether to return only scores.
+        """
+        speech = self.cn_processor(inputs, padding="max_length", truncation=True, max_length=self.duration * self.sr,
+                                return_tensors="pt", sampling_rate=self.sr).input_values.to(self.device)
+        with torch.no_grad():
+            logits = self.cn_audio_model(speech).logits
+        scores, indices = torch.sort(logits.squeeze().detach().cpu(), descending=True)
+        scores = F.softmax(scores, dim=0).numpy()
+        indices = indices.numpy()
+        return self.id2label(self.cn_audio_id2label, indices, scores)
+    def predict(self, audio, language_choice, preprocess: bool = True, text_weight: float = 0.5):
+        """
+        Run prediction based on the recipe.
+        Args:
+            audio: audio file path or audio data;
+            language_choice: str, "中文" or "English";
+            preprocess: bool, whether to run preprocess function.
+            text_weight: the ratio for text prediction.
+        """
+        speech = self.load_audio(audio, preprocess=preprocess)
+        if language_choice == '中文':
+            audio_scores = self.cn_audio_pred(speech)
+        else:
+            audio_scores = self.audio_pred(speech)
+        if not self.use_text:
+            dim_result = self.dim_pred(speech)
+            return audio_scores, dim_result, None
+        output = self.tts_model.transcribe(speech)
+        text, language = output['text'], output['language']
+        if language != 'en':
+            inputs = self.translator_tokenizer(text, return_tensors="pt").to(self.device)
+            output = self.translator.generate(**inputs)
+            text_en = self.translator_tokenizer.decode(output[0], skip_special_tokens=True)
+            text_scores = self.text_pred(text_en)
+        else:
+            text_scores = self.text_pred(text)
+        result = self.normalize_scores(audio_scores, text_scores, audio_weight=1-text_weight, text_weight=text_weight)
+        dim_result = self.dim_pred(speech)
+        # emotion, dim, text
+        return result, dim_result, text

ckpt/emo_dim_model/config.json ADDED Viewed

	@@ -0,0 +1,122 @@

+{
+  "_name_or_path": "torch",
+  "activation_dropout": 0.1,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForSpeechClassification"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "finetuning_task": "wav2vec2_reg",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "arousal",
+    "1": "dominance",
+    "2": "valence"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "arousal": 0,
+    "dominance": 1,
+    "valence": 2
+  },
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "pooling_mode": "mean",
+  "problem_type": "regression",
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": null,
+  "xvector_output_dim": 512
+}

ckpt/emo_dim_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efa5ac1a13b2d2f42182738e44794b1eb4c0cdd221a8b4ae11304c3a5f5fae95
+size 661375508

ckpt/emo_dim_model/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

ckpt/emo_dim_model/vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

ckpt/sepformer-wham-enhancement/decoder.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4703b15d23ad5dd4c9b6b93b09539cf0048ba2e58a36c71a62fb860d5b0d343f
+size 17272

ckpt/sepformer-wham-enhancement/encoder.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c6b3e53a4061b81b7b0abf6a7faac8ee9714e0509a49ac60d249488e347430c
+size 17272

ckpt/sepformer-wham-enhancement/hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# ################################
+# Model: Pretrained SepFormer for speech enhancement
+# Dataset : WHAM!
+# ################################
+num_spks: 1
+sample_rate: 8000
+# Encoder parameters
+N_encoder_out: 256
+out_channels: 256
+kernel_size: 16
+kernel_stride: 8
+# Specifying the network
+Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
+  kernel_size: 16
+  out_channels: 256
+SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+  num_layers: 8
+  d_model: 256
+  nhead: 8
+  d_ffn: 1024
+  dropout: 0
+  use_positional_encoding: true
+  norm_before: true
+SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+  num_layers: 8
+  d_model: 256
+  nhead: 8
+  d_ffn: 1024
+  dropout: 0
+  use_positional_encoding: true
+  norm_before: true
+MaskNet: !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
+  num_spks: 1
+  in_channels: 256
+  out_channels: 256
+  num_layers: 2
+  K: 250
+  intra_model: !ref <SBtfintra>
+  inter_model: !ref <SBtfinter>
+  norm: ln
+  linear_layer_after_inter_intra: false
+  skip_around_intra: true
+Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
+  in_channels: 256
+  out_channels: 1
+  kernel_size: 16
+  stride: 8
+  bias: false
+modules:
+  encoder: !ref <Encoder>
+  decoder: !ref <Decoder>
+  masknet: !ref <MaskNet>
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+  loadables:
+    encoder: !ref <Encoder>
+    masknet: !ref <MaskNet>
+    decoder: !ref <Decoder>

ckpt/sepformer-wham-enhancement/masknet.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:111312d682aee6b72610b83edc0dcf253d7a62f745136cd2828113fb75fbb6e4
+size 112849478

ckpt/ser_cn_audio/config.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+  "_name_or_path": "TencentGameMate/chinese-hubert-base",
+  "activation_dropout": 0.1,
+  "apply_spec_augment": true,
+  "architectures": [
+    "HubertForSpeechClassification"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_dropout": 0.1,
+  "classifier_proj_size": 256,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "feat_proj_layer_norm": true,
+  "final_dropout": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "hubert",
+  "num_attention_heads": 12,
+  "num_class": 6,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.24.0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32
+}

ckpt/ser_cn_audio/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

ckpt/ser_cn_audio/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd2286572750ab6f4cf3d1a5283cf6c92b4a8ae9e87f38ebb515439a56c5b53
+size 379939475

ckpt/ser_en_audio/config.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "_name_or_path": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+  "activation_dropout": 0.05,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": true,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.05,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.05,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "angry",
+    "1": "anxious",
+    "2": "apologetic",
+    "3": "assertive",
+    "4": "calm",
+    "5": "concerned",
+    "6": "disgust",
+    "7": "encouraging",
+    "8": "excited",
+    "9": "fearful",
+    "10": "happy",
+    "11": "neutral",
+    "12": "sad",
+    "13": "surprised"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "angry": 0,
+    "anxious": 1,
+    "apologetic": 2,
+    "assertive": 3,
+    "calm": 4,
+    "concerned": 5,
+    "disgust": 6,
+    "encouraging": 7,
+    "excited": 8,
+    "fearful": 9,
+    "happy": 10,
+    "neutral": 11,
+    "sad": 12,
+    "surprised": 13
+  },
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.05,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.45.2",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 33,
+  "xvector_output_dim": 512
+}

ckpt/ser_en_audio/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03a81b9298d54f85fec594b071c6fa8d32483876219ad0912f7324aaff2c3d72
+size 1262871640

ckpt/ser_en_audio/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:334204fa336cc008160506dd3272196fde73f9b74039d8b0c2e9eac8c8998acf
+size 2525994320

ckpt/ser_en_audio/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2ProcessorWithLM",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

ckpt/ser_en_audio/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cc40fede51cbf483771073afe9c2d734758236aad4608d6f0393100f5015429
+size 15024

ckpt/ser_en_audio/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58842f3563b84c1197fe10d111b14adbe13041260bf73977633194268f547050
+size 15024

ckpt/ser_en_audio/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a770c18a5735b020b633ee5c0dc23f405eb5244709c33f644d9f449cdbc2331
+size 15024

ckpt/ser_en_audio/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ca2caa02ebff6c447147c9295ca4a54048a4e2d5d2d45b27fad60090f0084fc
+size 15024

ckpt/ser_en_audio/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8017f56711090ed38368e1dea349327d91f7f162e079c14836498d327bbfab77
+size 1064

ckpt/ser_en_audio/trainer_state.json ADDED Viewed

	@@ -0,0 +1,652 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 30.0,
+  "eval_steps": 500,
+  "global_step": 18600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.8064516129032258,
+      "grad_norm": 5.6446027755737305,
+      "learning_rate": 9.798387096774194e-05,
+      "loss": 0.8653,
+      "step": 500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.8519116311913649,
+      "eval_f1": 0.5612972035236246,
+      "eval_loss": 0.41698312759399414,
+      "eval_precision": 0.6519175226159025,
+      "eval_recall": 0.5331925780455707,
+      "eval_runtime": 444.2569,
+      "eval_samples_per_second": 22.314,
+      "eval_steps_per_second": 0.698,
+      "step": 620
+    },
+    {
+      "epoch": 1.6129032258064515,
+      "grad_norm": 2.4346652030944824,
+      "learning_rate": 9.596774193548387e-05,
+      "loss": 0.3288,
+      "step": 1000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9004337738323414,
+      "eval_f1": 0.7645083858520498,
+      "eval_loss": 0.27656903862953186,
+      "eval_precision": 0.775442200715531,
+      "eval_recall": 0.7599977765696228,
+      "eval_runtime": 441.6079,
+      "eval_samples_per_second": 22.448,
+      "eval_steps_per_second": 0.702,
+      "step": 1240
+    },
+    {
+      "epoch": 2.4193548387096775,
+      "grad_norm": 3.350771903991699,
+      "learning_rate": 9.395161290322582e-05,
+      "loss": 0.2313,
+      "step": 1500
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9010390396449107,
+      "eval_f1": 0.7136957134951721,
+      "eval_loss": 0.30654314160346985,
+      "eval_precision": 0.7918800656051795,
+      "eval_recall": 0.680703182570182,
+      "eval_runtime": 443.8741,
+      "eval_samples_per_second": 22.333,
+      "eval_steps_per_second": 0.698,
+      "step": 1860
+    },
+    {
+      "epoch": 3.225806451612903,
+      "grad_norm": 1.5124069452285767,
+      "learning_rate": 9.193548387096774e-05,
+      "loss": 0.1733,
+      "step": 2000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.909210128114597,
+      "eval_f1": 0.8132841024537738,
+      "eval_loss": 0.3023378252983093,
+      "eval_precision": 0.8052977470471581,
+      "eval_recall": 0.8366718533903973,
+      "eval_runtime": 444.6165,
+      "eval_samples_per_second": 22.296,
+      "eval_steps_per_second": 0.697,
+      "step": 2480
+    },
+    {
+      "epoch": 4.032258064516129,
+      "grad_norm": 1.1565921306610107,
+      "learning_rate": 8.991935483870968e-05,
+      "loss": 0.1336,
+      "step": 2500
+    },
+    {
+      "epoch": 4.838709677419355,
+      "grad_norm": 7.502715110778809,
+      "learning_rate": 8.790322580645162e-05,
+      "loss": 0.1044,
+      "step": 3000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9228286088974075,
+      "eval_f1": 0.7768284943233322,
+      "eval_loss": 0.27094921469688416,
+      "eval_precision": 0.8088427869040292,
+      "eval_recall": 0.7605468716950611,
+      "eval_runtime": 446.1958,
+      "eval_samples_per_second": 22.217,
+      "eval_steps_per_second": 0.695,
+      "step": 3100
+    },
+    {
+      "epoch": 5.645161290322581,
+      "grad_norm": 4.222163677215576,
+      "learning_rate": 8.588709677419356e-05,
+      "loss": 0.0891,
+      "step": 3500
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.9190961363865631,
+      "eval_f1": 0.836047484191284,
+      "eval_loss": 0.2973528802394867,
+      "eval_precision": 0.8333888690415643,
+      "eval_recall": 0.8462180948065121,
+      "eval_runtime": 445.7518,
+      "eval_samples_per_second": 22.239,
+      "eval_steps_per_second": 0.695,
+      "step": 3720
+    },
+    {
+      "epoch": 6.451612903225806,
+      "grad_norm": 0.9999768137931824,
+      "learning_rate": 8.387096774193549e-05,
+      "loss": 0.0738,
+      "step": 4000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.9198022798345606,
+      "eval_f1": 0.8339480533055649,
+      "eval_loss": 0.32465115189552307,
+      "eval_precision": 0.8531984900149399,
+      "eval_recall": 0.8313244274621819,
+      "eval_runtime": 444.2156,
+      "eval_samples_per_second": 22.316,
+      "eval_steps_per_second": 0.698,
+      "step": 4340
+    },
+    {
+      "epoch": 7.258064516129032,
+      "grad_norm": 0.9860001802444458,
+      "learning_rate": 8.185483870967743e-05,
+      "loss": 0.0617,
+      "step": 4500
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.9283768788459599,
+      "eval_f1": 0.8488005438808053,
+      "eval_loss": 0.2583344280719757,
+      "eval_precision": 0.8446683685244591,
+      "eval_recall": 0.8628243687115127,
+      "eval_runtime": 446.3564,
+      "eval_samples_per_second": 22.209,
+      "eval_steps_per_second": 0.695,
+      "step": 4960
+    },
+    {
+      "epoch": 8.064516129032258,
+      "grad_norm": 0.3879972994327545,
+      "learning_rate": 7.983870967741936e-05,
+      "loss": 0.0574,
+      "step": 5000
+    },
+    {
+      "epoch": 8.870967741935484,
+      "grad_norm": 4.836447715759277,
+      "learning_rate": 7.78225806451613e-05,
+      "loss": 0.0492,
+      "step": 5500
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.9225259759911227,
+      "eval_f1": 0.816817658073689,
+      "eval_loss": 0.34519901871681213,
+      "eval_precision": 0.8676454880398878,
+      "eval_recall": 0.7877172029441207,
+      "eval_runtime": 447.3198,
+      "eval_samples_per_second": 22.161,
+      "eval_steps_per_second": 0.693,
+      "step": 5580
+    },
+    {
+      "epoch": 9.67741935483871,
+      "grad_norm": 0.07304174453020096,
+      "learning_rate": 7.580645161290323e-05,
+      "loss": 0.0419,
+      "step": 6000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.9251487945122566,
+      "eval_f1": 0.8469126432067672,
+      "eval_loss": 0.33950358629226685,
+      "eval_precision": 0.8519618889252182,
+      "eval_recall": 0.8489192187654137,
+      "eval_runtime": 446.1732,
+      "eval_samples_per_second": 22.218,
+      "eval_steps_per_second": 0.695,
+      "step": 6200
+    },
+    {
+      "epoch": 10.483870967741936,
+      "grad_norm": 3.8705227375030518,
+      "learning_rate": 7.379032258064516e-05,
+      "loss": 0.0331,
+      "step": 6500
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.9118329466357309,
+      "eval_f1": 0.8376640160936598,
+      "eval_loss": 0.4379476308822632,
+      "eval_precision": 0.8312605064526365,
+      "eval_recall": 0.8501611125751187,
+      "eval_runtime": 444.981,
+      "eval_samples_per_second": 22.277,
+      "eval_steps_per_second": 0.697,
+      "step": 6820
+    },
+    {
+      "epoch": 11.290322580645162,
+      "grad_norm": 0.8799965381622314,
+      "learning_rate": 7.177419354838711e-05,
+      "loss": 0.0353,
+      "step": 7000
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.9295874104710986,
+      "eval_f1": 0.8496199395886753,
+      "eval_loss": 0.3607427775859833,
+      "eval_precision": 0.8474190215073337,
+      "eval_recall": 0.8606261174217371,
+      "eval_runtime": 445.7391,
+      "eval_samples_per_second": 22.239,
+      "eval_steps_per_second": 0.695,
+      "step": 7440
+    },
+    {
+      "epoch": 12.096774193548388,
+      "grad_norm": 1.5596448183059692,
+      "learning_rate": 6.975806451612904e-05,
+      "loss": 0.0315,
+      "step": 7500
+    },
+    {
+      "epoch": 12.903225806451612,
+      "grad_norm": 1.5588679313659668,
+      "learning_rate": 6.774193548387096e-05,
+      "loss": 0.0289,
+      "step": 8000
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.9273681024916776,
+      "eval_f1": 0.8508873409327621,
+      "eval_loss": 0.3614977300167084,
+      "eval_precision": 0.8526893632091166,
+      "eval_recall": 0.8572290757862527,
+      "eval_runtime": 486.6477,
+      "eval_samples_per_second": 20.37,
+      "eval_steps_per_second": 0.637,
+      "step": 8060
+    },
+    {
+      "epoch": 13.709677419354838,
+      "grad_norm": 0.03195716440677643,
+      "learning_rate": 6.57258064516129e-05,
+      "loss": 0.0261,
+      "step": 8500
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.9272672248562494,
+      "eval_f1": 0.8630758757277098,
+      "eval_loss": 0.36916211247444153,
+      "eval_precision": 0.8586072821035567,
+      "eval_recall": 0.8724563229411526,
+      "eval_runtime": 472.9747,
+      "eval_samples_per_second": 20.959,
+      "eval_steps_per_second": 0.655,
+      "step": 8680
+    },
+    {
+      "epoch": 14.516129032258064,
+      "grad_norm": 0.2278767228126526,
+      "learning_rate": 6.370967741935485e-05,
+      "loss": 0.0239,
+      "step": 9000
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.9261575708665389,
+      "eval_f1": 0.8540811504617694,
+      "eval_loss": 0.4022212028503418,
+      "eval_precision": 0.8616150833457688,
+      "eval_recall": 0.8575704808190433,
+      "eval_runtime": 487.8531,
+      "eval_samples_per_second": 20.32,
+      "eval_steps_per_second": 0.635,
+      "step": 9300
+    },
+    {
+      "epoch": 15.32258064516129,
+      "grad_norm": 0.053363025188446045,
+      "learning_rate": 6.169354838709678e-05,
+      "loss": 0.0255,
+      "step": 9500
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.9373549883990719,
+      "eval_f1": 0.8401806488376877,
+      "eval_loss": 0.33772599697113037,
+      "eval_precision": 0.8640083214464809,
+      "eval_recall": 0.8396382260534664,
+      "eval_runtime": 462.9155,
+      "eval_samples_per_second": 21.414,
+      "eval_steps_per_second": 0.67,
+      "step": 9920
+    },
+    {
+      "epoch": 16.129032258064516,
+      "grad_norm": 0.11232730746269226,
+      "learning_rate": 5.9677419354838715e-05,
+      "loss": 0.0199,
+      "step": 10000
+    },
+    {
+      "epoch": 16.93548387096774,
+      "grad_norm": 0.48599764704704285,
+      "learning_rate": 5.7661290322580655e-05,
+      "loss": 0.0196,
+      "step": 10500
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.9315040855442348,
+      "eval_f1": 0.8327759456218216,
+      "eval_loss": 0.3767533302307129,
+      "eval_precision": 0.8591712003111222,
+      "eval_recall": 0.8342081502036376,
+      "eval_runtime": 513.732,
+      "eval_samples_per_second": 19.296,
+      "eval_steps_per_second": 0.603,
+      "step": 10540
+    },
+    {
+      "epoch": 17.741935483870968,
+      "grad_norm": 0.019181491807103157,
+      "learning_rate": 5.5645161290322576e-05,
+      "loss": 0.0192,
+      "step": 11000
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.9199031574699889,
+      "eval_f1": 0.8519369732275665,
+      "eval_loss": 0.45619821548461914,
+      "eval_precision": 0.8639403373066186,
+      "eval_recall": 0.8504081404701405,
+      "eval_runtime": 504.1002,
+      "eval_samples_per_second": 19.665,
+      "eval_steps_per_second": 0.615,
+      "step": 11160
+    },
+    {
+      "epoch": 18.548387096774192,
+      "grad_norm": 0.03667838126420975,
+      "learning_rate": 5.362903225806452e-05,
+      "loss": 0.0136,
+      "step": 11500
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.9227277312619793,
+      "eval_f1": 0.792027873615724,
+      "eval_loss": 0.43276721239089966,
+      "eval_precision": 0.8409174974381394,
+      "eval_recall": 0.770137877267392,
+      "eval_runtime": 551.2838,
+      "eval_samples_per_second": 17.982,
+      "eval_steps_per_second": 0.562,
+      "step": 11780
+    },
+    {
+      "epoch": 19.35483870967742,
+      "grad_norm": 4.472720623016357,
+      "learning_rate": 5.161290322580645e-05,
+      "loss": 0.0206,
+      "step": 12000
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.9308988197316654,
+      "eval_f1": 0.8577732939026992,
+      "eval_loss": 0.4217771291732788,
+      "eval_precision": 0.8689115082442737,
+      "eval_recall": 0.8597514273918758,
+      "eval_runtime": 494.2923,
+      "eval_samples_per_second": 20.055,
+      "eval_steps_per_second": 0.627,
+      "step": 12400
+    },
+    {
+      "epoch": 20.161290322580644,
+      "grad_norm": 0.23360569775104523,
+      "learning_rate": 4.959677419354839e-05,
+      "loss": 0.0145,
+      "step": 12500
+    },
+    {
+      "epoch": 20.967741935483872,
+      "grad_norm": 11.598882675170898,
+      "learning_rate": 4.7580645161290326e-05,
+      "loss": 0.0136,
+      "step": 13000
+    },
+    {
+      "epoch": 21.0,
+      "eval_accuracy": 0.9211136890951276,
+      "eval_f1": 0.8543572369365581,
+      "eval_loss": 0.4980121850967407,
+      "eval_precision": 0.8536818621962678,
+      "eval_recall": 0.8631371178427569,
+      "eval_runtime": 509.7213,
+      "eval_samples_per_second": 19.448,
+      "eval_steps_per_second": 0.608,
+      "step": 13020
+    },
+    {
+      "epoch": 21.774193548387096,
+      "grad_norm": 0.007597383111715317,
+      "learning_rate": 4.556451612903226e-05,
+      "loss": 0.0129,
+      "step": 13500
+    },
+    {
+      "epoch": 22.0,
+      "eval_accuracy": 0.9353374356905074,
+      "eval_f1": 0.8522499542325414,
+      "eval_loss": 0.37319281697273254,
+      "eval_precision": 0.8670789223875701,
+      "eval_recall": 0.8504049853786648,
+      "eval_runtime": 479.4485,
+      "eval_samples_per_second": 20.676,
+      "eval_steps_per_second": 0.647,
+      "step": 13640
+    },
+    {
+      "epoch": 22.580645161290324,
+      "grad_norm": 0.007587379310280085,
+      "learning_rate": 4.3548387096774194e-05,
+      "loss": 0.0097,
+      "step": 14000
+    },
+    {
+      "epoch": 23.0,
+      "eval_accuracy": 0.9339251487945123,
+      "eval_f1": 0.8622667883302407,
+      "eval_loss": 0.39090433716773987,
+      "eval_precision": 0.8768600760726798,
+      "eval_recall": 0.8598596008497976,
+      "eval_runtime": 549.8766,
+      "eval_samples_per_second": 18.028,
+      "eval_steps_per_second": 0.564,
+      "step": 14260
+    },
+    {
+      "epoch": 23.387096774193548,
+      "grad_norm": 0.0043108644895255566,
+      "learning_rate": 4.1532258064516135e-05,
+      "loss": 0.0124,
+      "step": 14500
+    },
+    {
+      "epoch": 24.0,
+      "eval_accuracy": 0.9375567436699284,
+      "eval_f1": 0.8596119869481005,
+      "eval_loss": 0.3828715682029724,
+      "eval_precision": 0.8788495808974863,
+      "eval_recall": 0.8587605197643191,
+      "eval_runtime": 518.7131,
+      "eval_samples_per_second": 19.111,
+      "eval_steps_per_second": 0.598,
+      "step": 14880
+    },
+    {
+      "epoch": 24.193548387096776,
+      "grad_norm": 0.013451021164655685,
+      "learning_rate": 3.951612903225806e-05,
+      "loss": 0.0082,
+      "step": 15000
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0018616759916767478,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 0.0097,
+      "step": 15500
+    },
+    {
+      "epoch": 25.0,
+      "eval_accuracy": 0.9202057903762736,
+      "eval_f1": 0.8674300779359293,
+      "eval_loss": 0.4943585991859436,
+      "eval_precision": 0.8615130325799393,
+      "eval_recall": 0.8783789279568988,
+      "eval_runtime": 466.3291,
+      "eval_samples_per_second": 21.258,
+      "eval_steps_per_second": 0.665,
+      "step": 15500
+    },
+    {
+      "epoch": 25.806451612903224,
+      "grad_norm": 0.004003328271210194,
+      "learning_rate": 3.548387096774194e-05,
+      "loss": 0.0113,
+      "step": 16000
+    },
+    {
+      "epoch": 26.0,
+      "eval_accuracy": 0.9377584989407848,
+      "eval_f1": 0.8685888939122031,
+      "eval_loss": 0.3813043534755707,
+      "eval_precision": 0.870556797543307,
+      "eval_recall": 0.8777354113914896,
+      "eval_runtime": 698.3043,
+      "eval_samples_per_second": 14.196,
+      "eval_steps_per_second": 0.444,
+      "step": 16120
+    },
+    {
+      "epoch": 26.612903225806452,
+      "grad_norm": 19.115692138671875,
+      "learning_rate": 3.346774193548387e-05,
+      "loss": 0.0081,
+      "step": 16500
+    },
+    {
+      "epoch": 27.0,
+      "eval_accuracy": 0.9331181277110865,
+      "eval_f1": 0.8731036089741192,
+      "eval_loss": 0.433142751455307,
+      "eval_precision": 0.8634855877520003,
+      "eval_recall": 0.886410634024776,
+      "eval_runtime": 912.9943,
+      "eval_samples_per_second": 10.858,
+      "eval_steps_per_second": 0.34,
+      "step": 16740
+    },
+    {
+      "epoch": 27.419354838709676,
+      "grad_norm": 0.0007239320548251271,
+      "learning_rate": 3.1451612903225806e-05,
+      "loss": 0.0071,
+      "step": 17000
+    },
+    {
+      "epoch": 28.0,
+      "eval_accuracy": 0.9340260264299405,
+      "eval_f1": 0.8482513205734062,
+      "eval_loss": 0.401883065700531,
+      "eval_precision": 0.8646923523178319,
+      "eval_recall": 0.8467129210887174,
+      "eval_runtime": 829.4485,
+      "eval_samples_per_second": 11.951,
+      "eval_steps_per_second": 0.374,
+      "step": 17360
+    },
+    {
+      "epoch": 28.225806451612904,
+      "grad_norm": 0.008651689626276493,
+      "learning_rate": 2.9435483870967743e-05,
+      "loss": 0.008,
+      "step": 17500
+    },
+    {
+      "epoch": 29.0,
+      "eval_accuracy": 0.9370523554927872,
+      "eval_f1": 0.8750355664355425,
+      "eval_loss": 0.3932338356971741,
+      "eval_precision": 0.8682629852956361,
+      "eval_recall": 0.8885135737659093,
+      "eval_runtime": 591.0085,
+      "eval_samples_per_second": 16.773,
+      "eval_steps_per_second": 0.525,
+      "step": 17980
+    },
+    {
+      "epoch": 29.032258064516128,
+      "grad_norm": 0.0007982092211022973,
+      "learning_rate": 2.7419354838709678e-05,
+      "loss": 0.0077,
+      "step": 18000
+    },
+    {
+      "epoch": 29.838709677419356,
+      "grad_norm": 0.0016565436962991953,
+      "learning_rate": 2.5403225806451615e-05,
+      "loss": 0.0066,
+      "step": 18500
+    },
+    {
+      "epoch": 30.0,
+      "eval_accuracy": 0.940986583274488,
+      "eval_f1": 0.8921279164005701,
+      "eval_loss": 0.38233834505081177,
+      "eval_precision": 0.9059714000838046,
+      "eval_recall": 0.8867045383809143,
+      "eval_runtime": 470.8987,
+      "eval_samples_per_second": 21.051,
+      "eval_steps_per_second": 0.658,
+      "step": 18600
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 24800,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 40,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.607823021993519e+20,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

ckpt/ser_en_audio/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e954588f3e2bd63d010ca82fe9ecff999d276a6113425e5185f8f5bb8f0caa3b
+size 5240

ckpt/ser_en_text/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_name_or_path": "distilroberta-base",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "anger",
+    "1": "disgust",
+    "2": "fear",
+    "3": "joy",
+    "4": "neutral",
+    "5": "sadness",
+    "6": "surprise"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "anger": 0,
+    "disgust": 1,
+    "fear": 2,
+    "joy": 3,
+    "neutral": 4,
+    "sadness": 5,
+    "surprise": 6
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.6.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

ckpt/ser_en_text/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/ser_en_text/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dde1eadd81741344dd707d1c482a3293810eb895c873053213ccdb2b57ca9e95
+size 328544361

ckpt/ser_en_text/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}

ckpt/ser_en_text/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/ser_en_text/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilroberta-base"}

ckpt/ser_en_text/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed7a68d54395ab0be21726d6fcf25f942ed459b16387bbf9cf251051986766f
+size 2415

ckpt/ser_en_text/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/zh-2-en/config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "_name_or_path": "/tmp/Helsinki-NLP/opus-mt-zh-en",
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bad_words_ids": [
+    [
+      65000
+    ]
+  ],
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 65000,
+  "decoder_vocab_size": 65001,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "extra_pos_embeddings": 65001,
+  "forced_eos_token_id": 0,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_length": 512,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_before": false,
+  "normalize_embedding": false,
+  "num_beams": 6,
+  "num_hidden_layers": 6,
+  "pad_token_id": 65000,
+  "scale_embedding": true,
+  "share_encoder_decoder_embeddings": true,
+  "static_position_embeddings": true,
+  "transformers_version": "4.22.0.dev0",
+  "use_cache": true,
+  "vocab_size": 65001
+}

ckpt/zh-2-en/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bad_words_ids": [
+    [
+      65000
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 65000,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "num_beams": 6,
+  "pad_token_id": 65000,
+  "renormalize_logits": true,
+  "transformers_version": "4.32.0.dev0"
+}

ckpt/zh-2-en/metadata.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hf_name":"zho-eng","source_languages":"zho","target_languages":"eng","opus_readme_url":"https:\/\/github.com\/Helsinki-NLP\/Tatoeba-Challenge\/tree\/master\/models\/zho-eng\/README.md","original_repo":"Tatoeba-Challenge","tags":["translation"],"languages":["zh","en"],"src_constituents":["cmn_Hans","nan","nan_Hani","gan","yue","cmn_Kana","yue_Hani","wuu_Bopo","cmn_Latn","yue_Hira","cmn_Hani","cjy_Hans","cmn","lzh_Hang","lzh_Hira","cmn_Hant","lzh_Bopo","zho","zho_Hans","zho_Hant","lzh_Hani","yue_Hang","wuu","yue_Kana","wuu_Latn","yue_Bopo","cjy_Hant","yue_Hans","lzh","cmn_Hira","lzh_Yiii","lzh_Hans","cmn_Bopo","cmn_Hang","hak_Hani","cmn_Yiii","yue_Hant","lzh_Kana","wuu_Hani"],"tgt_constituents":["eng"],"src_multilingual":false,"tgt_multilingual":false,"prepro":" normalization + SentencePiece (spm32k,spm32k)","url_model":"https:\/\/object.pouta.csc.fi\/Tatoeba-MT-models\/zho-eng\/opus-2020-07-17.zip","url_test_set":"https:\/\/object.pouta.csc.fi\/Tatoeba-MT-models\/zho-eng\/opus-2020-07-17.test.txt","src_alpha3":"zho","tgt_alpha3":"eng","short_pair":"zh-en","chrF2_score":0.548,"bleu":36.1,"brevity_penalty":0.948,"ref_len":82826.0,"src_name":"Chinese","tgt_name":"English","train_date":"2020-07-17","src_alpha2":"zh","tgt_alpha2":"en","prefer_old":false,"long_pair":"zho-eng","helsinki_git_sha":"480fcbe0ee1bf4774bcbe6226ad9f58e63f6c535","transformers_git_sha":"2207e5d8cb224e954a7cba69fa4ac2309e9ff30b","port_machine":"brutasse","port_time":"2020-08-21-14:41"}

ckpt/zh-2-en/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d8ceb91d103ef89400c9d9d62328b4858743cf8924878aee3b8afc594242ce0
+size 312087009

ckpt/zh-2-en/rust_model.ot ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:859d0e2531693a5f003ea110aa5cee1b3439cea362980668923126bbb11d56de
+size 578358061

ckpt/zh-2-en/source.spm ADDED Viewed

Binary file (805 kB). View file

ckpt/zh-2-en/target.spm ADDED Viewed

Binary file (807 kB). View file

ckpt/zh-2-en/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"target_lang": "eng", "source_lang": "zho"}

ckpt/zh-2-en/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff