marinone94
/

whisper-medium-swedish

@@ -7,35 +7,36 @@ tags:
 - generated_from_trainer
 datasets:
 - mozilla-foundation/common_voice_11_0
-metrics:
-- wer
 model-index:
 - name: Whisper Medium Swedish
   results:
   - task:
-      name: Automatic Speech Recognition
       type: automatic-speech-recognition
     dataset:
-      name: mozilla-foundation/common_voice_11_0 sv-SE
       type: mozilla-foundation/common_voice_11_0
       config: sv-SE
       split: test
-      args: sv-SE
     metrics:
     - name: Wer
       type: wer
-      value: 11.37780883775938
 ---
-<!-- This model card has been generated automatically according to the information the Trainer had access to. You
-should probably proofread and complete it, then remove this comment. -->
 # Whisper Medium Swedish
-This model is a fine-tuned version of [marinone94/whisper-medium-nordic](https://huggingface.co/marinone94/whisper-medium-nordic) on the mozilla-foundation/common_voice_11_0 sv-SE dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.2970
-- Wer: 11.3778
 ## Model description
@@ -61,17 +62,9 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - lr_scheduler_warmup_steps: 250
-- training_steps: 2500
 - mixed_precision_training: Native AMP
-### Training results
-| Training Loss | Epoch | Step | Validation Loss | Wer     |
-|:-------------:|:-----:|:----:|:---------------:|:-------:|
-| 0.0146        | 3.02  | 1000 | 0.2546          | 11.9423 |
-| 0.0017        | 6.04  | 2000 | 0.2970          | 11.3778 |
 ### Framework versions
 - Transformers 4.26.0.dev0

 - generated_from_trainer
 datasets:
 - mozilla-foundation/common_voice_11_0
+- babelbox/babelbox_voice
+- google/fleurs
 model-index:
 - name: Whisper Medium Swedish
   results:
   - task:
       type: automatic-speech-recognition
+      name: Automatic Speech Recognition
     dataset:
+      name: mozilla-foundation/common_voice_11_0
       type: mozilla-foundation/common_voice_11_0
       config: sv-SE
       split: test
     metrics:
     - name: Wer
       type: wer
+      value: 9.89
 ---
 # Whisper Medium Swedish
+This model is a fine-tuned version of [Whisper Medium Nordic](https://huggingface.co/marinone94/whisper-medium-nordic) on the [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) (train+validation), the [babelbox/babelbox_voice](https://huggingface.co/datasets/babelbox/babelbox_voice) (NST SV - train split) and the [google/fleurs](https://huggingface.co/datasets/google/fleurs) (sv_se - train+validation+test) datasets.
 It achieves the following results on the evaluation set:
+- eval_loss: 0.2483
+- eval_wer: 9.8914
+- eval_runtime: 2924.8709
+- eval_samples_per_second: 1.733
+- eval_steps_per_second: 0.108
+- step: 0
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - lr_scheduler_warmup_steps: 250
+- training_steps: 5000
 - mixed_precision_training: Native AMP
 ### Framework versions
 - Transformers 4.26.0.dev0

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 1.0,
     "eval_loss": 0.24834245443344116,
-    "eval_runtime": 2999.4256,
-    "eval_samples_per_second": 1.69,
-    "eval_steps_per_second": 0.106,
     "eval_wer": 9.891409525857435,
     "train_loss": 0.025400285175442697,
     "train_runtime": 51804.3597,

 {
     "epoch": 1.0,
     "eval_loss": 0.24834245443344116,
+    "eval_runtime": 2924.8709,
+    "eval_samples_per_second": 1.733,
+    "eval_steps_per_second": 0.108,
     "eval_wer": 9.891409525857435,
     "train_loss": 0.025400285175442697,
     "train_runtime": 51804.3597,

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "marinone94/whisper-medium-nordic",
   "activation_dropout": 0.0,
   "activation_function": "gelu",
   "architectures": [

 {
+  "_name_or_path": ".",
   "activation_dropout": 0.0,
   "activation_function": "gelu",
   "architectures": [

eval_results.json CHANGED Viewed

@@ -1,8 +1,7 @@
 {
-    "epoch": 1.0,
     "eval_loss": 0.24834245443344116,
-    "eval_runtime": 2999.4256,
-    "eval_samples_per_second": 1.69,
-    "eval_steps_per_second": 0.106,
     "eval_wer": 9.891409525857435
 }

 {
     "eval_loss": 0.24834245443344116,
+    "eval_runtime": 2924.8709,
+    "eval_samples_per_second": 1.733,
+    "eval_steps_per_second": 0.108,
     "eval_wer": 9.891409525857435
 }

run.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 python run_speech_recognition_seq2seq_streaming.py \
-	--model_name_or_path="marinone94/whisper-medium-nordic" \
 	--dataset_train_name="mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,google/fleurs" \
 	--dataset_train_config_name="sv-SE,nst,sv_se" \
 	--language="swedish" \
@@ -30,6 +30,7 @@ python run_speech_recognition_seq2seq_streaming.py \
 	--load_best_model_at_end \
 	--gradient_checkpointing \
 	--fp16 \
 	--predict_with_generate \
 	--do_normalize_eval \
 	--streaming \

 python run_speech_recognition_seq2seq_streaming.py \
+	--model_name_or_path="." \
 	--dataset_train_name="mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,google/fleurs" \
 	--dataset_train_config_name="sv-SE,nst,sv_se" \
 	--language="swedish" \
 	--load_best_model_at_end \
 	--gradient_checkpointing \
 	--fp16 \
+	--do_eval \
 	--predict_with_generate \
 	--do_normalize_eval \
 	--streaming \

tokenizer_config.json CHANGED Viewed

@@ -19,7 +19,7 @@
   },
   "errors": "replace",
   "model_max_length": 1024,
-  "name_or_path": "marinone94/whisper-medium-nordic",
   "pad_token": null,
   "processor_class": "WhisperProcessor",
   "return_attention_mask": false,

   },
   "errors": "replace",
   "model_max_length": 1024,
+  "name_or_path": ".",
   "pad_token": null,
   "processor_class": "WhisperProcessor",
   "return_attention_mask": false,