marinone94
commited on
Commit
•
060c28e
1
Parent(s):
7780ee2
remove add lm from script
Browse files
run_speech_recognition_ctc.py
CHANGED
@@ -31,7 +31,6 @@ import numpy as np
|
|
31 |
import torch
|
32 |
import wandb
|
33 |
from datasets import DatasetDict, load_dataset, load_metric
|
34 |
-
from pyctcdecode import build_ctcdecoder
|
35 |
|
36 |
import transformers
|
37 |
from transformers import (
|
@@ -743,24 +742,7 @@ def main():
|
|
743 |
trainer.push_to_hub(**kwargs)
|
744 |
else:
|
745 |
trainer.create_model_card(**kwargs)
|
746 |
-
|
747 |
-
if training_args.push_lm_to_hub:
|
748 |
-
vocab_dict = processor.tokenizer.get_vocab()
|
749 |
-
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
|
750 |
-
|
751 |
-
decoder = build_ctcdecoder(
|
752 |
-
labels=list(sorted_vocab_dict.keys()),
|
753 |
-
kenlm_model_path="5gram_sv_lm.bin",
|
754 |
-
)
|
755 |
-
|
756 |
-
processor_with_lm = Wav2Vec2ProcessorWithLM(
|
757 |
-
feature_extractor=processor.feature_extractor,
|
758 |
-
tokenizer=processor.tokenizer,
|
759 |
-
decoder=decoder
|
760 |
-
)
|
761 |
-
processor_with_lm.save_pretrained(repo_name)
|
762 |
-
processor_with_lm.push_to_hub(**kwargs)
|
763 |
-
|
764 |
return results
|
765 |
|
766 |
|
|
|
31 |
import torch
|
32 |
import wandb
|
33 |
from datasets import DatasetDict, load_dataset, load_metric
|
|
|
34 |
|
35 |
import transformers
|
36 |
from transformers import (
|
|
|
742 |
trainer.push_to_hub(**kwargs)
|
743 |
else:
|
744 |
trainer.create_model_card(**kwargs)
|
745 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
return results
|
747 |
|
748 |
|