hanasim commited on
Commit
abb8a1f
1 Parent(s): 870c049

Training in progress, step 200

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 73,
3
+ "<s>": 72
4
+ }
breeze-listen-w2v2-ml.log ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 01/29/2024 19:52:08 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True
2
+ 01/29/2024 19:52:08 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
3
+ _n_gpu=1,
4
+ adafactor=False,
5
+ adam_beta1=0.9,
6
+ adam_beta2=0.999,
7
+ adam_epsilon=1e-08,
8
+ auto_find_batch_size=False,
9
+ bf16=False,
10
+ bf16_full_eval=False,
11
+ data_seed=None,
12
+ dataloader_drop_last=False,
13
+ dataloader_num_workers=0,
14
+ dataloader_persistent_workers=False,
15
+ dataloader_pin_memory=True,
16
+ dataloader_prefetch_factor=None,
17
+ ddp_backend=None,
18
+ ddp_broadcast_buffers=None,
19
+ ddp_bucket_cap_mb=None,
20
+ ddp_find_unused_parameters=None,
21
+ ddp_timeout=1800,
22
+ debug=[],
23
+ deepspeed=None,
24
+ disable_tqdm=False,
25
+ dispatch_batches=None,
26
+ do_eval=True,
27
+ do_predict=False,
28
+ do_train=True,
29
+ eval_accumulation_steps=None,
30
+ eval_delay=0,
31
+ eval_steps=200,
32
+ evaluation_strategy=IntervalStrategy.STEPS,
33
+ fp16=True,
34
+ fp16_backend=auto,
35
+ fp16_full_eval=False,
36
+ fp16_opt_level=O1,
37
+ fsdp=[],
38
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
39
+ fsdp_min_num_params=0,
40
+ fsdp_transformer_layer_cls_to_wrap=None,
41
+ full_determinism=False,
42
+ gradient_accumulation_steps=1,
43
+ gradient_checkpointing=True,
44
+ gradient_checkpointing_kwargs=None,
45
+ greater_is_better=None,
46
+ group_by_length=True,
47
+ half_precision_backend=auto,
48
+ hub_always_push=False,
49
+ hub_model_id=simpragma/breeze-listen-w2v2-ml,
50
+ hub_private_repo=False,
51
+ hub_strategy=HubStrategy.EVERY_SAVE,
52
+ hub_token=<HUB_TOKEN>,
53
+ ignore_data_skip=False,
54
+ include_inputs_for_metrics=False,
55
+ include_num_input_tokens_seen=False,
56
+ include_tokens_per_second=False,
57
+ jit_mode_eval=False,
58
+ label_names=None,
59
+ label_smoothing_factor=0.0,
60
+ learning_rate=0.001,
61
+ length_column_name=input_length,
62
+ load_best_model_at_end=False,
63
+ local_rank=0,
64
+ log_level=passive,
65
+ log_level_replica=warning,
66
+ log_on_each_node=True,
67
+ logging_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-ml/runs/Jan29_19-52-08_knight,
68
+ logging_first_step=False,
69
+ logging_nan_inf_filter=True,
70
+ logging_steps=500,
71
+ logging_strategy=IntervalStrategy.STEPS,
72
+ lr_scheduler_kwargs={},
73
+ lr_scheduler_type=SchedulerType.LINEAR,
74
+ max_grad_norm=1.0,
75
+ max_steps=-1,
76
+ metric_for_best_model=None,
77
+ mp_parameters=,
78
+ neftune_noise_alpha=None,
79
+ no_cuda=False,
80
+ num_train_epochs=4.0,
81
+ optim=OptimizerNames.ADAMW_BNB,
82
+ optim_args=None,
83
+ output_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-ml,
84
+ overwrite_output_dir=True,
85
+ past_index=-1,
86
+ per_device_eval_batch_size=8,
87
+ per_device_train_batch_size=4,
88
+ prediction_loss_only=False,
89
+ push_to_hub=True,
90
+ push_to_hub_model_id=None,
91
+ push_to_hub_organization=None,
92
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
93
+ ray_scope=last,
94
+ remove_unused_columns=True,
95
+ report_to=[],
96
+ resume_from_checkpoint=None,
97
+ run_name=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-ml,
98
+ save_on_each_node=False,
99
+ save_only_model=False,
100
+ save_safetensors=True,
101
+ save_steps=200,
102
+ save_strategy=IntervalStrategy.STEPS,
103
+ save_total_limit=3,
104
+ seed=42,
105
+ skip_memory_metrics=True,
106
+ split_batches=False,
107
+ tf32=None,
108
+ torch_compile=False,
109
+ torch_compile_backend=None,
110
+ torch_compile_mode=None,
111
+ torchdynamo=None,
112
+ tpu_metrics_debug=False,
113
+ tpu_num_cores=None,
114
+ use_cpu=False,
115
+ use_ipex=False,
116
+ use_legacy_prediction_loop=False,
117
+ use_mps_device=False,
118
+ warmup_ratio=0.0,
119
+ warmup_steps=100,
120
+ weight_decay=0.0,
121
+ )
config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/mms-1b-all",
3
+ "activation_dropout": 0.05,
4
+ "adapter_attn_dim": 16,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.05,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 1024,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.05,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.05,
58
+ "hidden_size": 1280,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 5120,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "wav2vec2",
70
+ "num_adapter_layers": 3,
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 48,
78
+ "num_negatives": 100,
79
+ "output_hidden_size": 1280,
80
+ "pad_token_id": 71,
81
+ "proj_codevector_dim": 1024,
82
+ "tdnn_dilation": [
83
+ 1,
84
+ 2,
85
+ 3,
86
+ 1,
87
+ 1
88
+ ],
89
+ "tdnn_dim": [
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 1500
95
+ ],
96
+ "tdnn_kernel": [
97
+ 5,
98
+ 3,
99
+ 3,
100
+ 1,
101
+ 1
102
+ ],
103
+ "torch_dtype": "float32",
104
+ "transformers_version": "4.38.0.dev0",
105
+ "use_weighted_layer_sum": false,
106
+ "vocab_size": 74,
107
+ "xvector_output_dim": 512
108
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8646b820c7003d6bcb643ef8b492cd503c3de511327cb68c36f2fb036f8d2b3
3
+ size 3859111256
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": true,
19
+ "normalized": false,
20
+ "rstrip": true,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "[UNK]",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": true,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "70": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "71": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "72": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "73": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": "mal",
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "[UNK]",
47
+ "word_delimiter_token": "|"
48
+ }
train-ctc-model.sh ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/bash
2
+
3
+ #
4
+ # This script runs the speech recognition training using DeepSpeed
5
+ #
6
+
7
+ # CHANGE THESE AS PER YOUR REQUIREMENTS
8
+
9
+ # LANG as it is referred in the dataset
10
+ LANG=ml # 2 letter ISO code for the language
11
+ LANG_ISO_3=mal # 3 letter ISO code for the language
12
+ LANGUAGE=Malayalam # Full language name as per Whisper convention
13
+
14
+ # For Mozilla Commonvoice datasets, uncomment the following
15
+ DATASET="mozilla-foundation/common_voice_16_0"
16
+ TEXT_COLUMN="sentence"
17
+
18
+ # For Google Fleurs datasets, uncomment the following
19
+ # DATASET="google/fleurs"
20
+ # TEXT_COLUMN="transcription"
21
+
22
+ # Custom datasets
23
+ #DATASET="parambharat/kannada_asr_corpus"
24
+ #TEXT_COLUMN=${TEXT_COLUMN:-"sentence"}
25
+
26
+ # Function to get fine tuning learning rate
27
+ get_fine_tuning_lr() {
28
+ local model_size=$1
29
+ local lr
30
+
31
+ case $model_size in
32
+ "tiny")
33
+ lr="3.75e-5"
34
+ ;;
35
+ "base")
36
+ lr="2.5e-5"
37
+ ;;
38
+ "small")
39
+ lr="1.25e-5"
40
+ ;;
41
+ "medium")
42
+ lr="6.25e-6"
43
+ ;;
44
+ "large")
45
+ lr="4.375e-6"
46
+ ;;
47
+ "large-v2")
48
+ lr="5e-6"
49
+ ;;
50
+ *)
51
+ echo "Invalid model size"
52
+ exit 1
53
+ ;;
54
+ esac
55
+
56
+ echo $lr
57
+ }
58
+
59
+ SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}")
60
+ SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}"))
61
+
62
+ # Port to use
63
+ export MASTER_PORT="${MASTER_PORT:-29500}"
64
+ echo "Using master_port for deepspeech: ${MASTER_PORT}"
65
+
66
+ export "MASTER_ADDR"="localhost"
67
+ export "RANK"="0"
68
+ export "LOCAL_RANK"="0"
69
+ export "WORLD_SIZE"="1"
70
+
71
+ # Base model variant
72
+ MODEL=w2v2
73
+
74
+ # Model names and other stuff
75
+ BASE_MODEL="facebook/mms-1b-all"
76
+
77
+ JUST_LANG=${LANG%%_*}
78
+ MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}"
79
+
80
+ OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}"
81
+ echo "OUTDIR: ${OUTDIR}"
82
+
83
+ # Training parameters you can tweak. Feel free to directly change any of the parameters below.
84
+
85
+ MAX_EPOCHS=4
86
+ TRAIN_BATCH_SIZE=4
87
+ EVAL_BATCH_SIZE=4
88
+ LEARNING_RATE="1e-3"
89
+
90
+ EVAL_STEPS="200"
91
+ SAVE_STEPS="200"
92
+
93
+ # Create dir
94
+ mkdir -p ${OUTDIR}
95
+
96
+ # --overwrite_output_dir \
97
+
98
+ # If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory.
99
+ # --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \
100
+
101
+ echo "================ TRAINING: START ================"
102
+
103
+ python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
104
+ --dataset_name="${DATASET}" \
105
+ --model_name_or_path="${BASE_MODEL}" \
106
+ --dataset_config_name="${LANG}" \
107
+ --target_language="${LANG_ISO_3}" \
108
+ --output_dir="${OUTDIR}" \
109
+ --num_train_epochs="${MAX_EPOCHS}" \
110
+ --per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
111
+ --learning_rate="${LEARNING_RATE}" \
112
+ --warmup_steps="100" \
113
+ --evaluation_strategy="steps" \
114
+ --text_column_name="${TEXT_COLUMN}" \
115
+ --length_column_name="input_length" \
116
+ --save_steps="${SAVE_STEPS}" \
117
+ --eval_steps="${EVAL_STEPS}" \
118
+ --save_total_limit="3" \
119
+ --optim="adamw_bnb_8bit" \
120
+ --hub_model_id "simpragma/${MY_MODEL}" \
121
+ --gradient_checkpointing \
122
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
123
+ --fp16 \
124
+ --group_by_length \
125
+ --do_train \
126
+ --do_eval \
127
+ --push_to_hub \
128
+ | tee ${OUTDIR}/${MY_MODEL}.log
129
+
130
+ # Copy the script to the output directory so that we can recreate the model
131
+ cp ${SCRIPT_PATH} ${OUTDIR}
132
+
133
+ echo "================ TRAINING: DONE ================"
134
+
135
+ exit 0
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5285fada4bc7d4d9e8126ada302ea99fc90ca747677c26a4e7da55b072d8194e
3
+ size 4856
vocab.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mal": {
3
+ "[PAD]": 71,
4
+ "[UNK]": 70,
5
+ "|": 0,
6
+ "ം": 1,
7
+ "ഃ": 2,
8
+ "അ": 3,
9
+ "ആ": 4,
10
+ "ഇ": 5,
11
+ "ഈ": 6,
12
+ "ഉ": 7,
13
+ "ഊ": 8,
14
+ "എ": 9,
15
+ "ഏ": 10,
16
+ "ഐ": 11,
17
+ "ഒ": 12,
18
+ "ഓ": 13,
19
+ "ക": 14,
20
+ "ഖ": 15,
21
+ "ഗ": 16,
22
+ "ഘ": 17,
23
+ "ങ": 18,
24
+ "ച": 19,
25
+ "ഛ": 20,
26
+ "ജ": 21,
27
+ "ഞ": 22,
28
+ "ട": 23,
29
+ "ഠ": 24,
30
+ "ഡ": 25,
31
+ "ഢ": 26,
32
+ "ണ": 27,
33
+ "ത": 28,
34
+ "ഥ": 29,
35
+ "ദ": 30,
36
+ "ധ": 31,
37
+ "ന": 32,
38
+ "പ": 33,
39
+ "ഫ": 34,
40
+ "ബ": 35,
41
+ "ഭ": 36,
42
+ "മ": 37,
43
+ "യ": 38,
44
+ "ര": 39,
45
+ "റ": 40,
46
+ "ല": 41,
47
+ "ള": 42,
48
+ "ഴ": 43,
49
+ "വ": 44,
50
+ "ശ": 45,
51
+ "ഷ": 46,
52
+ "സ": 47,
53
+ "ഹ": 48,
54
+ "ാ": 49,
55
+ "ി": 50,
56
+ "ീ": 51,
57
+ "ു": 52,
58
+ "ൂ": 53,
59
+ "ൃ": 54,
60
+ "െ": 55,
61
+ "േ": 56,
62
+ "ൈ": 57,
63
+ "ൊ": 58,
64
+ "ോ": 59,
65
+ "ൌ": 60,
66
+ "്": 61,
67
+ "ൗ": 62,
68
+ "ൺ": 63,
69
+ "ൻ": 64,
70
+ "ർ": 65,
71
+ "ൽ": 66,
72
+ "ൾ": 67,
73
+ "ൿ": 68,
74
+ "’": 69
75
+ }
76
+ }