lengyue233 commited on
Commit
16b7417
1 Parent(s): 6d3f4b0

First model version

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +28 -0
  3. config.json +71 -0
  4. convert.py +150 -0
  5. pytorch_model.bin +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ content-vec-best-legacy-500.pt
README.md CHANGED
@@ -1,3 +1,31 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ # Content Vec Best
6
+ Official Repo: [ContentVec](https://github.com/auspicious3000/contentvec)
7
+ This repo brings fairseq ContentVec model to HuggingFace Transformers.
8
+
9
+ ## How to use
10
+ To use this model, you need to define
11
+ ```python
12
+ class HubertModelWithFinalProj(HubertModel):
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+
16
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
17
+ ```
18
+
19
+ and then load the model with
20
+ ```python
21
+ model = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best")
22
+
23
+ x = model(audio)["last_hidden_state"]
24
+ x = model.final_proj(x)
25
+ ```
26
+
27
+ ## How to convert
28
+ You need to download the ContentVec_legacy model from the official repo, and then run
29
+ ```bash
30
+ python convert.py
31
+ ```
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.27.3",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
convert.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import HubertConfig, HubertModel
4
+ import logging
5
+
6
+ # Ignore fairseq's logger
7
+ logging.getLogger("fairseq").setLevel(logging.WARNING)
8
+ logging.getLogger("torch.distributed.nn.jit.instantiator").setLevel(logging.WARNING)
9
+
10
+ from fairseq import checkpoint_utils
11
+
12
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
13
+ ["content-vec-best-legacy-500.pt"], suffix=""
14
+ )
15
+ model = models[0]
16
+ model.eval()
17
+ model.eval()
18
+
19
+
20
+ class HubertModelWithFinalProj(HubertModel):
21
+ def __init__(self, config):
22
+ super().__init__(config)
23
+
24
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
25
+
26
+
27
+ # Default Config
28
+ hubert = HubertModelWithFinalProj(HubertConfig())
29
+
30
+ # huggingface: fairseq
31
+ mapping = {
32
+ "masked_spec_embed": "mask_emb",
33
+ "encoder.layer_norm.bias": "encoder.layer_norm.bias",
34
+ "encoder.layer_norm.weight": "encoder.layer_norm.weight",
35
+ "encoder.pos_conv_embed.conv.bias": "encoder.pos_conv.0.bias",
36
+ "encoder.pos_conv_embed.conv.weight_g": "encoder.pos_conv.0.weight_g",
37
+ "encoder.pos_conv_embed.conv.weight_v": "encoder.pos_conv.0.weight_v",
38
+ "feature_projection.layer_norm.bias": "layer_norm.bias",
39
+ "feature_projection.layer_norm.weight": "layer_norm.weight",
40
+ "feature_projection.projection.bias": "post_extract_proj.bias",
41
+ "feature_projection.projection.weight": "post_extract_proj.weight",
42
+ "final_proj.bias": "final_proj.bias",
43
+ "final_proj.weight": "final_proj.weight",
44
+ }
45
+
46
+ # Convert encoder
47
+ for layer in range(12):
48
+ for j in ["q", "k", "v"]:
49
+ mapping[
50
+ f"encoder.layers.{layer}.attention.{j}_proj.weight"
51
+ ] = f"encoder.layers.{layer}.self_attn.{j}_proj.weight"
52
+ mapping[
53
+ f"encoder.layers.{layer}.attention.{j}_proj.bias"
54
+ ] = f"encoder.layers.{layer}.self_attn.{j}_proj.bias"
55
+
56
+ mapping[
57
+ f"encoder.layers.{layer}.final_layer_norm.bias"
58
+ ] = f"encoder.layers.{layer}.final_layer_norm.bias"
59
+ mapping[
60
+ f"encoder.layers.{layer}.final_layer_norm.weight"
61
+ ] = f"encoder.layers.{layer}.final_layer_norm.weight"
62
+
63
+ mapping[
64
+ f"encoder.layers.{layer}.layer_norm.bias"
65
+ ] = f"encoder.layers.{layer}.self_attn_layer_norm.bias"
66
+ mapping[
67
+ f"encoder.layers.{layer}.layer_norm.weight"
68
+ ] = f"encoder.layers.{layer}.self_attn_layer_norm.weight"
69
+
70
+ mapping[
71
+ f"encoder.layers.{layer}.attention.out_proj.bias"
72
+ ] = f"encoder.layers.{layer}.self_attn.out_proj.bias"
73
+ mapping[
74
+ f"encoder.layers.{layer}.attention.out_proj.weight"
75
+ ] = f"encoder.layers.{layer}.self_attn.out_proj.weight"
76
+
77
+ mapping[
78
+ f"encoder.layers.{layer}.feed_forward.intermediate_dense.bias"
79
+ ] = f"encoder.layers.{layer}.fc1.bias"
80
+ mapping[
81
+ f"encoder.layers.{layer}.feed_forward.intermediate_dense.weight"
82
+ ] = f"encoder.layers.{layer}.fc1.weight"
83
+
84
+ mapping[
85
+ f"encoder.layers.{layer}.feed_forward.output_dense.bias"
86
+ ] = f"encoder.layers.{layer}.fc2.bias"
87
+ mapping[
88
+ f"encoder.layers.{layer}.feed_forward.output_dense.weight"
89
+ ] = f"encoder.layers.{layer}.fc2.weight"
90
+
91
+ # Convert Conv Layers
92
+ for layer in range(7):
93
+ mapping[
94
+ f"feature_extractor.conv_layers.{layer}.conv.weight"
95
+ ] = f"feature_extractor.conv_layers.{layer}.0.weight"
96
+
97
+ if layer != 0:
98
+ continue
99
+
100
+ mapping[
101
+ f"feature_extractor.conv_layers.{layer}.layer_norm.weight"
102
+ ] = f"feature_extractor.conv_layers.{layer}.2.weight"
103
+ mapping[
104
+ f"feature_extractor.conv_layers.{layer}.layer_norm.bias"
105
+ ] = f"feature_extractor.conv_layers.{layer}.2.bias"
106
+
107
+ hf_keys = set(hubert.state_dict().keys())
108
+ fair_keys = set(model.state_dict().keys())
109
+
110
+ hf_keys -= set(mapping.keys())
111
+ fair_keys -= set(mapping.values())
112
+
113
+ for i, j in zip(sorted(hf_keys), sorted(fair_keys)):
114
+ print(i, j)
115
+
116
+ print(hf_keys, fair_keys)
117
+ print(len(hf_keys), len(fair_keys))
118
+
119
+ # try loading the weights
120
+ new_state_dict = {}
121
+ for k, v in mapping.items():
122
+ new_state_dict[k] = model.state_dict()[v]
123
+
124
+ x = hubert.load_state_dict(new_state_dict, strict=False)
125
+ print(x)
126
+ hubert.eval()
127
+
128
+ with torch.no_grad():
129
+ new_input = torch.randn(1, 16384)
130
+
131
+ result1 = hubert(new_input, output_hidden_states=True)["hidden_states"][9]
132
+ result1 = hubert.final_proj(result1)
133
+
134
+ result2 = model.extract_features(
135
+ **{
136
+ "source": new_input,
137
+ "padding_mask": torch.zeros(1, 16384, dtype=torch.bool),
138
+ # "features_only": True,
139
+ "output_layer": 9,
140
+ }
141
+ )[0]
142
+ result2 = model.final_proj(result2)
143
+
144
+ assert torch.allclose(result1, result2, atol=1e-3)
145
+
146
+ print("Sanity check passed")
147
+
148
+ # Save huggingface model
149
+ hubert.save_pretrained(".")
150
+ print("Saved model")
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
3
+ size 378342945