Update README.md
Browse files
README.md
CHANGED
@@ -47,8 +47,8 @@ if not os.path.exists("whisper-vq-stoks-medium-en+pl-fixed.model"):
|
|
47 |
vq_model = RQBottleneckTransformer.load_model(
|
48 |
"whisper-vq-stoks-medium-en+pl-fixed.model"
|
49 |
).to(device)
|
|
|
50 |
def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device=device):
|
51 |
-
vq_model.ensure_whisper(device)
|
52 |
|
53 |
wav, sr = torchaudio.load(audio_path)
|
54 |
if sr != 16000:
|
@@ -59,19 +59,6 @@ def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device=device):
|
|
59 |
|
60 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
61 |
return f'<|sound_start|>{result}<|sound_end|>'
|
62 |
-
|
63 |
-
def audio_to_sound_tokens_transcript(audio_path, target_bandwidth=1.5, device=device):
|
64 |
-
vq_model.ensure_whisper(device)
|
65 |
-
|
66 |
-
wav, sr = torchaudio.load(audio_path)
|
67 |
-
if sr != 16000:
|
68 |
-
wav = torchaudio.functional.resample(wav, sr, 16000)
|
69 |
-
with torch.no_grad():
|
70 |
-
codes = vq_model.encode_audio(wav.to(device))
|
71 |
-
codes = codes[0].cpu().tolist()
|
72 |
-
|
73 |
-
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
74 |
-
return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
|
75 |
```
|
76 |
|
77 |
Then, we can inference the model the same as any other LLM.
|
@@ -136,7 +123,7 @@ We utilize [torchtune](https://github.com/pytorch/torchtune) library for the lat
|
|
136 |
| Parameter | Instruction Fine-tuning |
|
137 |
|----------------------------|-------------------------|
|
138 |
| **Epoch** | 1 |
|
139 |
-
| **Global batch size** |
|
140 |
| **Learning Rate** | 7e-5 |
|
141 |
| **Learning Scheduler** | Cosine with warmup |
|
142 |
| **Optimizer** | Adam torch fused |
|
|
|
47 |
vq_model = RQBottleneckTransformer.load_model(
|
48 |
"whisper-vq-stoks-medium-en+pl-fixed.model"
|
49 |
).to(device)
|
50 |
+
vq_model.ensure_whisper(device)
|
51 |
def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device=device):
|
|
|
52 |
|
53 |
wav, sr = torchaudio.load(audio_path)
|
54 |
if sr != 16000:
|
|
|
59 |
|
60 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
61 |
return f'<|sound_start|>{result}<|sound_end|>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
```
|
63 |
|
64 |
Then, we can inference the model the same as any other LLM.
|
|
|
123 |
| Parameter | Instruction Fine-tuning |
|
124 |
|----------------------------|-------------------------|
|
125 |
| **Epoch** | 1 |
|
126 |
+
| **Global batch size** | 256 |
|
127 |
| **Learning Rate** | 7e-5 |
|
128 |
| **Learning Scheduler** | Cosine with warmup |
|
129 |
| **Optimizer** | Adam torch fused |
|