fish-speech-1 / tools /whisper_asr.py
lengyue233's picture
Init hf space integration
0a3525d verified
raw
history blame
3.49 kB
"""
Used to transcribe all audio files in one folder into another folder.
e.g.
Directory structure:
--pre_data_root
----SP_1
------01.wav
------02.wav
------......
----SP_2
------01.wav
------02.wav
------......
Use
python tools/whisper_asr.py --audio_dir pre_data_root/SP_1 --save_dir data/SP_1
to transcribe the first speaker.
Use
python tools/whisper_asr.py --audio_dir pre_data_root/SP_2 --save_dir data/SP_2
to transcribe the second speaker.
Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
"""
from pathlib import Path
import click
import librosa
import soundfile as sf
import whisper
from loguru import logger
from merge_asr_files import merge_and_delete_files
from tqdm import tqdm
from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
@click.command()
@click.option("--model-size", default="large", help="Size of the Whisper model")
@click.option("--audio-dir", required=True, help="Directory containing audio files")
@click.option(
"--save-dir", required=True, help="Directory to save processed audio files"
)
@click.option(
"--sample-rate",
default=None,
type=int,
help="Output sample rate, default to input sample rate",
)
@click.option("--device", default="cuda", help="Device to use")
@click.option("--language", default="ZH", help="Language of the transcription")
def main(model_size, audio_dir, save_dir, sample_rate, device, language):
logger.info("Loading / Downloading OpenAI Whisper model...")
model = whisper.load_model(
name=model_size,
device=device,
download_root=str(Path(".cache/whisper").resolve()),
)
logger.info("Model loaded.")
save_path = Path(save_dir)
save_path.mkdir(parents=True, exist_ok=True)
original_files = []
audio_files = list_files(
path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
)
for file_path in tqdm(audio_files, desc="Processing audio file"):
file_stem = file_path.stem
file_suffix = file_path.suffix
rel_path = Path(file_path).relative_to(audio_dir)
(save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
if (save_path / rel_path.parent / f"{rel_path.stem}.wav").exists() and (
save_path / rel_path.parent / f"{rel_path.stem}.lab"
).exists():
continue
audio, sr = librosa.load(file_path, sr=sample_rate, mono=False)
transcription = model.transcribe(str(file_path), language=language)
for segment in transcription.get("segments", []):
id, text, start, end = (
segment["id"],
segment["text"],
segment["start"],
segment["end"],
)
extract = audio[..., int(start * sr) : int(end * sr)]
audio_save_path = (
save_path / rel_path.parent / f"{file_stem}-{id}{file_suffix}"
)
sf.write(
audio_save_path,
extract,
samplerate=sr,
)
original_files.append(audio_save_path)
transcript_save_path = save_path / rel_path.parent / f"{file_stem}-{id}.lab"
with open(
transcript_save_path,
"w",
encoding="utf-8",
) as f:
f.write(text)
original_files.append(transcript_save_path)
merge_and_delete_files(save_dir, original_files)
if __name__ == "__main__":
main()