Save transcription or translation to txt file

#2
by Softology - opened

How do I save the results of the asr transcription and translation to a txt file, ie this code

from transformers import pipeline
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large")
asr("audio_king.mp3", generate_kwargs={'task': 'transcribe', 'language': 'no'})
asr("audio_king.mp3", generate_kwargs={'task': 'translate', 'language': 'no'})

Thanks.

Nasjonalbiblioteket AI Lab org
transcription_result = asr(input_file, generate_kwargs={'task': 'transcribe', 'language': 'no'})
output_file = "my_transcriptions.txt"

 with open(output_file, 'w', encoding='utf-8') as file:
        file.write(transcription_result['text'])

This works for those who may need it.

import sys

sys.stdout.write("Imports ...\n")
sys.stdout.flush()

from transformers import pipeline

# Load the model
sys.stdout.write("Loading the model ...\n")
sys.stdout.flush()
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large")

#transcribe
sys.stdout.write("Transcribing ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", generate_kwargs={'task': 'transcribe', 'language': 'no'})
with open('output_transcribe.txt', 'w+') as fh:
    fh.write(text['text'])

#translate
sys.stdout.write("Translating ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", generate_kwargs={'task': 'translate', 'language': 'en'})
with open('output_translate.txt', 'w+') as fh:
    fh.write(text['text'])

sys.stdout.write("Done\n")
sys.stdout.flush()
Softology changed discussion status to closed
transcription_result = asr(input_file, generate_kwargs={'task': 'transcribe', 'language': 'no'})
output_file = "my_transcriptions.txt"

 with open(output_file, 'w', encoding='utf-8') as file:
        file.write(transcription_result['text'])

Thanks, I found the same answer :)

Also how do we tell it to use the GPU?
The above code seems to be CPU only and appending to.("cuda") to the pipline does not work in this case.

Softology changed discussion status to open

asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large", device="cuda" ) seems to work

OK the final code to handle larger length mp3 files and use the GPU is...

import sys

sys.stdout.write("Imports ...\n")
sys.stdout.flush()

from transformers import pipeline

# Load the model
sys.stdout.write("Loading the model ...\n")
sys.stdout.flush()
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large", device="cuda")

#transcribe
sys.stdout.write("Transcribing ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'})
with open('output_transcribe.txt', 'w+') as fh:
    fh.write(text['text'])

#translate
sys.stdout.write("Translating ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'translate', 'language': 'en'})
with open('output_translate.txt', 'w+') as fh:
    fh.write(text['text'])

sys.stdout.write("Done\n")
sys.stdout.flush()

Really nice - I just need to figure out how to get timestamps in the fh.write(text['text']) output, as return_timestamps=True doesn't do anything by default, and all translations are in a single line

# Writing output with timestamps
with open('output_transcribe02.txt', 'w', encoding='utf-8') as fh:
    for chunk in results['chunks']:
        start_time, end_time = chunk['timestamp']
        transcribed_text = chunk['text']
        fh.write(f"{start_time}-{end_time}: {transcribed_text}\n")

Solved

For completion, here is the final script and the environment setup commands used for anyone else who wants a more complete example to try this locally.

import sys

sys.stdout.write("Imports ...\n")
sys.stdout.flush()

from transformers import pipeline

# Load the model
sys.stdout.write("Loading the model ...\n")
sys.stdout.flush()
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large", device="cuda")

#transcribe
sys.stdout.write("Transcribing ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'})

with open('output_transcribe.txt', 'w+') as fh:
    fh.write(text['text'])

with open('output_transcribe_timestamps.txt', 'w', encoding='utf-8') as fh:
    for chunk in text['chunks']:
        start_time, end_time = chunk['timestamp']
        transcribed_text = chunk['text']
        fh.write(f"{start_time}-{end_time}: {transcribed_text}\n")

#translate
sys.stdout.write("Translating ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'translate', 'language': 'en'})

with open('output_translate.txt', 'w+') as fh:
    fh.write(text['text'])

with open('output_translate_timestamps.txt', 'w', encoding='utf-8') as fh:
    for chunk in text['chunks']:
        start_time, end_time = chunk['timestamp']
        transcribed_text = chunk['text']
        fh.write(f"{start_time}-{end_time}: {transcribed_text}\n")

sys.stdout.write("Done\n")
sys.stdout.flush()

Environment setup pip commands used

python -m pip install --upgrade pip
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts wheel==0.40.0
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts numba==0.57.0
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts tqdm==4.65.0
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts transformers==4.35.2
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116

For Windows you also need to have ffmpeg.exe in the same folder as the script.

I'm currently using

from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large")
result = asr("king.mp3", generate_kwargs={'task': 'transcribe', 'language': 'no'})
print(result)

Is it possible to change the language to "sami" to get the sámi transcription? And if so, can I use translation to english simultanously?

Nasjonalbiblioteket AI Lab org

@Flameglory , no, unfortunately, while NB-Whisper models can translate North Sámi to Norwegian, they cannot do transcription to Sámi as Sámi is not supported by Whisper. For Sámi transcription a different model is needed. We have a working prototype for it: https://huggingface.co/NbAiLab/whisper-large-sme, but it does not support timestamps at the moment. There's a serverless demo that can be accessed here: https://huggingface.co/spaces/versae/whisper-sami-demo

Sign up or log in to comment