Chris Bracegirdle commited on
Commit
149046c
1 Parent(s): 38db600
Files changed (1) hide show
  1. app.py +56 -41
app.py CHANGED
@@ -1,43 +1,58 @@
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- import torch
4
- import librosa
5
- import json
6
- # Load model directly
7
- from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
8
-
9
- pipe = pipeline("automatic-speech-recognition", model="dmatekenya/whisper-large-v3-chichewa")
10
-
11
- def transcribe(audio_file_mic=None, audio_file_upload=None, language="English (eng)"):
12
- if audio_file_mic:
13
- audio_file = audio_file_mic
14
- elif audio_file_upload:
15
- audio_file = audio_file_upload
16
- else:
17
- return "Please upload an audio file or record one"
18
-
19
- # Make sure audio is 16kHz
20
- # speech, sample_rate = librosa.load(audio_file)
21
- # if sample_rate != 16000:
22
- # speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
23
-
24
- # Keep the same model in memory and simply switch out the language adapters by calling load_adapter() for the model and set_target_lang() for the tokenizer
25
- # language_code = iso_codes[language]
26
- # processor.tokenizer.set_target_lang(language_code)
27
- # model.load_adapter(language_code)
28
-
29
- result = pipe(audio_file)
30
- return result["text"]
31
-
32
-
33
- description = ''''''
34
-
35
- iface = gr.Interface(fn=transcribe,
36
- inputs=[
37
- gr.Audio(source="microphone", type="filepath", label="Record Audio"),
38
- gr.Audio(source="upload", type="filepath", label="Upload Audio"),
39
- ],
40
- outputs=gr.Textbox(label="Transcription"),
41
- description=description
42
- )
43
- iface.launch()
 
1
+ import torch
2
+
3
  import gradio as gr
4
+ import yt_dlp as youtube_dl
5
+ from transformers import pipeline
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+
8
+ import tempfile
9
+ import os
10
+
11
+ MODEL_NAME = "dmatekenya/whisper-large-v3-chichewa"
12
+ BATCH_SIZE = 8
13
+ FILE_LIMIT_MB = 1000
14
+ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
15
+
16
+ device = 0 if torch.cuda.is_available() else "cpu"
17
+
18
+ pipe = pipeline(
19
+ task="automatic-speech-recognition",
20
+ model=MODEL_NAME,
21
+ chunk_length_s=30,
22
+ device=device,
23
+ )
24
+
25
+
26
+ def transcribe(inputs, task):
27
+ if inputs is None:
28
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
29
+
30
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
31
+ return text
32
+
33
+
34
+ demo = gr.Blocks()
35
+
36
+ file_transcribe = gr.Interface(
37
+ fn=transcribe,
38
+ inputs=[
39
+ gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
40
+ gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
41
+ ],
42
+ outputs="text",
43
+ layout="horizontal",
44
+ theme="huggingface",
45
+ title="Whisper Large V3: Transcribe Audio",
46
+ description=(
47
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
48
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
49
+ " of arbitrary length."
50
+ ),
51
+ allow_flagging="never",
52
+ )
53
+
54
+ with demo:
55
+ gr.TabbedInterface([file_transcribe], [ "Audio file"])
56
+
57
+ demo.launch(enable_queue=True)
58