PhuongPhan commited on
Commit
10a3d05
β€’
1 Parent(s): fe06633

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import spaces
3
+ import gradio as gr
4
+
5
+ from transformers import pipeline
6
+ from huggingface_hub import model_info
7
+
8
+ MODEL_NAME = "openai/whisper-small"
9
+
10
+ device = 0 if torch.cuda.is_available() else "cpu"
11
+ pipe = pipeline(
12
+ task="automatic-speech-recognition",
13
+ model=MODEL_NAME,
14
+ chunk_length_s=30,
15
+ device=device,
16
+ )
17
+
18
+ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids( task="transcribe")
19
+
20
+ @spaces.GPU(duration=240)
21
+ def transcribe(mic, file_upload):
22
+
23
+ file = mic if mic is not None else file_upload
24
+
25
+ text = pipe(file)["text"]
26
+ return text
27
+
28
+
29
+ #---------------------------------------------------------------
30
+ import ctranslate2
31
+ import gradio as gr
32
+ from huggingface_hub import snapshot_download
33
+ from sentencepiece import SentencePieceProcessor
34
+
35
+ model_name = "santhosh/madlad400-3b-ct2"
36
+ model_path = snapshot_download(model_name)
37
+
38
+ tokenizer = SentencePieceProcessor()
39
+ tokenizer.load(f"{model_path}/sentencepiece.model")
40
+ translator = ctranslate2.Translator(model_path)
41
+ tokens = [tokenizer.decode(i) for i in range(460)]
42
+ lang_codes = [token[2:-1] for token in tokens if token.startswith("<2")]
43
+
44
+
45
+ @spaces.GPU(duration=240)
46
+ def translate(input_text, target_language):
47
+ input_tokens = tokenizer.encode(f"<2{target_language}> {input_text}", out_type=str)
48
+ results = translator.translate_batch(
49
+ [input_tokens],
50
+ batch_type="tokens",
51
+ beam_size=1,
52
+ no_repeat_ngram_size=1,
53
+ )
54
+ translated_sentence = tokenizer.decode(results[0].hypotheses[0])
55
+ return translated_sentence
56
+
57
+
58
+ @spaces.GPU(duration=240)
59
+ def translate_interface(input_text, target_language):
60
+ translated_text = translate(input_text, target_language)
61
+ return translated_text
62
+
63
+
64
+ with gr.Blocks() as demo:
65
+ with gr.Column():
66
+ gr.Markdown(
67
+ """
68
+
69
+ <div style="text-align: left;">
70
+ <a href='https://huggingface.co/PhuongPhan'><img style='display: inline-block; margin: 0; padding: 0;' src='https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg' alt='Follow me on HF'></a>
71
+ <a href='https://huggingface.co/Chunte'><img style='display: inline-block; margin: 0; padding: 0;' src='https://img.shields.io/badge/GitHub%20Pages-121013?logo=github&logoColor=white' alt='GitHub Pages'></a>
72
+ </div>
73
+
74
+ """ )
75
+
76
+ gr.Markdown("<h1 style='text-align: center;'>🎀 Speech to Text & Translation πŸ—£οΈ</h1>")
77
+
78
+ gr.HTML(
79
+ "<p style='text-align: center'>"
80
+ "🐀 <a href='https://huggingface.co/openai/whisper-small' target='_blank'>OpenAI Whisper</a> | "
81
+ "πŸ§‘β€πŸ’» <a href='https://huggingface.co/google/madlad400-3b-mt' target='_blank'>Google Madlad</a>"
82
+ "</p>")
83
+
84
+ gr.Markdown("<p style='text-align: center;'><i>Upload an audio file or use your microphone to transcribe speech and then translate it to different languages.</i></p>")
85
+
86
+ with gr.Row():
87
+ # First interface for transcription
88
+ gr.Markdown("## πŸŽ™οΈ Transcribe Audio ")
89
+ gr.Markdown("---")
90
+ audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath")
91
+ transcribe_button = gr.Button("Transcribe")
92
+ transcribed_output = gr.Textbox(label="Transcribed Text")
93
+ transcribe_button.click(transcribe, inputs=audio_input, outputs=transcribed_output)
94
+
95
+ with gr.Row():
96
+ # Second interface for translation
97
+ gr.Markdown("## 🌐 Translate Text 🌐")
98
+ gr.Markdown("---")
99
+ lang_dropdown = gr.Dropdown(lang_codes, value="en", label="Target Language")
100
+ translate_button = gr.Button("Translate")
101
+ translated_output = gr.Textbox(label="Translated Text")
102
+ translate_button.click(translate_interface, inputs=[transcribed_output, lang_dropdown], outputs=translated_output)
103
+
104
+ gr.Markdown("---")
105
+ with gr.Accordion("See Details", open = False):
106
+
107
+ gr.Markdown("---")
108
+ gr.Markdown('''
109
+
110
+ ## Description πŸ“
111
+
112
+ > Using OpenAI Whisper Base model to transcribe audio files into text Google Madlad model to translate transcribed texts into multiple languages.
113
+ > Enabling users to convert spoken words into written text.
114
+ > Supporting various use cases, including transcription of audio files, detection of phrases, speech-to-text generation, and translation of text.
115
+
116
+ ## How it Works 🫢
117
+
118
+ - Upload an audio file or record a new one directly in the app.
119
+ - Transcribe the audio into text, allow copy and paste function for further use.
120
+ - Or/ Translates the transcribed text into multiple languages.
121
+
122
+ ## Usage πŸ€—
123
+
124
+ 1. Transcribe audio files for note-taking, research, or content creation
125
+ 2. Detect phrases or keywords in audio recordings for data analysis or market research
126
+ 3. Generate text from speech for speech-to-text applications, such as subtitles, closed captions, or voice assistants
127
+ 4. Use the app for language learning, by transcribing audio files in a foreign language and practicing pronunciation
128
+ 5. Translate the transcribed text into multiple languages for global communication
129
+
130
+ ## Disclaimer πŸ™…β€β™‚οΈ
131
+
132
+ > This app is for personal use only and should not be used for commercial purposes.
133
+ The OpenAI Whisper Base model and Google Madlad model are pre-trained models and may not always produce accurate results. ''')
134
+
135
+ demo.queue(max_size=20)
136
+ demo.launch()