Aryan Wadhawan commited on
Commit
e25c52f
1 Parent(s): f561f73
.history/app_20230718132721.py ADDED
File without changes
.history/app_20230718133117.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+
7
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
8
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
9
+
10
+ waveform, sample_rate = librosa.load('harvard.wav', sr=16000) # Downsample 44.1kHz to 8kHz
11
+
12
+ input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_values
13
+
14
+ with torch.no_grad():
15
+ logits = model(input_values).logits
16
+
17
+ predicted_ids = torch.argmax(logits, dim=-1)
18
+ transcription = processor.batch_decode(predicted_ids)
19
+
20
+ def showTranscription(transcription):
21
+ return transcription
22
+
23
+ iface = gr.Interface(fn=showTranscription, inputs="text", outputs="text")
24
+ iface.launch()
.history/app_20230718133128.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+
7
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
8
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
9
+
10
+ waveform, sample_rate = librosa.load('harvard.wav', sr=16000) # Downsample 44.1kHz to 8kHz
11
+
12
+ input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_values
13
+
14
+ with torch.no_grad():
15
+ logits = model(input_values).logits
16
+
17
+ predicted_ids = torch.argmax(logits, dim=-1)
18
+ transcription = processor.batch_decode(predicted_ids)
19
+
20
+ def showTranscription(transcription):
21
+ return transcription
22
+
23
+ iface = gr.Interface(fn=showTranscription, inputs="text", outputs="text")
24
+ iface.launch()
.history/app_20230718133340.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+ import base64
7
+
8
+
9
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
10
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
11
+
12
+ waveform, sample_rate = librosa.load(
13
+ "harvard.wav", sr=16000
14
+ ) # Downsample 44.1kHz to 8kHz
15
+
16
+ input_values = processor(
17
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
18
+ ).input_values
19
+
20
+ with torch.no_grad():
21
+ logits = model(input_values).logits
22
+
23
+ predicted_ids = torch.argmax(logits, dim=-1)
24
+ transcription = processor.batch_decode(predicted_ids)
25
+
26
+
27
+ def showTranscription(transcription):
28
+ return transcription
29
+
30
+
31
+ iface = gr.Interface(fn=showTranscription, inputs="text", outputs="text")
32
+ iface.launch()
.history/app_20230718133558.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+ import base64
7
+
8
+
9
+ def lark(audioAsB64):
10
+ with open("audio.wav", "wb") as preWaveform:
11
+ preWaveform.write(base64.b64encode())
12
+
13
+
14
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
15
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
16
+
17
+ waveform, sample_rate = librosa.load(
18
+ "harvard.wav", sr=16000
19
+ ) # Downsample 44.1kHz to 8kHz
20
+
21
+ input_values = processor(
22
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
23
+ ).input_values
24
+
25
+ with torch.no_grad():
26
+ logits = model(input_values).logits
27
+
28
+ predicted_ids = torch.argmax(logits, dim=-1)
29
+ transcription = processor.batch_decode(predicted_ids)
30
+
31
+
32
+ iface = gr.Interface(fn=lark, inputs="text", outputs="text")
33
+ iface.launch()
.history/app_20230718133701.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+ import base64
7
+
8
+
9
+ def lark(audioAsB64):
10
+ # convert b64 audio to wav
11
+ with open("audio.wav", "wb") as preWaveform:
12
+ preWaveform.write(base64.b64encode())
13
+
14
+ # processing
15
+ processor = Wav2Vec2Processor.from_pretrained(
16
+ "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
17
+ )
18
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
19
+
20
+ waveform, sample_rate = librosa.load(
21
+ "harvard.wav", sr=16000
22
+ ) # Downsample 44.1kHz to 8kHz
23
+
24
+ input_values = processor(
25
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
26
+ ).input_values
27
+
28
+ with torch.no_grad():
29
+ logits = model(input_values).logits
30
+
31
+ predicted_ids = torch.argmax(logits, dim=-1)
32
+ transcription = processor.batch_decode(predicted_ids)
33
+
34
+
35
+ iface = gr.Interface(fn=lark, inputs="text", outputs="text")
36
+ iface.launch()
.history/app_20230718133728.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+ import base64
7
+
8
+
9
+ def lark(audioAsB64):
10
+ # convert b64 audio to wav
11
+ with open("audio.wav", "wb") as preWaveform:
12
+ preWaveform.write(base64.b64encode())
13
+
14
+ # processing
15
+ processor = Wav2Vec2Processor.from_pretrained(
16
+ "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
17
+ )
18
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
19
+
20
+ waveform, sample_rate = librosa.load(
21
+ "harvard.wav", sr=16000
22
+ ) # Downsample 44.1kHz to 8kHz
23
+
24
+ input_values = processor(
25
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
26
+ ).input_values
27
+
28
+ with torch.no_grad():
29
+ logits = model(input_values).logits
30
+
31
+ predicted_ids = torch.argmax(logits, dim=-1)
32
+ transcription = processor.batch_decode(predicted_ids)
33
+
34
+ return transcription
35
+
36
+
37
+ iface = gr.Interface(fn=lark, inputs="text", outputs="text")
38
+ iface.launch()
.history/app_20230718134339.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+ import base64
7
+
8
+
9
+ def lark(audioAsB64):
10
+ # convert b64 audio to wav
11
+ with open("audio.wav", "wb") as preWaveform:
12
+ preWaveform.write(base64.b64encode())
13
+
14
+ # processing
15
+ processor = Wav2Vec2Processor.from_pretrained(
16
+ "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
17
+ )
18
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
19
+
20
+ waveform, sample_rate = librosa.load(
21
+ "harvard.wav", sr=16000
22
+ ) # Downsample 44.1kHz to 8kHz
23
+
24
+ input_values = processor(
25
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
26
+ ).input_values
27
+
28
+ with torch.no_grad():
29
+ logits = model(input_values).logits
30
+
31
+ predicted_ids = torch.argmax(logits, dim=-1)
32
+ transcription = processor.batch_decode(predicted_ids)
33
+
34
+ return transcription
35
+
36
+
37
+ iface = gr.Interface(fn=lark, inputs="text", outputs="text")
38
+ iface.launch()
.history/packages_20230718132731.txt ADDED
File without changes
.history/packages_20230718132746.txt ADDED
File without changes
.history/packages_20230718132842.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ espeak
.history/requirements_20230718132726.txt ADDED
File without changes
.history/requirements_20230718132835.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ phonemizer
2
+ librosa
3
+ transformers
4
+ torch
.history/requirements_20230718133331.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ phonemizer
2
+ librosa
3
+ transformers
4
+ torch
5
+ base64
.history/requirements_20230718134813.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ phonemizer
2
+ librosa
3
+ transformers
4
+ torch
.history/requirements_20230718134828.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ phonemizer
2
+ librosa
3
+ transformers
4
+ torch
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  phonemizer
2
  librosa
3
  transformers
4
- torch
5
- base64
 
1
  phonemizer
2
  librosa
3
  transformers
4
+ torch