File size: 3,208 Bytes
1c82b7f
1210c2f
1c82b7f
09a1240
1210c2f
 
09a1240
 
ce955af
09a1240
1c82b7f
ce955af
 
1c82b7f
 
 
ce955af
 
 
1c82b7f
 
ce955af
 
 
 
 
 
 
dfb8f1c
ce955af
 
 
 
 
 
 
1210c2f
 
 
ce955af
 
1c82b7f
ce955af
1c82b7f
 
ce955af
 
 
 
 
 
 
 
1210c2f
ce955af
 
1c82b7f
ce955af
1c82b7f
ce955af
1c82b7f
ce955af
 
1c82b7f
 
 
ce955af
 
 
 
 
 
 
 
a463d9d
ce955af
a463d9d
ce955af
a463d9d
 
 
ce955af
09a1240
1c82b7f
 
 
ce955af
 
98a3806
dfb8f1c
 
abca8c5
 
ce955af
 
1c82b7f
1210c2f
1c82b7f
ce955af
dfb8f1c
1210c2f
 
dfb8f1c
1c82b7f
ce955af
1c82b7f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
from time import time

import gradio as gr
import requests

from languages import LANGUAGES

GLADIA_API_KEY = os.environ.get("GLADIA_API_KEY")

headers = {
    "accept": "application/json",
    "x-gladia-key": GLADIA_API_KEY,
}

ACCEPTED_LANGUAGE_BEHAVIOUR = [
    "manual",
    "automatic single language",
    "automatic multiple languages",
]


def transcribe(
    audio: str = None,
) -> dict:
    """
    This function transcribes audio to text using the Gladia API. 
    It sends a request to the API with the given audio file or audio URL, and returns the transcribed text.
    Get your api key at gladia.io !

    Parameters:
    audio (str): The path to the audio file to transcribe.

    Returns:
    dict: A dictionary containing the transcribed text and other metadata about the transcription process. If an error occurs, the function returns a string with an error message.
    """
    DEFAULT_MANUAL_LANGUAGE = "english"

    language_behaviour = ACCEPTED_LANGUAGE_BEHAVIOUR[2]

    # if video file is there then send the audio field as the content of the video
    files = {
        "language_behaviour": (None, language_behaviour),
    }

    # priority given to the audio or video
    if audio:
        files["audio"] = (audio, open(audio, "rb"), "audio/wav")

    # if language is manual then send the language field
    # if it's there for language_behaviour == automatic*
    # it will ignored anyways
    if language_behaviour == "manual":
        files["language"] = (None, DEFAULT_MANUAL_LANGUAGE)

    start_transfer = time()
    response = requests.post(
        "https://api.gladia.io/audio/text/audio-transcription/",
        headers=headers,
        files=files,
    )
    end_transfer = time()

    if response.status_code != 200:
        print(response.content, response.status_code)

        return "Sorry, an error occured with your request :/"

    # we have 2 outputs:
    # prediction and prediction_raw
    # prediction_raw has more details about the processing
    # and other debugging detailed element you might be
    # interested in
    
    segments = response.json()["prediction"]

    output = ""

    for segment in segments:
        output += " " + transcription        
    

    return output


iface = gr.Interface(
    title="Gladia.io fast audio transcription",
    description="""Gladia.io Whisper large-v2 fast audio transcription API
    is able to perform fast audio transcriptions for any audio / video (around 10s per hour) .<br/>For more details and a benchmark ran on multiple Speech-To-Text providers, please visit
    [our post](https://medium.com/@gladia.io/gladia-alpha-launch-redefining-what-s-possible-with-speech-to-text-ai-686dd4312a86) on Medium.
    <br/><br/>
    You are more than welcome to join us on [Slack](https://gladia-io.slack.com)
    and don't forget to get your own API key on [Gladia.io](https://gladia.io/) during the free alpha !
    """,
    fn=transcribe,
    inputs=[
        gr.Audio(label="Audio file", source="upload", type="filepath"),
    ],
    outputs="json",
    examples=[
        ["examples/good.will.hunting.wav"],
        ["examples/wolf.of.wall.street.wav"],
    ],
)
iface.queue()
iface.launch()