File size: 3,837 Bytes
1c82b7f
1210c2f
1c82b7f
09a1240
1210c2f
 
09a1240
 
ce955af
09a1240
1c82b7f
ce955af
 
1c82b7f
 
 
ce955af
 
 
1c82b7f
 
ce955af
 
 
 
 
 
 
dfb8f1c
ce955af
 
 
 
 
 
 
1210c2f
 
 
ce955af
ce11ee8
 
ce955af
1c82b7f
ce955af
ce11ee8
 
 
bcf8a6e
1c82b7f
 
ce955af
 
 
 
 
 
 
 
1210c2f
ce955af
 
1c82b7f
ce955af
1c82b7f
ce955af
1c82b7f
ce955af
 
1c82b7f
 
 
ce955af
 
 
 
 
 
 
 
5eb3657
a463d9d
ce955af
a463d9d
5eb3657
a463d9d
5eb3657
 
0a7c28f
5eb3657
 
a463d9d
ce955af
5c867fd
1c82b7f
 
eae71ec
1c82b7f
ce955af
 
5798954
dfb8f1c
 
abca8c5
 
ce955af
 
1c82b7f
1210c2f
1c82b7f
5eb3657
dfb8f1c
1210c2f
 
dfb8f1c
1c82b7f
ce955af
1c82b7f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
from time import time

import gradio as gr
import requests

from languages import LANGUAGES

GLADIA_API_KEY = os.environ.get("GLADIA_API_KEY")

headers = {
    "accept": "application/json",
    "x-gladia-key": GLADIA_API_KEY,
}

ACCEPTED_LANGUAGE_BEHAVIOUR = [
    "manual",
    "automatic single language",
    "automatic multiple languages",
]


def transcribe(
    audio: str = None,
) -> dict:
    """
    This function transcribes audio to text using the Gladia API. 
    It sends a request to the API with the given audio file or audio URL, and returns the transcribed text.
    Get your api key at gladia.io !

    Parameters:
    audio (str): The path to the audio file to transcribe.

    Returns:
    dict: A dictionary containing the transcribed text and other metadata about the transcription process. If an error occurs, the function returns a string with an error message.
    """
    DEFAULT_MANUAL_LANGUAGE = "english"

    language_behaviour = ACCEPTED_LANGUAGE_BEHAVIOUR[2]

    # if video file is there then send the audio field as the content of the video

    # if video file is there then send the audio field as the content of the video
    files = {
        "language_behaviour": (None, language_behaviour),
        "noise_reduction": (None, "false"),
        'output_format': (None, 'json'),
        'toggle_diarization': (None, 'true'),
        'diarization_max_speakers': (None, '2'),
    }

    # priority given to the audio or video
    if audio:
        files["audio"] = (audio, open(audio, "rb"), "audio/wav")

    # if language is manual then send the language field
    # if it's there for language_behaviour == automatic*
    # it will ignored anyways
    if language_behaviour == "manual":
        files["language"] = (None, DEFAULT_MANUAL_LANGUAGE)

    start_transfer = time()
    response = requests.post(
        "https://api.gladia.io/audio/text/audio-transcription/",
        headers=headers,
        files=files,
    )
    end_transfer = time()

    if response.status_code != 200:
        print(response.content, response.status_code)

        return "Sorry, an error occured with your request :/"

    # we have 2 outputs:
    # prediction and prediction_raw
    # prediction_raw has more details about the processing
    # and other debugging detailed element you might be
    # interested in
    
    
    segments = response.json()["prediction"]

    output = ""
    current_speaker = ""
    for segment in segments:
        if segment["speaker"] != current_speaker and segment["speaker"]!= "unknown":
            current_speaker = segment["speaker"]
            output = output + "<br/><br/><b> Speaker:" + str(segment["speaker"]) + ":</b> " + segment["transcription"]
        else:
            output = output + " " + segment["transcription"]
    

    return output, response.json()["prediction_raw"]



iface = gr.Interface(
    title="Gladia.io fast audio transcription",
    description="""Gladia.io Whisper large-v2 fast audio transcription API
    is able to perform fast audio transcriptions for any audio / video (less than a minute per hour) .<br/>For more details and a benchmark ran on multiple Speech-To-Text providers, please visit
    [our post](https://medium.com/@gladia.io/gladia-alpha-launch-redefining-what-s-possible-with-speech-to-text-ai-686dd4312a86) on Medium.
    <br/><br/>
    You are more than welcome to join us on [Slack](https://gladia-io.slack.com)
    and don't forget to get your own API key on [Gladia.io](https://gladia.io/) during the free alpha !
    """,
    fn=transcribe,
    inputs=[
        gr.Audio(label="Audio file", source="upload", type="filepath"),
    ],
    outputs=["html", "json"],
    examples=[
        ["examples/good.will.hunting.wav"],
        ["examples/wolf.of.wall.street.wav"],
    ],
)
iface.queue()
iface.launch()