import os from time import time import gradio as gr import requests from languages import LANGUAGES GLADIA_API_KEY = os.environ.get("GLADIA_API_KEY") headers = { "accept": "application/json", "x-gladia-key": GLADIA_API_KEY, } ACCEPTED_LANGUAGE_BEHAVIOUR = [ "manual", "automatic single language", "automatic multiple languages", ] def transcribe( audio: str = None, ) -> dict: """ This function transcribes audio to text using the Gladia API. It sends a request to the API with the given audio file or audio URL, and returns the transcribed text. Get your api key at gladia.io ! Parameters: audio (str): The path to the audio file to transcribe. Returns: dict: A dictionary containing the transcribed text and other metadata about the transcription process. If an error occurs, the function returns a string with an error message. """ DEFAULT_MANUAL_LANGUAGE = "english" language_behaviour = ACCEPTED_LANGUAGE_BEHAVIOUR[2] # if video file is there then send the audio field as the content of the video # if video file is there then send the audio field as the content of the video files = { "language_behaviour": (None, language_behaviour), "noise_reduction": (None, "false"), 'output_format': (None, 'json'), 'toggle_diarization': (None, 'true'), } # priority given to the audio or video if audio: files["audio"] = (audio, open(audio, "rb"), "audio/wav") # if language is manual then send the language field # if it's there for language_behaviour == automatic* # it will ignored anyways if language_behaviour == "manual": files["language"] = (None, DEFAULT_MANUAL_LANGUAGE) start_transfer = time() response = requests.post( "https://api.gladia.io/audio/text/audio-transcription/", headers=headers, files=files, ) end_transfer = time() if response.status_code != 200: print(response.content, response.status_code) return "Sorry, an error occured with your request :/" # we have 2 outputs: # prediction and prediction_raw # prediction_raw has more details about the processing # and other debugging detailed element you might be # interested in segments = response.json()["prediction"] output = "" current_speaker = "" for segment in segments: if segment["speaker"] != current_speaker and segment["speaker"]!= "unknown": current_speaker = segment["speaker"] output = output + "

Speaker:" + str(segment["speaker"]) + ": " + segment["transcription"] else: output = output + " " + segment["transcription"] return output, response.json()["prediction_raw"] iface = gr.Interface( title="Gladia.io fast audio transcription", description="""Gladia.io Whisper large-v2 fast audio transcription API is able to perform fast audio transcriptions for any audio / video (around 10s per hour) .
For more details and a benchmark ran on multiple Speech-To-Text providers, please visit [our post](https://medium.com/@gladia.io/gladia-alpha-launch-redefining-what-s-possible-with-speech-to-text-ai-686dd4312a86) on Medium.

You are more than welcome to join us on [Slack](https://gladia-io.slack.com) and don't forget to get your own API key on [Gladia.io](https://gladia.io/) during the free alpha ! """, fn=transcribe, inputs=[ gr.Audio(label="Audio file", source="upload", type="filepath"), ], outputs=["html", "json"], examples=[ ["examples/good.will.hunting.wav"], ["examples/wolf.of.wall.street.wav"], ], ) iface.queue() iface.launch()