import os import requests import gradio as gr from languages import LANGUAGES from time import time GLADIA_API_KEY = os.environ.get("GLADIA_API_KEY") headers = { "accept": "application/json", "x-gladia-key": GLADIA_API_KEY, } ACCEPTED_LANGUAGE_BEHAVIOUR = [ "manual", "automatic single language", "automatic multiple languages", ] def transcribe( audio: str = None, language_behaviour: str = ACCEPTED_LANGUAGE_BEHAVIOUR[2], language: str = "english", ) -> dict: """ This function transcribes audio to text using the Gladia API. It sends a request to the API with the given audio file or audio URL, and returns the transcribed text. Get your api key at gladia.io ! Parameters: audio_url (str): The URL of the audio file to transcribe. If audio_url is provided, audio file will be ignored. audio (str): The path to the audio file to transcribe. video (str): The path to the video file. If provided, the audio field will be set to the content of this video. language_behaviour (str): Determines how language detection should be performed. Must be one of [ "manual", "automatic single language", "automatic multiple languages" ] If "manual", the language field must be provided and the API will transcribe the audio in the given language. If "automatic single language", the language of the audio will be automatically detected by the API but will force the transcription to be in a single language. If "automatic multiple languages", the language of the audio will be automatically detected by the API for each sentence allowing code-switching over 97 languages. language (str): The language of the audio file. This field is ignored if language_behaviour is set to "automatic*". Returns: dict: A dictionary containing the transcribed text and other metadata about the transcription process. If an error occurs, the function returns a string with an error message. """ # if video file is there then send the audio field as the content of the video files = { "language_behaviour": (None, language_behaviour), } # priority given to the audio or video if audio: files["audio"] = (audio, open(audio, "rb"), "audio/wav") else: files["audio_url"] = ((None, audio_url),) # if language is manual then send the language field # if it's there for language_behaviour == automatic* # it will ignored anyways if language_behaviour == "manual": files["language"] = (None, language) start_transfer = time() response = requests.post( "https://api.gladia.io/audio/text/audio-transcription/", headers=headers, files=files, ) end_transfer = time() if response.status_code != 200: print(response.content, response.status_code) return "Sorry, an error occured with your request :/" # we have 2 outputs: # prediction and prediction_raw # prediction_raw has more details about the processing # and other debugging detailed element you might be # interested in output = response.json()["prediction_raw"] output["metadata"]["client_total_execution_time"] = end_transfer - start_transfer output["metadata"]["data_transfer_time"] = output["metadata"]["client_total_execution_time"] -output["metadata"]["total_transcription_time"] output["metadata"]["api_server_transcription_time"] = output["metadata"]["total_transcription_time"] del output["metadata"]["original_mediainfo"] return output iface = gr.Interface( title="Gladia.io fast audio transcription", description="""Gladia.io Whisper large-v2 fast audio transcription API is able to perform fast audio transcriptions for any audio / video. For more details and a benchmark ran on multiple Speech-To-Text providers, please visit [our post](https://medium.com/@gladia.io/gladia-alpha-launch-redefining-what-s-possible-with-speech-to-text-ai-686dd4312a86) on Medium.

You are more than welcome to join our [Slack](https://gladia-io.slack.com) to discuss with us and also don't forget to get your own API key on [Gladia.io](https://gladia.io/) during the free alpha ! """, fn=transcribe, inputs=[ gr.Audio(label="or Audio file to transcribe", source="upload", type="filepath"), gr.Dropdown( label="""Language transcription behaviour:\n If "manual", the language field must be provided and the API will transcribe the audio in the given language. If "automatic single language", the language of the audio will be automatically detected by the API but will force the transcription to be in a single language. If "automatic multiple languages", the language of the audio will be automatically detected by the API for each sentence allowing code-switching over 97 languages. """, choices=ACCEPTED_LANGUAGE_BEHAVIOUR, value=ACCEPTED_LANGUAGE_BEHAVIOUR[1] ), gr.Dropdown( choices=sorted([language_name for language_name in LANGUAGES.keys()]), label="Language (only if language behaviour is set to manual)", value="english" ), ], outputs="json", examples=[ ["examples/good.will.hunting.wav", ACCEPTED_LANGUAGE_BEHAVIOUR[1], "english"], ["examples/wolf.of.wall.street.wav", ACCEPTED_LANGUAGE_BEHAVIOUR[1], "english"], ], ) iface.queue() iface.launch()