Spaces:

Gladiaio
/

Audio-Transcription

Runtime error

App Files Files Community

Audio-Transcription / app.py

jilijeanlouis

Update app.py

ce955af almost 2 years ago

raw

history blame

5.95 kB

	import os
	import requests

	import gradio as gr
	from languages import LANGUAGES
	from time import time

	GLADIA_API_KEY = os.environ.get("GLADIA_API_KEY")

	headers = {
	"accept": "application/json",
	"x-gladia-key": GLADIA_API_KEY,
	}

	ACCEPTED_LANGUAGE_BEHAVIOUR = [
	"manual",
	"automatic single language",
	"automatic multiple languages",
	]


	def transcribe(
	audio_url: str = None,
	audio: str = None,
	video: str = None,
	language_behaviour: str = ACCEPTED_LANGUAGE_BEHAVIOUR[2],
	language: str = "english",
	) -> dict:
	"""
	This function transcribes audio to text using the Gladia API.
	It sends a request to the API with the given audio file or audio URL, and returns the transcribed text.
	Find your api key at gladia.io

	Parameters:
	audio_url (str): The URL of the audio file to transcribe. If audio_url is provided, audio file will be ignored.
	audio (str): The path to the audio file to transcribe.
	video (str): The path to the video file. If provided, the audio field will be set to the content of this video.
	language_behaviour (str): Determines how language detection should be performed.
	Must be one of [
	"manual",
	"automatic single language",
	"automatic multiple languages"
	]
	If "manual", the language field must be provided and the API will transcribe the audio in the given language.
	If "automatic single language", the language of the audio will be automatically detected by the API
	but will force the transcription to be in a single language.
	If "automatic multiple languages", the language of the audio will be automatically detected by the API for
	each sentence allowing code-switching over 97 languages.

	language (str): The language of the audio file. This field is ignored if language_behaviour is set to "automatic*".

	Returns:
	dict: A dictionary containing the transcribed text and other metadata about the transcription process. If an error occurs, the function returns a string with an error message.
	"""

	# if video file is there then send the audio field as the content of the video
	files = {
	"language_behaviour": (None, language_behaviour),
	}

	# priority given to the video
	if video:
	audio = video

	# priority given to the audio or video
	if audio:
	files["audio"] = (audio, open(audio, "rb"), "audio/wav")
	else:
	files["audio_url"] = ((None, audio_url),)

	# if language is manual then send the language field
	# if it's there for language_behaviour == automatic*
	# it will ignored anyways
	if language_behaviour == "manual":
	files["language"] = (None, language)

	start_transfer = time()
	response = requests.post(
	"https://api.gladia.io/audio/text/audio-transcription/",
	headers=headers,
	files=files,
	)
	end_transfer = time()

	if response.status_code != 200:
	print(response.content, response.status_code)

	return "Sorry, an error occured with your request :/"

	# we have 2 outputs:
	# prediction and prediction_raw
	# prediction_raw has more details about the processing
	# and other debugging detailed element you might be
	# interested in

	output = response.json()["prediction_raw"]

	output["metadata"]["client_total_execution_time"] = end_transfer - start_transfer
	output["metadata"]["data_transfer_time"] = output["metadata"]["client_total_execution_time"] -output["metadata"]["total_transcription_time"]
	output["metadata"]["api_server_transcription_time"] = output["metadata"]["total_transcription_time"]

	del output["metadata"]["original_mediainfo"]

	return output


	iface = gr.Interface(
	title="Gladia.io fast audio transcription",
	description="""Gladia.io Whisper large-v2 fast audio transcription API
	is able to perform fast audio transcription for any audio / video or url format.<br/><br/>
	However it's prefered for faster performance to provide <br/>
	wav 16KHz with 16b encoding (pcm_u16be) to avoid further the conversion time.<br/>
	"automatic single language" language discovery behavior may also<br/>
	slow down (just a little bit - talking about ms) the process.
	<br/>
	Here is a benchmark ran on multiple Speech-To-Text providers
	![Benchmarks](https://storage.gra.cloud.ovh.net/v1/AUTH_90df0bdc74f749ce86783e6550b1e4aa/public-files/benchmark.png)<br/>
	Join our [Slack](https://gladia-io.slack.com) to discuss with us.<br/><br/>
	Get your own API key on [Gladia.io](https://gladia.io/) during free alpha
	""",
	fn=transcribe,
	inputs=[
	gr.Textbox(
	lines=1,
	label="Audio/Video url to transcribe",
	),
	gr.Audio(label="or Audio file to transcribe", source="upload", type="filepath"),
	gr.Video(label="or Video file to transcribe", source="upload", type="filepath"),
	gr.Dropdown(
	label="""Language transcription behaviour:\n
	If "manual", the language field must be provided and the API will transcribe the audio in the given language.
	If "automatic single language", the language of the audio will be automatically detected by the API
	but will force the transcription to be in a single language.
	If "automatic multiple languages", the language of the audio will be automatically detected by the API for
	each sentence allowing code-switching over 97 languages.
	""",
	choices=ACCEPTED_LANGUAGE_BEHAVIOUR,
	value=ACCEPTED_LANGUAGE_BEHAVIOUR[1]
	),
	gr.Dropdown(
	choices=sorted([language_name for language_name in LANGUAGES.keys()]),
	label="Language (only if language behaviour is set to manual)",
	value="english"
	),
	],
	outputs="json",
	)
	iface.queue()
	iface.launch()