Spaces:

abrar-adnan
/

speech-analyzer

Running

App Files Files Community

speech-analyzer / app.py

abrar-adnan

added audio transcription

197af76 over 1 year ago

raw

history blame

6.44 kB

	import gradio as gr
	import os
	import cv2
	import face_recognition
	from fastai.vision.all import load_learner
	import time
	import base64
	from deepface import DeepFace
	import torchaudio
	import moviepy.editor as mp
	from transformers import WhisperProcessor, WhisperForConditionalGeneration

	# import pathlib
	# temp = pathlib.PosixPath
	# pathlib.PosixPath = pathlib.WindowsPath

	backends = [
	'opencv',
	'ssd',
	'dlib',
	'mtcnn',
	'retinaface',
	'mediapipe'
	]

	def getTranscription(path):
	# Insert Local Video File Path
	clip = mp.VideoFileClip(path)

	# Insert Local Audio File Path
	clip.audio.write_audiofile(r"audio.wav")

	waveform, sample_rate = torchaudio.load("audio.wav")
	waveform, sample_rate

	waveform, sample_rate = torchaudio.load("audio.wav")
	resampler = torchaudio.transforms.Resample(sample_rate, 16000)
	waveform = resampler(waveform)[0]

	processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
	model.config.forced_decoder_ids = None

	input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)

	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

	return transcription[0]


	model = load_learner("gaze-recognizer-v3.pkl")

	def video_processing(video_file, encoded_video):
	angry = 0
	disgust = 0
	fear = 0
	happy = 0
	sad = 0
	surprise = 0
	neutral = 0
	emotion_count = 0

	if encoded_video != "":

	decoded_file_data = base64.b64decode(encoded_video)

	with open("temp_video.mp4", "wb") as f:
	f.write(decoded_file_data)

	video_file = "temp_video.mp4"

	start_time = time.time()

	transcription = getTranscription(video_file)
	print(transcription)

	video_capture = cv2.VideoCapture(video_file)
	on_camera = 0
	off_camera = 0
	total = 0

	while True:
	# Read a single frame from the video
	for i in range(24*3):
	ret, frame = video_capture.read()
	if not ret:
	break

	# If there are no more frames, break out of the loop
	if not ret:
	break

	# Convert the frame to RGB color (face_recognition uses RGB)
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)




	# Find all the faces in the frame using a pre-trained convolutional neural network.
	face_locations = face_recognition.face_locations(gray)
	#face_locations = face_recognition.face_locations(gray, number_of_times_to_upsample=0, model="cnn")

	if len(face_locations) > 0:
	# Show the original frame with face rectangles drawn around the faces
	for top, right, bottom, left in face_locations:
	# cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)
	face_image = gray[top:bottom, left:right]
	color_image = frame[top:bottom, left:right]

	# Resize the face image to the desired size
	resized_face_image = cv2.resize(face_image, (128,128))

	try:
	emotion = DeepFace.analyze(color_image,actions=['emotion'],detector_backend = backends[2],enforce_detection = False)# 2,3, 4 works
	emotion_count += 1
	except Exception as e:
	pass

	print(emotion[0]['emotion'])
	angry += emotion[0]['emotion']['angry']
	disgust += emotion[0]['emotion']['disgust']
	fear += emotion[0]['emotion']['fear']
	happy += emotion[0]['emotion']['happy']
	sad += emotion[0]['emotion']['sad']
	surprise += emotion[0]['emotion']['surprise']
	neutral += emotion[0]['emotion']['neutral']

	# Predict the class of the resized face image using the model
	result = model.predict(resized_face_image)
	print(result[0])
	if(result[0] == 'on_camera'): on_camera = on_camera + 1
	elif(result[0] == 'off_camera'): off_camera = off_camera + 1
	total = total + 1

	try:
	# your processing code here
	gaze_percentage = on_camera / total * 100
	except Exception as e:
	print(f"An error occurred while processing the video: {e}")
	gaze_percentage = f'no face detected Total = {total},on_camera = {on_camera},off_camera = {off_camera}'
	print(f'Total = {total},on_camera = {on_camera},off_camera = {off_camera}')
	# print(f'focus perfectage = {on_camera/total*100}')
	# Release the video capture object and close all windows
	video_capture.release()
	cv2.destroyAllWindows()
	end_time = time.time()
	print(f'Time taken: {end_time-start_time}')
	if os.path.exists("temp_video.mp4"):
	os.remove("temp_video.mp4")
	print(gaze_percentage)

	angry = angry / emotion_count
	disgust = disgust / emotion_count
	fear = fear / emotion_count
	happy = happy / emotion_count
	sad = sad / emotion_count
	surprise = surprise / emotion_count
	neutral = neutral / emotion_count
	emotion = {
	'angry': angry,
	'disgust': disgust,
	'fear': fear,
	'happy': happy,
	'sad': sad,
	'surprise': surprise,
	'neutral': neutral
	},

	# angry = 'total anger percentage' + str(angry)
	# disgust = 'total disgust percentage' + str(disgust)
	# fear = 'total fear percentage' + str(fear)
	# happy = 'total happy percentage' + str(happy)
	# sad = 'total sad percentage' + str(sad)
	# surprise = 'total surprise percentage' + str(surprise)
	# neutral = 'total neutral percentage' + str(neutral)
	print(f'total anger percentage = {angry}')
	print(f'total disgust percentage = {disgust}')
	print(f'total fear percentage = {fear}')
	print(f'total happy percentage = {happy}')
	print(f'total sad percentage = {sad}')
	print(f'total surprise percentage = {surprise}')
	print(f'total neutral percentage = {neutral}')
	return str(gaze_percentage)


	demo = gr.Interface(fn=video_processing,
	inputs=["video", "text"],
	outputs="text")

	if __name__ == "__main__":
	demo.launch()