Spaces:
Sleeping
Sleeping
File size: 3,578 Bytes
63f899c 8eca9ee 63f899c 59d9186 63f899c 8eca9ee 0f1045d bd786ec 7b988f1 c39b894 63f899c 8eca9ee 7b988f1 a6075c0 59d9186 63f899c 59d9186 751c5b7 59d9186 7b988f1 59d9186 63f899c 59d9186 63f899c 8eca9ee 7b988f1 614db49 7b988f1 8eca9ee 7b988f1 a6075c0 614db49 69dd2a2 7b988f1 8eca9ee a6075c0 614db49 a6075c0 7b988f1 a6075c0 7b988f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import os
import shutil
from huggingface_hub import snapshot_download
import gradio as gr
import numpy as np
from PIL import Image
import soundfile as sf
import argparse
import uuid
os.chdir(os.path.dirname(os.path.abspath(__file__)))
from scripts.inference import inference_process
is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False
if not is_shared_ui:
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
def check_image_square(image_path):
image = Image.open(image_path)
if image.width != image.height:
raise gr.Error("The uploaded image is not square. Please upload a square image.")
return image_path
def convert_audio_to_wav(audio_path):
if not audio_path.endswith('.wav'):
audio_data, samplerate = sf.read(audio_path)
wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
sf.write(wav_path, audio_data, samplerate)
return wav_path
return audio_path
def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
if is_shared_ui:
raise gr.Error("This Space only works in duplicated instances")
unique_id = uuid.uuid4()
args = argparse.Namespace(
config='configs/inference/default.yaml',
source_image=source_image,
driving_audio=driving_audio,
output=f'output-{unique_id}.mp4',
pose_weight=pose_weight,
face_weight=face_weight,
lip_weight=lip_weight,
face_expand_ratio=face_expand_ratio,
checkpoint=None
)
inference_process(args)
return f'output-{unique_id}.mp4'
with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8') as demo:
gr.Markdown(
"""
# Talking Head Generation
Upload a face image and driving audio, and adjust the weights to generate a talking head video.
> **Note:**
> - The face should be the main focus, making up 50%-70% of the image.
> - The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
> - To make it work, duplicate the Space and run it on your own profile using a private GPU.
> - An L4 costs US$0.80/h.
"""
)
with gr.Row():
with gr.Column():
avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input")
driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input")
with gr.Column():
with gr.Accordion("Advanced Settings", open=False):
pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight")
face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight")
lip_weight = gr.Slider(minimum=0.0, value=1.1, label="Lip Weight")
face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio")
generate = gr.Button("Generate", elem_id="generate-button")
output_video = gr.Video(label="Your Talking Head", elem_id="output-video")
avatar_face.change(fn=check_image_square, inputs=avatar_face, outputs=avatar_face)
driving_audio.change(fn=convert_audio_to_wav, inputs=driving_audio, outputs=driving_audio)
generate.click(
fn=run_inference,
inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
outputs=output_video
)
demo.launch()
|