import os
import time
import pdb
import cuid
import gradio as gr
from huggingface_hub import snapshot_download
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
def download_model():
if not os.path.exists(CheckpointsDir):
print("Checkpoint Not Downloaded, start downloading...")
tic = time.time()
snapshot_download(
repo_id="TMElyralab/MuseV",
local_dir=CheckpointsDir,
max_workers=8,
)
toc = time.time()
print(f"download cost {toc-tic} seconds")
else:
print("Already download the model.")
download_model() # for huggingface deployment.
from gradio_video2video import online_v2v_inference
from gradio_text2video import online_t2v_inference
def update_shape(image):
if image != None:
h, w, _ = image.shape
else:
h, w = 768, 512
return w, h
class ConcatenateBlock(gr.blocks.Block):
def __init__(self, options):
self.options = options
self.current_string = ""
def update_string(self, new_choice):
if new_choice and new_choice not in self.current_string.split(", "):
if self.current_string == "":
self.current_string = new_choice
else:
self.current_string += ", " + new_choice
return self.current_string
def process_input(new_choice):
return concatenate_block.update_string(new_choice), ""
control_options = [
"pose",
"pose_body",
"pose_hand",
"pose_face",
"pose_hand_body",
"pose_hand_face",
"dwpose",
"dwpose_face",
"dwpose_hand",
"dwpose_body",
"dwpose_body_hand",
"canny",
"tile",
"hed",
"hed_scribble",
"depth",
"pidi",
"normal_bae",
"lineart",
"lineart_anime",
"zoe",
"sam",
"mobile_sam",
"leres",
"content",
"face_detector",
]
concatenate_block = ConcatenateBlock(control_options)
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
with gr.Blocks(css=css) as demo:
gr.Markdown(
"
MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising
\
\
\
Zhiqiang Xia *,\
Zhaokang Chen*,\
Bin Wu†,\
Chao Li,\
Kwok-Wai Hung,\
Chao Zhan,\
Yingjie He,\
Wenjiang Zhou\
(*Equal Contribution, †Corresponding Author, benbinwu@tencent.com)\
\
Lyra Lab, Tencent Music Entertainment\
\
[Github Repo]\
, which is important to Open-Source projects. Thanks!\
[ArXiv(Coming Soon)] \
[Project Page(Coming Soon)] \
If MuseV is useful, please help star the repo~ "
)
with gr.Tab("Text to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
image = gr.Image(label="VisionCondImage")
gr.Markdown("seed=-1 means that the seeds run each time are different")
seed = gr.Number(label="Seed", value=-1)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
btn1 = gr.Button("Generate")
out = gr.outputs.Video()
# pdb.set_trace()
with gr.Row():
board = gr.Dataframe(
value=[["", "", ""]] * 3,
interactive=False,
type="array",
label="Demo Video",
)
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
btn1.click(
fn=online_t2v_inference,
inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
outputs=out,
)
with gr.Tab("Video to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
gr.Markdown(
(
"pose of VisionCondImage should be same as of the first frame of the video. "
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
)
)
image = gr.Image(label="VisionCondImage")
video = gr.Video(label="ReferVideo")
# radio = gr.inputs.Radio(, label="Select an option")
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
# output_text = gr.outputs.Textbox()
processor = gr.Textbox(
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
value="dwpose_body_hand",
)
gr.Markdown("seed=-1 means that seeds are different in every run")
seed = gr.Number(label="Seed", value=-1)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
btn2 = gr.Button("Generate")
out1 = gr.outputs.Video()
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
btn2.click(
fn=online_v2v_inference,
inputs=[
prompt,
image,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
],
outputs=out1,
)
# Set the IP and port
ip_address = "0.0.0.0" # Replace with your desired IP address
port_number = 7860 # Replace with your desired port number
demo.queue().launch(
share=False, debug=True, server_name=ip_address, server_port=port_number
)