import os import time import pdb import cuid import gradio as gr from huggingface_hub import snapshot_download ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) CheckpointsDir = os.path.join(ProjectDir, "checkpoints") def download_model(): if not os.path.exists(CheckpointsDir): print("Checkpoint Not Downloaded, start downloading...") tic = time.time() snapshot_download( repo_id="TMElyralab/MuseV", local_dir=CheckpointsDir, max_workers=8, ) toc = time.time() print(f"download cost {toc-tic} seconds") else: print("Already download the model.") download_model() # for huggingface deployment. from gradio_video2video import online_v2v_inference from gradio_text2video import online_t2v_inference def update_shape(image): if image != None: h, w, _ = image.shape else: h, w = 768, 512 return w, h class ConcatenateBlock(gr.blocks.Block): def __init__(self, options): self.options = options self.current_string = "" def update_string(self, new_choice): if new_choice and new_choice not in self.current_string.split(", "): if self.current_string == "": self.current_string = new_choice else: self.current_string += ", " + new_choice return self.current_string def process_input(new_choice): return concatenate_block.update_string(new_choice), "" control_options = [ "pose", "pose_body", "pose_hand", "pose_face", "pose_hand_body", "pose_hand_face", "dwpose", "dwpose_face", "dwpose_hand", "dwpose_body", "dwpose_body_hand", "canny", "tile", "hed", "hed_scribble", "depth", "pidi", "normal_bae", "lineart", "lineart_anime", "zoe", "sam", "mobile_sam", "leres", "content", "face_detector", ] concatenate_block = ConcatenateBlock(control_options) css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}""" with gr.Blocks(css=css) as demo: gr.Markdown( "

MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising

\

\
\ Zhiqiang Xia *,\ Zhaokang Chen*,\ Bin Wu,\ Chao Li,\ Kwok-Wai Hung,\ Chao Zhan,\ Yingjie He,\ Wenjiang Zhou\ (*Equal Contribution, Corresponding Author, benbinwu@tencent.com)\
\ Lyra Lab, Tencent Music Entertainment\

\ [Github Repo]\ , which is important to Open-Source projects. Thanks!\ [ArXiv(Coming Soon)] \ [Project Page(Coming Soon)] \ If MuseV is useful, please help star the repo~
" ) with gr.Tab("Text to Video"): with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt") image = gr.Image(label="VisionCondImage") gr.Markdown("seed=-1 means that the seeds run each time are different") seed = gr.Number(label="Seed", value=-1) video_length = gr.Number(label="Video Length", value=12) fps = gr.Number(label="Generate Video FPS", value=6) gr.Markdown( ( "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n" "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n" "The longer the W&H, the smaller the motion amplitude, and the higher video quality" ) ) with gr.Row(): w = gr.Number(label="Width", value=-1) h = gr.Number(label="Height", value=-1) img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0) btn1 = gr.Button("Generate") out = gr.outputs.Video() # pdb.set_trace() with gr.Row(): board = gr.Dataframe( value=[["", "", ""]] * 3, interactive=False, type="array", label="Demo Video", ) # image.change(fn=update_shape, inputs=[image], outputs=[w, h]) btn1.click( fn=online_t2v_inference, inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio], outputs=out, ) with gr.Tab("Video to Video"): with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt") gr.Markdown( ( "pose of VisionCondImage should be same as of the first frame of the video. " "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL." ) ) image = gr.Image(label="VisionCondImage") video = gr.Video(label="ReferVideo") # radio = gr.inputs.Radio(, label="Select an option") # ctr_button = gr.inputs.Button(label="Add ControlNet List") # output_text = gr.outputs.Textbox() processor = gr.Textbox( label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}", value="dwpose_body_hand", ) gr.Markdown("seed=-1 means that seeds are different in every run") seed = gr.Number(label="Seed", value=-1) video_length = gr.Number(label="Video Length", value=12) fps = gr.Number(label="Generate Video FPS", value=6) gr.Markdown( ( "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n" "The shorter the image size, the larger the motion amplitude, and the lower video quality. \n" "The longer the W&H, the smaller the motion amplitude, and the higher video quality. " ) ) with gr.Row(): w = gr.Number(label="Width", value=-1) h = gr.Number(label="Height", value=-1) img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0) btn2 = gr.Button("Generate") out1 = gr.outputs.Video() # image.change(fn=update_shape, inputs=[image], outputs=[w, h]) btn2.click( fn=online_v2v_inference, inputs=[ prompt, image, video, processor, seed, fps, w, h, video_length, img_edge_ratio, ], outputs=out1, ) # Set the IP and port ip_address = "0.0.0.0" # Replace with your desired IP address port_number = 7860 # Replace with your desired port number demo.queue().launch( share=False, debug=True, server_name=ip_address, server_port=port_number )