Spaces:

kevinwang676
/

MuseV-test

No application file

App Files Files Community

MuseV-test / mmcm /t2p /text2pose.py

kevinwang676

Upload folder using huggingface_hub

6755a2d verified 8 months ago

raw

history blame contribute delete

14.9 kB

	#from __future__ import absolute_import
	import sys
	import io
	import os
	sys.argv = ['GPT_eval_multi.py']

	# 将项目根目录添加到sys.path中
	PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
	sys.path.insert(1, PROJECT_ROOT)
	CKPT_ROOT="/cfs-datasets/public_models/motion"

	from .options import option_transformer as option_trans

	import sys
	print(sys.path[0])

	import clip
	import torch
	import cv2
	import numpy as np
	from .models import vqvae as vqvae
	from .models import t2m_trans as trans
	import warnings
	from .visualization import plot_3d_global as plot_3d
	import matplotlib.pyplot as plt
	import numpy as np
	import matplotlib.colors as mcolors
	from tqdm import tqdm
	from mpl_toolkits.mplot3d import Axes3D
	from PIL import Image

	import time
	import random


	warnings.filterwarnings('ignore')
	from matplotlib.axes._axes import _log as matplotlib_axes_logger
	matplotlib_axes_logger.setLevel('ERROR')

	from math import cos,sin,radians

	args = option_trans.get_args_parser()

	args.dataname = 't2m'
	args.resume_pth = os.path.join(CKPT_ROOT,'pretrained/VQVAE/net_last.pth')
	args.resume_trans = os.path.join(CKPT_ROOT,'pretrained/VQTransformer_corruption05/net_best_fid.pth')
	args.down_t = 2
	args.depth = 3
	args.block_size = 51

	def replace_space_with_underscore(s):
	return s.replace(' ', '_')


	def Rz(angle):
	theta=radians(angle)
	return np.array([[cos(theta), -sin(theta), 0],
	[sin(theta), cos(theta), 0],
	[0, 0, 1]])


	def Rx(angle):
	theta=radians(angle)
	return np.array(
	[[1, 0, 0],
	[0 , cos(theta), -sin(theta)],
	[0, sin(theta), cos(theta)]])

	def generate_cuid():
	timestamp = hex(int(time.time() * 1000))[2:]
	random_str = hex(random.randint(0, 0xfffff))[2:]
	return (timestamp + random_str).zfill(10)

	def smpl_to_openpose18(smpl_keypoints):
	'''
	22关键点SMPL对应关系解释
	[0, 2, 5, 8, 11]
	这个列表表示SMPL模型中左腿的连接方式，从骨盆（0号关键点）开始，连接左大腿（2号关键点）、左小腿（5号关键点）、左脚（8号关键点）和左脚尖（11号关键点）。

	[0, 1, 4, 7, 10]
	这个列表表示SMPL模型中右腿的连接方式，从骨盆（0号关键点）开始，连接右大腿（1号关键点）、右小腿（4号关键点）、右脚（7号关键点）和右脚尖（10号关键点）。

	[0, 3, 6, 9, 12, 15]
	这个列表表示SMPL模型中躯干的连接方式，从骨盆（0号关键点）开始，连接脊柱（3号关键点）、颈部（6号关键点）、头部（9号关键点）、左肩膀（12号关键点）、右肩膀（15号关键点）。

	[9, 14, 17, 19, 21]
	这个列表表示SMPL模型中左臂的连接方式，从左肩膀（9号关键点）开始，连接左上臂（14号关键点）、左前臂（17号关键点）、左手腕（19号关键点）和左手（21号关键点）。

	[9, 13, 16, 18, 20]
	这个列表表示SMPL模型中右臂的连接方式，从右肩膀（9号关键点）开始，连接右上臂（13号关键点）、右前臂（16号关键点）、右手腕（18号关键点）和右手（20号关键点）。

	目前转Openpose忽略掉了SMPL的肩膀关键点
	'''
	openpose_keypoints = np.zeros((18, 3))
	openpose_keypoints[0] = smpl_keypoints[9] # nose
	openpose_keypoints[0][1] = openpose_keypoints[0][1]+0.3 #


	openpose_keypoints[1] = smpl_keypoints[6] # neck
	openpose_keypoints[2] = smpl_keypoints[16] # right shoulder
	openpose_keypoints[3] = smpl_keypoints[18] # right elbow
	openpose_keypoints[4] = smpl_keypoints[20] # right wrist
	openpose_keypoints[5] = smpl_keypoints[17] # left shoulder
	openpose_keypoints[6] = smpl_keypoints[19] # left elbow
	openpose_keypoints[7] = smpl_keypoints[21] # left wrist

	#TODO: Experiment,将neck的关键点抬高&&将nose的关键点相对高度关系与neck保持一致
	openpose_keypoints[1][0]=(openpose_keypoints[2][0]+openpose_keypoints[5][0])/2
	openpose_keypoints[1][1]=(openpose_keypoints[2][1]+openpose_keypoints[5][1])/2
	openpose_keypoints[1][2]=(openpose_keypoints[2][2]+openpose_keypoints[5][2])/2
	openpose_keypoints[0][1] = openpose_keypoints[1][1]+0.3 #


	openpose_keypoints[8] = smpl_keypoints[1] # right hip
	openpose_keypoints[9] = smpl_keypoints[4] # right knee
	openpose_keypoints[10] = smpl_keypoints[7] # right ankle
	openpose_keypoints[11] = smpl_keypoints[2] # left hip
	openpose_keypoints[12] = smpl_keypoints[5] # left knee
	openpose_keypoints[13] = smpl_keypoints[8] # left ankle

	#TODO: Experiment,手工指定脸部关键点测试是否能够指定身体朝向
	#openpose_keypoints[0][0] = openpose_keypoints[0][0]+0.3#测试0坐标轴方向(水平向右)
	#openpose_keypoints[0][2] = openpose_keypoints[0][2]#测试2坐标轴方向（向外
	#openpose_keypoints[0][1] = openpose_keypoints[0][1]+0.5#测试1坐标轴方向（垂直向上
	openpose_keypoints[14] = openpose_keypoints[0] # right eye
	openpose_keypoints[14][1]=openpose_keypoints[14][1]+0.05
	openpose_keypoints[14][0]=openpose_keypoints[14][0]+0.3*(openpose_keypoints[2][0]-openpose_keypoints[1][0])
	openpose_keypoints[14][2]=openpose_keypoints[14][2]+0.3*(openpose_keypoints[2][2]-openpose_keypoints[1][2])

	openpose_keypoints[15] = openpose_keypoints[0] # left eye
	openpose_keypoints[15][1]=openpose_keypoints[15][1]+0.05
	openpose_keypoints[15][0]=openpose_keypoints[15][0]+0.3*(openpose_keypoints[5][0]-openpose_keypoints[1][0])
	openpose_keypoints[15][2]=openpose_keypoints[15][2]+0.3*(openpose_keypoints[5][2]-openpose_keypoints[1][2])

	openpose_keypoints[16] = openpose_keypoints[0] # right ear
	openpose_keypoints[16][0]=openpose_keypoints[16][0]+0.7*(openpose_keypoints[2][0]-openpose_keypoints[1][0])
	openpose_keypoints[16][2]=openpose_keypoints[16][2]+0.7*(openpose_keypoints[2][2]-openpose_keypoints[1][2])

	openpose_keypoints[17] = openpose_keypoints[0] # left ear
	openpose_keypoints[17][0]=openpose_keypoints[17][0]+0.7*(openpose_keypoints[5][0]-openpose_keypoints[1][0])
	openpose_keypoints[17][2]=openpose_keypoints[17][2]+0.7*(openpose_keypoints[5][2]-openpose_keypoints[1][2])

	return openpose_keypoints






	# TODO: debug only, need to be deleted before unload
	## load clip model and datasets
	clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False, download_root=CKPT_ROOT) # Must set jit=False for training
	clip.model.convert_weights(clip_model) # Actually this line is unnecessary since clip by default already on float16
	clip_model.eval()
	for p in clip_model.parameters():
	p.requires_grad = False
	print("loaded CLIP model")
	net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
	args.nb_code,
	args.code_dim,
	args.output_emb_width,
	args.down_t,
	args.stride_t,
	args.width,
	args.depth,
	args.dilation_growth_rate)


	trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code,
	embed_dim=1024,
	clip_dim=args.clip_dim,
	block_size=args.block_size,
	num_layers=9,
	n_head=16,
	drop_out_rate=args.drop_out_rate,
	fc_rate=args.ff_rate)


	print ('loading checkpoint from {}'.format(args.resume_pth))
	ckpt = torch.load(args.resume_pth, map_location='cpu')
	net.load_state_dict(ckpt['net'], strict=True)
	net.eval()
	net.cuda()

	print ('loading transformer checkpoint from {}'.format(args.resume_trans))
	ckpt = torch.load(args.resume_trans, map_location='cpu')
	trans_encoder.load_state_dict(ckpt['trans'], strict=True)
	trans_encoder.eval()
	trans_encoder.cuda()

	mean = torch.from_numpy(np.load(os.path.join(CKPT_ROOT,'./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/mean.npy'))).cuda()
	std = torch.from_numpy(np.load(os.path.join(CKPT_ROOT,'./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/std.npy'))).cuda()



	def get_open_pose(text,height,width,save_path,video_length):
	CKPT_ROOT = os.path.dirname(os.path.abspath(__file__))

	clip_text=[text]
	print(f"Motion Prompt: {text}")
	# cuid=generate_cuid()
	# print(f"Motion Generation cuid: {cuid}")

	# clip_text = ["the person jump and spin twice,then running straght and sit down. "] #支持单个token的生成

	# change the text here



	text = clip.tokenize(clip_text, truncate=False).cuda()
	feat_clip_text = clip_model.encode_text(text).float()
	index_motion = trans_encoder.sample(feat_clip_text[0:1], False)
	pred_pose = net.forward_decoder(index_motion)

	from utils.motion_process import recover_from_ric
	pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
	xyz = pred_xyz.reshape(1, -1, 22, 3)

	np.save('motion.npy', xyz.detach().cpu().numpy())


	pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, ['smpl.gif'])

	res=xyz.detach().cpu().numpy()
	points_3d_list=res[0]
	frame_num=points_3d_list.shape[0]

	open_pose_list=np.array(points_3d_list)
	print("The total SMPL sequence shape is : "+str(open_pose_list.shape))

	max_val = np.max(open_pose_list, axis=(0, 1))
	min_val = np.min(open_pose_list, axis=(0, 1))

	print("三维坐标在坐标系上的最大值：", max_val)
	print("三维坐标在坐标系上的最小值：", min_val)


	check= smpl_to_openpose18(open_pose_list[0]) # 18个关键点
	print("******SMPL_2_OpenPose_List(14/18)******")
	print(check)
	print("*************************")
	print(f"Total Frame Number: {frame_num}")
	img_list=[]
	for step in tqdm(range(0,frame_num)):
	# 生成图像
	dpi=84
	fig =plt.figure(figsize=(width/dpi, height/dpi), dpi=dpi)
	ax = fig.add_subplot(111, projection='3d')
	limits=2

	ax.set_xlim(-limits0.7, limits0.7)
	ax.set_ylim(0, limits*1.5)#上下
	ax.set_zlim(0, limits*1.5)# 前后
	ax.grid(b=False)
	#ax.dist = 1
	ax.set_box_aspect([1.4, 1.5, 1.5],zoom=3.5)# 坐标轴比例 TODO:这个比例可能有问题，会出现超出坐标范围的bug

	# 关键点坐标，每行包含(x, y, z)
	keypoints = smpl_to_openpose18(open_pose_list[step]) # 18个关键点

	# 运动学链目前只用到body部分
	kinematic_chain = [(0, 1), (1, 2), (2, 3), (3, 4), (1, 5), (5, 6), (6, 7), (1, 8), (8, 9), (9, 10), (1, 11), (11, 12), (12, 13), (0, 14), (14, 16), (0, 15), (15, 17)]
	#kinematic_chain = [(0, 1), (1, 2), (2, 3), (3, 4), (1, 5), (5, 6), (6, 7), (1, 8), (8, 9), (9, 10), (1, 11), (11, 12), (12, 13)]

	# 颜色RGB

	colors = [(0, 0, 255), (0, 255, 255), (0, 255, 0), (255, 0, 0), (255, 0, 255), (255, 192, 203), (0, 165, 255), (19, 69, 139), (173, 216, 230), (34, 139, 34), (0, 0, 128), (184, 134, 11), (139, 0, 139), (0, 100, 0), (0, 255, 255), (0, 255, 0), (216, 191, 216), (255, 255, 224)]
	#colors=[(0, 0, 255), (0, 255, 255), (0, 255, 0), (255, 0, 0), (255, 0, 255), (255, 192, 203), (0, 165, 255), (19, 69, 139), (173, 216, 230), (34, 139, 34), (0, 0, 128), (184, 134, 11), (139, 0, 139), (0, 100, 0)]

	#18点
	joint_colors=[(255,0,0),(255,85,0),(255,170,0),(255,255,0),(170,255,0),(85,255,0),(0,255,0),(0,255,85),(0,255,170),(0,255,255),(0,170,255),(0,85,255),(0,0,255),(85,0,255),(170,0,255),(255,0,255),(255,0,170),(255,0,85),(255,0,0)]
	#14点主干
	#joint_colors=[(255,0,0),(255,85,0),(255,170,0),(255,255,0),(170,255,0),(85,255,0),(0,255,0),(0,255,85),(0,255,170),(0,255,255),(0,170,255),(0,85,255),(0,0,255),(85,0,255),(170,0,255)]
	#运动链连线是joint颜色的60%


	#plt颜色在0-1之间
	rgb_color2=[]
	joint_rgb_color2=[]
	kinematic_chain_rgb_color2=[]
	for color in joint_colors:
	joint_rgb_color2.append(tuple([x/255 for x in color]))
	kinematic_chain_rgb_color2.append(tuple([x*0.6/255 for x in color])) #运动链连线是joint颜色的60%

	# 可视化结果
	for i in range(0,18):
	# 绘制关键点
	ax.scatter(keypoints[i][0], keypoints[i][1], keypoints[i][2], s=50, c=joint_rgb_color2[i], marker='o')

	# 绘制运动学链
	for j in range(len(kinematic_chain)):
	if kinematic_chain[j][1] == i:
	ax.plot([keypoints[kinematic_chain[j][0]][0], keypoints[kinematic_chain[j][1]][0]], [keypoints[kinematic_chain[j][0]][1], keypoints[kinematic_chain[j][1]][1]], [keypoints[kinematic_chain[j][0]][2], keypoints[kinematic_chain[j][1]][2]], c=kinematic_chain_rgb_color2[i], linewidth=5)

	# 调整视角
	ax.view_init(elev=110, azim=-90)
	plt.axis('off')


	# 保存图片
	# 将图像数据输出为图像数组
	if not os.path.exists(save_path):
	os.makedirs(save_path)
	image_tmp_path=str(f"{save_path}/{str(step)}.jpg")
	plt.savefig(os.path.join(CKPT_ROOT,image_tmp_path))#RGB
	img=cv2.imread(os.path.join(CKPT_ROOT,image_tmp_path))
	img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
	img_list.append(img)
	res=[]
	if len(img_list)>=video_length:
	key_frame_sample_step=int(len(img_list)/video_length)
	else:
	print("ERROR: video length is too long")
	key_frame_sample_step=1

	for i in range(0,len(img_list),key_frame_sample_step):
	res.append(img_list[i])

	return res



	def offline_get_open_pose(text,motion_text,height,width,save_path):
	#motion_text=text

	clip_text=[text]
	print(f"Motion Prompt: {text}")
	cuid=generate_cuid()
	print(f"Motion Generation cuid: {cuid}")

	# clip_text = ["the person jump and spin twice,then running straght and sit down. "] #支持单个token的生成

	# change the text here



	text = clip.tokenize(clip_text, truncate=False).cuda()
	feat_clip_text = clip_model.encode_text(text).float()
	index_motion = trans_encoder.sample(feat_clip_text[0:1], False)
	pred_pose = net.forward_decoder(index_motion)

	from utils.motion_process import recover_from_ric
	pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
	xyz = pred_xyz.reshape(1, -1, 22, 3)
	res=xyz.detach().cpu().numpy()
	np.save(f'{save_path}/{replace_space_with_underscore(motion_text)}.npy', res)


	pose_vis = plot_3d.draw_to_batch(res,clip_text, ['smpl.gif'])




	if __name__ == "__main__":

	text="walk around, jump, run straght."
	pose = get_open_pose(text,512,512)
	#pdb.set_trace()