MuseV-test / mmcm /t2p /text2pose.py
kevinwang676's picture
Upload folder using huggingface_hub
6755a2d verified
#from __future__ import absolute_import
import sys
import io
import os
sys.argv = ['GPT_eval_multi.py']
# 将项目根目录添加到sys.path中
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(1, PROJECT_ROOT)
CKPT_ROOT="/cfs-datasets/public_models/motion"
from .options import option_transformer as option_trans
import sys
print(sys.path[0])
import clip
import torch
import cv2
import numpy as np
from .models import vqvae as vqvae
from .models import t2m_trans as trans
import warnings
from .visualization import plot_3d_global as plot_3d
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
from PIL import Image
import time
import random
warnings.filterwarnings('ignore')
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
from math import cos,sin,radians
args = option_trans.get_args_parser()
args.dataname = 't2m'
args.resume_pth = os.path.join(CKPT_ROOT,'pretrained/VQVAE/net_last.pth')
args.resume_trans = os.path.join(CKPT_ROOT,'pretrained/VQTransformer_corruption05/net_best_fid.pth')
args.down_t = 2
args.depth = 3
args.block_size = 51
def replace_space_with_underscore(s):
return s.replace(' ', '_')
def Rz(angle):
theta=radians(angle)
return np.array([[cos(theta), -sin(theta), 0],
[sin(theta), cos(theta), 0],
[0, 0, 1]])
def Rx(angle):
theta=radians(angle)
return np.array(
[[1, 0, 0],
[0 , cos(theta), -sin(theta)],
[0, sin(theta), cos(theta)]])
def generate_cuid():
timestamp = hex(int(time.time() * 1000))[2:]
random_str = hex(random.randint(0, 0xfffff))[2:]
return (timestamp + random_str).zfill(10)
def smpl_to_openpose18(smpl_keypoints):
'''
22关键点SMPL对应关系解释
[0, 2, 5, 8, 11]
这个列表表示SMPL模型中左腿的连接方式,从骨盆(0号关键点)开始,连接左大腿(2号关键点)、左小腿(5号关键点)、左脚(8号关键点)和左脚尖(11号关键点)。
[0, 1, 4, 7, 10]
这个列表表示SMPL模型中右腿的连接方式,从骨盆(0号关键点)开始,连接右大腿(1号关键点)、右小腿(4号关键点)、右脚(7号关键点)和右脚尖(10号关键点)。
[0, 3, 6, 9, 12, 15]
这个列表表示SMPL模型中躯干的连接方式,从骨盆(0号关键点)开始,连接脊柱(3号关键点)、颈部(6号关键点)、头部(9号关键点)、左肩膀(12号关键点)、右肩膀(15号关键点)。
[9, 14, 17, 19, 21]
这个列表表示SMPL模型中左臂的连接方式,从左肩膀(9号关键点)开始,连接左上臂(14号关键点)、左前臂(17号关键点)、左手腕(19号关键点)和左手(21号关键点)。
[9, 13, 16, 18, 20]
这个列表表示SMPL模型中右臂的连接方式,从右肩膀(9号关键点)开始,连接右上臂(13号关键点)、右前臂(16号关键点)、右手腕(18号关键点)和右手(20号关键点)。
目前转Openpose忽略掉了SMPL的肩膀关键点
'''
openpose_keypoints = np.zeros((18, 3))
openpose_keypoints[0] = smpl_keypoints[9] # nose
openpose_keypoints[0][1] = openpose_keypoints[0][1]+0.3 #
openpose_keypoints[1] = smpl_keypoints[6] # neck
openpose_keypoints[2] = smpl_keypoints[16] # right shoulder
openpose_keypoints[3] = smpl_keypoints[18] # right elbow
openpose_keypoints[4] = smpl_keypoints[20] # right wrist
openpose_keypoints[5] = smpl_keypoints[17] # left shoulder
openpose_keypoints[6] = smpl_keypoints[19] # left elbow
openpose_keypoints[7] = smpl_keypoints[21] # left wrist
#TODO: Experiment,将neck的关键点抬高&&将nose的关键点相对高度关系与neck保持一致
openpose_keypoints[1][0]=(openpose_keypoints[2][0]+openpose_keypoints[5][0])/2
openpose_keypoints[1][1]=(openpose_keypoints[2][1]+openpose_keypoints[5][1])/2
openpose_keypoints[1][2]=(openpose_keypoints[2][2]+openpose_keypoints[5][2])/2
openpose_keypoints[0][1] = openpose_keypoints[1][1]+0.3 #
openpose_keypoints[8] = smpl_keypoints[1] # right hip
openpose_keypoints[9] = smpl_keypoints[4] # right knee
openpose_keypoints[10] = smpl_keypoints[7] # right ankle
openpose_keypoints[11] = smpl_keypoints[2] # left hip
openpose_keypoints[12] = smpl_keypoints[5] # left knee
openpose_keypoints[13] = smpl_keypoints[8] # left ankle
#TODO: Experiment,手工指定脸部关键点测试是否能够指定身体朝向
#openpose_keypoints[0][0] = openpose_keypoints[0][0]+0.3#测试0坐标轴方向(水平向右)
#openpose_keypoints[0][2] = openpose_keypoints[0][2]#测试2坐标轴方向(向外
#openpose_keypoints[0][1] = openpose_keypoints[0][1]+0.5#测试1坐标轴方向(垂直向上
openpose_keypoints[14] = openpose_keypoints[0] # right eye
openpose_keypoints[14][1]=openpose_keypoints[14][1]+0.05
openpose_keypoints[14][0]=openpose_keypoints[14][0]+0.3*(openpose_keypoints[2][0]-openpose_keypoints[1][0])
openpose_keypoints[14][2]=openpose_keypoints[14][2]+0.3*(openpose_keypoints[2][2]-openpose_keypoints[1][2])
openpose_keypoints[15] = openpose_keypoints[0] # left eye
openpose_keypoints[15][1]=openpose_keypoints[15][1]+0.05
openpose_keypoints[15][0]=openpose_keypoints[15][0]+0.3*(openpose_keypoints[5][0]-openpose_keypoints[1][0])
openpose_keypoints[15][2]=openpose_keypoints[15][2]+0.3*(openpose_keypoints[5][2]-openpose_keypoints[1][2])
openpose_keypoints[16] = openpose_keypoints[0] # right ear
openpose_keypoints[16][0]=openpose_keypoints[16][0]+0.7*(openpose_keypoints[2][0]-openpose_keypoints[1][0])
openpose_keypoints[16][2]=openpose_keypoints[16][2]+0.7*(openpose_keypoints[2][2]-openpose_keypoints[1][2])
openpose_keypoints[17] = openpose_keypoints[0] # left ear
openpose_keypoints[17][0]=openpose_keypoints[17][0]+0.7*(openpose_keypoints[5][0]-openpose_keypoints[1][0])
openpose_keypoints[17][2]=openpose_keypoints[17][2]+0.7*(openpose_keypoints[5][2]-openpose_keypoints[1][2])
return openpose_keypoints
# TODO: debug only, need to be deleted before unload
## load clip model and datasets
clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False, download_root=CKPT_ROOT) # Must set jit=False for training
clip.model.convert_weights(clip_model) # Actually this line is unnecessary since clip by default already on float16
clip_model.eval()
for p in clip_model.parameters():
p.requires_grad = False
print("loaded CLIP model")
net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
args.nb_code,
args.code_dim,
args.output_emb_width,
args.down_t,
args.stride_t,
args.width,
args.depth,
args.dilation_growth_rate)
trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code,
embed_dim=1024,
clip_dim=args.clip_dim,
block_size=args.block_size,
num_layers=9,
n_head=16,
drop_out_rate=args.drop_out_rate,
fc_rate=args.ff_rate)
print ('loading checkpoint from {}'.format(args.resume_pth))
ckpt = torch.load(args.resume_pth, map_location='cpu')
net.load_state_dict(ckpt['net'], strict=True)
net.eval()
net.cuda()
print ('loading transformer checkpoint from {}'.format(args.resume_trans))
ckpt = torch.load(args.resume_trans, map_location='cpu')
trans_encoder.load_state_dict(ckpt['trans'], strict=True)
trans_encoder.eval()
trans_encoder.cuda()
mean = torch.from_numpy(np.load(os.path.join(CKPT_ROOT,'./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/mean.npy'))).cuda()
std = torch.from_numpy(np.load(os.path.join(CKPT_ROOT,'./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/std.npy'))).cuda()
def get_open_pose(text,height,width,save_path,video_length):
CKPT_ROOT = os.path.dirname(os.path.abspath(__file__))
clip_text=[text]
print(f"Motion Prompt: {text}")
# cuid=generate_cuid()
# print(f"Motion Generation cuid: {cuid}")
# clip_text = ["the person jump and spin twice,then running straght and sit down. "] #支持单个token的生成
# change the text here
text = clip.tokenize(clip_text, truncate=False).cuda()
feat_clip_text = clip_model.encode_text(text).float()
index_motion = trans_encoder.sample(feat_clip_text[0:1], False)
pred_pose = net.forward_decoder(index_motion)
from utils.motion_process import recover_from_ric
pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
xyz = pred_xyz.reshape(1, -1, 22, 3)
np.save('motion.npy', xyz.detach().cpu().numpy())
pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, ['smpl.gif'])
res=xyz.detach().cpu().numpy()
points_3d_list=res[0]
frame_num=points_3d_list.shape[0]
open_pose_list=np.array(points_3d_list)
print("The total SMPL sequence shape is : "+str(open_pose_list.shape))
max_val = np.max(open_pose_list, axis=(0, 1))
min_val = np.min(open_pose_list, axis=(0, 1))
print("三维坐标在坐标系上的最大值:", max_val)
print("三维坐标在坐标系上的最小值:", min_val)
check= smpl_to_openpose18(open_pose_list[0]) # 18个关键点
print("********SMPL_2_OpenPose_List(14/18)********")
print(check)
print("*************************")
print(f"Total Frame Number: {frame_num}")
img_list=[]
for step in tqdm(range(0,frame_num)):
# 生成图像
dpi=84
fig =plt.figure(figsize=(width/dpi, height/dpi), dpi=dpi)
ax = fig.add_subplot(111, projection='3d')
limits=2
ax.set_xlim(-limits*0.7, limits*0.7)
ax.set_ylim(0, limits*1.5)#上下
ax.set_zlim(0, limits*1.5)# 前后
ax.grid(b=False)
#ax.dist = 1
ax.set_box_aspect([1.4, 1.5, 1.5],zoom=3.5)# 坐标轴比例 TODO:这个比例可能有问题,会出现超出坐标范围的bug
# 关键点坐标,每行包含(x, y, z)
keypoints = smpl_to_openpose18(open_pose_list[step]) # 18个关键点
# 运动学链 目前只用到body部分
kinematic_chain = [(0, 1), (1, 2), (2, 3), (3, 4), (1, 5), (5, 6), (6, 7), (1, 8), (8, 9), (9, 10), (1, 11), (11, 12), (12, 13), (0, 14), (14, 16), (0, 15), (15, 17)]
#kinematic_chain = [(0, 1), (1, 2), (2, 3), (3, 4), (1, 5), (5, 6), (6, 7), (1, 8), (8, 9), (9, 10), (1, 11), (11, 12), (12, 13)]
# 颜色RGB
colors = [(0, 0, 255), (0, 255, 255), (0, 255, 0), (255, 0, 0), (255, 0, 255), (255, 192, 203), (0, 165, 255), (19, 69, 139), (173, 216, 230), (34, 139, 34), (0, 0, 128), (184, 134, 11), (139, 0, 139), (0, 100, 0), (0, 255, 255), (0, 255, 0), (216, 191, 216), (255, 255, 224)]
#colors=[(0, 0, 255), (0, 255, 255), (0, 255, 0), (255, 0, 0), (255, 0, 255), (255, 192, 203), (0, 165, 255), (19, 69, 139), (173, 216, 230), (34, 139, 34), (0, 0, 128), (184, 134, 11), (139, 0, 139), (0, 100, 0)]
#18点
joint_colors=[(255,0,0),(255,85,0),(255,170,0),(255,255,0),(170,255,0),(85,255,0),(0,255,0),(0,255,85),(0,255,170),(0,255,255),(0,170,255),(0,85,255),(0,0,255),(85,0,255),(170,0,255),(255,0,255),(255,0,170),(255,0,85),(255,0,0)]
#14点主干
#joint_colors=[(255,0,0),(255,85,0),(255,170,0),(255,255,0),(170,255,0),(85,255,0),(0,255,0),(0,255,85),(0,255,170),(0,255,255),(0,170,255),(0,85,255),(0,0,255),(85,0,255),(170,0,255)]
#运动链连线是joint颜色的60%
#plt颜色在0-1之间
rgb_color2=[]
joint_rgb_color2=[]
kinematic_chain_rgb_color2=[]
for color in joint_colors:
joint_rgb_color2.append(tuple([x/255 for x in color]))
kinematic_chain_rgb_color2.append(tuple([x*0.6/255 for x in color])) #运动链连线是joint颜色的60%
# 可视化结果
for i in range(0,18):
# 绘制关键点
ax.scatter(keypoints[i][0], keypoints[i][1], keypoints[i][2], s=50, c=joint_rgb_color2[i], marker='o')
# 绘制运动学链
for j in range(len(kinematic_chain)):
if kinematic_chain[j][1] == i:
ax.plot([keypoints[kinematic_chain[j][0]][0], keypoints[kinematic_chain[j][1]][0]], [keypoints[kinematic_chain[j][0]][1], keypoints[kinematic_chain[j][1]][1]], [keypoints[kinematic_chain[j][0]][2], keypoints[kinematic_chain[j][1]][2]], c=kinematic_chain_rgb_color2[i], linewidth=5)
# 调整视角
ax.view_init(elev=110, azim=-90)
plt.axis('off')
# 保存图片
# 将图像数据输出为图像数组
if not os.path.exists(save_path):
os.makedirs(save_path)
image_tmp_path=str(f"{save_path}/{str(step)}.jpg")
plt.savefig(os.path.join(CKPT_ROOT,image_tmp_path))#RGB
img=cv2.imread(os.path.join(CKPT_ROOT,image_tmp_path))
img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
img_list.append(img)
res=[]
if len(img_list)>=video_length:
key_frame_sample_step=int(len(img_list)/video_length)
else:
print("ERROR: video length is too long")
key_frame_sample_step=1
for i in range(0,len(img_list),key_frame_sample_step):
res.append(img_list[i])
return res
def offline_get_open_pose(text,motion_text,height,width,save_path):
#motion_text=text
clip_text=[text]
print(f"Motion Prompt: {text}")
cuid=generate_cuid()
print(f"Motion Generation cuid: {cuid}")
# clip_text = ["the person jump and spin twice,then running straght and sit down. "] #支持单个token的生成
# change the text here
text = clip.tokenize(clip_text, truncate=False).cuda()
feat_clip_text = clip_model.encode_text(text).float()
index_motion = trans_encoder.sample(feat_clip_text[0:1], False)
pred_pose = net.forward_decoder(index_motion)
from utils.motion_process import recover_from_ric
pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
xyz = pred_xyz.reshape(1, -1, 22, 3)
res=xyz.detach().cpu().numpy()
np.save(f'{save_path}/{replace_space_with_underscore(motion_text)}.npy', res)
pose_vis = plot_3d.draw_to_batch(res,clip_text, ['smpl.gif'])
if __name__ == "__main__":
text="walk around, jump, run straght."
pose = get_open_pose(text,512,512)
#pdb.set_trace()