MotionCLR / models /unet.py
EvanTHU's picture
Update models/unet.py
df42902 verified
import clip
import math
import torch
import torch.nn.functional as F
from torch import nn
import numpy as np
from einops.layers.torch import Rearrange
from einops import rearrange
import matplotlib.pyplot as plt
import os
import torch.nn as nn
# Custom LayerNorm class to handle fp16
class CustomLayerNorm(nn.LayerNorm):
def forward(self, x: torch.Tensor):
if self.weight.dtype == torch.float32:
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)
else:
return super().forward(x)
# Function to replace LayerNorm in CLIP model with CustomLayerNorm
def replace_layer_norm(model):
for name, module in model.named_children():
if isinstance(module, nn.LayerNorm):
setattr(model, name, CustomLayerNorm(module.normalized_shape, elementwise_affine=module.elementwise_affine).cuda())
else:
replace_layer_norm(module) # Recursively apply to all submodules
MONITOR_ATTN = []
SELF_ATTN = []
def vis_attn(att, out_path, step, layer, shape, type_="self", lines=True):
if lines:
plt.figure(figsize=(10, 3))
for token_index in range(att.shape[1]):
plt.plot(att[:, token_index], label=f"Token {token_index}")
plt.title("Attention Values for Each Token")
plt.xlabel("time")
plt.ylabel("Attention Value")
plt.legend(loc="upper right", bbox_to_anchor=(1.15, 1))
# save image
savepath = os.path.join(out_path, f"vis-{type_}/step{str(step)}/layer{str(layer)}_lines_{shape}.png")
os.makedirs(os.path.dirname(savepath), exist_ok=True)
plt.savefig(savepath, bbox_inches="tight")
np.save(savepath.replace(".png", ".npy"), att)
else:
plt.figure(figsize=(10, 10))
plt.imshow(att.transpose(), cmap="viridis", aspect="auto")
plt.colorbar()
plt.title("Attention Matrix Heatmap")
plt.ylabel("time")
plt.xlabel("time")
# save image
savepath = os.path.join(out_path, f"vis-{type_}/step{str(step)}/layer{str(layer)}_heatmap_{shape}.png")
os.makedirs(os.path.dirname(savepath), exist_ok=True)
plt.savefig(savepath, bbox_inches="tight")
np.save(savepath.replace(".png", ".npy"), att)
def zero_module(module):
"""
Zero out the parameters of a module and return it.
"""
for p in module.parameters():
p.detach().zero_()
return module
class FFN(nn.Module):
def __init__(self, latent_dim, ffn_dim, dropout):
super().__init__()
self.linear1 = nn.Linear(latent_dim, ffn_dim)
self.linear2 = zero_module(nn.Linear(ffn_dim, latent_dim))
self.activation = nn.GELU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
y = self.linear2(self.dropout(self.activation(self.linear1(x))))
y = x + y
return y
class Conv1dAdaGNBlock(nn.Module):
"""
Conv1d --> GroupNorm --> scale,shift --> Mish
"""
def __init__(self, inp_channels, out_channels, kernel_size, n_groups=4):
super().__init__()
self.out_channels = out_channels
self.block = nn.Conv1d(
inp_channels, out_channels, kernel_size, padding=kernel_size // 2
)
self.group_norm = nn.GroupNorm(n_groups, out_channels)
self.avtication = nn.Mish()
def forward(self, x, scale, shift):
"""
Args:
x: [bs, nfeat, nframes]
scale: [bs, out_feat, 1]
shift: [bs, out_feat, 1]
"""
x = self.block(x)
batch_size, channels, horizon = x.size()
x = rearrange(
x, "batch channels horizon -> (batch horizon) channels"
) # [bs*seq, nfeats]
x = self.group_norm(x)
x = rearrange(
x.reshape(batch_size, horizon, channels),
"batch horizon channels -> batch channels horizon",
)
x = ada_shift_scale(x, shift, scale)
return self.avtication(x)
class SelfAttention(nn.Module):
def __init__(
self,
latent_dim,
text_latent_dim,
num_heads: int = 8,
dropout: float = 0.0,
log_attn=False,
edit_config=None,
):
super().__init__()
self.num_head = num_heads
self.norm = nn.LayerNorm(latent_dim)
self.query = nn.Linear(latent_dim, latent_dim)
self.key = nn.Linear(latent_dim, latent_dim)
self.value = nn.Linear(latent_dim, latent_dim)
self.dropout = nn.Dropout(dropout)
self.edit_config = edit_config
self.log_attn = log_attn
def forward(self, x):
"""
x: B, T, D
xf: B, N, L
"""
B, T, D = x.shape
N = x.shape[1]
assert N == T
H = self.num_head
# B, T, 1, D
query = self.query(self.norm(x)).unsqueeze(2)
# B, 1, N, D
key = self.key(self.norm(x)).unsqueeze(1)
query = query.view(B, T, H, -1)
key = key.view(B, N, H, -1)
# style transfer motion editing
style_tranfer = self.edit_config.style_tranfer.use
if style_tranfer:
if (
len(SELF_ATTN)
<= self.edit_config.style_tranfer.style_transfer_steps_end
):
query[1] = query[0]
# example based motion generation
example_based = self.edit_config.example_based.use
if example_based:
if len(SELF_ATTN) == self.edit_config.example_based.example_based_steps_end:
temp_seed = self.edit_config.example_based.temp_seed
for id_ in range(query.shape[0] - 1):
with torch.random.fork_rng():
torch.manual_seed(temp_seed)
tensor = query[0]
chunks = torch.split(
tensor, self.edit_config.example_based.chunk_size, dim=0
)
shuffled_indices = torch.randperm(len(chunks))
shuffled_chunks = [chunks[i] for i in shuffled_indices]
shuffled_tensor = torch.cat(shuffled_chunks, dim=0)
query[1 + id_] = shuffled_tensor
temp_seed += self.edit_config.example_based.temp_seed_bar
# time shift motion editing (q, k)
time_shift = self.edit_config.time_shift.use
if time_shift:
if len(MONITOR_ATTN) <= self.edit_config.time_shift.time_shift_steps_end:
part1 = int(
key.shape[1] * self.edit_config.time_shift.time_shift_ratio // 1
)
part2 = int(
key.shape[1]
* (1 - self.edit_config.time_shift.time_shift_ratio)
// 1
)
q_front_part = query[0, :part1, :, :]
q_back_part = query[0, -part2:, :, :]
new_q = torch.cat((q_back_part, q_front_part), dim=0)
query[1] = new_q
k_front_part = key[0, :part1, :, :]
k_back_part = key[0, -part2:, :, :]
new_k = torch.cat((k_back_part, k_front_part), dim=0)
key[1] = new_k
# B, T, N, H
attention = torch.einsum("bnhd,bmhd->bnmh", query, key) / math.sqrt(D // H)
weight = self.dropout(F.softmax(attention, dim=2))
# for counting the step and logging attention maps
try:
attention_matrix = (
weight[0, :, :].mean(dim=-1).detach().cpu().numpy().astype(float)
)
SELF_ATTN[-1].append(attention_matrix)
except:
pass
# attention manipulation for replacement
attention_manipulation = self.edit_config.manipulation.use
if attention_manipulation:
if len(SELF_ATTN) <= self.edit_config.manipulation.manipulation_steps_end:
weight[1, :, :, :] = weight[0, :, :, :]
value = self.value(self.norm(x)).view(B, N, H, -1)
# time shift motion editing (v)
if time_shift:
if len(MONITOR_ATTN) <= self.edit_config.time_shift.time_shift_steps_end:
v_front_part = value[0, :part1, :, :]
v_back_part = value[0, -part2:, :, :]
new_v = torch.cat((v_back_part, v_front_part), dim=0)
value[1] = new_v
y = torch.einsum("bnmh,bmhd->bnhd", weight, value).reshape(B, T, D)
return y
class TimestepEmbedder(nn.Module):
def __init__(self, d_model, max_len=5000):
super(TimestepEmbedder, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer("pe", pe)
def forward(self, x):
self.pe = self.pe.cuda()
return self.pe[x]
class Downsample1d(nn.Module):
def __init__(self, dim):
super().__init__()
self.conv = nn.Conv1d(dim, dim, 3, 2, 1)
def forward(self, x):
self.conv = self.conv.cuda()
return self.conv(x)
class Upsample1d(nn.Module):
def __init__(self, dim_in, dim_out=None):
super().__init__()
dim_out = dim_out or dim_in
self.conv = nn.ConvTranspose1d(dim_in, dim_out, 4, 2, 1)
def forward(self, x):
self.conv = self.conv.cuda()
return self.conv(x)
class Conv1dBlock(nn.Module):
"""
Conv1d --> GroupNorm --> Mish
"""
def __init__(self, inp_channels, out_channels, kernel_size, n_groups=4, zero=False):
super().__init__()
self.out_channels = out_channels
self.block = nn.Conv1d(
inp_channels, out_channels, kernel_size, padding=kernel_size // 2
)
self.norm = nn.GroupNorm(n_groups, out_channels)
self.activation = nn.Mish()
if zero:
# zero init the convolution
nn.init.zeros_(self.block.weight)
nn.init.zeros_(self.block.bias)
def forward(self, x):
"""
Args:
x: [bs, nfeat, nframes]
"""
x = self.block(x)
batch_size, channels, horizon = x.size()
x = rearrange(
x, "batch channels horizon -> (batch horizon) channels"
) # [bs*seq, nfeats]
x = self.norm(x)
x = rearrange(
x.reshape(batch_size, horizon, channels),
"batch horizon channels -> batch channels horizon",
)
return self.activation(x)
def ada_shift_scale(x, shift, scale):
return x * (1 + scale) + shift
class ResidualTemporalBlock(nn.Module):
def __init__(
self,
inp_channels,
out_channels,
embed_dim,
kernel_size=5,
zero=True,
n_groups=8,
dropout: float = 0.1,
adagn=True,
):
super().__init__()
self.adagn = adagn
self.blocks = nn.ModuleList(
[
# adagn only the first conv (following guided-diffusion)
(
Conv1dAdaGNBlock(inp_channels, out_channels, kernel_size, n_groups)
if adagn
else Conv1dBlock(inp_channels, out_channels, kernel_size)
),
Conv1dBlock(
out_channels, out_channels, kernel_size, n_groups, zero=zero
),
]
)
self.time_mlp = nn.Sequential(
nn.Mish(),
# adagn = scale and shift
nn.Linear(embed_dim, out_channels * 2 if adagn else out_channels),
Rearrange("batch t -> batch t 1"),
)
self.dropout = nn.Dropout(dropout)
if zero:
nn.init.zeros_(self.time_mlp[1].weight)
nn.init.zeros_(self.time_mlp[1].bias)
self.residual_conv = (
nn.Conv1d(inp_channels, out_channels, 1)
if inp_channels != out_channels
else nn.Identity()
)
def forward(self, x, time_embeds=None):
"""
x : [ batch_size x inp_channels x nframes ]
t : [ batch_size x embed_dim ]
returns: [ batch_size x out_channels x nframes ]
"""
if self.adagn:
scale, shift = self.time_mlp(time_embeds).chunk(2, dim=1)
out = self.blocks[0](x, scale, shift)
else:
out = self.blocks[0](x) + self.time_mlp(time_embeds)
out = self.blocks[1](out)
out = self.dropout(out)
return out + self.residual_conv(x)
class CrossAttention(nn.Module):
def __init__(
self,
latent_dim,
text_latent_dim,
num_heads: int = 8,
dropout: float = 0.0,
log_attn=False,
edit_config=None,
):
super().__init__()
self.num_head = num_heads
self.norm = nn.LayerNorm(latent_dim)
self.text_norm = nn.LayerNorm(text_latent_dim)
self.query = nn.Linear(latent_dim, latent_dim)
self.key = nn.Linear(text_latent_dim, latent_dim)
self.value = nn.Linear(text_latent_dim, latent_dim)
self.dropout = nn.Dropout(dropout)
self.edit_config = edit_config
self.log_attn = log_attn
def forward(self, x, xf):
"""
x: B, T, D
xf: B, N, L
"""
B, T, D = x.shape
N = xf.shape[1]
H = self.num_head
# B, T, 1, D
query = self.query(self.norm(x)).unsqueeze(2)
# B, 1, N, D
key = self.key(self.text_norm(xf)).unsqueeze(1)
query = query.view(B, T, H, -1)
key = key.view(B, N, H, -1)
# B, T, N, H
attention = torch.einsum("bnhd,bmhd->bnmh", query, key) / math.sqrt(D // H)
weight = self.dropout(F.softmax(attention, dim=2))
# attention reweighting for (de)-emphasizing motion
if self.edit_config.reweighting_attn.use:
reweighting_attn = self.edit_config.reweighting_attn.reweighting_attn_weight
if self.edit_config.reweighting_attn.idx == -1:
# read idxs from txt file
with open("./assets/reweighting_idx.txt", "r") as f:
idxs = f.readlines()
else:
# gradio demo mode
idxs = [0, self.edit_config.reweighting_attn.idx]
idxs = [int(idx) for idx in idxs]
for i in range(len(idxs)):
weight[i, :, 1 + idxs[i]] = weight[i, :, 1 + idxs[i]] + reweighting_attn
weight[i, :, 1 + idxs[i] + 1] = (
weight[i, :, 1 + idxs[i] + 1] + reweighting_attn
)
# for counting the step and logging attention maps
try:
attention_matrix = (
weight[0, :, 1 : 1 + 3]
.mean(dim=-1)
.detach()
.cpu()
.numpy()
.astype(float)
)
MONITOR_ATTN[-1].append(attention_matrix)
except:
pass
# erasing motion (autually is the deemphasizing motion)
erasing_motion = self.edit_config.erasing_motion.use
if erasing_motion:
reweighting_attn = self.edit_config.erasing_motion.erasing_motion_weight
begin = self.edit_config.erasing_motion.time_start
end = self.edit_config.erasing_motion.time_end
idx = self.edit_config.erasing_motion.idx
if reweighting_attn > 0.01 or reweighting_attn < -0.01:
weight[1, int(T * begin) : int(T * end), idx] = (
weight[1, int(T * begin) : int(T * end) :, idx] * reweighting_attn
)
weight[1, int(T * begin) : int(T * end), idx + 1] = (
weight[1, int(T * begin) : int(T * end), idx + 1] * reweighting_attn
)
# attention manipulation for motion replacement
manipulation = self.edit_config.manipulation.use
if manipulation:
if (
len(MONITOR_ATTN)
<= self.edit_config.manipulation.manipulation_steps_end_crossattn
):
word_idx = self.edit_config.manipulation.word_idx
weight[1, :, : 1 + word_idx, :] = weight[0, :, : 1 + word_idx, :]
weight[1, :, 1 + word_idx + 1 :, :] = weight[
0, :, 1 + word_idx + 1 :, :
]
value = self.value(self.text_norm(xf)).view(B, N, H, -1)
y = torch.einsum("bnmh,bmhd->bnhd", weight, value).reshape(B, T, D)
return y
class ResidualCLRAttentionLayer(nn.Module):
def __init__(
self,
dim1,
dim2,
num_heads: int = 8,
dropout: float = 0.1,
no_eff: bool = False,
self_attention: bool = False,
log_attn=False,
edit_config=None,
):
super(ResidualCLRAttentionLayer, self).__init__()
self.dim1 = dim1
self.dim2 = dim2
self.num_heads = num_heads
# Multi-Head Attention Layer
if no_eff:
self.cross_attention = CrossAttention(
latent_dim=dim1,
text_latent_dim=dim2,
num_heads=num_heads,
dropout=dropout,
log_attn=log_attn,
edit_config=edit_config,
)
else:
self.cross_attention = LinearCrossAttention(
latent_dim=dim1,
text_latent_dim=dim2,
num_heads=num_heads,
dropout=dropout,
log_attn=log_attn,
)
if self_attention:
self.self_attn_use = True
self.self_attention = SelfAttention(
latent_dim=dim1,
text_latent_dim=dim2,
num_heads=num_heads,
dropout=dropout,
log_attn=log_attn,
edit_config=edit_config,
)
else:
self.self_attn_use = False
def forward(self, input_tensor, condition_tensor, cond_indices):
"""
input_tensor :B, D, L
condition_tensor: B, L, D
"""
if cond_indices.numel() == 0:
return input_tensor
# self attention
if self.self_attn_use:
x = input_tensor
x = x.permute(0, 2, 1) # (batch_size, seq_length, feat_dim)
x = self.self_attention(x)
x = x.permute(0, 2, 1) # (batch_size, feat_dim, seq_length)
input_tensor = input_tensor + x
x = input_tensor
# cross attention
x = x[cond_indices].permute(0, 2, 1) # (batch_size, seq_length, feat_dim)
x = self.cross_attention(x, condition_tensor[cond_indices])
x = x.permute(0, 2, 1) # (batch_size, feat_dim, seq_length)
input_tensor[cond_indices] = input_tensor[cond_indices] + x
return input_tensor
class CLRBlock(nn.Module):
def __init__(
self,
dim_in,
dim_out,
cond_dim,
time_dim,
adagn=True,
zero=True,
no_eff=False,
self_attention=False,
dropout: float = 0.1,
log_attn=False,
edit_config=None,
) -> None:
super().__init__()
self.conv1d = ResidualTemporalBlock(
dim_in, dim_out, embed_dim=time_dim, adagn=adagn, zero=zero, dropout=dropout
)
self.clr_attn = ResidualCLRAttentionLayer(
dim1=dim_out,
dim2=cond_dim,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
)
# import pdb; pdb.set_trace()
self.ffn = FFN(dim_out, dim_out * 4, dropout=dropout)
def forward(self, x, t, cond, cond_indices=None):
x = self.conv1d(x, t)
x = self.clr_attn(x, cond, cond_indices)
x = self.ffn(x.permute(0, 2, 1)).permute(0, 2, 1)
return x
class CondUnet1D(nn.Module):
"""
Diffusion's style UNET with 1D convolution and adaptive group normalization for motion suquence denoising,
cross-attention to introduce conditional prompts (like text).
"""
def __init__(
self,
input_dim,
cond_dim,
dim=128,
dim_mults=(1, 2, 4, 8),
dims=None,
time_dim=512,
adagn=True,
zero=True,
dropout=0.1,
no_eff=False,
self_attention=False,
log_attn=False,
edit_config=None,
):
super().__init__()
if not dims:
dims = [input_dim, *map(lambda m: int(dim * m), dim_mults)] ##[d, d,2d,4d]
print("dims: ", dims, "mults: ", dim_mults)
in_out = list(zip(dims[:-1], dims[1:]))
self.time_mlp = nn.Sequential(
TimestepEmbedder(time_dim),
nn.Linear(time_dim, time_dim * 4),
nn.Mish(),
nn.Linear(time_dim * 4, time_dim),
)
self.downs = nn.ModuleList([])
self.ups = nn.ModuleList([])
for ind, (dim_in, dim_out) in enumerate(in_out):
self.downs.append(
nn.ModuleList(
[
CLRBlock(
dim_in,
dim_out,
cond_dim,
time_dim,
adagn=adagn,
zero=zero,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
),
CLRBlock(
dim_out,
dim_out,
cond_dim,
time_dim,
adagn=adagn,
zero=zero,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
),
Downsample1d(dim_out),
]
)
)
mid_dim = dims[-1]
self.mid_block1 = CLRBlock(
dim_in=mid_dim,
dim_out=mid_dim,
cond_dim=cond_dim,
time_dim=time_dim,
adagn=adagn,
zero=zero,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
)
self.mid_block2 = CLRBlock(
dim_in=mid_dim,
dim_out=mid_dim,
cond_dim=cond_dim,
time_dim=time_dim,
adagn=adagn,
zero=zero,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
)
last_dim = mid_dim
for ind, dim_out in enumerate(reversed(dims[1:])):
self.ups.append(
nn.ModuleList(
[
Upsample1d(last_dim, dim_out),
CLRBlock(
dim_out * 2,
dim_out,
cond_dim,
time_dim,
adagn=adagn,
zero=zero,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
),
CLRBlock(
dim_out,
dim_out,
cond_dim,
time_dim,
adagn=adagn,
zero=zero,
no_eff=no_eff,
dropout=dropout,
self_attention=self_attention,
log_attn=log_attn,
edit_config=edit_config,
),
]
)
)
last_dim = dim_out
self.final_conv = nn.Conv1d(dim_out, input_dim, 1)
if zero:
nn.init.zeros_(self.final_conv.weight)
nn.init.zeros_(self.final_conv.bias)
def forward(
self,
x,
t,
cond,
cond_indices,
):
self.time_mlp = self.time_mlp.cuda()
temb = self.time_mlp(t)
h = []
for block1, block2, downsample in self.downs:
block1 = block1.cuda()
block2 = block2.cuda()
x = block1(x, temb, cond, cond_indices)
x = block2(x, temb, cond, cond_indices)
h.append(x)
x = downsample(x)
self.mid_block1 = self.mid_block1.cuda()
self.mid_block2 = self.mid_block2.cuda()
x = self.mid_block1(x, temb, cond, cond_indices)
x = self.mid_block2(x, temb, cond, cond_indices)
for upsample, block1, block2 in self.ups:
x = upsample(x)
x = torch.cat((x, h.pop()), dim=1)
block1 = block1.cuda()
block2 = block2.cuda()
x = block1(x, temb, cond, cond_indices)
x = block2(x, temb, cond, cond_indices)
self.final_conv = self.final_conv.cuda()
x = self.final_conv(x)
return x
class MotionCLR(nn.Module):
"""
Diffuser's style UNET for text-to-motion task.
"""
def __init__(
self,
input_feats,
base_dim=128,
dim_mults=(1, 2, 2, 2),
dims=None,
adagn=True,
zero=True,
dropout=0.1,
no_eff=False,
time_dim=512,
latent_dim=256,
cond_mask_prob=0.1,
clip_dim=512,
clip_version="ViT-B/32",
text_latent_dim=256,
text_ff_size=2048,
text_num_heads=4,
activation="gelu",
num_text_layers=4,
self_attention=False,
vis_attn=False,
edit_config=None,
out_path=None,
):
super().__init__()
self.input_feats = input_feats
self.dim_mults = dim_mults
self.base_dim = base_dim
self.latent_dim = latent_dim
self.cond_mask_prob = cond_mask_prob
self.vis_attn = vis_attn
self.counting_map = []
self.out_path = out_path
print(
f"The T2M Unet mask the text prompt by {self.cond_mask_prob} prob. in training"
)
# text encoder
self.embed_text = nn.Linear(clip_dim, text_latent_dim)
self.clip_version = clip_version
self.clip_model = self.load_and_freeze_clip(clip_version)
replace_layer_norm(self.clip_model)
textTransEncoderLayer = nn.TransformerEncoderLayer(
d_model=text_latent_dim,
nhead=text_num_heads,
dim_feedforward=text_ff_size,
dropout=dropout,
activation=activation,
)
self.textTransEncoder = nn.TransformerEncoder(
textTransEncoderLayer, num_layers=num_text_layers
)
self.text_ln = nn.LayerNorm(text_latent_dim)
self.unet = CondUnet1D(
input_dim=self.input_feats,
cond_dim=text_latent_dim,
dim=self.base_dim,
dim_mults=self.dim_mults,
adagn=adagn,
zero=zero,
dropout=dropout,
no_eff=no_eff,
dims=dims,
time_dim=time_dim,
self_attention=self_attention,
log_attn=self.vis_attn,
edit_config=edit_config,
)
self.clip_model = self.clip_model.cuda()
self.embed_text = self.embed_text.cuda()
self.textTransEncoder = self.textTransEncoder.cuda()
self.text_ln = self.text_ln.cuda()
self.unet = self.unet.cuda()
def encode_text(self, raw_text, device):
self.clip_model.token_embedding = self.clip_model.token_embedding.to(device)
self.clip_model.transformer = self.clip_model.transformer.to(device)
self.clip_model.ln_final = self.clip_model.ln_final.to(device)
with torch.no_grad():
texts = clip.tokenize(raw_text, truncate=True).to(
device
) # [bs, context_length] # if n_tokens > 77 -> will truncate
x = self.clip_model.token_embedding(texts).type(self.clip_model.dtype).to(device) # [batch_size, n_ctx, d_model]
x = x + self.clip_model.positional_embedding.type(self.clip_model.dtype).to(device)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.clip_model.transformer(x)
x = self.clip_model.ln_final(x).type(
self.clip_model.dtype
) # [len, batch_size, 512]
self.embed_text = self.embed_text.to(device)
x = self.embed_text(x) # [len, batch_size, 256]
self.textTransEncoder = self.textTransEncoder.to(device)
x = self.textTransEncoder(x)
self.text_ln = self.text_ln.to(device)
x = self.text_ln(x)
# T, B, D -> B, T, D
xf_out = x.permute(1, 0, 2)
ablation_text = False
if ablation_text:
xf_out[:, 1:, :] = xf_out[:, 0, :].unsqueeze(1)
return xf_out
def load_and_freeze_clip(self, clip_version):
clip_model, _ = clip.load( # clip_model.dtype=float32
clip_version, device="cpu", jit=False
) # Must set jit=False for training
# Freeze CLIP weights
clip_model.eval()
for p in clip_model.parameters():
p.requires_grad = False
return clip_model
def mask_cond(self, bs, force_mask=False):
"""
mask motion condition , return contitional motion index in the batch
"""
if force_mask:
cond_indices = torch.empty(0)
elif self.training and self.cond_mask_prob > 0.0:
mask = torch.bernoulli(
torch.ones(
bs,
)
* self.cond_mask_prob
) # 1-> use null_cond, 0-> use real cond
mask = 1.0 - mask
cond_indices = torch.nonzero(mask).squeeze(-1)
else:
cond_indices = torch.arange(bs)
return cond_indices
def forward(
self,
x,
timesteps,
text=None,
uncond=False,
enc_text=None,
):
"""
Args:
x: [batch_size, nframes, nfeats],
timesteps: [batch_size] (int)
text: list (batch_size length) of strings with input text prompts
uncond: whethere using text condition
Returns: [batch_size, seq_length, nfeats]
"""
B, T, _ = x.shape
x = x.transpose(1, 2) # [bs, nfeats, nframes]
if enc_text is None:
enc_text = self.encode_text(text, x.device) # [bs, seqlen, text_dim]
cond_indices = self.mask_cond(x.shape[0], force_mask=uncond)
# NOTE: need to pad to be the multiplier of 8 for the unet
PADDING_NEEEDED = (16 - (T % 16)) % 16
padding = (0, PADDING_NEEEDED)
x = F.pad(x, padding, value=0)
x = self.unet(
x,
t=timesteps,
cond=enc_text,
cond_indices=cond_indices,
) # [bs, nfeats,, nframes]
x = x[:, :, :T].transpose(1, 2) # [bs, nframes, nfeats,]
return x
def forward_with_cfg(self, x, timesteps, text=None, enc_text=None, cfg_scale=2.5):
"""
Args:
x: [batch_size, nframes, nfeats],
timesteps: [batch_size] (int)
text: list (batch_size length) of strings with input text prompts
Returns: [batch_size, max_frames, nfeats]
"""
global SELF_ATTN
global MONITOR_ATTN
MONITOR_ATTN.append([])
SELF_ATTN.append([])
B, T, _ = x.shape
x = x.transpose(1, 2) # [bs, nfeats, nframes]
if enc_text is None:
enc_text = self.encode_text(text, x.device) # [bs, seqlen, text_dim]
cond_indices = self.mask_cond(B)
# NOTE: need to pad to be the multiplier of 8 for the unet
PADDING_NEEEDED = (16 - (T % 16)) % 16
padding = (0, PADDING_NEEEDED)
x = F.pad(x, padding, value=0)
combined_x = torch.cat([x, x], dim=0)
combined_t = torch.cat([timesteps, timesteps], dim=0)
out = self.unet(
x=combined_x,
t=combined_t,
cond=enc_text,
cond_indices=cond_indices,
) # [bs, nfeats, nframes]
out = out[:, :, :T].transpose(1, 2) # [bs, nframes, nfeats,]
out_cond, out_uncond = torch.split(out, len(out) // 2, dim=0)
if self.vis_attn == True:
i = len(MONITOR_ATTN)
attnlist = MONITOR_ATTN[-1]
print(i, "cross", len(attnlist))
for j, att in enumerate(attnlist):
vis_attn(
att,
out_path=self.out_path,
step=i,
layer=j,
shape="_".join(map(str, att.shape)),
type_="cross",
)
attnlist = SELF_ATTN[-1]
print(i, "self", len(attnlist))
for j, att in enumerate(attnlist):
vis_attn(
att,
out_path=self.out_path,
step=i,
layer=j,
shape="_".join(map(str, att.shape)),
type_="self",
lines=False,
)
if len(SELF_ATTN) % 10 == 0:
SELF_ATTN = []
MONITOR_ATTN = []
return out_uncond + (cfg_scale * (out_cond - out_uncond))
if __name__ == "__main__":
device = "cuda:0"
n_feats = 263
num_frames = 196
text_latent_dim = 256
dim_mults = [2, 2, 2, 2]
base_dim = 512
model = MotionCLR(
input_feats=n_feats,
text_latent_dim=text_latent_dim,
base_dim=base_dim,
dim_mults=dim_mults,
adagn=True,
zero=True,
dropout=0.1,
no_eff=True,
cond_mask_prob=0.1,
self_attention=True,
)
model = model.to(device)
from utils.model_load import load_model_weights
checkpoint_path = "/comp_robot/chenlinghao/StableMoFusion/checkpoints/t2m/self_attn—fulllayer-ffn-drop0_1-lr1e4/model/latest.tar"
new_state_dict = {}
checkpoint = torch.load(checkpoint_path)
ckpt2 = checkpoint.copy()
ckpt2["model_ema"] = {}
ckpt2["encoder"] = {}
for key, value in list(checkpoint["model_ema"].items()):
new_key = key.replace(
"cross_attn", "clr_attn"
) # Replace 'cross_attn' with 'clr_attn'
ckpt2["model_ema"][new_key] = value
for key, value in list(checkpoint["encoder"].items()):
new_key = key.replace(
"cross_attn", "clr_attn"
) # Replace 'cross_attn' with 'clr_attn'
ckpt2["encoder"][new_key] = value
torch.save(
ckpt2,
"/comp_robot/chenlinghao/CLRpreview/checkpoints/t2m/release/model/latest.tar",
)
dtype = torch.float32
bs = 1
x = torch.rand((bs, 196, 263), dtype=dtype).to(device)
timesteps = torch.randint(low=0, high=1000, size=(bs,)).to(device)
y = ["A man jumps to his left." for i in range(bs)]
length = torch.randint(low=20, high=196, size=(bs,)).to(device)
out = model(x, timesteps, text=y)
print(out.shape)
model.eval()
out = model.forward_with_cfg(x, timesteps, text=y)
print(out.shape)