File size: 50,663 Bytes

import sys

sys.path.append("src")
import torch
import logging
import torch.nn as nn
from qa_mdt.audioldm_train.modules.clap.open_clip import create_model
from qa_mdt.audioldm_train.modules.clap.training.data import get_audio_features

import torchaudio
from transformers import (
    RobertaTokenizer,
    AutoTokenizer,
    T5EncoderModel,
    MT5EncoderModel,
)
import torch.nn.functional as F
from qa_mdt.audioldm_train.modules.audiomae.AudioMAE import Vanilla_AudioMAE
from qa_mdt.audioldm_train.modules.phoneme_encoder.encoder import TextEncoder

from transformers import SpeechT5Processor, AutoTokenizer, GPT2Model, GPT2Tokenizer
from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithTextPrenet

from qa_mdt.audioldm_train.modules.audiomae.sequence_gen.model import CLAP2AudioMAE
from qa_mdt.audioldm_train.modules.audiomae.sequence_gen.sequence_input import (
    Sequence2AudioMAE,
)
import numpy as np
from qa_mdt.audioldm_train.modules.audiomae.sequence_gen.model import Prenet
import json 
with open('offset_pretrained_checkpoints.json', 'r') as config_file:
    config_data = json.load(config_file)

"""
The model forward function can return three types of data:
1. tensor: used directly as conditioning signal
2. dict: where there is a main key as condition, there are also other key that you can use to pass loss function and itermediate result. etc.
3. list: the length is 2, in which the first element is tensor, the second element is attntion mask.

The output shape for the cross attention condition should be:
x,x_mask = [bs, seq_len, emb_dim], [bs, seq_len]

All the returned data, in which will be used as diffusion input, will need to be in float type
"""


class GPT2WordEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        # self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = GPT2Model.from_pretrained("gpt2").wte
        self.device = None

    def get_unconditional_condition(self, batchsize):
        unconditional_condition = ["random"] * batchsize
        return self(unconditional_condition)

    def forward(self, text):
        assert isinstance(text, list)
        if self.device is None:
            self.device = next(self.model.parameters()).device

        tokenization_result = self.tokenizer(text, return_tensors="pt", padding=True)
        input_ids, attn_mask = tokenization_result["input_ids"].to(
            self.device
        ), tokenization_result["attention_mask"].to(self.device)

        input_embed = self.model(input_ids.long())

        return [input_embed, attn_mask]


class ConcateBandWidthCond(nn.Module):
    def __init__(self, latent_t_size, latent_f_size):
        super().__init__()
        self.placeholder = nn.Linear(1, 1)
        self.latent_t_size = latent_t_size
        self.latent_f_size = latent_f_size
        self.device = None

    def get_unconditional_condition(self, batchsize):
        return torch.zeros((batchsize, self.latent_t_size, self.latent_f_size)).to(
            self.device
        )

    def forward(self, mel_spec_bandwidth_cond_extra_channel):
        if self.device is None:
            self.device = mel_spec_bandwidth_cond_extra_channel.device

        return mel_spec_bandwidth_cond_extra_channel


class BandwidthEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(1000, 128)
        nn.init.normal_(self.emb.weight, 0.0, 128**-0.5)
        self.linear_bandwidth = nn.Linear(128, 128)
        self.unconditional_condition = torch.zeros((1, 256))
        self.device = None

    def get_unconditional_condition(self, batchsize):
        return self.unconditional_condition.expand(batchsize, 256)

    def forward(self, bandwidth):

        if self.device is None:
            self.device = next(self.linear_bandwidth.parameters()).device
            self.unconditional_condition = self.unconditional_condition.to(self.device)

        # freq_energy_percentile
        lower_cutoff, higher_cutoff = bandwidth[..., 0], bandwidth[..., 1]
        # lower_cutoff, higher_cutoff = lower_cutoff*0+5, higher_cutoff*0+300

        lower_cutoff_emb = self.linear_bandwidth(self.emb(lower_cutoff.long()))
        higher_cutoff_emb = self.linear_bandwidth(self.emb(higher_cutoff.long()))
        cutoff_emb = torch.cat([lower_cutoff_emb, higher_cutoff_emb], dim=-1)
        # [bs, 256]
        return cutoff_emb


class SpeechT5TextEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.model = SpeechT5EncoderWithTextPrenet.from_pretrained(
            "microsoft/speecht5_tts"
        )
        for p in self.model.parameters():
            p.requires_grad = False
        self.model.eval()

    # Required
    def get_unconditional_condition(self, batchsize):
        device = self.model.device
        hidden_state = torch.zeros((batchsize, 1, 768)).to(device)
        attention_mask = torch.ones((batchsize, 1)).to(device)
        return [hidden_state.float(), attention_mask.float()]

    def forward(self, text):
        with torch.no_grad():
            device = self.model.device
            inputs = self.processor(text=text, return_tensors="pt", padding=True)
            input_ids, attention_mask = inputs["input_ids"].to(device), inputs[
                "attention_mask"
            ].to(device)
            emb = self.model(input_ids, attention_mask)
            emb = emb.last_hidden_state.detach()
        return [emb.float(), attention_mask.float()]


class PhonemeEncoder(nn.Module):
    def __init__(self, vocabs_size=41, pad_length=250, pad_token_id=None):
        super().__init__()
        """
            encoder = PhonemeEncoder(40)
            data = torch.randint(0, 39, (2, 250))
            output = encoder(data)
            import ipdb;ipdb.set_trace()
        """
        assert pad_token_id is not None

        self.device = None
        self.PAD_LENGTH = int(pad_length)
        self.pad_token_id = pad_token_id
        self.pad_token_sequence = torch.tensor([self.pad_token_id] * self.PAD_LENGTH)

        self.text_encoder = TextEncoder(
            n_vocab=vocabs_size,
            out_channels=192,
            hidden_channels=192,
            filter_channels=768,
            n_heads=2,
            n_layers=6,
            kernel_size=3,
            p_dropout=0.1,
        )

        self.learnable_positional_embedding = torch.nn.Parameter(
            torch.zeros((1, 192, self.PAD_LENGTH))
        )  # [batchsize, seqlen, padlen]
        self.learnable_positional_embedding.requires_grad = True

    # Required
    def get_unconditional_condition(self, batchsize):
        unconditional_tokens = self.pad_token_sequence.expand(
            batchsize, self.PAD_LENGTH
        )
        return self(unconditional_tokens)  # Need to return float type

    # def get_unconditional_condition(self, batchsize):

    #     hidden_state = torch.zeros((batchsize, self.PAD_LENGTH, 192)).to(self.device)
    #     attention_mask = torch.ones((batchsize, self.PAD_LENGTH)).to(self.device)
    #     return [hidden_state, attention_mask] # Need to return float type

    def _get_src_mask(self, phoneme):
        src_mask = phoneme != self.pad_token_id
        return src_mask

    def _get_src_length(self, phoneme):
        src_mask = self._get_src_mask(phoneme)
        length = torch.sum(src_mask, dim=-1)
        return length

    # def make_empty_condition_unconditional(self, src_length, text_emb, attention_mask):
    #     # src_length: [bs]
    #     # text_emb: [bs, 192, pad_length]
    #     # attention_mask: [bs, pad_length]
    #     mask = src_length[..., None, None] > 1
    #     text_emb = text_emb * mask

    #     attention_mask[src_length < 1] = attention_mask[src_length < 1] * 0.0 + 1.0
    #     return text_emb, attention_mask

    def forward(self, phoneme_idx):
        if self.device is None:
            self.device = self.learnable_positional_embedding.device
            self.pad_token_sequence = self.pad_token_sequence.to(self.device)

        src_length = self._get_src_length(phoneme_idx)
        text_emb, m, logs, text_emb_mask = self.text_encoder(phoneme_idx, src_length)
        text_emb = text_emb + self.learnable_positional_embedding

        # text_emb, text_emb_mask = self.make_empty_condition_unconditional(src_length, text_emb, text_emb_mask)

        return [
            text_emb.permute(0, 2, 1),
            text_emb_mask.squeeze(1),
        ]  # [2, 250, 192], [2, 250]


class FlanT5HiddenState(nn.Module):
    """
    llama = FlanT5HiddenState()
    data = ["","this is not an empty sentence"]
    encoder_hidden_states = llama(data)
    import ipdb;ipdb.set_trace()
    """

    def __init__(
        self, text_encoder_name=config_data['flan_t5'], freeze_text_encoder=True
    ):
        super().__init__()
        self.freeze_text_encoder = freeze_text_encoder
        ## MODIFIED 
        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
        self.model = T5EncoderModel.from_pretrained("google/flan-t5-large")
        if freeze_text_encoder:
            self.model.eval()
            for p in self.model.parameters():
                p.requires_grad = False
        else:
            print("=> The text encoder is learnable")

        self.empty_hidden_state_cfg = None
        self.device = None

    # Required
    def get_unconditional_condition(self, batchsize):
        param = next(self.model.parameters())
        if self.freeze_text_encoder:
            assert param.requires_grad == False

        # device = param.device
        if self.empty_hidden_state_cfg is None:
            self.empty_hidden_state_cfg, _ = self([""])

        hidden_state = torch.cat([self.empty_hidden_state_cfg] * batchsize).float()
        attention_mask = (
            torch.ones((batchsize, hidden_state.size(1)))
            .to(hidden_state.device)
            .float()
        )
        return [hidden_state, attention_mask]  # Need to return float type

    def forward(self, batch):
        param = next(self.model.parameters())
        if self.freeze_text_encoder:
            assert param.requires_grad == False

        if self.device is None:
            self.device = param.device

        # print("Manually change text")
        # for i in range(len(batch)):
        #     batch[i] = "dog barking"
        try:
            return self.encode_text(batch)
        except Exception as e:
            print(e, batch)
            logging.exception("An error occurred: %s", str(e))

    def encode_text(self, prompt):
        device = self.model.device
        batch = self.tokenizer(
            prompt,
            max_length=128,  # self.tokenizer.model_max_length
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(
            device
        )
        # Get text encoding
        if self.freeze_text_encoder:
            with torch.no_grad():
                encoder_hidden_states = self.model(
                    input_ids=input_ids, attention_mask=attention_mask
                )[0]
        else:
            encoder_hidden_states = self.model(
                input_ids=input_ids, attention_mask=attention_mask
            )[0]
        return [
            encoder_hidden_states.detach(),
            attention_mask.float(),
        ]  # Attention mask == 1 means usable token


class FlanT5HiddenStatePaddedSameLength(nn.Module):
    """
    llama = FlanT5HiddenState()
    data = ["","this is not an empty sentence"]
    encoder_hidden_states = llama(data)
    import ipdb;ipdb.set_trace()
    """

    def __init__(
        self, text_encoder_name="google/flan-t5-large", freeze_text_encoder=True
    ):
        super().__init__()
        self.freeze_text_encoder = freeze_text_encoder
        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
        self.model = T5EncoderModel.from_pretrained("google/flan-t5-large")
        if freeze_text_encoder:
            self.model.eval()
            for p in self.model.parameters():
                p.requires_grad = False
        else:
            print("=> The text encoder is learnable")

        self.empty_hidden_state_cfg = None
        self.device = None

    # Required
    def get_unconditional_condition(self, batchsize):
        param = next(self.model.parameters())
        if self.freeze_text_encoder:
            assert param.requires_grad == False

        # device = param.device
        if self.empty_hidden_state_cfg is None:
            self.empty_hidden_state_cfg, _ = self([""])

        hidden_state = torch.cat([self.empty_hidden_state_cfg] * batchsize).float()
        attention_mask = (
            torch.ones((batchsize, hidden_state.size(1)))
            .to(hidden_state.device)
            .float()
        )
        return [hidden_state, attention_mask]  # Need to return float type

    def forward(self, batch):
        param = next(self.model.parameters())
        if self.freeze_text_encoder:
            assert param.requires_grad == False

        if self.device is None:
            self.device = param.device

        # print("Manually change text")
        # for i in range(len(batch)):
        #     batch[i] = "dog barking"
        try:
            text_embed = self.encode_text(batch)
            return text_embed
        except Exception as e:
            print(e, batch)
            logging.exception("An error occurred: %s", str(e))

    def encode_text(self, prompt):
        device = self.model.device
        batch = self.tokenizer(
            prompt,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(
            device
        )

        # Get text encoding
        if self.freeze_text_encoder:
            with torch.no_grad():
                encoder_hidden_states = self.model(
                    input_ids=input_ids, attention_mask=attention_mask
                )[0]
        else:
            encoder_hidden_states = self.model(
                input_ids=input_ids, attention_mask=attention_mask
            )[0]
        return [
            encoder_hidden_states.detach(),
            attention_mask.float(),
        ]  # Attention mask == 1 means usable token


class CLAPGenAudioMAECond(CLAP2AudioMAE):
    def __init__(
        self,
        cond_stage_config,
        learnable=True,
        pretrained_path=None,
        use_gt_mae_output=None,  # False: does not use AudioMAE GT, True: Use AudioMAE GT
        use_gt_mae_prob=None,
    ):  # The prob of using AudioMAE GT
        super().__init__(base_learning_rate=1e-5, cond_stage_config=cond_stage_config)
        assert use_gt_mae_output is not None and use_gt_mae_prob is not None

        if pretrained_path is not None:
            print("Reload CLAPGenAudioMAECond from %s" % pretrained_path)
            state_dict = torch.load(pretrained_path)["state_dict"]
            self.load_state_dict(state_dict)

        self.use_gt_mae_output = use_gt_mae_output
        self.use_gt_mae_prob = use_gt_mae_prob
        self.learnable = learnable

        if not learnable:
            # Only optimize the GPT2 model
            for p in self.model.parameters():
                p.requires_grad = False
            self.eval()

    # Required
    def get_unconditional_condition(self, batchsize):
        return_dict = self.cfg_uncond(batchsize)
        return return_dict

    def forward(self, batch):
        # The conditional module can return both tensor or dictionaries
        # The returned tensor will be corresponding to the cond_stage_key
        # The returned dict will have keys that correspond to the cond_stage_key
        ret_dict = {}
        if self.use_gt_mae_output and torch.rand(1).item() < self.use_gt_mae_prob:
            cond_dict = self.get_input(batch)
            # Used as condition
            ret_dict["crossattn_clap_to_audiomae_feature"] = [
                cond_dict["crossattn_audiomae_pooled"][0],
                torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float(),
            ]  # Input sequence and mask
        else:
            # Used as condition
            input_embeds, cond_dict = self.generate(batch)
            input_embeds_mask = (
                torch.ones((input_embeds.size(0), input_embeds.size(1)))
                .to(input_embeds.device)
                .float()
            )
            ret_dict["crossattn_clap_to_audiomae_feature"] = [
                input_embeds,
                input_embeds_mask,
            ]  # Input sequence and mask

        # If the following two keys are not in cond_stage_key, then they will not be used as condition
        ret_dict["film_clap_cond1"] = cond_dict[
            "film_clap_cond1"
        ]  # the clap target latent
        ret_dict["crossattn_audiomae_pooled"] = cond_dict[
            "crossattn_audiomae_pooled"
        ]  # audiomae target latent

        if self.learnable and self.training:
            loss = self.training_step(batch, cond_dict=cond_dict)
            ret_dict["noncond_loss_clap2audiomae"] = loss

        return ret_dict


class SequenceGenAudioMAECond(Sequence2AudioMAE):
    def __init__(
        self,
        cond_stage_config,
        base_learning_rate,
        sequence_gen_length,
        sequence_input_key,
        sequence_input_embed_dim,
        batchsize,
        always_output_audiomae_gt=False,
        pretrained_path=None,
        force_reload_pretrain_avoid_overwrite=False,
        learnable=True,
        use_warmup=True,
        use_gt_mae_output=None,  # False: does not use AudioMAE GT, True: Use AudioMAE GT
        use_gt_mae_prob=None,
    ):  # The prob of using AudioMAE GT
        if use_warmup:
            print(
                "Warning: You didn't initialize sequence prediction module with trainer. Set warmup to False. You can still use the warmup scheme from the latent diffusion model."
            )
            use_warmup = False

        super().__init__(
            base_learning_rate=base_learning_rate,
            cond_stage_config=cond_stage_config,
            sequence_gen_length=sequence_gen_length,
            sequence_input_key=sequence_input_key,
            use_warmup=use_warmup,
            sequence_input_embed_dim=sequence_input_embed_dim,
            batchsize=batchsize,
        )

        assert use_gt_mae_output is not None and use_gt_mae_prob is not None
        self.always_output_audiomae_gt = always_output_audiomae_gt
        self.force_reload_pretrain_avoid_overwrite = (
            force_reload_pretrain_avoid_overwrite
        )
        self.pretrained_path = pretrained_path
        if self.force_reload_pretrain_avoid_overwrite:
            self.is_reload = False
        else:
            self.is_reload = True

        self.load_pretrain_model()

        self.use_gt_mae_output = use_gt_mae_output
        self.use_gt_mae_prob = use_gt_mae_prob
        self.learnable = learnable

        if not learnable:
            # Only optimize the GPT2 model
            for p in self.model.parameters():
                p.requires_grad = False
            self.eval()

    def load_pretrain_model(self):
        if self.pretrained_path is not None:
            print("Reload SequenceGenAudioMAECond from %s" % self.pretrained_path)
            state_dict = torch.load(self.pretrained_path)["state_dict"]
            self.load_state_dict(state_dict)

    # Required
    def get_unconditional_condition(self, batchsize):
        return_dict = self.cfg_uncond(batchsize)
        return_dict["crossattn_audiomae_generated"] = [
            return_dict["crossattn_audiomae_pooled"][0],
            torch.ones_like(return_dict["crossattn_audiomae_pooled"][1]).float(),
        ]
        return return_dict

    def forward(self, batch):
        # The conditional module can return both tensor or dictionaries
        # The returned tensor will be corresponding to the cond_stage_key
        # The returned dict will have keys that correspond to the cond_stage_key
        ret_dict = {}

        if self.force_reload_pretrain_avoid_overwrite and not self.is_reload:
            self.load_pretrain_model()
            self.is_reload = True

        self.check_module_param_update()

        if self.always_output_audiomae_gt or (
            self.use_gt_mae_output and torch.rand(1).item() < self.use_gt_mae_prob
        ):
            cond_dict = self.get_input(batch)
            ret_dict["crossattn_audiomae_generated"] = [
                cond_dict["crossattn_audiomae_pooled"][0],
                torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float(),
            ]  # Input sequence and mask
            # _, output = self.training_step(batch, cond_dict=cond_dict, return_output=True)
            # ret_dict["crossattn_audiomae_generated"] = [output, torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float()] # Input sequence and mask
        else:
            if not self.training:
                print("--------------> Generate !!!!!!!!!!!!")
            input_embeds, cond_dict = self.generate(batch)
            # print("Generate Partial!!!!"); input_embeds, cond_dict = self.generate_partial(batch)
            input_embeds_mask = (
                torch.ones((input_embeds.size(0), input_embeds.size(1)))
                .to(input_embeds.device)
                .float()
            )
            ret_dict["crossattn_audiomae_generated"] = [
                input_embeds,
                input_embeds_mask,
            ]  # Input sequence and mask

        # If the following two keys are not in cond_stage_key, then they will not be used as condition
        for key in cond_dict.keys():
            ret_dict[key] = cond_dict[key]

        if self.learnable and self.training:
            loss = self.training_step(batch, cond_dict=cond_dict)
            ret_dict["noncond_loss_clap2audiomae"] = loss

        return ret_dict


class SequenceGenAudioMAECond_AudioMAE_PostNet(Sequence2AudioMAE):
    def __init__(
        self,
        cond_stage_config,
        base_learning_rate,
        sequence_gen_length,
        sequence_input_key,
        sequence_input_embed_dim,
        batchsize,
        always_output_audiomae_gt=False,
        pretrained_path=None,
        use_ar_gen_loss=False,
        force_reload_pretrain_avoid_overwrite=False,
        learnable=True,
        use_warmup=True,
        use_gt_mae_output=None,  # False: does not use AudioMAE GT, True: Use AudioMAE GT
        use_gt_mae_prob=None,
    ):  # The prob of using AudioMAE GT
        if use_warmup:
            print(
                "Warning: You didn't initialize sequence prediction module with trainer. Set warmup to False. You can still use the warmup scheme from the latent diffusion model."
            )
            use_warmup = False

        super().__init__(
            base_learning_rate=base_learning_rate,
            cond_stage_config=cond_stage_config,
            sequence_gen_length=sequence_gen_length,
            sequence_input_key=sequence_input_key,
            use_ar_gen_loss=use_ar_gen_loss,
            use_warmup=use_warmup,
            sequence_input_embed_dim=sequence_input_embed_dim,
            batchsize=batchsize,
        )

        assert use_gt_mae_output is not None and use_gt_mae_prob is not None
        self.always_output_audiomae_gt = always_output_audiomae_gt
        self.force_reload_pretrain_avoid_overwrite = (
            force_reload_pretrain_avoid_overwrite
        )
        self.pretrained_path = pretrained_path
        if self.force_reload_pretrain_avoid_overwrite:
            self.is_reload = False
        else:
            self.is_reload = True

        self.load_pretrain_model()

        self.prenet = Prenet(in_dim=768, sizes=[768, 768, 768], dropout_rate=0.5)

        self.use_gt_mae_output = use_gt_mae_output
        self.use_gt_mae_prob = use_gt_mae_prob
        self.learnable = learnable

        if not learnable:
            # Only optimize the GPT2 model
            for p in self.model.parameters():
                p.requires_grad = False
            self.eval()

    def load_pretrain_model(self):
        if self.pretrained_path is not None:
            print("Reload SequenceGenAudioMAECond from %s" % self.pretrained_path)
            state_dict = torch.load(self.pretrained_path)["state_dict"]
            self.load_state_dict(state_dict)

    # Required
    def get_unconditional_condition(self, batchsize):
        return_dict = self.cfg_uncond(batchsize)
        return_dict["crossattn_audiomae_generated"] = [
            return_dict["crossattn_audiomae_pooled"][0],
            torch.ones_like(return_dict["crossattn_audiomae_pooled"][1]).float(),
        ]
        return return_dict

    def forward(self, batch):
        # The conditional module can return both tensor or dictionaries
        # The returned tensor will be corresponding to the cond_stage_key
        # The returned dict will have keys that correspond to the cond_stage_key
        ret_dict = {}

        if self.force_reload_pretrain_avoid_overwrite and not self.is_reload:
            self.load_pretrain_model()
            self.is_reload = True

        self.check_module_param_update()

        if self.always_output_audiomae_gt or (
            self.use_gt_mae_output and torch.rand(1).item() < self.use_gt_mae_prob
        ):
            cond_dict = self.get_input(batch)
            gt_audiomae = self.prenet(cond_dict["crossattn_audiomae_pooled"][0])
            ret_dict["crossattn_audiomae_generated"] = [
                gt_audiomae,
                torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float(),
            ]  # Input sequence and mask
        else:
            print("--------------> Generate!!!!!!!!!!!!")
            input_embeds, cond_dict = self.generate(batch)
            # input_embeds, cond_dict = self.generate_partial(batch)
            input_embeds = self.prenet(input_embeds)
            input_embeds_mask = (
                torch.ones((input_embeds.size(0), input_embeds.size(1)))
                .to(input_embeds.device)
                .float()
            )
            ret_dict["crossattn_audiomae_generated"] = [
                input_embeds,
                input_embeds_mask,
            ]  # Input sequence and mask

        # If the following two keys are not in cond_stage_key, then they will not be used as condition
        for key in cond_dict.keys():
            ret_dict[key] = cond_dict[key]

        if self.learnable and self.training:
            loss = self.training_step(batch, cond_dict=cond_dict)
            ret_dict["noncond_loss_clap2audiomae"] = loss

        return ret_dict


class AudioMAEConditionCTPoolRandTFSeparated(nn.Module):
    """
    audiomae = AudioMAEConditionCTPool2x2()
    data = torch.randn((4, 1024, 128))
    output = audiomae(data)
    import ipdb;ipdb.set_trace()
    exit(0)
    """

    def __init__(
        self,
        time_pooling_factors=[1, 2, 4, 8],
        freq_pooling_factors=[1, 2, 4, 8],
        eval_time_pooling=None,
        eval_freq_pooling=None,
        mask_ratio=0.0,
        regularization=False,
        no_audiomae_mask=True,
        no_audiomae_average=False,
    ):
        super().__init__()
        self.device = None
        self.time_pooling_factors = time_pooling_factors
        self.freq_pooling_factors = freq_pooling_factors
        self.no_audiomae_mask = no_audiomae_mask
        self.no_audiomae_average = no_audiomae_average

        self.eval_freq_pooling = eval_freq_pooling
        self.eval_time_pooling = eval_time_pooling
        self.mask_ratio = mask_ratio
        self.use_reg = regularization

        self.audiomae = Vanilla_AudioMAE()
        self.audiomae.eval()
        for p in self.audiomae.parameters():
            p.requires_grad = False

    # Required
    def get_unconditional_condition(self, batchsize):
        param = next(self.audiomae.parameters())
        assert param.requires_grad == False
        device = param.device
        # time_pool, freq_pool = max(self.time_pooling_factors), max(self.freq_pooling_factors)
        time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
            self.eval_freq_pooling, 8
        )
        # time_pool = self.time_pooling_factors[np.random.choice(list(range(len(self.time_pooling_factors))))]
        # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
        token_num = int(512 / (time_pool * freq_pool))
        return [
            torch.zeros((batchsize, token_num, 768)).to(device).float(),
            torch.ones((batchsize, token_num)).to(device).float(),
        ]

    def pool(self, representation, time_pool=None, freq_pool=None):
        assert representation.size(-1) == 768
        representation = representation[:, 1:, :].transpose(1, 2)
        bs, embedding_dim, token_num = representation.size()
        representation = representation.reshape(bs, embedding_dim, 64, 8)

        if self.training:
            if time_pool is None and freq_pool is None:
                time_pool = min(
                    64,
                    self.time_pooling_factors[
                        np.random.choice(list(range(len(self.time_pooling_factors))))
                    ],
                )
                freq_pool = min(
                    8,
                    self.freq_pooling_factors[
                        np.random.choice(list(range(len(self.freq_pooling_factors))))
                    ],
                )
                # freq_pool = min(8, time_pool) # TODO here I make some modification.
        else:
            time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
                self.eval_freq_pooling, 8
            )

        self.avgpooling = nn.AvgPool2d(
            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
        )
        self.maxpooling = nn.MaxPool2d(
            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
        )

        pooled = (
            self.avgpooling(representation) + self.maxpooling(representation)
        ) / 2  # [bs, embedding_dim, time_token_num, freq_token_num]
        pooled = pooled.flatten(2).transpose(1, 2)
        return pooled  # [bs, token_num, embedding_dim]

    def regularization(self, x):
        assert x.size(-1) == 768
        x = F.normalize(x, p=2, dim=-1)
        return x

    # Required
    def forward(self, batch, time_pool=None, freq_pool=None):
        assert batch.size(-2) == 1024 and batch.size(-1) == 128

        if self.device is None:
            self.device = batch.device

        batch = batch.unsqueeze(1)
        with torch.no_grad():
            representation = self.audiomae(
                batch,
                mask_ratio=self.mask_ratio,
                no_mask=self.no_audiomae_mask,
                no_average=self.no_audiomae_average,
            )
            representation = self.pool(representation, time_pool, freq_pool)
            if self.use_reg:
                representation = self.regularization(representation)
            return [
                representation,
                torch.ones((representation.size(0), representation.size(1)))
                .to(representation.device)
                .float(),
            ]


class AudioMAEConditionCTPoolRand(nn.Module):
    """
    audiomae = AudioMAEConditionCTPool2x2()
    data = torch.randn((4, 1024, 128))
    output = audiomae(data)
    import ipdb;ipdb.set_trace()
    exit(0)
    """

    def __init__(
        self,
        time_pooling_factors=[1, 2, 4, 8],
        freq_pooling_factors=[1, 2, 4, 8],
        eval_time_pooling=None,
        eval_freq_pooling=None,
        mask_ratio=0.0,
        regularization=False,
        no_audiomae_mask=True,
        no_audiomae_average=False,
    ):
        super().__init__()
        self.device = None
        self.time_pooling_factors = time_pooling_factors
        self.freq_pooling_factors = freq_pooling_factors
        self.no_audiomae_mask = no_audiomae_mask
        self.no_audiomae_average = no_audiomae_average

        self.eval_freq_pooling = eval_freq_pooling
        self.eval_time_pooling = eval_time_pooling
        self.mask_ratio = mask_ratio
        self.use_reg = regularization

        self.audiomae = Vanilla_AudioMAE()
        self.audiomae.eval()
        for p in self.audiomae.parameters():
            p.requires_grad = False

    # Required
    def get_unconditional_condition(self, batchsize):
        param = next(self.audiomae.parameters())
        assert param.requires_grad == False
        device = param.device
        # time_pool, freq_pool = max(self.time_pooling_factors), max(self.freq_pooling_factors)
        time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
            self.eval_freq_pooling, 8
        )
        # time_pool = self.time_pooling_factors[np.random.choice(list(range(len(self.time_pooling_factors))))]
        # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
        token_num = int(512 / (time_pool * freq_pool))
        return [
            torch.zeros((batchsize, token_num, 768)).to(device).float(),
            torch.ones((batchsize, token_num)).to(device).float(),
        ]

    def pool(self, representation, time_pool=None, freq_pool=None):
        assert representation.size(-1) == 768
        representation = representation[:, 1:, :].transpose(1, 2)
        bs, embedding_dim, token_num = representation.size()
        representation = representation.reshape(bs, embedding_dim, 64, 8)

        if self.training:
            if time_pool is None and freq_pool is None:
                time_pool = min(
                    64,
                    self.time_pooling_factors[
                        np.random.choice(list(range(len(self.time_pooling_factors))))
                    ],
                )
                # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
                freq_pool = min(8, time_pool)  # TODO here I make some modification.
        else:
            time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
                self.eval_freq_pooling, 8
            )

        self.avgpooling = nn.AvgPool2d(
            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
        )
        self.maxpooling = nn.MaxPool2d(
            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
        )

        pooled = (
            self.avgpooling(representation) + self.maxpooling(representation)
        ) / 2  # [bs, embedding_dim, time_token_num, freq_token_num]
        pooled = pooled.flatten(2).transpose(1, 2)
        return pooled  # [bs, token_num, embedding_dim]

    def regularization(self, x):
        assert x.size(-1) == 768
        x = F.normalize(x, p=2, dim=-1)
        return x

    # Required
    def forward(self, batch, time_pool=None, freq_pool=None):
        assert batch.size(-2) == 1024 and batch.size(-1) == 128

        if self.device is None:
            self.device = batch.device

        batch = batch.unsqueeze(1)
        with torch.no_grad():
            representation = self.audiomae(
                batch,
                mask_ratio=self.mask_ratio,
                no_mask=self.no_audiomae_mask,
                no_average=self.no_audiomae_average,
            )
            representation = self.pool(representation, time_pool, freq_pool)
            if self.use_reg:
                representation = self.regularization(representation)
            return [
                representation,
                torch.ones((representation.size(0), representation.size(1)))
                .to(representation.device)
                .float(),
            ]


class ConditionalToken(nn.Module):
    def __init__(self, embedding_dim):
        super(ConditionalToken, self).__init__()
        self.embedding_dim = embedding_dim
        # Define the conditional tokens as fixed values
        self.pooling_factor_tokens = {
            1: torch.Tensor([1.0, 0.0] * (embedding_dim // 2)),
            2: torch.Tensor([0.0, 1.0] * (embedding_dim // 2)),
            4: torch.Tensor([1.0, 1.0] * (embedding_dim // 2)),
            8: torch.Tensor([-1.0, 0.0] * (embedding_dim // 2)),
            16: torch.Tensor([0.0, -1.0] * (embedding_dim // 2)),
            32: torch.Tensor([-1.0, -1.0] * (embedding_dim // 2)),
            64: torch.Tensor([0.0, 0.0] * (embedding_dim // 2)),
        }
        for p in self.parameters():
            p.requires_grad = False

    def forward(self, condition, batchsize):
        """
        Returns the conditional token for the given condition.
        """
        if condition not in self.pooling_factor_tokens.keys():
            raise ValueError(f"Unsupported condition: {condition}")
        batched_token = self.pooling_factor_tokens[condition][None, None].expand(
            batchsize, 1, self.embedding_dim
        )
        return batched_token


class AudioMAEConditionCTPoolRandV2(nn.Module):
    """
    audiomae = AudioMAEConditionCTPool2x2()
    data = torch.randn((4, 1024, 128))
    output = audiomae(data)
    import ipdb;ipdb.set_trace()
    exit(0)
    """

    def __init__(
        self,
        time_pooling_factors=[1, 2, 4, 8],
        freq_pooling_factors=[1, 2, 4, 8],
        eval_time_pooling=None,
        eval_freq_pooling=None,
        mask_ratio=0.0,
        regularization=False,
        no_audiomae_mask=True,
        no_audiomae_average=False,
    ):
        super().__init__()
        self.device = None
        self.time_pooling_factors = time_pooling_factors
        self.freq_pooling_factors = freq_pooling_factors
        self.no_audiomae_mask = no_audiomae_mask
        self.no_audiomae_average = no_audiomae_average

        self.eval_freq_pooling = eval_freq_pooling
        self.eval_time_pooling = eval_time_pooling
        self.mask_ratio = mask_ratio
        self.use_reg = regularization

        self.pooling_tokens = ConditionalToken(768)

        self.audiomae = Vanilla_AudioMAE()
        self.audiomae.eval()

        for p in self.audiomae.parameters():
            p.requires_grad = False

    # Required
    def get_unconditional_condition(self, batchsize):
        param = next(self.audiomae.parameters())
        assert param.requires_grad == False
        device = param.device
        # time_pool, freq_pool = max(self.time_pooling_factors), max(self.freq_pooling_factors)
        time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
            self.eval_freq_pooling, 8
        )
        # time_pool = self.time_pooling_factors[np.random.choice(list(range(len(self.time_pooling_factors))))]
        # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
        pool_condition_token = self.pooling_tokens(time_pool, batchsize).to(device)
        token_num = int(512 / (time_pool * freq_pool))

        rep = torch.zeros((batchsize, token_num, 768)).to(device).float()
        rep = torch.cat([rep, pool_condition_token], dim=1)

        return [rep, torch.ones((batchsize, token_num + 1)).to(device).float()]

    def pool(self, representation, time_pool=None, freq_pool=None):
        assert representation.size(-1) == 768
        representation = representation[:, 1:, :].transpose(1, 2)
        bs, embedding_dim, token_num = representation.size()
        representation = representation.reshape(bs, embedding_dim, 64, 8)

        if self.training:
            if time_pool is None and freq_pool is None:
                time_pool = min(
                    64,
                    self.time_pooling_factors[
                        np.random.choice(list(range(len(self.time_pooling_factors))))
                    ],
                )
                # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
                freq_pool = min(8, time_pool)  # TODO here I make some modification.
        else:
            time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
                self.eval_freq_pooling, 8
            )

        self.avgpooling = nn.AvgPool2d(
            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
        )
        self.maxpooling = nn.MaxPool2d(
            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
        )
        pooled = (
            self.avgpooling(representation) + self.maxpooling(representation)
        ) / 2  # [bs, embedding_dim, time_token_num, freq_token_num]
        pooled = pooled.flatten(2).transpose(1, 2)
        return pooled, time_pool, freq_pool  # [bs, token_num, embedding_dim]

    def regularization(self, x):
        assert x.size(-1) == 768
        x = F.normalize(x, p=2, dim=-1)
        return x

    # Required
    def forward(self, batch):
        assert batch.size(-2) == 1024 and batch.size(-1) == 128

        if self.device is None:
            self.device = batch.device

        batch = batch.unsqueeze(1)

        with torch.no_grad():
            representation = self.audiomae(
                batch,
                mask_ratio=self.mask_ratio,
                no_mask=self.no_audiomae_mask,
                no_average=self.no_audiomae_average,
            )
            representation, time_pool, freq_pool = self.pool(representation)
            if self.use_reg:
                representation = self.regularization(representation)
            pool_condition_token = self.pooling_tokens(
                time_pool, representation.size(0)
            ).to(representation.device)
            representation = torch.cat([representation, pool_condition_token], dim=1)

            return [
                representation,
                torch.ones((representation.size(0), representation.size(1)))
                .to(representation.device)
                .float(),
            ]


class BeatDownbeatConditionConcat(nn.Module):
    def __init__(self, latent_t_size, latent_f_size):
        super().__init__()
        self.latent_t_size = latent_t_size
        self.latent_f_size = latent_f_size
        self.device = None

    # Required
    def get_unconditional_condition(self, batchsize):
        return torch.zeros((batchsize, self.latent_t_size, self.latent_f_size)).to(
            self.device
        )

    # Required
    def forward(self, batch):
        if self.device is None:
            self.device = batch.device
        return batch


class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
    def __init__(
        self,
        pretrained_path,
        sampling_rate=16000,
        embed_mode="audio",
        amodel="HTSAT-base",
        unconditional_prob=0.1,
        random_mute=False,
        max_random_mute_portion=0.5,
        training_mode=True,
    ):
        super().__init__()
        self.device = "cpu"
        self.precision = "fp32"
        self.amodel = amodel  # or 'PANN-14'
        self.tmodel = "roberta"  # the best text encoder in our training
        self.enable_fusion = False  # False if you do not want to use the fusion model
        self.fusion_type = "aff_2d"
        self.pretrained = pretrained_path
        self.embed_mode = embed_mode
        self.embed_mode_orig = embed_mode
        self.sampling_rate = sampling_rate
        self.unconditional_prob = unconditional_prob
        self.random_mute = random_mute
        self.tokenize = RobertaTokenizer.from_pretrained(config_data["roberta-base"])
        self.max_random_mute_portion = max_random_mute_portion
        self.training_mode = training_mode
        self.model, self.model_cfg = create_model(
            self.amodel,
            self.tmodel,
            self.pretrained,
            precision=self.precision,
            device=self.device,
            enable_fusion=self.enable_fusion,
            fusion_type=self.fusion_type,
        )
        audio_cfg = self.model_cfg["audio_cfg"]
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=audio_cfg["sample_rate"],
            n_fft=audio_cfg["window_size"],
            win_length=audio_cfg["window_size"],
            hop_length=audio_cfg["hop_size"],
            center=True,
            pad_mode="reflect",
            power=2.0,
            norm=None,
            onesided=True,
            n_mels=64,
            f_min=audio_cfg["fmin"],
            f_max=audio_cfg["fmax"],
        )
        for p in self.model.parameters():
            p.requires_grad = False
        self.unconditional_token = None
        self.model.eval()

    def get_unconditional_condition(self, batchsize):
        self.unconditional_token = self.model.get_text_embedding(
            self.tokenizer(["", ""])
        )[0:1]
        return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)

    def batch_to_list(self, batch):
        ret = []
        for i in range(batch.size(0)):
            ret.append(batch[i])
        return ret

    def make_decision(self, probability):
        if float(torch.rand(1)) < probability:
            return True
        else:
            return False

    def random_uniform(self, start, end):
        val = torch.rand(1).item()
        return start + (end - start) * val

    def _random_mute(self, waveform):
        # waveform: [bs, t-steps]
        t_steps = waveform.size(-1)
        for i in range(waveform.size(0)):
            mute_size = int(
                self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
            )
            mute_start = int(self.random_uniform(0, t_steps - mute_size))
            waveform[i, mute_start : mute_start + mute_size] = 0
        return waveform

    def cos_similarity(self, waveform, text):
        # waveform: [bs, t_steps]
        original_embed_mode = self.embed_mode
        with torch.no_grad():
            self.embed_mode = "audio"
            audio_emb = self(waveform.cuda())
            self.embed_mode = "text"
            text_emb = self(text)
            similarity = F.cosine_similarity(audio_emb, text_emb, dim=2)
        self.embed_mode = original_embed_mode
        return similarity.squeeze()

    def build_unconditional_emb(self):
        self.unconditional_token = self.model.get_text_embedding(
            self.tokenizer(["", ""])
        )[0:1]

    def forward(self, batch):
        # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0
        # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0
        if self.model.training == True and not self.training_mode:
            print(
                "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters."
            )
            self.model, self.model_cfg = create_model(
                self.amodel,
                self.tmodel,
                self.pretrained,
                precision=self.precision,
                device="cuda",
                enable_fusion=self.enable_fusion,
                fusion_type=self.fusion_type,
            )
            for p in self.model.parameters():
                p.requires_grad = False
            self.model.eval()

        if self.unconditional_token is None:
            self.build_unconditional_emb()

        # if(self.training_mode):
        #     assert self.model.training == True
        # else:
        #     assert self.model.training == False

        # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
        if self.embed_mode == "audio":
            if not self.training:
                print("INFO: clap model calculate the audio embedding as condition")
            with torch.no_grad():
                # assert (
                #     self.sampling_rate == 16000
                # ), "We only support 16000 sampling rate"

                # if self.random_mute:
                #     batch = self._random_mute(batch)
                # batch: [bs, 1, t-samples]
                if self.sampling_rate != 48000:
                    batch = torchaudio.functional.resample(
                        batch, orig_freq=self.sampling_rate, new_freq=48000
                    )

                audio_data = batch.squeeze(1)
                mel = self.mel_transform(audio_data)
                audio_dict = get_audio_features(
                    audio_data,
                    mel,
                    480000,
                    data_truncating="fusion",
                    data_filling="repeatpad",
                    audio_cfg=self.model_cfg["audio_cfg"],
                )
                # [bs, 512]
                embed = self.model.get_audio_embedding(audio_dict)
        elif self.embed_mode == "text":
            with torch.no_grad():
                # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
                text_data = self.tokenizer(batch)

                if isinstance(batch, str) or (
                    isinstance(batch, list) and len(batch) == 1
                ):
                    for key in text_data.keys():
                        text_data[key] = text_data[key].unsqueeze(0)

                embed = self.model.get_text_embedding(text_data)

        embed = embed.unsqueeze(1)
        for i in range(embed.size(0)):
            if self.make_decision(self.unconditional_prob):
                embed[i] = self.unconditional_token
        # embed = torch.randn((batch.size(0), 1, 512)).type_as(batch)
        return embed.detach()

    def tokenizer(self, text):
        result = self.tokenize(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in result.items()}


if __name__ == "__main__":
    model = CLAPAudioEmbeddingClassifierFreev2(
        pretrained_path="/mnt/bn/lqhaoheliu/exps/checkpoints/audioldm/ckpt/CLAP.pt",
        embed_mode="text",
        amodel="HTSAT-tiny",
    )
    # data = torch.randn((6, 1, int(16000*10.24)))
    data = ["text", "text"]
    res = model(data)
    import ipdb

    ipdb.set_trace()