import gradio as gr
from transformers import Wav2Vec2FeatureExtractor
from transformers import AutoModel
import torch
from torch import nn
import torchaudio
import torchaudio.transforms as T
import logging
# input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py


logger = logging.getLogger("whisper-jax-app")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
ch.setFormatter(formatter)
logger.addHandler(ch)


inputs = [gr.components.Audio(type="filepath", label="Add music audio file"), 
          gr.inputs.Audio(source="microphone",optional=True, type="filepath"),
          ]
outputs = [gr.components.Textbox()]
# outputs = [gr.components.Textbox(), transcription_df]
title = "Output the tags of a (music) audio"
description = "An example of using MERT-95M-public to conduct music tagging."
article = ""
audio_examples = [
    # ["input/example-1.wav"],
    # ["input/example-2.wav"],
]

# Load the model
model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def convert_audio(inputs, microphone):
    if (microphone is not None):
        inputs = microphone
    
    waveform, sample_rate = torchaudio.load(inputs)

    resample_rate = processor.sampling_rate

    # make sure the sample_rate aligned
    if resample_rate != sample_rate:
        print(f'setting rate from {sample_rate} to {resample_rate}')
        resampler = T.Resample(sample_rate, resample_rate)
        waveform = resampler(waveform)
    
    waveform = waveform.view(-1,) # make it (n_sample, )
    model_inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt")
    model_inputs.to(device)
    with torch.no_grad():
        model_outputs = model(**model_inputs, output_hidden_states=True)

    # take a look at the output shape, there are 13 layers of representation
    # each layer performs differently in different downstream tasks, you should choose empirically
    all_layer_hidden_states = torch.stack(model_outputs.hidden_states).squeeze()
    # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
    # logger.warning(all_layer_hidden_states.shape)
    
    return device + "  :" + str(all_layer_hidden_states.shape)


# iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text")
# iface.launch()

audio_chunked = gr.Interface(
    fn=convert_audio,
    inputs=inputs,
    outputs=outputs,
    allow_flagging="never",
    title=title,
    description=description,
    article=article,
    examples=audio_examples,
)


demo = gr.Blocks()
with demo:
    gr.TabbedInterface([audio_chunked], [
        "Audio File"])
# demo.queue(concurrency_count=1, max_size=5)
demo.launch(show_api=False)