import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
import re
import time
from PIL import Image
import torch
import spaces
import subprocess
#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)


processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")

model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct", 
        torch_dtype=torch.bfloat16,
        #_attn_implementation="flash_attention_2"
        ).to("cuda")

@spaces.GPU
def model_inference(
    images, text, assistant_prefix, decoding_strategy, temperature, max_new_tokens,
    repetition_penalty, top_p
):
    if text == "" and not images:
        gr.Error("Please input a query and optionally image(s).")

    if text == "" and images:
        gr.Error("Please input a text query along the image(s).")

    if isinstance(images, Image.Image):
        images = [images]


    resulting_messages = [
                {
                    "role": "user",
                    "content": [{"type": "image"}] + [
                        {"type": "text", "text": text}
                    ]
                }
            ]

    if assistant_prefix:
      text = f"{assistant_prefix} {text}"


    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[images], return_tensors="pt")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    generation_args = {
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,

    }

    assert decoding_strategy in [
        "Greedy",
        "Top P Sampling",
    ]
    if decoding_strategy == "Greedy":
        generation_args["do_sample"] = False
    elif decoding_strategy == "Top P Sampling":
        generation_args["temperature"] = temperature
        generation_args["do_sample"] = True
        generation_args["top_p"] = top_p

    generation_args.update(inputs)

    # Generate
    generated_ids = model.generate(**generation_args)

    generated_texts = processor.batch_decode(generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True)
    return generated_texts[0]


with gr.Blocks() as demo:
    gr.Markdown("## SmolVLM: Small yet Mighty 💫")
    gr.Markdown("Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples.")
    with gr.Column():
        image_input = gr.Image(label="Upload your Image", type="pil", scale=1)
        query_input = gr.Textbox(label="Prompt")
        assistant_prefix = gr.Textbox(label="Assistant Prefix", placeholder="Let's think step by step.")

        submit_btn = gr.Button("Submit")
        output = gr.Textbox(label="Output")

        examples=[
                    ["example_images/rococo.jpg", "What art era is this?", None, "Greedy", 0.4, 512, 1.2, 0.8],
                    ["example_images/examples_wat_arun.jpg", "Give me travel tips for the area around this monument.", None, "Greedy", 0.4, 512, 1.2, 0.8],
                    ["example_images/examples_invoice.png", "What is the due date and the invoice date?", None, "Greedy", 0.4, 512, 1.2, 0.8],
                    ["example_images/s2w_example.png", "What is this UI about?", None, "Greedy", 0.4, 512, 1.2, 0.8],
                    ["example_images/examples_weather_events.png", "Where do the severe droughts happen according to this diagram?", None, "Greedy", 0.4, 512, 1.2, 0.8],
        ]
        
        with gr.Accordion(label="Advanced Generation Parameters", open=False):

            # Hyper-parameters for generation
            max_new_tokens = gr.Slider(
                minimum=8,
                maximum=1024,
                value=512,
                step=1,
                interactive=True,
                label="Maximum number of new tokens to generate",
            )
            repetition_penalty = gr.Slider(
                minimum=0.01,
                maximum=5.0,
                value=1.2,
                step=0.01,
                interactive=True,
                label="Repetition penalty",
                info="1.0 is equivalent to no penalty",
            )
            temperature = gr.Slider(
                minimum=0.0,
                maximum=5.0,
                value=0.4,
                step=0.1,
                interactive=True,
                label="Sampling temperature",
                info="Higher values will produce more diverse outputs.",
            )
            top_p = gr.Slider(
                minimum=0.01,
                maximum=0.99,
                value=0.8,
                step=0.01,
                interactive=True,
                label="Top P",
                info="Higher values is equivalent to sampling more low-probability tokens.",
            )
            decoding_strategy = gr.Radio(
                [
                    "Greedy",
                    "Top P Sampling",
                ],
                value="Greedy",
                label="Decoding strategy",
                interactive=True,
                info="Higher values is equivalent to sampling more low-probability tokens.",
            )
            decoding_strategy.change(
                fn=lambda selection: gr.Slider(
                    visible=(
                        selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
                    )
                ),
                inputs=decoding_strategy,
                outputs=temperature,
            )

            decoding_strategy.change(
                fn=lambda selection: gr.Slider(
                    visible=(
                        selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
                    )
                ),
                inputs=decoding_strategy,
                outputs=repetition_penalty,
            )
            decoding_strategy.change(
                fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
                inputs=decoding_strategy,
                outputs=top_p,
            )
        gr.Examples(
                        examples = examples,
                        inputs=[image_input, query_input, assistant_prefix, decoding_strategy, temperature,
                                                              max_new_tokens, repetition_penalty, top_p],
                        outputs=output,
                        fn=model_inference
                    )
        

        submit_btn.click(model_inference, inputs = [image_input, query_input, assistant_prefix, decoding_strategy, temperature,
                                                      max_new_tokens, repetition_penalty, top_p], outputs=output)


demo.launch(debug=True)