"""
app.py
"""

import json
from typing import Any, Dict, List, Optional

import gradio as gr
import numpy as np
import pandas as pd
import spaces
import torch
from huggingface_hub import hf_hub_download
from sklearn.linear_model import RidgeClassifier
from transformers import AutoModel, AutoTokenizer

# Define the list of available Lionguard models
Lionguard_models: List[str] = [
    "dsaidgovsg/Lionguard-binary-v1.0",
    "dsaidgovsg/Lionguard-harassment-v1.0",
    "dsaidgovsg/Lionguard-hateful-v1.0",
    "dsaidgovsg/Lionguard-public_harm-v1.0",
    "dsaidgovsg/Lionguard-self_harm-v1.0",
    "dsaidgovsg/Lionguard-sexual-v1.0",
    "dsaidgovsg/Lionguard-toxic-v1.0",
    "dsaidgovsg/Lionguard-violent-v1.0",
]

def load_config(model_repo: str) -> Dict[str, Any]:
    """
    Load the configuration for a given model repository.

    Args:
        model_repo (str): The model repository name.

    Returns:
        Dict[str, Any]: The configuration dictionary.
    """
    config_path = hf_hub_download(repo_id=model_repo, filename="config.json")
    with open(config_path, 'r') as f:
        return json.load(f)

def load_all_configs() -> Dict[str, Dict[str, Any]]:
    """
    Load configurations for all Lionguard models.

    Returns:
        Dict[str, Dict[str, Any]]: A dictionary of model configurations.
    """
    model_configs = {}
    for model_repo in Lionguard_models:
        model_configs[model_repo] = load_config(model_repo)
    print("All model configurations loaded.")
    return model_configs

@spaces.GPU
def get_embeddings(device: str, data: List[str], config: Dict[str, Any]) -> np.ndarray:
    """
    Generate embeddings for the input data using the specified model configuration.

    Args:
        device (str): The device to use for computations.
        data (List[str]): The input text data.
        config (Dict[str, Any]): The model configuration.

    Returns:
        np.ndarray: The generated embeddings.
    """
    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])
    model = AutoModel.from_pretrained(config['embedding_model'])
    model.eval()
    model.to(device)

    batch_size = config['batch_size']
    num_batches = int(np.ceil(len(data)/batch_size))
    output = []
    for i in range(num_batches):
        sentences = data[i*batch_size:(i+1)*batch_size]
        encoded_input = tokenizer(sentences, max_length=config['max_length'], padding=True, truncation=True, return_tensors='pt')
        encoded_input.to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)
            sentence_embeddings = model_output[0][:, 0]
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        output.extend(sentence_embeddings.cpu().numpy())
    return np.array(output)

def set_model_atttributes(model: RidgeClassifier, attributes: Dict[str, Any]) -> RidgeClassifier:
    """
    Set the attributes for the Ridge Classifier model.

    Args:
        model (RidgeClassifier): The Ridge Classifier model.
        attributes (Dict[str, Any]): The attributes to set.

    Returns:
        RidgeClassifier: The updated Ridge Classifier model.
    """
    model.coef_ = np.array(attributes['coef_'])
    model.intercept_ = np.array(attributes['intercept_'])
    model.n_features_in_ = np.array(attributes['n_features_in_'])
    return model

def convert_decision_to_proba(d: np.ndarray) -> np.ndarray:
    """
    Convert decision function values to probabilities.

    Args:
        d (np.ndarray): The decision function values.

    Returns:
        np.ndarray: The converted probabilities.
    """
    d = np.c_[-d, d]
    probs = np.exp(d) / np.sum(np.exp(d), axis=1, keepdims=True)
    return probs

def predict_all(text: str, model_configs: Dict[str, Dict[str, Any]]) -> Optional[pd.DataFrame]:
    """
    Predict probabilities for all Lionguard models given an input text.

    Args:
        text (str): The input text to predict on.
        model_configs (Dict[str, Dict[str, Any]]): The configurations for all models.

    Returns:
        pd.DataFrame: A DataFrame containing prediction probabilities for each category.
    """
    if not text.strip():
        return None

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    first_model = next(iter(model_configs))
    config = model_configs[first_model]
    embeddings = get_embeddings(device, [text], config)
    embeddings_df = pd.DataFrame(embeddings)

    results = []
    for model_repo in Lionguard_models:
        if model_repo not in model_configs:
            print(f"Configuration for {model_repo} not found. Skipping...")
            continue
        
        config = model_configs[model_repo]
        
        model_fp = hf_hub_download(repo_id=model_repo, filename=config['model_name'])
        with open(model_fp, 'r') as json_file:
            model_params = json.load(json_file)

        model = RidgeClassifier()
        model_attributes = model_params["attributes"]
        model_params.pop("attributes", None)
        model.set_params(**model_params)
        model = set_model_atttributes(model, model_attributes)
        
        preds = convert_decision_to_proba(model.decision_function(embeddings_df))[:,1]
        
        model_name = model_repo.split('/')[-1].split('-')[1]
        results.append({"Category": model_name, "Probability": float(preds[0])})

    result_df = pd.DataFrame(results).sort_values("Probability", ascending=False)

    if result_df.shape[0] > 0:
        return result_df
    else:
        return None

def create_interface(model_configs: Dict[str, Dict[str, Any]]) -> gr.Interface:
    """
    Create the Gradio interface for the Lionguard demo.

    Args:
        model_configs (Dict[str, Dict[str, Any]]): The configurations for all models.

    Returns:
        gr.Interface: The Gradio interface object.
    """
    return gr.Interface(
        fn=lambda text: predict_all(text, model_configs),
        inputs=gr.Textbox(lines=3, placeholder="Enter text here..."),
        outputs=gr.DataFrame(label="Prediction Probabilities"),
        title="🦁 Lionguard Demo",
        description="Lionguard is a Singapore-contextualized moderation classifier that can serve against unsafe LLM outputs.",
        allow_flagging="never"
    )

if __name__ == "__main__":
    model_configs = load_all_configs()
    iface = create_interface(model_configs)
    iface.launch()