Spaces:

gabrielchua
/

lionguard-demo

Runtime error

App Files Files Community

gabrielchua commited on Jun 23

Commit

073508d

•

1 Parent(s): 3df2176

add demo

Browse files

Files changed (2) hide show

app.py +191 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+app.py
+"""
+import json
+from typing import Any, Dict, List, Optional
+import gradio as gr
+import numpy as np
+import pandas as pd
+import spaces
+import torch
+from huggingface_hub import hf_hub_download
+from sklearn.linear_model import RidgeClassifier
+from transformers import AutoModel, AutoTokenizer
+# Define the list of available Lionguard models
+Lionguard_models: List[str] = [
+    "dsaidgovsg/Lionguard-binary-v1.0",
+    "dsaidgovsg/Lionguard-harassment-v1.0",
+    "dsaidgovsg/Lionguard-hateful-v1.0",
+    "dsaidgovsg/Lionguard-public_harm-v1.0",
+    "dsaidgovsg/Lionguard-self_harm-v1.0",
+    "dsaidgovsg/Lionguard-sexual-v1.0",
+    "dsaidgovsg/Lionguard-toxic-v1.0",
+    "dsaidgovsg/Lionguard-violent-v1.0",
+]
+def load_config(model_repo: str) -> Dict[str, Any]:
+    """
+    Load the configuration for a given model repository.
+    Args:
+        model_repo (str): The model repository name.
+    Returns:
+        Dict[str, Any]: The configuration dictionary.
+    """
+    config_path = hf_hub_download(repo_id=model_repo, filename="config.json")
+    with open(config_path, 'r') as f:
+        return json.load(f)
+def load_all_configs() -> Dict[str, Dict[str, Any]]:
+    """
+    Load configurations for all Lionguard models.
+    Returns:
+        Dict[str, Dict[str, Any]]: A dictionary of model configurations.
+    """
+    model_configs = {}
+    for model_repo in Lionguard_models:
+        model_configs[model_repo] = load_config(model_repo)
+    print("All model configurations loaded.")
+    return model_configs
+@spaces.GPU
+def get_embeddings(device: str, data: List[str], config: Dict[str, Any]) -> np.ndarray:
+    """
+    Generate embeddings for the input data using the specified model configuration.
+    Args:
+        device (str): The device to use for computations.
+        data (List[str]): The input text data.
+        config (Dict[str, Any]): The model configuration.
+    Returns:
+        np.ndarray: The generated embeddings.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])
+    model = AutoModel.from_pretrained(config['embedding_model'])
+    model.eval()
+    model.to(device)
+    batch_size = config['batch_size']
+    num_batches = int(np.ceil(len(data)/batch_size))
+    output = []
+    for i in range(num_batches):
+        sentences = data[i*batch_size:(i+1)*batch_size]
+        encoded_input = tokenizer(sentences, max_length=config['max_length'], padding=True, truncation=True, return_tensors='pt')
+        encoded_input.to(device)
+        with torch.no_grad():
+            model_output = model(**encoded_input)
+            sentence_embeddings = model_output[0][:, 0]
+        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+        output.extend(sentence_embeddings.cpu().numpy())
+    return np.array(output)
+def set_model_atttributes(model: RidgeClassifier, attributes: Dict[str, Any]) -> RidgeClassifier:
+    """
+    Set the attributes for the Ridge Classifier model.
+    Args:
+        model (RidgeClassifier): The Ridge Classifier model.
+        attributes (Dict[str, Any]): The attributes to set.
+    Returns:
+        RidgeClassifier: The updated Ridge Classifier model.
+    """
+    model.coef_ = np.array(attributes['coef_'])
+    model.intercept_ = np.array(attributes['intercept_'])
+    model.n_features_in_ = np.array(attributes['n_features_in_'])
+    return model
+def convert_decision_to_proba(d: np.ndarray) -> np.ndarray:
+    """
+    Convert decision function values to probabilities.
+    Args:
+        d (np.ndarray): The decision function values.
+    Returns:
+        np.ndarray: The converted probabilities.
+    """
+    d = np.c_[-d, d]
+    probs = np.exp(d) / np.sum(np.exp(d), axis=1, keepdims=True)
+    return probs
+def predict_all(text: str, model_configs: Dict[str, Dict[str, Any]]) -> Optional[pd.DataFrame]:
+    """
+    Predict probabilities for all Lionguard models given an input text.
+    Args:
+        text (str): The input text to predict on.
+        model_configs (Dict[str, Dict[str, Any]]): The configurations for all models.
+    Returns:
+        pd.DataFrame: A DataFrame containing prediction probabilities for each category.
+    """
+    if not text.strip():
+        return None
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    first_model = next(iter(model_configs))
+    config = model_configs[first_model]
+    embeddings = get_embeddings(device, [text], config)
+    embeddings_df = pd.DataFrame(embeddings)
+    results = []
+    for model_repo in Lionguard_models:
+        if model_repo not in model_configs:
+            print(f"Configuration for {model_repo} not found. Skipping...")
+            continue
+        config = model_configs[model_repo]
+        model_fp = hf_hub_download(repo_id=model_repo, filename=config['model_name'])
+        with open(model_fp, 'r') as json_file:
+            model_params = json.load(json_file)
+        model = RidgeClassifier()
+        model_attributes = model_params["attributes"]
+        model_params.pop("attributes", None)
+        model.set_params(**model_params)
+        model = set_model_atttributes(model, model_attributes)
+        preds = convert_decision_to_proba(model.decision_function(embeddings_df))[:,1]
+        model_name = model_repo.split('/')[-1].split('-')[1]
+        results.append({"Category": model_name, "Probability": float(preds[0])})
+    result_df = pd.DataFrame(results).sort_values("Probability", ascending=False)
+    if result_df.shape[0] > 0:
+        return result_df
+    else:
+        return None
+def create_interface(model_configs: Dict[str, Dict[str, Any]]) -> gr.Interface:
+    """
+    Create the Gradio interface for the Lionguard demo.
+    Args:
+        model_configs (Dict[str, Dict[str, Any]]): The configurations for all models.
+    Returns:
+        gr.Interface: The Gradio interface object.
+    """
+    return gr.Interface(
+        fn=lambda text: predict_all(text, model_configs),
+        inputs=gr.Textbox(lines=3, placeholder="Enter text here..."),
+        outputs=gr.DataFrame(label="Prediction Probabilities"),
+        title="🦁 Lionguard Demo",
+        description="Lionguard is a Singapore-contextualized moderation classifier that can serve against unsafe LLM outputs.",
+        allow_flagging="never"
+    )
+if __name__ == "__main__":
+    model_configs = load_all_configs()
+    iface = create_interface(model_configs)
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio==4.22.0
+huggingface-hub==0.20.2
+numpy==1.24.4
+pandas==2.2.1
+torch==2.1.2
+scikit-learn==1.3.0
+spaces==0.28.3
+transformers==4.37.2