semantic-entropy-probes

Sleeping

App Files Files Community

s-a-malik commited on Jul 16

Commit

b874271

•

1 Parent(s): f4748d0

debugging

Browse files

Files changed (7) hide show

app_sep.py +189 -0
debug.ipynb +233 -0
model/{spiece.model → 20240625-131035_demo.pkl} +2 -2
model/config.json +0 -40
model/pytorch_model.bin +0 -3
model/special_tokens_map.json +0 -9
model/tokenizer_config.json +0 -17

app_sep.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import pickle as pkl
+from pathlib import Path
+from threading import Thread
+from typing import List, Optional, Tuple, Iterator
+import gradio as gr
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+DESCRIPTION = """\
+# Llama-2 7B Chat with Streamable Semantic Uncertainty Probe
+This Space demonstrates the Llama-2-7b-chat model with an added semantic uncertainty probe.
+The highlighted text shows the model's uncertainty in real-time, with more intense yellow indicating higher uncertainty.
+"""
+if torch.cuda.is_available():
+    model_id = "meta-llama/Llama-2-7b-chat-hf"
+    # TODO load the full model?
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.use_default_system_prompt = False
+    # load the probe data
+    # TODO load accuracy and SE probe and compare in different tabs
+    with open("./model/20240625-131035_demo.pkl", "rb") as f:
+        probe_data = pkl.load(f)
+    # take the NQ open one
+    probe_data = probe_data[-2]
+    model = probe_data['t_bmodel']
+    layer_range = probe_data['sep_layer_range']
+    acc_model = probe_data['t_amodel']
+    acc_layer_range = probe_data['ap_layer_range']
+def generate(
+    message: str,
+    chat_history: List[Tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        input_ids=input_ids,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        streamer=streamer,
+        output_hidden_states=True,
+        return_dict_in_generate=True,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    generated_text = ""
+    highlighted_text = ""
+    for output in streamer:
+        print(output)
+        generated_text += output
+        yield generated_text
+    # this is doing it twice... just do autoregressive generation instead
+    for new_text in streamer:
+        generated_text += new_text
+        current_input_ids = tokenizer.encode(generated_text, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model(current_input_ids, output_hidden_states=True)
+            hidden = outputs.hidden_states
+            # Stack second last token embeddings from all layers
+            # if len(hidden) == 1:  # FIX: runtime error for mistral-7b on bioasq
+            #     sec_last_input = hidden[0]
+            # elif ((n_generated - 2) >= len(hidden)):
+            #     sec_last_input = hidden[-2]
+            # else:
+            #     sec_last_input = hidden[n_generated - 2]
+            last_hidden_state = torch.stack([layer[:, -1, :].cpu() for layer in hidden[-1]]).cpu().numpy()
+            # print(sec_last_token_embedding.shape)
+        # last_hidden_state = outputs.hidden_states[-1][:, -1, :].cpu().numpy()
+        print(last_hidden_state.shape)
+        # TODO potentially need to only compute uncertainty for the last token in sentence?
+        # concatenate the hidden states from the specified layers
+        probe_input = np.concatenate(last_hidden_state[layer_range], axis=1)
+        print(probe_input.shape)
+        uncertainty_score = model.predict(probe_input)
+        print(uncertainty_score)
+        new_highlighted_text = highlight_text(new_text, uncertainty_score[0])
+        print(new_highlighted_text)
+        highlighted_text += new_highlighted_text
+        yield highlighted_text
+def highlight_text(text: str, uncertainty_score: float) -> str:
+    if uncertainty_score > 0:
+        html_color = "#%02X%02X%02X" % (
+            255,
+            int(255 * (1 - uncertainty_score)),
+            int(255 * (1 - uncertainty_score)),
+        )
+    else:
+        html_color = "#%02X%02X%02X" % (
+            int(255 * (1 + uncertainty_score)),
+            255,
+            int(255 * (1 + uncertainty_score)),
+        )
+    return '<span style="background-color: {}; color: black">{}</span>'.format(
+        html_color, text
+    )
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Textbox(label="System prompt", lines=6),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["What is the capital of France?"],
+        ["Explain the theory of relativity in simple terms."],
+        ["Write a short poem about artificial intelligence."]
+    ],
+    title="Llama-2 7B Chat with Streamable Semantic Uncertainty Probe",
+    description=DESCRIPTION,
+)
+if __name__ == "__main__":
+    chat_interface.launch()

debug.ipynb ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
+      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/shreshth/anaconda3/envs/llm-test/lib/python3.11/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n",
+      "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n",
+      "the same time. Both libraries are known to be incompatible and this\n",
+      "can cause random crashes or deadlocks on Linux when loaded in the\n",
+      "same Python program.\n",
+      "Using threadpoolctl may cause crashes or deadlocks. For more\n",
+      "information and possible workarounds, please see\n",
+      "    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n",
+      "\n",
+      "  warnings.warn(msg, RuntimeWarning)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'name': 'nq',\n",
+       " 't_bmodel': LogisticRegression(),\n",
+       " 't_amodel': LogisticRegression(),\n",
+       " 'sep_layer_range': (27, 32),\n",
+       " 'ap_layer_range': (17, 22)}"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# test probe loading \n",
+    "import pickle as pkl\n",
+    "import numpy as np\n",
+    "import sklearn \n",
+    "from sklearn import linear_model\n",
+    "import os\n",
+    "os.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = \"1\"\n",
+    "\n",
+    "# load the probe data\n",
+    "with open(\"./model/20240625-131035_demo.pkl\", \"rb\") as f:\n",
+    "    probe_data = pkl.load(f)\n",
+    "# take the NQ open one\n",
+    "probe_data = probe_data[-2]\n",
+    "probe_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "probe = probe_data['t_bmodel']\n",
+    "layer_range = probe_data['sep_layer_range']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1c0e30b73cab48069e985203c598a9b0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer\n",
+    "\n",
+    "model_id = \"meta-llama/Llama-2-7b-chat-hf\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"cpu\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "tokenizer.use_default_system_prompt = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,\n",
+      "           526,   263,  8444, 20255, 29889,    13, 29966,   829, 14816, 29903,\n",
+      "          6778,    13,    13,  5816,   338,   278,  7483,   310,  3444, 29973,\n",
+      "           518, 29914, 25580, 29962]]) torch.Size([1, 34])\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from threading import Thread\n",
+    "\n",
+    "system_prompt = \"You are a helpful assistant.\"\n",
+    "message = \"what is the capital of France?\"\n",
+    "max_new_tokens = 100\n",
+    "top_p = 0.9\n",
+    "top_k = 50\n",
+    "temperature = 0.7\n",
+    "repetition_penalty = 1.2\n",
+    "\n",
+    "conversation = []\n",
+    "\n",
+    "conversation.append({\"role\": \"system\", \"content\": system_prompt})\n",
+    "conversation.append({\"role\": \"user\", \"content\": message})\n",
+    "input_ids = tokenizer.apply_chat_template(conversation, return_tensors=\"pt\")\n",
+    "input_ids = input_ids.to(model.device)\n",
+    "print(input_ids, input_ids.shape)\n",
+    "streamer = TextIteratorStreamer(tokenizer, timeout=1000.0, skip_prompt=True, skip_special_tokens=True)\n",
+    "generation_kwargs = dict(\n",
+    "    input_ids=input_ids,\n",
+    "    max_new_tokens=max_new_tokens,\n",
+    "    do_sample=True,\n",
+    "    top_p=top_p,\n",
+    "    top_k=top_k,\n",
+    "    temperature=temperature,\n",
+    "    repetition_penalty=repetition_penalty,\n",
+    "    streamer=streamer,\n",
+    "    output_hidden_states=True,\n",
+    "    return_dict_in_generate=True,\n",
+    ")\n",
+    "\n",
+    "thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
+    "thread.start()\n",
+    "\n",
+    "generated_text = \"\"\n",
+    "highlighted_text = \"\"\n",
+    "\n",
+    "for new_text in streamer:\n",
+    "    print(new_text)\n",
+    "    generated_text += new_text\n",
+    "    current_input_ids = tokenizer.encode(generated_text, return_tensors=\"pt\").to(model.device)\n",
+    "    print(current_input_ids, current_input_ids.shape)\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(current_input_ids, output_hidden_states=True)\n",
+    "        print(outputs)\n",
+    "        hidden = outputs.hidden_states    \n",
+    "        print(hidden.shape)\n",
+    "        # Stack second last token embeddings from all layers \n",
+    "        # if len(hidden) == 1:  # FIX: runtime error for mistral-7b on bioasq\n",
+    "        #     sec_last_input = hidden[0]\n",
+    "        # elif ((n_generated - 2) >= len(hidden)):\n",
+    "        #     sec_last_input = hidden[-2]\n",
+    "        # else:\n",
+    "        #     sec_last_input = hidden[n_generated - 2]\n",
+    "        # sec_last_token_embedding = torch.stack([layer[:, -1, :].cpu() for layer in sec_last_input])\n",
+    "        # print(sec_last_token_embedding.shape)\n",
+    "    last_hidden_state = outputs.hidden_states[-1][:, -1, :].cpu().numpy()\n",
+    "    print(last_hidden_state.shape)  \n",
+    "    # TODO potentially need to only compute uncertainty for the last token in sentence?\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# concat hidden states\n",
+    "\n",
+    "\n",
+    "hidden_states = np.concatenate(np.array(hidden_states)[layer_range], axis=1)\n",
+    "# predict with probe\n",
+    "pred = probe.predict(hidden_states)\n",
+    "print(pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llm-test",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

model/{spiece.model → 20240625-131035_demo.pkl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5cbdfa8aa7c54c8c5af85b78c309c54a5f2749a20468bf6f60eee007fe6fec1
-size 805634

 version https://git-lfs.github.com/spec/v1
+oid sha256:c48fbd05e51c7b72d97012266d3f483198ec99ce66ed9611eefc347ab2e21360
+size 1313378

model/config.json DELETED Viewed

@@ -1,40 +0,0 @@
-{
-  "_name_or_path": "rinna/japanese-gpt2-medium",
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.1,
-  "bos_token_id": 1,
-  "embd_pdrop": 0.1,
-  "eos_token_id": 2,
-  "gradient_checkpointing": false,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 1024,
-  "n_head": 16,
-  "n_inner": 4096,
-  "n_layer": 24,
-  "n_positions": 1024,
-  "reorder_and_upcast_attn": false,
-  "resid_pdrop": 0.1,
-  "scale_attn_by_inverse_layer_idx": false,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": true,
-      "max_length": 50
-    }
-  },
-  "torch_dtype": "float32",
-  "transformers_version": "4.25.1",
-  "use_cache": true,
-  "vocab_size": 32000
-}

model/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:21035c9390e20b77578237469586e51d2275cf889d1c3418c4b57da8c67160c6
-size 1369783965

model/special_tokens_map.json DELETED Viewed

@@ -1,9 +0,0 @@
-{
-  "bos_token": "<s>",
-  "cls_token": "[CLS]",
-  "eos_token": "</s>",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "<unk>"
-}

model/tokenizer_config.json DELETED Viewed

@@ -1,17 +0,0 @@
-{
-  "additional_special_tokens": [],
-  "bos_token": "<s>",
-  "cls_token": "[CLS]",
-  "do_lower_case": true,
-  "eos_token": "</s>",
-  "extra_ids": 0,
-  "mask_token": "[MASK]",
-  "model_max_length": 1000000000000000019884624838656,
-  "name_or_path": "rinna/japanese-gpt2-medium",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "sp_model_kwargs": {},
-  "special_tokens_map_file": "/Users/agiats/.cache/huggingface/hub/models--rinna--japanese-gpt2-medium/snapshots/f464b76739c884d8b0479a0a7705b7fa71c3fd5a/special_tokens_map.json",
-  "tokenizer_class": "T5Tokenizer",
-  "unk_token": "<unk>"
-}