Using the model for inference
I followed the formatting listed in the paper and tried to test it on a simple toy problem, but the results I am getting appear to be worse than random. What am I doing wrong?
The problem just randomly creates classification boundaries for 1 random variable, so it should be trivial, but Tabula for an accuracy of 20% on my tests, while a simple default config logistic regression classifier trained on the same context length got 95% test accuracy
I've composed it into a single script:
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import torch
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score
import random
import re
# Load environment variables from .env file
load_dotenv()
# Use the HF_HOME value when loading models
tokenizer = AutoTokenizer.from_pretrained("mlfoundations/tabula-8b", cache_dir=os.getenv('HF_HOME',None))
model = AutoModelForCausalLM.from_pretrained("mlfoundations/tabula-8b", cache_dir=os.getenv('HF_HOME',None))
# Move model to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# model = model.half()
print(f"Using device: {device}")
# Generate toy dataset
def generate_custom_dataset(n_samples=100):
X = np.random.randint(0, 1001, n_samples)
sp1, sp2 = np.percentile(X, [33, 66]).astype(int)
Y = np.array(['a' if x < sp1 else 'b' if sp1 <= x < sp2 else 'c' for x in X])
return X, Y, sp1, sp2
# Format data for Tabula
def format_data_for_tabula(X, Y, sp1, sp2, n_context=5):
data = []
for x, y in zip(X, Y):
row = (f"Predict the class: ||a||b||c|| "
f"The X is {x}. "
f"What is the class? ||a||b||c||<|endinput|>"
f"{y}<|endcompletion|>")
data.append(row)
context = "\n".join(data[:n_context])
return context
def run_test(x_test, sp1, sp2, context):
y_true = 'a' if x_test < sp1 else 'b' if sp1 <= x_test < sp2 else 'c'
input_text = (f"{context}\n"
f"Predict the class: ||a||b||c|| "
f"The X is {x_test}. "
f"What is the class? ||a||b||c||<|endinput|>")
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
output = model.generate(input_ids, max_new_tokens=5, num_return_sequences=1, no_repeat_ngram_size=2)
# result = tokenizer.decode(output[0], skip_special_tokens=True)
result = tokenizer.decode(output[0, input_ids.shape[-1]:], skip_special_tokens=False)
print(f"{result=}")
spl = result.split("<|endcompletion|>")
print(f"{spl=}")
predicted_class = spl[0].strip().lower()
return y_true, predicted_class
dset_size = 500
X, Y, sp1, sp2 = generate_custom_dataset(n_samples=dset_size)
print(f"Split points: {sp1}, {sp2}")
# Prepare context
context = format_data_for_tabula(X, Y, sp1, sp2, n_context=dset_size)
# Run multiple tests
n_tests = 10
true_classes = []
predicted_classes = []
for i in range(n_tests):
x_test = np.random.randint(0, 1001)
y_true, y_pred = run_test(x_test, sp1, sp2, context)
true_classes.append(y_true)
predicted_classes.append(y_pred)
print(f"Test {i + 1}: X = {x_test}, True class = {y_true}, Predicted class = {y_pred}")
# Calculate accuracy
accuracy = accuracy_score(true_classes, predicted_classes)
print(f"\nAccuracy: {accuracy:.4f}")
print("Model loaded successfully!")
Update:
I couldn't get the HF version to work, but I did retry this with the llama.cpp version, and it got 99% accuracy on the same toy problem.
Just gonna leave this here for posterity:
import os
import numpy as np
from sklearn.metrics import accuracy_score
import random
from llama_cpp import Llama
import tqdm
def generate_custom_dataset(n_samples=100):
X = np.random.randint(0, 1001, n_samples)
sp1, sp2 = np.percentile(X, [33, 66]).astype(int)
Y = np.array(['a' if x < sp1 else 'b' if sp1 <= x < sp2 else 'c' for x in X])
return X, Y, sp1, sp2
def format_data_for_tabula(X, Y, sp1, sp2, n_context=5):
data = []
for x, y in zip(X, Y):
row = (f"Predict the class: ||a||b||c|| "
f"The X is {x}. "
f"What is the class? ||a||b||c||<|endinput|>"
f"{y}<|endcompletion|>")
data.append(row)
context = "\n".join(data[:n_context])
return context
def run_test(x_test, sp1, sp2, context, llm):
y_true = 'a' if x_test < sp1 else 'b' if sp1 <= x_test < sp2 else 'c'
input_text = (f"{context}\n"
f"Predict the class: ||a||b||c|| "
f"The X is {x_test}. "
f"What is the class? ||a||b||c||<|endinput|>")
output = llm(
input_text,
max_tokens=5,
stop=["<|endcompletion|>"],
echo=True
)
result = output['choices'][0]['text']
predicted_class = result.split("<|endinput|>")[-1].strip().lower()
return y_true, predicted_class
# Initialize llama.cpp model
llm = Llama(
model_path=r"G:\llama_cpp_models\specialist\tabula-8b-Q8_0\tabula-8b-Q8_0.gguf",
n_gpu_layers=-1, # Uncomment to use GPU acceleration
n_ctx=8192, # Set context window
)
print("Model loaded successfully!")
# Generate dataset
dset_size = 100
X, Y, sp1, sp2 = generate_custom_dataset(n_samples=dset_size)
print(f"Split points: {sp1}, {sp2}")
# Prepare context
context = format_data_for_tabula(X, Y, sp1, sp2, n_context=dset_size)
# Run multiple tests
n_tests = 100
true_classes = []
predicted_classes = []
for i in tqdm.tqdm(range(n_tests)):
x_test = np.random.randint(0, 1001)
y_true, y_pred = run_test(x_test, sp1, sp2, context, llm)
true_classes.append(y_true)
predicted_classes.append(y_pred)
# print(f"Test {i + 1}: X = {x_test}, True class = {y_true}, Predicted class = {y_pred}")
# Calculate accuracy
accuracy = accuracy_score(true_classes, predicted_classes)
print(f"\nAccuracy: {accuracy:.4f}")
print("Done")
Thanks, I know this issue is closed but adding a comment in case future readers arrive here :)
There are some extra utilities we would suggest using when performing inference with TabuLa-8B; you can find more info in the (recently-added) inference notebook in our repo: https://github.com/mlfoundations/rtfm?tab=readme-ov-file#quickstart---inference . I would not suggest using only the above code, as it doesn't match our internal usage of the model so we cannot easily verify that it produces identical input-output formatting for the model (although I haven't reviewed the above code line-by-line to look for issues).
However, your use case -- checking performance on labeled data -- would be best covered by using our evaluation pipeline (which is designed for exactly this case, where the data is already labeled and you want to check zero- or few-shot performance on it). That evaluation code is what we use to evaluate TabuLa-8B; it is also probably much more efficient than the inference utilities (which are designed for usability but not necessarily for performance).