Using the model for inference

#3
by ihexx - opened

I followed the formatting listed in the paper and tried to test it on a simple toy problem, but the results I am getting appear to be worse than random. What am I doing wrong?

The problem just randomly creates classification boundaries for 1 random variable, so it should be trivial, but Tabula for an accuracy of 20% on my tests, while a simple default config logistic regression classifier trained on the same context length got 95% test accuracy

I've composed it into a single script:

import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM

import numpy as np
import torch

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score
import random
import re

# Load environment variables from .env file
load_dotenv()

# Use the HF_HOME value when loading models
tokenizer = AutoTokenizer.from_pretrained("mlfoundations/tabula-8b", cache_dir=os.getenv('HF_HOME',None))
model = AutoModelForCausalLM.from_pretrained("mlfoundations/tabula-8b", cache_dir=os.getenv('HF_HOME',None))

# Move model to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# model = model.half()
print(f"Using device: {device}")

# Generate toy dataset
def generate_custom_dataset(n_samples=100):
    X = np.random.randint(0, 1001, n_samples)
    sp1, sp2 = np.percentile(X, [33, 66]).astype(int)
    Y = np.array(['a' if x < sp1 else 'b' if sp1 <= x < sp2 else 'c' for x in X])
    return X, Y, sp1, sp2

# Format data for Tabula
def format_data_for_tabula(X, Y, sp1, sp2, n_context=5):
    data = []
    for x, y in zip(X, Y):
        row = (f"Predict the class: ||a||b||c|| "
               f"The X is {x}. "
               f"What is the class? ||a||b||c||<|endinput|>"
               f"{y}<|endcompletion|>")
        data.append(row)
    context = "\n".join(data[:n_context])
    return context

def run_test(x_test, sp1, sp2, context):
    y_true = 'a' if x_test < sp1 else 'b' if sp1 <= x_test < sp2 else 'c'
    input_text = (f"{context}\n"
                  f"Predict the class: ||a||b||c|| "
                  f"The X is {x_test}. "
                  f"What is the class? ||a||b||c||<|endinput|>")

    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_new_tokens=5, num_return_sequences=1, no_repeat_ngram_size=2)

    # result = tokenizer.decode(output[0], skip_special_tokens=True)
    result = tokenizer.decode(output[0, input_ids.shape[-1]:], skip_special_tokens=False)

    print(f"{result=}")
    spl = result.split("<|endcompletion|>")
    print(f"{spl=}")
    predicted_class = spl[0].strip().lower()

    return y_true, predicted_class

dset_size = 500
X, Y, sp1, sp2 = generate_custom_dataset(n_samples=dset_size)

print(f"Split points: {sp1}, {sp2}")

# Prepare context
context = format_data_for_tabula(X, Y, sp1, sp2, n_context=dset_size)

# Run multiple tests
n_tests = 10
true_classes = []
predicted_classes = []

for i in range(n_tests):
    x_test = np.random.randint(0, 1001)
    y_true, y_pred = run_test(x_test, sp1, sp2, context)
    true_classes.append(y_true)
    predicted_classes.append(y_pred)
    print(f"Test {i + 1}: X = {x_test}, True class = {y_true}, Predicted class = {y_pred}")

# Calculate accuracy
accuracy = accuracy_score(true_classes, predicted_classes)
print(f"\nAccuracy: {accuracy:.4f}")

print("Model loaded successfully!")

Update:
I couldn't get the HF version to work, but I did retry this with the llama.cpp version, and it got 99% accuracy on the same toy problem.
Just gonna leave this here for posterity:

import os
import numpy as np
from sklearn.metrics import accuracy_score
import random
from llama_cpp import Llama
import tqdm



def generate_custom_dataset(n_samples=100):
    X = np.random.randint(0, 1001, n_samples)
    sp1, sp2 = np.percentile(X, [33, 66]).astype(int)
    Y = np.array(['a' if x < sp1 else 'b' if sp1 <= x < sp2 else 'c' for x in X])
    return X, Y, sp1, sp2


def format_data_for_tabula(X, Y, sp1, sp2, n_context=5):
    data = []
    for x, y in zip(X, Y):
        row = (f"Predict the class: ||a||b||c|| "
               f"The X is {x}. "
               f"What is the class? ||a||b||c||<|endinput|>"
               f"{y}<|endcompletion|>")
        data.append(row)
    context = "\n".join(data[:n_context])
    return context


def run_test(x_test, sp1, sp2, context, llm):
    y_true = 'a' if x_test < sp1 else 'b' if sp1 <= x_test < sp2 else 'c'
    input_text = (f"{context}\n"
                  f"Predict the class: ||a||b||c|| "
                  f"The X is {x_test}. "
                  f"What is the class? ||a||b||c||<|endinput|>")

    output = llm(
        input_text,
        max_tokens=5,
        stop=["<|endcompletion|>"],
        echo=True
    )

    result = output['choices'][0]['text']
    predicted_class = result.split("<|endinput|>")[-1].strip().lower()

    return y_true, predicted_class


# Initialize llama.cpp model
llm = Llama(
    model_path=r"G:\llama_cpp_models\specialist\tabula-8b-Q8_0\tabula-8b-Q8_0.gguf",
    n_gpu_layers=-1,  # Uncomment to use GPU acceleration
    n_ctx=8192,  # Set context window
)
print("Model loaded successfully!")

# Generate dataset
dset_size = 100
X, Y, sp1, sp2 = generate_custom_dataset(n_samples=dset_size)
print(f"Split points: {sp1}, {sp2}")

# Prepare context
context = format_data_for_tabula(X, Y, sp1, sp2, n_context=dset_size)

# Run multiple tests
n_tests = 100
true_classes = []
predicted_classes = []

for i in tqdm.tqdm(range(n_tests)):
    x_test = np.random.randint(0, 1001)
    y_true, y_pred = run_test(x_test, sp1, sp2, context, llm)
    true_classes.append(y_true)
    predicted_classes.append(y_pred)
    # print(f"Test {i + 1}: X = {x_test}, True class = {y_true}, Predicted class = {y_pred}")

# Calculate accuracy
accuracy = accuracy_score(true_classes, predicted_classes)
print(f"\nAccuracy: {accuracy:.4f}")
print("Done")
ihexx changed discussion status to closed
ML Foundations org

Thanks, I know this issue is closed but adding a comment in case future readers arrive here :)

There are some extra utilities we would suggest using when performing inference with TabuLa-8B; you can find more info in the (recently-added) inference notebook in our repo: https://github.com/mlfoundations/rtfm?tab=readme-ov-file#quickstart---inference . I would not suggest using only the above code, as it doesn't match our internal usage of the model so we cannot easily verify that it produces identical input-output formatting for the model (although I haven't reviewed the above code line-by-line to look for issues).

However, your use case -- checking performance on labeled data -- would be best covered by using our evaluation pipeline (which is designed for exactly this case, where the data is already labeled and you want to check zero- or few-shot performance on it). That evaluation code is what we use to evaluate TabuLa-8B; it is also probably much more efficient than the inference utilities (which are designed for usability but not necessarily for performance).

jpgard changed discussion title from Worse than random results to Using the model for inference

Sign up or log in to comment