Mihaiii/stablelm-zephyr-3b-OV_FP14-4BIT

The quantized version of stablelm-zephyr-3b after running the steps on from here

You can use it like this (steps taken from the above link):

pip install -q git+https://github.com/huggingface/optimum-intel.git@e22a2ac26b3a6c7854da956d538f784ebeca879b onnx openvino-nightly

then

from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoConfig, AutoTokenizer
from optimum.utils import NormalizedTextConfig, NormalizedConfigManager

NormalizedConfigManager._conf['stablelm_epoch'] = NormalizedTextConfig.with_args(num_layers='num_hidden_layers', num_attention_heads='num_attention_heads')
NormalizedConfigManager._conf['stablelm-epoch'] = NormalizedTextConfig.with_args(num_layers='num_hidden_layers', num_attention_heads='num_attention_heads')

model_path = 'Mihaiii/stablelm-zephyr-3b-OV_FP14-4BIT'
model = OVModelForCausalLM.from_pretrained(model_path, compile=False, config=AutoConfig.from_pretrained(model_path, trust_remote_code=True), stateful=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

prompt = [{'role': 'user', 'content': 'List 3 synonyms for the word "tiny"'}]
inputs = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt=True,
    return_tensors='pt'
)

tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=1024,
    temperature=0.8,
    do_sample=True
)

print(tokenizer.decode(tokens[0], skip_special_tokens=False))