TheBloke
/

Falcon-7B-Instruct-GPTQ

Text Generation

RefinedWebModel

text-generation-inference

4-bit precision

Model card Files Files and versions Community

TheBloke commited on Jun 16, 2023

Commit

71cdf2d

•

1 Parent(s): 1b4c932

Update README.md

Files changed (1) hide show

README.md +44 -12

README.md CHANGED Viewed

@@ -90,24 +90,56 @@ pip install einops
 You can then run this example code:
 ```python
-import torch
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-# Download the model from HF and store it locally, then reference its location here:
-quantized_model_dir = "/path/to/falcon7b-instruct-gptq"
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=False)
-model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False, use_safetensors=True, torch_dtype=torch.float32, trust_remote_code=True)
-prompt = "Write a story about llamas"
-prompt_template = f"### Instruction: {prompt}\n### Response:"
-tokens = tokenizer(prompt_template, return_tensors="pt").to("cuda:0").input_ids
-output = model.generate(input_ids=tokens, max_new_tokens=100, do_sample=True, temperature=0.8)
 print(tokenizer.decode(output[0]))
 ```
 ## Provided files

 You can then run this example code:
 ```python
+from transformers import AutoTokenizer, pipeline, logging
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+import argparse
+model_name_or_path = "TheBloke/falcon-7b-instruct-GPTQ"
+# You could also download the model locally, and access it there
+# model_name_or_path = "/path/to/TheBloke_falcon-7b-instruct-GPTQ"
+model_basename = "gptq_model-4bit-64g"
+use_triton = False
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
+model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
+        model_basename=model_basename,
+        use_safetensors=True,
+        trust_remote_code=True,
+        device="cuda:0",
+        use_triton=use_triton,
+        quantize_config=None)
+prompt = "Tell me about AI"
+prompt_template=f'''### Human: {prompt}
+### Assistant:'''
+print("\n\n*** Generate:")
+input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
+output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
 print(tokenizer.decode(output[0]))
+# Inference can also be done using transformers' pipeline
+# Note that if you use pipeline, you will see a spurious error message saying the model type is not supported
+# This can be ignored!  Or you can hide it with the following logging line:
+# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
+logging.set_verbosity(logging.CRITICAL)
+print("*** Pipeline:")
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    temperature=0.7,
+    top_p=0.95,
+    repetition_penalty=1.15
+)
+print(pipe(prompt_template)[0]['generated_text'])
 ```
 ## Provided files