Spaces:

vishal0719
/

infogen-qa-bot

Runtime error

vishal0719 commited on Dec 25, 2023

Commit

9ccf1f4

•

1 Parent(s): e454491

reverted back to meta-llama model with quantization

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,20 +29,19 @@ import transformers
 from accelerate import disk_offload
 # Model used
-# model_id = 'meta-llama/Llama-2-7b-chat-hf'
-model_id = 'NousResearch/Llama-2-7b-chat-hf'
 # Detects available device (GPU or CPU)
 device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
 # set quantization configuration to load large model with less GPU memory
 # this requires the `bitsandbytes` library
-# bnb_config = transformers.BitsAndBytesConfig(
-#     load_in_4bit=True,
-#     bnb_4bit_quant_type='nf4',
-#     bnb_4bit_use_double_quant=True,
-#     bnb_4bit_compute_dtype=bfloat16
-# )
 # Hugging Face Access Token
 hf_auth = os.environ.get("hf_auth")
@@ -58,7 +57,7 @@ model = transformers.AutoModelForCausalLM.from_pretrained(
     model_id,
     trust_remote_code=True,
     config=model_config,
-    # quantization_config=bnb_config,
     device_map='auto',
     token=hf_auth
 )

 from accelerate import disk_offload
 # Model used
+model_id = 'meta-llama/Llama-2-7b-chat-hf'
 # Detects available device (GPU or CPU)
 device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
 # set quantization configuration to load large model with less GPU memory
 # this requires the `bitsandbytes` library
+bnb_config = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type='nf4',
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=bfloat16
+)
 # Hugging Face Access Token
 hf_auth = os.environ.get("hf_auth")
     model_id,
     trust_remote_code=True,
     config=model_config,
+    quantization_config=bnb_config,
     device_map='auto',
     token=hf_auth
 )