vishal0719 commited on
Commit
9ccf1f4
β€’
1 Parent(s): e454491

reverted back to meta-llama model with quantization

Browse files
Files changed (1) hide show
  1. app.py +8 -9
app.py CHANGED
@@ -29,20 +29,19 @@ import transformers
29
  from accelerate import disk_offload
30
 
31
  # Model used
32
- # model_id = 'meta-llama/Llama-2-7b-chat-hf'
33
- model_id = 'NousResearch/Llama-2-7b-chat-hf'
34
 
35
  # Detects available device (GPU or CPU)
36
  device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
37
 
38
  # set quantization configuration to load large model with less GPU memory
39
  # this requires the `bitsandbytes` library
40
- # bnb_config = transformers.BitsAndBytesConfig(
41
- # load_in_4bit=True,
42
- # bnb_4bit_quant_type='nf4',
43
- # bnb_4bit_use_double_quant=True,
44
- # bnb_4bit_compute_dtype=bfloat16
45
- # )
46
 
47
  # Hugging Face Access Token
48
  hf_auth = os.environ.get("hf_auth")
@@ -58,7 +57,7 @@ model = transformers.AutoModelForCausalLM.from_pretrained(
58
  model_id,
59
  trust_remote_code=True,
60
  config=model_config,
61
- # quantization_config=bnb_config,
62
  device_map='auto',
63
  token=hf_auth
64
  )
 
29
  from accelerate import disk_offload
30
 
31
  # Model used
32
+ model_id = 'meta-llama/Llama-2-7b-chat-hf'
 
33
 
34
  # Detects available device (GPU or CPU)
35
  device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
36
 
37
  # set quantization configuration to load large model with less GPU memory
38
  # this requires the `bitsandbytes` library
39
+ bnb_config = transformers.BitsAndBytesConfig(
40
+ load_in_4bit=True,
41
+ bnb_4bit_quant_type='nf4',
42
+ bnb_4bit_use_double_quant=True,
43
+ bnb_4bit_compute_dtype=bfloat16
44
+ )
45
 
46
  # Hugging Face Access Token
47
  hf_auth = os.environ.get("hf_auth")
 
57
  model_id,
58
  trust_remote_code=True,
59
  config=model_config,
60
+ quantization_config=bnb_config,
61
  device_map='auto',
62
  token=hf_auth
63
  )