ssaroya
/

gptq_model

Inference Endpoints

Model card Files Files and versions Community

ssaroya commited on May 22, 2023

Commit

a405859

1 Parent(s): e418f96

Update llama_inference_class.py

Browse files

Files changed (1) hide show

llama_inference_class.py +46 -46

llama_inference_class.py CHANGED Viewed

@@ -34,52 +34,52 @@ class ModelInference:
         model.seqlen = 2048
         return model
-def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
-    from transformers import LlamaConfig, LlamaForCausalLM
-    config = LlamaConfig.from_pretrained(model)
-    def noop(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = noop
-    torch.nn.init.uniform_ = noop
-    torch.nn.init.normal_ = noop
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = LlamaForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    if eval:
-        model = model.eval()
-    layers = find_layers(model)
-    for name in ['lm_head']:
-        if name in layers:
-            del layers[name]
-    quant.make_quant_linear(model, layers, wbits, groupsize)
-    del layers
-    print('Loading model ...')
-    if checkpoint.endswith('.safetensors'):
-        from safetensors.torch import load_file as safe_load
-        model.load_state_dict(safe_load(checkpoint), strict=False)
-    else:
-        model.load_state_dict(torch.load(checkpoint), strict=False)
-    if eval:
-        quant.make_quant_attn(model)
-        quant.make_quant_norm(model)
-        if fused_mlp:
-            quant.make_fused_mlp(model)
-    if warmup_autotune:
-        quant.autotune_warmup_linear(model, transpose=not (eval))
-        if eval and fused_mlp:
-            quant.autotune_warmup_fused(model)
-    model.seqlen = 2048
-    print('Done.')
-    return model
     def generate_text(self, text, min_length=10, max_length=50, top_p=0.95, temperature=0.8):
         input_ids = self.tokenizer.encode(text, return_tensors="pt").to(DEV)

         model.seqlen = 2048
         return model
+    def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
+        from transformers import LlamaConfig, LlamaForCausalLM
+        config = LlamaConfig.from_pretrained(model)
+        def noop(*args, **kwargs):
+            pass
+        torch.nn.init.kaiming_uniform_ = noop
+        torch.nn.init.uniform_ = noop
+        torch.nn.init.normal_ = noop
+        torch.set_default_dtype(torch.half)
+        transformers.modeling_utils._init_weights = False
+        torch.set_default_dtype(torch.half)
+        model = LlamaForCausalLM(config)
+        torch.set_default_dtype(torch.float)
+        if eval:
+            model = model.eval()
+        layers = find_layers(model)
+        for name in ['lm_head']:
+            if name in layers:
+                del layers[name]
+        quant.make_quant_linear(model, layers, wbits, groupsize)
+        del layers
+        print('Loading model ...')
+        if checkpoint.endswith('.safetensors'):
+            from safetensors.torch import load_file as safe_load
+            model.load_state_dict(safe_load(checkpoint), strict=False)
+        else:
+            model.load_state_dict(torch.load(checkpoint), strict=False)
+        if eval:
+            quant.make_quant_attn(model)
+            quant.make_quant_norm(model)
+            if fused_mlp:
+                quant.make_fused_mlp(model)
+        if warmup_autotune:
+            quant.autotune_warmup_linear(model, transpose=not (eval))
+            if eval and fused_mlp:
+                quant.autotune_warmup_fused(model)
+        model.seqlen = 2048
+        print('Done.')
+        return model
     def generate_text(self, text, min_length=10, max_length=50, top_p=0.95, temperature=0.8):
         input_ids = self.tokenizer.encode(text, return_tensors="pt").to(DEV)