Spaces:

izumi-lab
/

llama-13b-japanese-lora-v0-1ep

Paused

App Files Files Community

masanorihirano commited on May 30, 2023

Commit

7a42c18

•

1 Parent(s): bed8c52

update

Browse files

Files changed (2) hide show

app.py +7 -29
pyproject.toml +1 -1

app.py CHANGED Viewed

@@ -9,16 +9,12 @@ from typing import Union
 import gradio as gr
 import requests
 import torch
 from fastchat.conversation import Conversation
-from fastchat.conversation import SeparatorStyle
-from fastchat.conversation import get_conv_template
-from fastchat.conversation import register_conv_template
-from fastchat.model.model_adapter import BaseAdapter
-from fastchat.model.model_adapter import load_model
-from fastchat.model.model_adapter import model_adapters
 from fastchat.serve.cli import SimpleChatIO
-from fastchat.serve.inference import compress_module
 from fastchat.serve.inference import generate_stream
 from huggingface_hub import Repository
 from huggingface_hub import snapshot_download
 from peft import LoraConfig
@@ -30,24 +26,8 @@ from transformers import LlamaTokenizer
 from transformers import PreTrainedModel
 from transformers import PreTrainedTokenizerBase
-class LLaMAdapter(BaseAdapter):
-    "Model adapater for vicuna-v1.1"
-    def match(self, model_path: str):
-        return "llama" in model_path
-    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
-        tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
-        model = LlamaForCausalLM.from_pretrained(
-            model_path,
-            low_cpu_mem_usage=True,
-            **from_pretrained_kwargs,
-        )
-        return model, tokenizer
-model_adapters.insert(-1, LLaMAdapter())
 def load_lora_model(
@@ -67,12 +47,10 @@ def load_lora_model(
         device=device,
         num_gpus=num_gpus,
         max_gpu_memory=max_gpu_memory,
-        load_8bit=False,
         cpu_offloading=cpu_offloading,
         debug=debug,
     )
-    if load_8bit:
-        compress_module(model)
     if lora_weight is not None:
         # model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
         config = LoraConfig.from_pretrained(lora_weight)
@@ -217,7 +195,7 @@ def evaluate(
                 gr.update(interactive=True),
             )
-        conv = get_conv_template()
         conv.append_message(conv.roles[0], instruction)
         conv.append_message(conv.roles[1], None)

 import gradio as gr
 import requests
 import torch
+import transformers
 from fastchat.conversation import Conversation
+from fastchat.conversation import get_default_conv_template
 from fastchat.serve.cli import SimpleChatIO
 from fastchat.serve.inference import generate_stream
+from fastchat.serve.inference import load_model
 from huggingface_hub import Repository
 from huggingface_hub import snapshot_download
 from peft import LoraConfig
 from transformers import PreTrainedModel
 from transformers import PreTrainedTokenizerBase
+transformers.AutoTokenizer.from_pretrained = LlamaTokenizer.from_pretrained
+transformers.AutoModelForCausalLM.from_pretrained = LlamaForCausalLM.from_pretrained
 def load_lora_model(
         device=device,
         num_gpus=num_gpus,
         max_gpu_memory=max_gpu_memory,
+        load_8bit=True,
         cpu_offloading=cpu_offloading,
         debug=debug,
     )
     if lora_weight is not None:
         # model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
         config = LoraConfig.from_pretrained(lora_weight)
                 gr.update(interactive=True),
             )
+        conv = get_default_conv_template(BASE_MODEL).copy()
         conv.append_message(conv.roles[0], instruction)
         conv.append_message(conv.roles[1], None)

pyproject.toml CHANGED Viewed

@@ -15,7 +15,7 @@ huggingface-hub = "^0.14.1"
 sentencepiece = "^0.1.99"
 bitsandbytes = "^0.38.1"
 accelerate = "^0.19.0"
-fschat = "0.2.8"
 transformers = "4.28.1"

 sentencepiece = "^0.1.99"
 bitsandbytes = "^0.38.1"
 accelerate = "^0.19.0"
+fschat = "0.2.3"
 transformers = "4.28.1"