Spaces:
Build error
Build error
File size: 4,045 Bytes
7009660 8d717c1 7009660 aeb550e 7009660 1f84a9a 7009660 1f84a9a 7009660 aeb550e 1f84a9a 7009660 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# %%
# git clone https://huggingface.co/nyanko7/LLaMA-7B
# python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu117/torch2.00/index.html
# apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
from transformers import LlamaForCausalLM, LlamaTokenizer
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
from langchain.llms import LlamaCpp, HuggingFacePipeline
from langchain.vectorstores import Chroma
from transformers import pipeline
import torch
torch.backends.cuda.matmul.allow_tf32 = True
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import streamlit as st
import cloudpickle
import os
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import OpenAI
from chromadb.config import Settings
import chromadb
import pathlib
current_path = str( pathlib.Path(__file__).parent.resolve() )
print(current_path)
persist_directory = current_path + "/VectorStore"
# %%
@st.cache_resource
def load_cpu_model():
"""Does not work atm, bc cpu model is not persisted"""
model_path= "./llama.cpp/models/LLaMA-7B/ggml-model-q4_0.bin"
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
llm = LlamaCpp(
model_path=model_path,
n_ctx=6000,
n_threads=16,
temperature=0.6,
top_p=0.95
)
llama_embeddings = LlamaCppEmbeddings(model_path=model_path)
return llm
@st.cache_resource(max_entries =1)
def load_gpu_model(used_model):
torch.cuda.empty_cache()
tokenizer = LlamaTokenizer.from_pretrained(used_model)
if not torch.cuda.is_available():
device_map = {
"": "cpu"
}
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
torch_dtype=torch.float32
load_in_8bit=False
else:
device_map="auto"
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) #atm no offload, bc device_map="auto"
base_model = LlamaForCausalLM.from_pretrained(
used_model,
device_map=device_map,
offload_folder=current_path + "/models_gpt/",
low_cpu_mem_usage=True,
quantization_config=quantization_config,
cache_dir = current_path + "/mymodels/"
)
pipe = pipeline(
"text-generation",
model=base_model,
tokenizer=tokenizer,
max_length=8000,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.2
)
llm = HuggingFacePipeline(pipeline=pipe)
return llm
#@st.cache_resource
def load_openai_model():
return OpenAI(temperature=0.9)
@st.cache_resource
def load_openai_embedding():
return OpenAIEmbeddings()
#@st.cache_resource
def load_embedding(model_name):
embeddings = HuggingFaceInstructEmbeddings(
query_instruction="Represent the query for retrieval: ",
model_name = model_name,
cache_folder=current_path + "/mymodels/"
)
return embeddings
def load_vectorstore(model_name, collection, metadata):
embeddings = load_embedding(model_name)
client_settings = Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=persist_directory,
anonymized_telemetry=False
)
vectorstore = Chroma(
collection_name=collection,
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=persist_directory,
collection_metadata=metadata
)
return vectorstore
def create_chain(_llm, collection, model_name, metadata):
vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
return chain
# %%
|