File size: 4,049 Bytes
7009660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d717c1
7009660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f84a9a
7009660
 
 
 
 
 
 
 
 
 
 
1f84a9a
7009660
 
 
1f84a9a
 
7009660
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# %%
# git clone https://huggingface.co/nyanko7/LLaMA-7B
# python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu117/torch2.00/index.html
# apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
from transformers import LlamaForCausalLM, LlamaTokenizer
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
from langchain.llms import LlamaCpp, HuggingFacePipeline
from langchain.vectorstores import Chroma
from transformers import pipeline
import torch
torch.backends.cuda.matmul.allow_tf32 = True
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import streamlit as st
import cloudpickle
import os
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import OpenAI

from chromadb.config import Settings
import chromadb

import pathlib

current_path = str( pathlib.Path(__file__).parent.resolve() )
print(current_path)
persist_directory = current_path + "/VectorStore"

# %%
@st.cache_resource
def load_cpu_model():
    """Does not work atm, bc cpu model is not persisted"""
    model_path= "./llama.cpp/models/LLaMA-7B/ggml-model-q4_0.bin"
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    llm = LlamaCpp(
        model_path=model_path,
        n_ctx=6000,
        n_threads=16,
        temperature=0.6,
        top_p=0.95
        )
    
    llama_embeddings = LlamaCppEmbeddings(model_path=model_path)
    return llm

@st.cache_resource(max_entries =1)
def load_gpu_model(used_model):
    torch.cuda.empty_cache()
    tokenizer = LlamaTokenizer.from_pretrained(used_model)

    if not torch.cuda.is_available():
        device_map = {
            "": "cpu"
        }
        quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
        torch_dtype=torch.float32
        load_in_8bit=False
    else:
        device_map="auto"
        quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) #atm no offload, bc device_map="auto"


    base_model = LlamaForCausalLM.from_pretrained(
        used_model,
        device_map=device_map,
        offload_folder=current_path + "/models_gpt/",
        low_cpu_mem_usage=True,
        quantization_config=quantization_config,
        cache_dir = current_path + "/mymodels/"
    )
    pipe = pipeline(
        "text-generation",
        model=base_model, 
        tokenizer=tokenizer, 
        max_length=8000,
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.2
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

#@st.cache_resource
def load_openai_model():
    return OpenAI(temperature=0.9)

@st.cache_resource
def load_openai_embedding():
    return OpenAIEmbeddings()

@st.cache_resource
def load_embedding(model_name):
    embeddings = HuggingFaceInstructEmbeddings(
            query_instruction="Represent the query for retrieval: ",
            model_name = model_name,
            cache_folder=current_path + "/mymodels/"
        )
    return embeddings

def load_vectorstore(model_name, collection, metadata):
        embeddings = load_embedding(model_name)
        client_settings = Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=persist_directory,
            anonymized_telemetry=False
        )
        vectorstore = Chroma(
            collection_name=collection,
            embedding_function=embeddings,
            client_settings=client_settings,
            persist_directory=persist_directory,
            collection_metadata=metadata
        )
        return vectorstore

def create_chain(_llm, collection, model_name, metadata=None):
    vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
    return chain
# %%