Spaces:

heikowagner
/

GPT-Docker

Build error

App Files Files Community

heikowagner commited on Apr 29, 2023

Commit

fbb697c

•

1 Parent(s): 19b8811

add document upload

Browse files

Files changed (19) hide show

app/VectorStore/chroma-collections.parquet +2 -2
app/VectorStore/chroma-embeddings.parquet +2 -2
app/VectorStore/index/id_to_uuid_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl +3 -0
app/VectorStore/index/id_to_uuid_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl +2 -2
app/VectorStore/index/id_to_uuid_90530179-2196-4073-89e7-11f14538d27c.pkl +3 -0
app/VectorStore/index/index_0244568c-57df-4dab-9a52-e4703f31eeaa.bin +3 -0
app/VectorStore/index/index_52984ff2-d9c3-459b-acc0-0b0aa559d50f.bin +2 -2
app/VectorStore/index/index_90530179-2196-4073-89e7-11f14538d27c.bin +3 -0
app/VectorStore/index/index_metadata_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl +3 -0
app/VectorStore/index/index_metadata_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl +1 -1
app/VectorStore/index/index_metadata_90530179-2196-4073-89e7-11f14538d27c.pkl +3 -0
app/VectorStore/index/uuid_to_id_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl +3 -0
app/VectorStore/index/uuid_to_id_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl +2 -2
app/VectorStore/index/uuid_to_id_90530179-2196-4073-89e7-11f14538d27c.pkl +3 -0
app/app.py +38 -37
app/load_model.py +0 -3
app/load_vectors.py +23 -2
app/requirements.txt +1 -1
app/utils.py +51 -0

app/VectorStore/chroma-collections.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26740390ba936629dcb106c9948b55752ac6c763915bf0e7ad4a1273ac9ba084
-size 745

 version https://git-lfs.github.com/spec/v1
+oid sha256:e65624a226acdd91b0686aede21cb17c270204829fcc86602f16a6352b877337
+size 943

app/VectorStore/chroma-embeddings.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b79fe220db8ba8a7a77617dd295bf51f5438257e676b00ddd28a1fcf62c757fb
-size 240218512

 version https://git-lfs.github.com/spec/v1
+oid sha256:514430ced16df82f6b5355cc14ed912c5af38661418efb691ea8e73e6333ffed
+size 5782971

app/VectorStore/index/id_to_uuid_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df8c9c0ad0e24164c8cdea96715e56553fc72fcb3dc7e7d7da60f0f9cf38ef1c
+size 1640

app/VectorStore/index/id_to_uuid_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f64ceca9dc08a1fa345c23f4012132ef11f0c472dd64c6f80e445a65f29f536e
-size 104759

 version https://git-lfs.github.com/spec/v1
+oid sha256:f39dc0dcfa56bb6584759d134c28bd53ac0165a2873cdd5b9e0ff70244840542
+size 131496

app/VectorStore/index/id_to_uuid_90530179-2196-4073-89e7-11f14538d27c.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7296f8de5fb49d35a4e1b00cdc056b260b4a57b3d320f72d1c20982acd6c5f37
+size 3371

app/VectorStore/index/index_0244568c-57df-4dab-9a52-e4703f31eeaa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae356cfeea07ada1ba8342dfd128fdb76f212e37ba0e2876fccafe2b16bd95e3
+size 164384

app/VectorStore/index/index_52984ff2-d9c3-459b-acc0-0b0aa559d50f.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:518d2424133c568190196628f29226cca2e9c198616b03990347f2ce0c11ea7e
-size 10402668

 version https://git-lfs.github.com/spec/v1
+oid sha256:de4cc0ee24b85680520f48ee0ee673443133127e18533255c60528cfe2f925be
+size 13050028

app/VectorStore/index/index_90530179-2196-4073-89e7-11f14538d27c.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3015aa28224dce1211e2498699823bdee0958c6f024dd28ed317c0ec7e401556
+size 341400

app/VectorStore/index/index_metadata_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:757150b880764d792751d3d3675056820575f39244ff3401bbf602f213ba7df9
+size 73

app/VectorStore/index/index_metadata_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8e41a80750fa05ef9dd656d26239c0d9f06c1d278825090d6fd4f9645756d35
 size 74

 version https://git-lfs.github.com/spec/v1
+oid sha256:64da3bdfe4bc7727e421826a6459753a44eabcd37df7fe207fbde1014c0c2fe6
 size 74

app/VectorStore/index/index_metadata_90530179-2196-4073-89e7-11f14538d27c.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6d8aafc2b81de7e6a55297e5029654ee387b8774a6f91d5d702420e1ff80c78
+size 73

app/VectorStore/index/uuid_to_id_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3a5ae6784f1c41a78ce924bc4fd48d24083fb9df13ce10e271a25d00303f9e4
+size 1903

app/VectorStore/index/uuid_to_id_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90ccb948caa40144e512f5ab70fb9d8fe4b08dcf18b3d7e4c368bee1f0283a47
-size 122516

 version https://git-lfs.github.com/spec/v1
+oid sha256:b153d0a7649253a0b5b095f1d126ba5a36b6a650e177b03393bd76cf8b399896
+size 153763

app/VectorStore/index/uuid_to_id_90530179-2196-4073-89e7-11f14538d27c.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a97f316acd65482df9ed256c51cdb8c113d53e1d70346a666350ed625f2da76
+size 3938

app/app.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import streamlit as st
-import langchain
 import load_model
 import utils as ut
-import chromadb
-from chromadb.config import Settings
 import os
 persist_directory = load_model.persist_directory
 st.title('myGPT')
 st.header('An GPT example brought to you by Heiko Wagner')
@@ -15,38 +11,43 @@ st.markdown('*\"Parametrised models are simply functions that depend on inputs a
 st.latex(r'''h(\boldsymbol x, \boldsymbol w)= \sum_{k=1}^{K}\boldsymbol w_{k} \phi_{k}(\boldsymbol x)''')
-import torch
-torch.cuda.empty_cache()
-model_type = st.selectbox(
-    'Select the Documents to be used to answer your question',
-    ('OpenAI', 'local_model') )
-if model_type=='OpenAI':
-    openai_key= st.text_area('OpenAI Key:', '')
-    os.environ["OPENAI_API_KEY"] = openai_key
-    llm= load_model.load_openai_model()
 else:
-    llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
-client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
-                                persist_directory=persist_directory
-                            ))
-collections = tuple( [collection.name for collection in client.list_collections()] )
-print(collections)
-option = st.selectbox(
-    'Select the Documents to be used to answer your question',
-    collections )
-st.write('You selected:', option)
-chain = load_model.create_chain(llm, collection=option)
-try:
-    query = st.text_area('Ask a question:', 'Hallo how are you today?')
-    result = chain({"query": query})
-    ut.format_result_set(result)
-finally:
-    del chain
-    torch.cuda.empty_cache()

 import streamlit as st
 import load_model
 import utils as ut
 import os
 persist_directory = load_model.persist_directory
 st.title('myGPT')
 st.header('An GPT example brought to you by Heiko Wagner')
 st.latex(r'''h(\boldsymbol x, \boldsymbol w)= \sum_{k=1}^{K}\boldsymbol w_{k} \phi_{k}(\boldsymbol x)''')
+agree = st.checkbox('Load new Documents')
+if agree:
+    ut.load_files()
 else:
+    import torch
+    torch.cuda.empty_cache()
+    model_type = st.selectbox(
+        'Select the Documents to be used to answer your question',
+        ('OpenAI', 'local_model') )
+    if model_type=='OpenAI':
+        if 'openai_key' not in st.session_state:
+            openai_key= st.text_area('OpenAI Key:', '')
+            if len(openai_key)>10:
+                st.session_state['openai_key'] = openai_key
+                os.environ["OPENAI_API_KEY"] = openai_key
+        else:
+            os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
+        llm= load_model.load_openai_model()
+    else:
+        llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
+    collections = ut.retrieve_collections()
+    option = st.selectbox(
+        'Select the Documents to be used to answer your question',
+        collections )
+    st.write('You selected:', option)
+    chain = load_model.create_chain(llm, collection=option)
+    try:
+        query = st.text_area('Ask a question:', 'Hallo how are you today?')
+        result = chain({"query": query})
+        ut.format_result_set(result)
+    finally:
+        del chain
+        torch.cuda.empty_cache()

app/load_model.py CHANGED Viewed

@@ -27,9 +27,6 @@ print(current_path)
 persist_directory = current_path + "/VectorStore"
 # %%
-llm =OpenAI(temperature=0.9)
-llm
 @st.cache_resource
 def load_cpu_model():
     """Does not work atm, bc cpu model is not persisted"""

 persist_directory = current_path + "/VectorStore"
 # %%
 @st.cache_resource
 def load_cpu_model():
     """Does not work atm, bc cpu model is not persisted"""

app/load_vectors.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import nltk
 from langchain.indexes import VectorstoreIndexCreator
 from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
-from langchain.document_loaders import OnlinePDFLoader
 from langchain.vectorstores import Chroma
 from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
 from chromadb.config import Settings
@@ -16,6 +16,8 @@ from load_model import load_embedding
 import torch
 import re
 import pathlib
 current_path = str( pathlib.Path(__file__).parent.resolve() )
@@ -47,7 +49,7 @@ def create_and_add(collection_name, sub_docs, model_name):
     )
     client = chromadb.Client(client_settings)
-    collection_name = collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
     embeddings = load_embedding(model_name)
     logging.info(f"Adding documents to {collection_name}")
@@ -71,6 +73,25 @@ def create_and_add(collection_name, sub_docs, model_name):
     return vectorstore
 def load_from_web(urls, cache=True):
     docs_list = urls
     filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"

 import nltk
 from langchain.indexes import VectorstoreIndexCreator
 from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
+from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
 from langchain.vectorstores import Chroma
 from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
 from chromadb.config import Settings
 import torch
 import re
 import pathlib
+import tempfile
 current_path = str( pathlib.Path(__file__).parent.resolve() )
     )
     client = chromadb.Client(client_settings)
+    collection_name = collection_name # + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
     embeddings = load_embedding(model_name)
     logging.info(f"Adding documents to {collection_name}")
     return vectorstore
+def load_from_file(files):
+    saved_files=[]
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for file in files:
+            temp_dir = pathlib.Path(tmpdirname)
+            file_name = os.path.join(temp_dir,file.name)
+            saved_files.append(file_name)
+            with open(file_name, mode='wb') as w:
+                w.write(file.read())
+        print(saved_files)
+        loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
+        docs = []
+        print(loaders)
+        for loader in loaders:
+            docs.extend(loader.load())
+    return docs
 def load_from_web(urls, cache=True):
     docs_list = urls
     filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"

app/requirements.txt CHANGED Viewed

@@ -9,4 +9,4 @@ streamlit
 requests==2.28.0
 latex2markdown
 openai
-unstructured

 requests==2.28.0
 latex2markdown
 openai
+unstructured[local-inference]

app/utils.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import streamlit as st
 import latex2markdown
 from langchain.docstore.document import Document
 def format_document(document: Document):
     """TODO: Implement a nice style"""
@@ -16,4 +21,50 @@ def format_result_set(result):
         for document in source_documents:
             st.write(format_document(document))

 import streamlit as st
 import latex2markdown
 from langchain.docstore.document import Document
+import chromadb
+from chromadb.config import Settings
+import load_model
+from load_vectors import load_from_file, load_and_split, create_and_add
+persist_directory = load_model.persist_directory
 def format_document(document: Document):
     """TODO: Implement a nice style"""
         for document in source_documents:
             st.write(format_document(document))
+@st.cache_resource
+def get_chroma_client():
+    return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
+                                    persist_directory=persist_directory
+                                ))
+@st.cache_data
+def retrieve_collections():
+    client = get_chroma_client()
+    collections = tuple( [collection.name for collection in client.list_collections()] )
+    return collections
+def load_files():
+    client = get_chroma_client()
+    option = st.radio(
+        "",
+        options=["Add Documents", "Start new collection"],
+    )
+    collections = retrieve_collections()
+    if option == "Add Documents":
+        selected_collection = st.selectbox(
+            'Add to exsisting collection or create a new one',
+            collections )
+        if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
+            client.delete_collection(name=selected_collection)
+            retrieve_collections.clear()
+            collections = retrieve_collections()
+        st.write('Source Documents:')
+        uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
+        chunk_size = st.text_area('chunk Size:', 1000)
+        if st.button('Upload'):
+            docs = load_from_file(uploaded_files)
+            sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
+            create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
+            uploaded_files=None
+    else:
+        collection = st.text_area('Name of your new collection:', '')
+        if st.button('Create'):
+            if len(collection)>3:
+                client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)  --Problem i added the model to the name -> Better use Metadata :)
+                retrieve_collections.clear()
+                st.write("Collection " +collection+" succesfully created.")