heikowagner commited on
Commit
fbb697c
1 Parent(s): 19b8811

add document upload

Browse files
app/VectorStore/chroma-collections.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26740390ba936629dcb106c9948b55752ac6c763915bf0e7ad4a1273ac9ba084
3
- size 745
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65624a226acdd91b0686aede21cb17c270204829fcc86602f16a6352b877337
3
+ size 943
app/VectorStore/chroma-embeddings.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b79fe220db8ba8a7a77617dd295bf51f5438257e676b00ddd28a1fcf62c757fb
3
- size 240218512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514430ced16df82f6b5355cc14ed912c5af38661418efb691ea8e73e6333ffed
3
+ size 5782971
app/VectorStore/index/id_to_uuid_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df8c9c0ad0e24164c8cdea96715e56553fc72fcb3dc7e7d7da60f0f9cf38ef1c
3
+ size 1640
app/VectorStore/index/id_to_uuid_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f64ceca9dc08a1fa345c23f4012132ef11f0c472dd64c6f80e445a65f29f536e
3
- size 104759
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f39dc0dcfa56bb6584759d134c28bd53ac0165a2873cdd5b9e0ff70244840542
3
+ size 131496
app/VectorStore/index/id_to_uuid_90530179-2196-4073-89e7-11f14538d27c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7296f8de5fb49d35a4e1b00cdc056b260b4a57b3d320f72d1c20982acd6c5f37
3
+ size 3371
app/VectorStore/index/index_0244568c-57df-4dab-9a52-e4703f31eeaa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae356cfeea07ada1ba8342dfd128fdb76f212e37ba0e2876fccafe2b16bd95e3
3
+ size 164384
app/VectorStore/index/index_52984ff2-d9c3-459b-acc0-0b0aa559d50f.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:518d2424133c568190196628f29226cca2e9c198616b03990347f2ce0c11ea7e
3
- size 10402668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de4cc0ee24b85680520f48ee0ee673443133127e18533255c60528cfe2f925be
3
+ size 13050028
app/VectorStore/index/index_90530179-2196-4073-89e7-11f14538d27c.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3015aa28224dce1211e2498699823bdee0958c6f024dd28ed317c0ec7e401556
3
+ size 341400
app/VectorStore/index/index_metadata_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:757150b880764d792751d3d3675056820575f39244ff3401bbf602f213ba7df9
3
+ size 73
app/VectorStore/index/index_metadata_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8e41a80750fa05ef9dd656d26239c0d9f06c1d278825090d6fd4f9645756d35
3
  size 74
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64da3bdfe4bc7727e421826a6459753a44eabcd37df7fe207fbde1014c0c2fe6
3
  size 74
app/VectorStore/index/index_metadata_90530179-2196-4073-89e7-11f14538d27c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d8aafc2b81de7e6a55297e5029654ee387b8774a6f91d5d702420e1ff80c78
3
+ size 73
app/VectorStore/index/uuid_to_id_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3a5ae6784f1c41a78ce924bc4fd48d24083fb9df13ce10e271a25d00303f9e4
3
+ size 1903
app/VectorStore/index/uuid_to_id_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90ccb948caa40144e512f5ab70fb9d8fe4b08dcf18b3d7e4c368bee1f0283a47
3
- size 122516
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b153d0a7649253a0b5b095f1d126ba5a36b6a650e177b03393bd76cf8b399896
3
+ size 153763
app/VectorStore/index/uuid_to_id_90530179-2196-4073-89e7-11f14538d27c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a97f316acd65482df9ed256c51cdb8c113d53e1d70346a666350ed625f2da76
3
+ size 3938
app/app.py CHANGED
@@ -1,13 +1,9 @@
1
  import streamlit as st
2
- import langchain
3
  import load_model
4
  import utils as ut
5
- import chromadb
6
- from chromadb.config import Settings
7
  import os
8
 
9
  persist_directory = load_model.persist_directory
10
-
11
  st.title('myGPT')
12
  st.header('An GPT example brought to you by Heiko Wagner')
13
 
@@ -15,38 +11,43 @@ st.markdown('*\"Parametrised models are simply functions that depend on inputs a
15
 
16
  st.latex(r'''h(\boldsymbol x, \boldsymbol w)= \sum_{k=1}^{K}\boldsymbol w_{k} \phi_{k}(\boldsymbol x)''')
17
 
18
- import torch
19
- torch.cuda.empty_cache()
20
-
21
- model_type = st.selectbox(
22
- 'Select the Documents to be used to answer your question',
23
- ('OpenAI', 'local_model') )
24
-
25
- if model_type=='OpenAI':
26
- openai_key= st.text_area('OpenAI Key:', '')
27
- os.environ["OPENAI_API_KEY"] = openai_key
28
- llm= load_model.load_openai_model()
29
  else:
30
- llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
31
-
32
-
33
- client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
34
- persist_directory=persist_directory
35
- ))
36
-
37
- collections = tuple( [collection.name for collection in client.list_collections()] )
38
- print(collections)
39
- option = st.selectbox(
40
- 'Select the Documents to be used to answer your question',
41
- collections )
42
-
43
- st.write('You selected:', option)
44
 
45
- chain = load_model.create_chain(llm, collection=option)
46
- try:
47
- query = st.text_area('Ask a question:', 'Hallo how are you today?')
48
- result = chain({"query": query})
49
- ut.format_result_set(result)
50
- finally:
51
- del chain
52
- torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  import load_model
3
  import utils as ut
 
 
4
  import os
5
 
6
  persist_directory = load_model.persist_directory
 
7
  st.title('myGPT')
8
  st.header('An GPT example brought to you by Heiko Wagner')
9
 
 
11
 
12
  st.latex(r'''h(\boldsymbol x, \boldsymbol w)= \sum_{k=1}^{K}\boldsymbol w_{k} \phi_{k}(\boldsymbol x)''')
13
 
14
+ agree = st.checkbox('Load new Documents')
15
+ if agree:
16
+ ut.load_files()
 
 
 
 
 
 
 
 
17
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ import torch
20
+ torch.cuda.empty_cache()
21
+
22
+ model_type = st.selectbox(
23
+ 'Select the Documents to be used to answer your question',
24
+ ('OpenAI', 'local_model') )
25
+
26
+ if model_type=='OpenAI':
27
+ if 'openai_key' not in st.session_state:
28
+ openai_key= st.text_area('OpenAI Key:', '')
29
+ if len(openai_key)>10:
30
+ st.session_state['openai_key'] = openai_key
31
+ os.environ["OPENAI_API_KEY"] = openai_key
32
+ else:
33
+ os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
34
+ llm= load_model.load_openai_model()
35
+ else:
36
+ llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
37
+
38
+
39
+ collections = ut.retrieve_collections()
40
+ option = st.selectbox(
41
+ 'Select the Documents to be used to answer your question',
42
+ collections )
43
+
44
+ st.write('You selected:', option)
45
+
46
+ chain = load_model.create_chain(llm, collection=option)
47
+ try:
48
+ query = st.text_area('Ask a question:', 'Hallo how are you today?')
49
+ result = chain({"query": query})
50
+ ut.format_result_set(result)
51
+ finally:
52
+ del chain
53
+ torch.cuda.empty_cache()
app/load_model.py CHANGED
@@ -27,9 +27,6 @@ print(current_path)
27
  persist_directory = current_path + "/VectorStore"
28
 
29
  # %%
30
- llm =OpenAI(temperature=0.9)
31
- llm
32
-
33
  @st.cache_resource
34
  def load_cpu_model():
35
  """Does not work atm, bc cpu model is not persisted"""
 
27
  persist_directory = current_path + "/VectorStore"
28
 
29
  # %%
 
 
 
30
  @st.cache_resource
31
  def load_cpu_model():
32
  """Does not work atm, bc cpu model is not persisted"""
app/load_vectors.py CHANGED
@@ -2,7 +2,7 @@
2
  import nltk
3
  from langchain.indexes import VectorstoreIndexCreator
4
  from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
5
- from langchain.document_loaders import OnlinePDFLoader
6
  from langchain.vectorstores import Chroma
7
  from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
8
  from chromadb.config import Settings
@@ -16,6 +16,8 @@ from load_model import load_embedding
16
  import torch
17
  import re
18
  import pathlib
 
 
19
 
20
  current_path = str( pathlib.Path(__file__).parent.resolve() )
21
 
@@ -47,7 +49,7 @@ def create_and_add(collection_name, sub_docs, model_name):
47
  )
48
 
49
  client = chromadb.Client(client_settings)
50
- collection_name = collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
51
 
52
  embeddings = load_embedding(model_name)
53
  logging.info(f"Adding documents to {collection_name}")
@@ -71,6 +73,25 @@ def create_and_add(collection_name, sub_docs, model_name):
71
 
72
  return vectorstore
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def load_from_web(urls, cache=True):
75
  docs_list = urls
76
  filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
 
2
  import nltk
3
  from langchain.indexes import VectorstoreIndexCreator
4
  from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
5
+ from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
6
  from langchain.vectorstores import Chroma
7
  from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
8
  from chromadb.config import Settings
 
16
  import torch
17
  import re
18
  import pathlib
19
+ import tempfile
20
+
21
 
22
  current_path = str( pathlib.Path(__file__).parent.resolve() )
23
 
 
49
  )
50
 
51
  client = chromadb.Client(client_settings)
52
+ collection_name = collection_name # + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
53
 
54
  embeddings = load_embedding(model_name)
55
  logging.info(f"Adding documents to {collection_name}")
 
73
 
74
  return vectorstore
75
 
76
+ def load_from_file(files):
77
+
78
+ saved_files=[]
79
+ with tempfile.TemporaryDirectory() as tmpdirname:
80
+ for file in files:
81
+ temp_dir = pathlib.Path(tmpdirname)
82
+ file_name = os.path.join(temp_dir,file.name)
83
+ saved_files.append(file_name)
84
+ with open(file_name, mode='wb') as w:
85
+ w.write(file.read())
86
+
87
+ print(saved_files)
88
+ loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
89
+ docs = []
90
+ print(loaders)
91
+ for loader in loaders:
92
+ docs.extend(loader.load())
93
+ return docs
94
+
95
  def load_from_web(urls, cache=True):
96
  docs_list = urls
97
  filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
app/requirements.txt CHANGED
@@ -9,4 +9,4 @@ streamlit
9
  requests==2.28.0
10
  latex2markdown
11
  openai
12
- unstructured
 
9
  requests==2.28.0
10
  latex2markdown
11
  openai
12
+ unstructured[local-inference]
app/utils.py CHANGED
@@ -1,6 +1,11 @@
1
  import streamlit as st
2
  import latex2markdown
3
  from langchain.docstore.document import Document
 
 
 
 
 
4
 
5
  def format_document(document: Document):
6
  """TODO: Implement a nice style"""
@@ -16,4 +21,50 @@ def format_result_set(result):
16
  for document in source_documents:
17
  st.write(format_document(document))
18
 
 
 
 
 
 
 
 
 
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import latex2markdown
3
  from langchain.docstore.document import Document
4
+ import chromadb
5
+ from chromadb.config import Settings
6
+ import load_model
7
+ from load_vectors import load_from_file, load_and_split, create_and_add
8
+ persist_directory = load_model.persist_directory
9
 
10
  def format_document(document: Document):
11
  """TODO: Implement a nice style"""
 
21
  for document in source_documents:
22
  st.write(format_document(document))
23
 
24
+ @st.cache_resource
25
+ def get_chroma_client():
26
+ return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
27
+ persist_directory=persist_directory
28
+ ))
29
+ @st.cache_data
30
+ def retrieve_collections():
31
+ client = get_chroma_client()
32
+ collections = tuple( [collection.name for collection in client.list_collections()] )
33
+ return collections
34
 
35
+ def load_files():
36
+
37
+ client = get_chroma_client()
38
+
39
+ option = st.radio(
40
+ "",
41
+ options=["Add Documents", "Start new collection"],
42
+ )
43
+
44
+ collections = retrieve_collections()
45
+
46
+ if option == "Add Documents":
47
+ selected_collection = st.selectbox(
48
+ 'Add to exsisting collection or create a new one',
49
+ collections )
50
+ if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
51
+ client.delete_collection(name=selected_collection)
52
+ retrieve_collections.clear()
53
+ collections = retrieve_collections()
54
+
55
+ st.write('Source Documents:')
56
+ uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
57
+ chunk_size = st.text_area('chunk Size:', 1000)
58
+
59
+ if st.button('Upload'):
60
+ docs = load_from_file(uploaded_files)
61
+ sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
62
+ create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
63
+ uploaded_files=None
64
+ else:
65
+ collection = st.text_area('Name of your new collection:', '')
66
+ if st.button('Create'):
67
+ if len(collection)>3:
68
+ client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name) --Problem i added the model to the name -> Better use Metadata :)
69
+ retrieve_collections.clear()
70
+ st.write("Collection " +collection+" succesfully created.")