File size: 4,018 Bytes
7009660
 
 
fbb697c
 
 
1f84a9a
4f0dc21
fbb697c
7009660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aeb550e
fbb697c
 
 
 
1f84a9a
fbb697c
 
1f84a9a
aeb550e
fbb697c
7009660
fbb697c
 
 
 
 
 
 
 
 
 
39b12fb
fbb697c
 
 
 
80fe2b7
 
fbb697c
 
39b12fb
 
4f0dc21
 
b45426b
4f0dc21
b45426b
4f0dc21
 
 
 
 
 
 
aeb550e
80fe2b7
4f0dc21
b45426b
4f0dc21
 
 
fbb697c
4f0dc21
 
 
aeb550e
80fe2b7
fbb697c
 
1f84a9a
fbb697c
 
1f84a9a
80fe2b7
 
 
fbb697c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import latex2markdown
from langchain.docstore.document import Document
import chromadb
from chromadb.config import Settings
import load_model
from load_model import load_embedding
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
persist_directory = load_model.persist_directory

def format_document(document: Document):
    """TODO: Implement a nice style"""
    return document.dict()

def format_result_set(result):
    st.write(latex2markdown.LaTeX2Markdown(result["result"]).to_markdown())

    agree = st.checkbox('Show source documents')
    source_documents = result["source_documents"]
    if agree:
        st.write('Source Documents:')
        for document in source_documents:
            st.write(format_document(document))

@st.cache_resource
def get_chroma_client():
    return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory=persist_directory
                                ))
#@st.cache_data
def retrieve_collections():
    client = get_chroma_client()
    all_collections = client.list_collections()
    collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
    return collections

def load_files():
    
    client = get_chroma_client()

    option = st.radio(
        "",
        options=["Add Documents", "Start new collection"],
    )

    if option == "Add Documents":
        collections = retrieve_collections()
        selected_collection = st.selectbox(
            'Add to exsisting collection or create a new one',
            collections )
        if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
            client.delete_collection(name=selected_collection["name"])
            #retrieve_collections.clear()
            collections = retrieve_collections()

        if selected_collection:
            st.write("Selected Vectorstore:", selected_collection)
        option = st.radio(
            "",
            options=["Upload Files from Local", "Upload Files from Web"],
        )
        if option == "Upload Files from Local":
            st.write('Source Documents:')
            uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
            chunk_size = st.text_area('chunk Size:', 1000)

            if st.button('Upload'):
                docs = load_from_file(uploaded_files)
                sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
                vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
                st.write("Upload succesful")
        else:
            st.write('Urls of Source Documents (Comma separated):')
            urls = chunk_size = st.text_area('Urls:', '')
            chunk_size = st.text_area('chunk Size:', 1000)
            urls = urls.replace(",", "" ).replace('"', "" ).split(',')

            if st.button('Upload'):
                docs = load_from_web(urls)     
                sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
                vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
                st.write("Upload succesful")
    else:
        collection = st.text_area('Name of your new collection:', '')
        model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
        if st.button('Create'):
            if len(collection)>3:
                ef = load_embedding(model_name)
                metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
                client.create_collection(collection, embedding_function=ef, metadata=metadata) 
                # retrieve_collections.clear()
                st.write("Collection " +collection+" succesfully created.")