File size: 3,414 Bytes
7009660
 
 
fbb697c
 
 
4f0dc21
fbb697c
7009660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbb697c
 
 
 
 
 
 
 
 
 
7009660
fbb697c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f0dc21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbb697c
4f0dc21
 
 
 
 
fbb697c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
import latex2markdown
from langchain.docstore.document import Document
import chromadb
from chromadb.config import Settings
import load_model
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
persist_directory = load_model.persist_directory

def format_document(document: Document):
    """TODO: Implement a nice style"""
    return document.dict()

def format_result_set(result):
    st.write(latex2markdown.LaTeX2Markdown(result["result"]).to_markdown())

    agree = st.checkbox('Show source documents')
    source_documents = result["source_documents"]
    if agree:
        st.write('Source Documents:')
        for document in source_documents:
            st.write(format_document(document))

@st.cache_resource
def get_chroma_client():
    return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory=persist_directory
                                ))
@st.cache_data
def retrieve_collections():
    client = get_chroma_client()
    collections = tuple( [collection.name for collection in client.list_collections()] )
    return collections

def load_files():
    
    client = get_chroma_client()

    option = st.radio(
        "",
        options=["Add Documents", "Start new collection"],
    )

    collections = retrieve_collections()

    if option == "Add Documents":
        selected_collection = st.selectbox(
            'Add to exsisting collection or create a new one',
            collections )
        if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
            client.delete_collection(name=selected_collection)
            retrieve_collections.clear()
            collections = retrieve_collections()

        option = st.radio(
            "",
            options=["Upload Files", "Download Files"],
        )
        if option == "Upload Files":
            st.write('Source Documents:')
            uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
            chunk_size = st.text_area('chunk Size:', 1000)

            if st.button('Upload'):
                docs = load_from_file(uploaded_files)
                sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
                create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
                uploaded_files=None
        else:
            st.write('Source Documents (Comma separated):')
            urls = chunk_size = st.text_area('Urls:', '')
            chunk_size = st.text_area('chunk Size:', 1000)
            urls = urls.replace(",", "" ).replace('"', "" ).split(',')

            if st.button('Upload'):
                docs = load_from_web(urls)     
                sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
                create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
                uploaded_files=None
    else:
        collection = st.text_area('Name of your new collection:', '')
        if st.button('Create'):
            if len(collection)>3:
                client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)  --Problem i added the model to the name -> Better use Metadata :)
                retrieve_collections.clear()
                st.write("Collection " +collection+" succesfully created.")