Carlosito16 commited on
Commit
bd2cf7b
1 Parent(s): 01eb801

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import os
4
+ import torch
5
+ from tqdm.auto import tqdm
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+
9
+ from langchain.vectorstores import Chroma
10
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
11
+
12
+
13
+ from langchain import HuggingFacePipeline
14
+ from langchain.chains import RetrievalQA
15
+
16
+
17
+
18
+ st.set_page_config(
19
+ page_title = 'aitGPT',
20
+ page_icon = '✅')
21
+
22
+
23
+ st.markdown("# Hello")
24
+
25
+
26
+ with open("/Users/carlosito/Library/CloudStorage/OneDrive-Personal/AIT material/99-AIT-thesis/aitGPT/ait-web-document", "rb") as fp:
27
+ ait_web_documents = pickle.load(fp)
28
+
29
+
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ # Set a really small chunk size, just to show.
32
+ chunk_size = 500,
33
+ chunk_overlap = 100,
34
+ length_function = len,
35
+ )
36
+
37
+ chunked_text = text_splitter.create_documents([doc for doc in tqdm(ait_web_documents)])
38
+
39
+
40
+ st.markdown(f"Number of Documents: {len(ait_web_documents)}")
41
+ st.markdown(f"Number of chunked texts: {len(chunked_text)}")
42
+
43
+
44
+ embedding_model = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-base',
45
+ model_kwargs = {'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')})
46
+ persist_directory = 'db_chunk_500'
47
+ db_chunk_500 = Chroma.from_documents(documents= chunked_text,
48
+ embedding= embedding_model,
49
+ persist_directory=persist_directory)
50
+
51
+ print("load done")