Spaces:
Sleeping
Sleeping
File size: 4,992 Bytes
fb7d447 95fdd56 fb7d447 95fdd56 fb7d447 95fdd56 fb7d447 95fdd56 9a7b6d5 95fdd56 9a7b6d5 95fdd56 fb7d447 95fdd56 fb7d447 2bf4887 fb7d447 95fdd56 fb7d447 95fdd56 5da87fd 7cf7b07 fb7d447 7cf7b07 d637584 fb7d447 95fdd56 fb7d447 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import streamlit as st
import os
import base64
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import textwrap
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from constants import CHROMA_SETTINGS
from streamlit_chat import message
st.set_page_config(layout="wide")
# Specify the device
device = torch.device('cpu')
checkpoint = "MBZUAI/LaMini-T5-738M"
print(f"Checkpoint path: {checkpoint}") # Add this line for debugging
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
persist_directory = "db"
@st.cache_resource
def data_ingestion():
for root, dirs, files in os.walk("docs"):
for file in files:
if file.endswith(".pdf"):
print(file)
loader = PDFMinerLoader(os.path.join(root, file))
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
texts = text_splitter.split_documents(documents)
#create embeddings here
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#create vector store here
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
db.persist()
db=None
@st.cache_resource
def llm_pipeline():
pipe = pipeline(
'text2text-generation',
model = base_model,
tokenizer = tokenizer,
max_length = 256,
do_sample = True,
temperature = 0.3,
top_p= 0.95,
device=device
)
local_llm = HuggingFacePipeline(pipeline=pipe)
return local_llm
@st.cache_resource
def qa_llm():
llm = llm_pipeline()
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory="db", embedding_function = embeddings, client_settings=CHROMA_SETTINGS)
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(
llm = llm,
chain_type = "stuff",
retriever = retriever,
return_source_documents=True
)
return qa
def process_answer(instruction):
response = ''
instruction = instruction
qa = qa_llm()
generated_text = qa(instruction)
answer = generated_text['result']
return answer
def get_file_size(file):
file.seek(0, os.SEEK_END)
file_size = file.tell()
file.seek(0)
return file_size
# Specify the path to your PDF document directly
filepath = "removed_null.pdf"
@st.cache_data
#function to display the PDF of a given file
def displayPDF(file):
# Opening file from file path
with open(file, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# Embedding PDF in HTML
pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
# Displaying File
st.markdown(pdf_display, unsafe_allow_html=True)
# Display conversation history using Streamlit messages
def display_conversation(history):
for i in range(len(history["generated"])):
message(history["past"][i], is_user=True, key=str(i) + "_user")
message(history["generated"][i],key=str(i))
def main():
st.markdown("<h1 style='text-align: center; color: blue;'>JurioSync📄 </h1>", unsafe_allow_html=True)
st.markdown("<h3 style='text-align: center; color: grey;'>Ai Powered Legal Document Assistant</h3>", unsafe_allow_html=True)
st.markdown("<h4 style color:black;'>File details</h4>", unsafe_allow_html=True)
# You can display any additional file details here if needed
st.markdown("<h4 style color:black;'>File preview</h4>", unsafe_allow_html=True)
pdf_view = displayPDF(filepath)
ingested_data = data_ingestion()
st.success('Embeddings are created successfully!')
# Initialize session state for generated responses and past messages
if "generated" not in st.session_state:
st.session_state["generated"] = ["I am ready to help you"]
if "past" not in st.session_state:
st.session_state["past"] = ["Hey there!"]
# Search the database for a response based on user input and update session state
if user_input:
answer = process_answer({'query': user_input})
st.session_state["past"].append(user_input)
response = answer
st.session_state["generated"].append(response)
# Display conversation history using Streamlit messages
if st.session_state["generated"]:
display_conversation(st.session_state)
if __name__ == "__main__":
main()
|