# setting device on GPU if available, else CPU import os import sys from timeit import default_timer as timer from typing import List from langchain.document_loaders import PyPDFDirectoryLoader from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.faiss import FAISS from app_modules.init import app_init, get_device_types from app_modules.llm_summarize_chain import SummarizeChain def load_documents(source_pdfs_path, urls) -> List: loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True) documents = loader.load() if urls is not None and len(urls) > 0: for doc in documents: source = doc.metadata["source"] filename = source.split("/")[-1] for url in urls: if url.endswith(filename): doc.metadata["url"] = url break return documents def split_chunks(documents: List, chunk_size, chunk_overlap) -> List: text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) return text_splitter.split_documents(documents) llm_loader = app_init(False)[0] source_pdfs_path = ( sys.argv[1] if len(sys.argv) > 1 else os.environ.get("SOURCE_PDFS_PATH") ) chunk_size = os.environ.get("CHUNCK_SIZE") chunk_overlap = os.environ.get("CHUNK_OVERLAP") sources = load_documents(source_pdfs_path, None) print(f"Splitting {len(sources)} PDF pages in to chunks ...") chunks = split_chunks( sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap) ) print(f"Summarizing {len(chunks)} chunks ...") start = timer() summarize_chain = SummarizeChain(llm_loader) result = summarize_chain.call_chain( {"input_documents": chunks}, None, None, True, ) end = timer() print(f"Completed in {end - start:.3f}s") print("\n\n***Summary:") print(result["output_text"])