bangkit-chatbot / extract_pdf.py
karuniaperjuangan's picture
Upload 8 files
8f9ef57
"""
Kode di bawah digunakan untuk mengextract teks dari file pdf yang ada di folder document
"""
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle
import re
from langchain.document_loaders import PyPDFLoader
paths = Path("document/").glob("**/*.pdf")
docs = []
for path in paths:
path = str(path)
loader = PyPDFLoader(path)
pages = loader.load_and_split()
pages
document = []
for page in pages:
content = page.page_content
#Kode di bawah untuk menghilangkan header dan footer
content = re.sub(r"PRES(.*)\n(.*)\n-.*-\n", " ", content)
content = re.sub(r"FRES(.*)\n(.*)\n-.*-\n", " ", content)
content = re.sub(r"PRESIDEN\nREPUBLIK INDONESIA\n.11-\n", " ", content)
content = re.sub(r"PRESIDEN\nREPUBLIK INOONESIA\n_55_\n", " ", content)
content = re.sub(r"PRESIDEN\nREPUBLIK INDONESIA\n.20 -\n", " ", content)
content = re.sub(r"PRESIDEN\nREPUBLIK INDONESIA\n24-", " ", content)
content = re.sub(r"PRESIOEN\nREPUBLIK INDONESIA\n-39\n", " ", content)
content = re.sub(r"PRESIDEN\nREPUELIK INDONESIA\nL2-\n", " ", content)
document.append({"content": content,"metadata":{'source':page.metadata['source']}})
document
#join content every 3 pages
for i in range(0, len(pages), 3):
docs.append({'content':" ".join([page['content'] for page in document][i:i+3]),
'metadata':{'source':document[i]['metadata']['source']}
})
# Here we create a vector store from the documents and save it to disk.
list_content = [doc['content'] for doc in docs]
list_metadata = [doc['metadata'] for doc in docs]
store = FAISS.from_texts(list_content, OpenAIEmbeddings(), metadatas=list_metadata)
faiss.write_index(store.index, "docs.index")
store.index = None