resume-ranker / pdf_loader.py
zhuolisam's picture
feat:streamlit
2c14023
import os
import PyPDF2
def load_single_document(file_path: str):
# Loads a single document from file path
if file_path[-4:] == '.txt':
with open(file_path, 'r') as f:
return f.read()
elif file_path[-4:] == '.pdf':
pdfFileObj = open(file_path, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
text = ''
for page in pdfReader.pages:
text += page.extract_text()
return text
elif file_path[-4:] == '.csv':
with open(file_path, 'r') as f:
return f.read()
else:
raise Exception('Invalid file type')
def load_documents(file_paths: list[str] = None, source_dir: str = None):
# Loads all documents from source documents directory
if file_paths:
all_files = file_paths
elif source_dir:
all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
else:
raise Exception('No file paths or source directory provided')
return [
{
'name': os.path.basename(file_path),
'content': load_single_document(f"{file_path}")
} for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
]
def load_io(file_byte = None):
# Loads a single document from file path
if file_byte.name[-3:] == 'txt':
return file_byte.read().decode("utf-8")
elif file_byte.name[-3:] == 'pdf':
pdfReader = PyPDF2.PdfReader(file_byte)
text = ''
for page in pdfReader.pages:
text += page.extract_text()
return text
else:
raise Exception('Invalid file type')
def load_btyes_io(files = None):
return [
{
'name': file_btye.name,
'content': load_io(file_btye)
} for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
]