Spaces:

zhuolisam
/

resume-ranker

Runtime error

App Files Files Community

resume-ranker / pdf_loader.py

zhuolisam

feat:streamlit

2c14023 over 1 year ago

raw

history blame contribute delete

1.97 kB

	import os
	import PyPDF2

	def load_single_document(file_path: str):
	# Loads a single document from file path
	if file_path[-4:] == '.txt':
	with open(file_path, 'r') as f:
	return f.read()

	elif file_path[-4:] == '.pdf':
	pdfFileObj = open(file_path, 'rb')
	pdfReader = PyPDF2.PdfReader(pdfFileObj)
	text = ''
	for page in pdfReader.pages:
	text += page.extract_text()
	return text

	elif file_path[-4:] == '.csv':
	with open(file_path, 'r') as f:
	return f.read()

	else:
	raise Exception('Invalid file type')


	def load_documents(file_paths: list[str] = None, source_dir: str = None):
	# Loads all documents from source documents directory
	if file_paths:
	all_files = file_paths
	elif source_dir:
	all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
	else:
	raise Exception('No file paths or source directory provided')

	return [
	{
	'name': os.path.basename(file_path),
	'content': load_single_document(f"{file_path}")
	} for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
	]

	def load_io(file_byte = None):
	# Loads a single document from file path
	if file_byte.name[-3:] == 'txt':
	return file_byte.read().decode("utf-8")

	elif file_byte.name[-3:] == 'pdf':
	pdfReader = PyPDF2.PdfReader(file_byte)
	text = ''
	for page in pdfReader.pages:
	text += page.extract_text()
	return text

	else:
	raise Exception('Invalid file type')

	def load_btyes_io(files = None):

	return [
	{
	'name': file_btye.name,
	'content': load_io(file_btye)
	} for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
	]