Spaces:
Paused
Paused
from enum import Enum | |
from langchain_community.document_loaders import PyPDFLoader,TextLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter | |
separators=[ | |
"\n\n", | |
"\n", | |
" ", | |
".", | |
",", | |
"\u200b", # Zero-width space | |
"\uff0c", # Fullwidth comma | |
"\u3001", # Ideographic comma | |
"\uff0e", # Fullwidth full stop | |
"\u3002", # Ideographic full stop | |
"", | |
] | |
class ChunkingStrategy(Enum): | |
RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter" | |
NLTK_TEXT_SPLITTER = "nltk_text_splitter" | |
SPACY_TEXT_SPLITTER = "spacy_text_splitter" | |
class TextLoaderAndSplitterWrapper: | |
def __init__(self, strategy: ChunkingStrategy, file_path:str): | |
# Defaults | |
self.splitter = None | |
self.documents = [] | |
# Determine with splitter strategy to use from parameter | |
if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER: | |
self.splitter = RecursiveCharacterTextSplitter(separators=separators) | |
elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER: | |
self.splitter = NLTKTextSplitter() | |
elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER: | |
self.splitter = SpacyTextSplitter() | |
else: | |
raise ValueError(f"Unknown strategy: {strategy}") | |
# Load the document and chunk it | |
self.file_path = file_path | |
def load_documents(self): | |
if self.file_path.endswith(".pdf"): | |
# Use PDF loader | |
pdf_loader = PyPDFLoader(self.file_path) | |
self.documents = pdf_loader.load_and_split(text_splitter=self.splitter) # Defaults to RecursiveCharacterTextSplitter. | |
return self.documents | |
elif self.file_path.endswith(".txt"): | |
# Use Text loader | |
text_loader = TextLoader(self.file_path) | |
self.documents = text_loader.load_and_split(text_splitter=self.splitter) | |
return self.documents | |
else: | |
raise ValueError(f"Unknown file type: {self.file_path}") | |
def split(self, text: str): | |
return self.splitter.split(text) | |
def join(self, chunks: list): | |
return self.splitter.join(chunks) | |
def __str__(self): | |
return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})" | |
def __repr__(self): | |
return str(self) |