LCELRag / Chunking.py
jeevan
Locally working lcel rag
4a0c158
from enum import Enum
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter
separators=[
"\n\n",
"\n",
" ",
".",
",",
"\u200b", # Zero-width space
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
"\u3002", # Ideographic full stop
"",
]
class ChunkingStrategy(Enum):
RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter"
NLTK_TEXT_SPLITTER = "nltk_text_splitter"
SPACY_TEXT_SPLITTER = "spacy_text_splitter"
class TextLoaderAndSplitterWrapper:
def __init__(self, strategy: ChunkingStrategy, file_path:str):
# Defaults
self.splitter = None
self.documents = []
# Determine with splitter strategy to use from parameter
if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER:
self.splitter = RecursiveCharacterTextSplitter(separators=separators)
elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER:
self.splitter = NLTKTextSplitter()
elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER:
self.splitter = SpacyTextSplitter()
else:
raise ValueError(f"Unknown strategy: {strategy}")
# Load the document and chunk it
self.file_path = file_path
def load_documents(self):
if self.file_path.endswith(".pdf"):
# Use PDF loader
pdf_loader = PyPDFLoader(self.file_path)
self.documents = pdf_loader.load_and_split(text_splitter=self.splitter) # Defaults to RecursiveCharacterTextSplitter.
return self.documents
elif self.file_path.endswith(".txt"):
# Use Text loader
text_loader = TextLoader(self.file_path)
self.documents = text_loader.load_and_split(text_splitter=self.splitter)
return self.documents
else:
raise ValueError(f"Unknown file type: {self.file_path}")
def split(self, text: str):
return self.splitter.split(text)
def join(self, chunks: list):
return self.splitter.join(chunks)
def __str__(self):
return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})"
def __repr__(self):
return str(self)