ParisNeo
/

lollms-personalities-zoo

Model card Files Files and versions Community

lollms-personalities-zoo / english /data /chat_with_docs /scripts /processor.py

ParisNeo

All personalities are there

453b8b8 over 1 year ago

raw

history blame contribute delete

23.3 kB

	from lollms.config import TypedConfig, BaseConfig, ConfigTemplate, InstallOption
	from lollms.types import MSG_TYPE
	from lollms.personality import APScript, AIPersonality
	from lollms.helpers import ASCIIColors

	import numpy as np
	import json
	from pathlib import Path
	import numpy as np
	import json

	class TextVectorizer:
	def __init__(self, model_name, database_file:Path\|str, visualize_data_at_startup=False, visualize_data_at_add_file=False, visualize_data_at_generate=False):
	from transformers import AutoTokenizer, AutoModel

	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModel.from_pretrained(model_name)
	self.embeddings = {}
	self.texts = {}
	self.ready = False
	self.database_file = Path(database_file)
	self.visualize_data_at_startup = visualize_data_at_startup
	self.visualize_data_at_add_file = visualize_data_at_add_file
	self.visualize_data_at_generate = visualize_data_at_generate

	# Load previous state from the JSON file
	if Path(self.database_file).exists():
	ASCIIColors.success(f"Database file found : {self.database_file}")
	self.load_from_json()
	if visualize_data_at_startup:
	self.show_document()
	self.ready = True
	else:
	ASCIIColors.info(f"No database file found : {self.database_file}")


	def show_document(self, query_text="What is the main idea of this text?", use_pca=True):
	import textwrap
	import seaborn as sns
	import matplotlib.pyplot as plt
	import mplcursors
	from tkinter import Tk, Text, Scrollbar, Frame, Label, TOP, BOTH, RIGHT, LEFT, Y, N, END


	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA
	import torch

	if use_pca:
	print("Showing pca representation :")
	else:
	print("Showing t-sne representation :")
	texts = list(self.texts.values())
	embeddings = torch.stack(list(self.embeddings.values())).detach().squeeze(1).numpy()
	# Normalize embeddings
	norms = np.linalg.norm(embeddings, axis=1)
	normalized_embeddings = embeddings / norms[:, np.newaxis]

	# Embed the query text
	query_embedding = self.embed_query(query_text)
	query_embedding = query_embedding.detach().squeeze().numpy()
	query_normalized_embedding = query_embedding / np.linalg.norm(query_embedding)

	# Combine the query embedding with the document embeddings
	combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding))

	if use_pca:
	# Use PCA for dimensionality reduction
	pca = PCA(n_components=2)
	embeddings_2d = pca.fit_transform(combined_embeddings)
	else:
	# Use t-SNE for dimensionality reduction
	# Adjust the perplexity value
	perplexity = min(30, combined_embeddings.shape[0] - 1)
	tsne = TSNE(n_components=2, perplexity=perplexity)
	embeddings_2d = tsne.fit_transform(combined_embeddings)


	# Create a scatter plot using Seaborn
	sns.scatterplot(x=embeddings_2d[:-1, 0], y=embeddings_2d[:-1, 1]) # Plot document embeddings
	plt.scatter(embeddings_2d[-1, 0], embeddings_2d[-1, 1], color='red') # Plot query embedding

	# Add labels to the scatter plot
	for i, (x, y) in enumerate(embeddings_2d[:-1]):
	plt.text(x, y, str(i), fontsize=8)

	plt.xlabel('Dimension 1')
	plt.ylabel('Dimension 2')
	if use_pca:
	plt.title('Embeddings Scatter Plot based on PCA')
	else:
	plt.title('Embeddings Scatter Plot based on t-SNE')
	# Enable mplcursors to show tooltips on hover
	cursor = mplcursors.cursor(hover=True)

	# Define the hover event handler
	@cursor.connect("add")
	def on_hover(sel):
	index = sel.target.index
	if index > 0:
	text = texts[index]
	wrapped_text = textwrap.fill(text, width=50) # Wrap the text into multiple lines
	sel.annotation.set_text(f"Index: {index}\nText:\n{wrapped_text}")
	else:
	sel.annotation.set_text("Query")

	# Define the click event handler using matplotlib event handling mechanism
	def on_click(event):
	if event.xdata is not None and event.ydata is not None:
	x, y = event.xdata, event.ydata
	distances = ((embeddings_2d[:, 0] - x) 2 + (embeddings_2d[:, 1] - y) 2)
	index = distances.argmin()
	text = texts[index] if index < len(texts) else query_text

	# Open a new Tkinter window with the content of the text
	root = Tk()
	root.title(f"Text for Index {index}")
	frame = Frame(root)
	frame.pack(fill=BOTH, expand=True)

	label = Label(frame, text="Text:")
	label.pack(side=TOP, padx=5, pady=5)

	text_box = Text(frame)
	text_box.pack(side=TOP, padx=5, pady=5, fill=BOTH, expand=True)
	text_box.insert(END, text)

	scrollbar = Scrollbar(frame)
	scrollbar.pack(side=RIGHT, fill=Y)
	scrollbar.config(command=text_box.yview)
	text_box.config(yscrollcommand=scrollbar.set)

	text_box.config(state="disabled")

	root.mainloop()

	# Connect the click event handler to the figure
	plt.gcf().canvas.mpl_connect("button_press_event", on_click)
	plt.show()

	def index_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
	import torch

	if document_id in self.embeddings and not force_vectorize:
	print(f"Document {document_id} already exists. Skipping vectorization.")
	return

	# Tokenize text
	tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_attention_mask=False)['input_ids']

	# Split tokens into sentences
	sentences = self.tokenizer.decode(tokens).split('. ')

	# Generate chunks with overlap and sentence boundaries
	chunks = []
	current_chunk = []
	for sentence in sentences:
	sentence_tokens = self.tokenizer.encode_plus(sentence, add_special_tokens=False, return_attention_mask=False)['input_ids']
	if len(current_chunk) + len(sentence_tokens) <= chunk_size:
	current_chunk.extend(sentence_tokens)
	else:
	if current_chunk:
	chunks.append(current_chunk)
	current_chunk = sentence_tokens

	if current_chunk:
	chunks.append(current_chunk)

	# Generate overlapping chunks
	overlapping_chunks = []
	for i in range(len(chunks)):
	chunk_start = i * (chunk_size - overlap_size)
	chunk_end = min(chunk_start + chunk_size, len(tokens))
	chunk = tokens[chunk_start:chunk_end]
	overlapping_chunks.append(chunk)

	# Generate embeddings for each chunk
	for i, chunk in enumerate(overlapping_chunks):
	# Pad the chunk if it is smaller than chunk_size
	if len(chunk) < chunk_size:
	padding = [self.tokenizer.pad_token_id] * (chunk_size - len(chunk))
	chunk.extend(padding)

	# Convert tokens to IDs
	input_ids = chunk[:chunk_size]

	# Convert input to PyTorch tensor
	input_tensor = torch.tensor([input_ids])

	# Generate chunk embedding
	with torch.no_grad():
	self.model.eval()
	outputs = self.model(input_tensor)
	embeddings = outputs.last_hidden_state.mean(dim=1)

	# Store chunk ID, embedding, and original text
	chunk_id = f"{document_id}_chunk_{i + 1}"
	self.embeddings[chunk_id] = embeddings
	self.texts[chunk_id] = self.tokenizer.decode(chunk[:chunk_size], skip_special_tokens=True)

	self.save_to_json()
	self.ready = True
	if self.visualize_data_at_add_file:
	self.show_document()


	def embed_query(self, query_text):
	import torch

	# Tokenize query text
	query_tokens = self.tokenizer.encode(query_text)

	# Convert input to PyTorch tensor
	query_input_tensor = torch.tensor([query_tokens])

	# Generate query embedding
	with torch.no_grad():
	self.model.eval()
	query_outputs = self.model(query_input_tensor)
	query_embedding = query_outputs.last_hidden_state.mean(dim=1)

	return query_embedding

	def recover_text(self, query_embedding, top_k=1):
	from sklearn.metrics.pairwise import cosine_similarity
	similarities = {}
	for chunk_id, chunk_embedding in self.embeddings.items():
	similarity = cosine_similarity(query_embedding.numpy(), chunk_embedding.numpy())[0][0]
	similarities[chunk_id] = similarity

	# Sort the similarities and retrieve the top-k most similar embeddings
	sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]

	# Retrieve the original text associated with the most similar embeddings
	texts = [self.texts[chunk_id] for chunk_id, _ in sorted_similarities]

	if self.visualize_data_at_generate:
	self.show_document()

	return texts

	def save_to_json(self):
	state = {
	"embeddings": {str(k): v.tolist() for k, v in self.embeddings.items()},
	"texts": self.texts,
	}
	with open(self.database_file, "w") as f:
	json.dump(state, f)

	def load_from_json(self):
	import torch

	ASCIIColors.info("Loading vectorized documents")
	with open(self.database_file, "r") as f:
	state = json.load(f)
	self.embeddings = {k: torch.tensor(v) for k, v in state["embeddings"].items()}
	self.texts = state["texts"]
	self.ready = True


	class Processor(APScript):
	"""
	A class that processes model inputs and outputs.

	Inherits from APScript.
	"""

	def __init__(
	self,
	personality: AIPersonality
	) -> None:

	self.word_callback = None

	personality_config_template = ConfigTemplate(
	[
	{"name":"database_path","type":"str","value":f"{personality.name}_db.json", "help":"Path to the database"},
	{"name":"max_chunk_size","type":"int","value":512, "min":10, "max":personality.config["ctx_size"],"help":"Maximum size of text chunks to vectorize"},
	{"name":"chunk_overlap","type":"int","value":20, "min":0, "max":personality.config["ctx_size"],"help":"Overlap between chunks"},

	{"name":"max_answer_size","type":"int","value":512, "min":10, "max":personality.config["ctx_size"],"help":"Maximum number of tokens to allow the generator to generate as an answer to your question"},

	{"name":"visualize_data_at_startup","type":"bool","value":False, "help":"If true, the database will be visualized at startup"},
	{"name":"visualize_data_at_add_file","type":"bool","value":False, "help":"If true, the database will be visualized when a new file is added"},
	{"name":"visualize_data_at_generate","type":"bool","value":False, "help":"If true, the database will be visualized at generation time"},
	]
	)
	personality_config_vals = BaseConfig.from_template(personality_config_template)

	personality_config = TypedConfig(
	personality_config_template,
	personality_config_vals
	)
	super().__init__(
	personality,
	personality_config
	)
	self.state = 0
	self.ready = False
	self.personality = personality
	self.callback = None
	self.vector_store = TextVectorizer(
	"bert-base-uncased",
	self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"],
	visualize_data_at_startup=self.personality_config["visualize_data_at_startup"],
	visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"],
	visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
	)
	if len(self.vector_store.embeddings)>0:
	self.ready = True


	@staticmethod
	def read_pdf_file(file_path):
	import PyPDF2
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	@staticmethod
	def read_docx_file(file_path):
	from docx import Document
	doc = Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	@staticmethod
	def read_json_file(file_path):
	with open(file_path, 'r') as file:
	data = json.load(file)
	return data

	@staticmethod
	def read_csv_file(file_path):
	import csv
	with open(file_path, 'r') as file:
	csv_reader = csv.reader(file)
	lines = [row for row in csv_reader]
	return lines

	@staticmethod
	def read_html_file(file_path):
	from bs4 import BeautifulSoup
	with open(file_path, 'r') as file:
	soup = BeautifulSoup(file, 'html.parser')
	text = soup.get_text()
	return text
	@staticmethod
	def read_pptx_file(file_path):
	from pptx import Presentation
	prs = Presentation(file_path)
	text = ""
	for slide in prs.slides:
	for shape in slide.shapes:
	if shape.has_text_frame:
	for paragraph in shape.text_frame.paragraphs:
	for run in paragraph.runs:
	text += run.text
	return text
	@staticmethod
	def read_text_file(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()
	return content

	def build_db(self):
	ASCIIColors.info("-> Vectorizing the database"+ASCIIColors.color_orange)
	if self.callback is not None:
	self.callback("Vectorizing the database", MSG_TYPE.MSG_TYPE_CHUNK)
	for file in self.files:
	try:
	if Path(file).suffix==".pdf":
	text = Processor.read_pdf_file(file)
	elif Path(file).suffix==".docx":
	text = Processor.read_docx_file(file)
	elif Path(file).suffix==".docx":
	text = Processor.read_pptx_file(file)
	elif Path(file).suffix==".json":
	text = Processor.read_json_file(file)
	elif Path(file).suffix==".csv":
	text = Processor.read_csv_file(file)
	elif Path(file).suffix==".html":
	text = Processor.read_html_file(file)
	else:
	text = Processor.read_text_file(file)
	try:
	chunk_size=int(self.personality_config["chunk_size"])
	except:
	ASCIIColors.warning(f"Couldn't read chunk size. Verify your configuration file")
	chunk_size=512
	try:
	overlap_size=int(self.personality_config["chunk_overlap"])
	except:
	ASCIIColors.warning(f"Couldn't read chunk size. Verify your configuration file")
	overlap_size=50

	self.vector_store.index_document(file, text, chunk_size=chunk_size, overlap_size=overlap_size)

	print(ASCIIColors.color_reset)
	ASCIIColors.success(f"File {file} vectorized successfully")
	self.ready = True
	except Exception as ex:
	ASCIIColors.error(f"Couldn't vectorize {file}: The vectorizer threw this exception:{ex}")

	def add_file(self, path):
	super().add_file(path)
	try:
	self.build_db()
	self.ready = True
	return True
	except Exception as ex:
	ASCIIColors.error(f"Couldn't vectorize the database: The vectgorizer threw this exception: {ex}")
	return False

	def run_workflow(self, prompt, previous_discussion_text="", callback=None):
	"""
	Runs the workflow for processing the model input and output.

	This method should be called to execute the processing workflow.

	Args:
	generate_fn (function): A function that generates model output based on the input prompt.
	The function should take a single argument (prompt) and return the generated text.
	prompt (str): The input prompt for the model.
	previous_discussion_text (str, optional): The text of the previous discussion. Default is an empty string.

	Returns:
	None
	"""
	# State machine
	output =""
	self.callback = callback
	if prompt.strip().lower()=="send_file":
	self.state = 1
	print("Please provide the file name")
	if callback is not None:
	callback("Please provide the file path", MSG_TYPE.MSG_TYPE_FULL)
	output = "Please provide the file name"
	elif prompt.strip().lower()=="help":
	if callback:
	callback(self.personality.help,MSG_TYPE.MSG_TYPE_FULL)
	ASCIIColors.info(help)
	self.state = 0
	elif prompt.strip().lower()=="show_database":
	try:
	self.vector_store.show_document()
	except Exception as ex:
	if callback is not None:
	callback(f"Couldn't show the database\nMake sure you have already uploaded a database.\nReceived exception is: {ex}", MSG_TYPE.MSG_TYPE_FULL)

	self.state = 0

	elif prompt.strip().lower()=="set_database":
	print("Please provide the database file name")
	if callback is not None:
	callback("Please provide the database file path", MSG_TYPE.MSG_TYPE_FULL)
	output = "Please provide the database file name"
	self.state = 2
	elif prompt.strip().lower()=="clear_database":
	database_fill_path:Path = self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"]
	if database_fill_path.exists():
	database_fill_path.unlink()
	self.vector_store = TextVectorizer(
	"bert-base-uncased",
	self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"],
	visualize_data_at_startup=self.personality_config["visualize_data_at_startup"],
	visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"],
	visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
	)
	if callback is not None:
	callback("Database file cleared successfully", MSG_TYPE.MSG_TYPE_FULL)
	else:
	if callback is not None:
	callback("The database file does not exist yet, so you can't clear it", MSG_TYPE.MSG_TYPE_FULL)
	self.state = 0
	else:
	if self.state ==1:
	try:
	self.add_file(prompt)
	if callback is not None:
	callback(f"File {prompt} added successfully", MSG_TYPE.MSG_TYPE_FULL)

	except Exception as ex:
	ASCIIColors.error(f"Exception: {ex}")
	if callback is not None:
	callback(f"Couldn't load file {prompt}.\nThe following exception was thrown: {ex}", MSG_TYPE.MSG_TYPE_FULL)
	output = str(ex)
	self.state=0
	elif self.state ==2:
	try:
	new_db_path = Path(prompt)
	if new_db_path.exists():
	self.personality_config["database_path"] = prompt
	self.personality_config.save()
	self.vector_store = TextVectorizer(
	"bert-base-uncased",
	self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"],
	visualize_data_at_startup=self.personality_config["visualize_data_at_startup"],
	visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"],
	visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
	)

	self.save_config_file(self.personality.lollms_paths.personal_configuration_path/f"personality_{self.personality.name}.yaml", self.personality_config)
	else:
	output = "Database file not found.\nGoing back to default state."
	except Exception as ex:
	ASCIIColors.error(f"Exception: {ex}")
	output = str(ex)
	self.state=0
	else:
	if not self.ready:
	ASCIIColors.error(f"No data to discuss. Please upload a document first")
	else:
	docs = self.vector_store.recover_text(self.vector_store.embed_query(prompt), top_k=3)
	docs = '\n'.join([f"Doc{i}:\n{v}" for i,v in enumerate(docs)])
	full_text = self.personality.personality_conditioning+"\n### Docs:\n"+docs+"\n### Question: "+prompt+"\n### Answer:"
	ASCIIColors.blue("-------------- Documentation -----------------------")
	ASCIIColors.blue(full_text)
	ASCIIColors.blue("----------------------------------------------------")
	ASCIIColors.blue("Thinking")
	if callback is not None:
	callback("Thinking", MSG_TYPE.MSG_TYPE_FULL)
	output = self.generate(full_text, self.personality_config["max_answer_size"])
	if callback is not None:
	callback(output, MSG_TYPE.MSG_TYPE_FULL)
	return output