File size: 23,264 Bytes
453b8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
from lollms.config import TypedConfig, BaseConfig, ConfigTemplate, InstallOption
from lollms.types import MSG_TYPE
from lollms.personality import APScript, AIPersonality
from lollms.helpers import ASCIIColors

import numpy as np
import json
from pathlib import Path
import numpy as np
import json

class TextVectorizer:
    def __init__(self, model_name, database_file:Path|str, visualize_data_at_startup=False, visualize_data_at_add_file=False, visualize_data_at_generate=False):
        from transformers import AutoTokenizer, AutoModel

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.embeddings = {}
        self.texts = {}
        self.ready = False
        self.database_file = Path(database_file)
        self.visualize_data_at_startup  = visualize_data_at_startup
        self.visualize_data_at_add_file = visualize_data_at_add_file
        self.visualize_data_at_generate = visualize_data_at_generate

        # Load previous state from the JSON file
        if Path(self.database_file).exists():
            ASCIIColors.success(f"Database file found : {self.database_file}")
            self.load_from_json()
            if visualize_data_at_startup:
                self.show_document()
            self.ready = True
        else:
            ASCIIColors.info(f"No database file found : {self.database_file}")

                
    def show_document(self, query_text="What is the main idea of this text?", use_pca=True):
        import textwrap
        import seaborn as sns
        import matplotlib.pyplot as plt
        import mplcursors
        from tkinter import Tk, Text, Scrollbar, Frame, Label, TOP, BOTH, RIGHT, LEFT, Y, N, END

        
        from sklearn.manifold import TSNE
        from sklearn.decomposition import PCA
        import torch
        
        if use_pca:
            print("Showing pca representation :")
        else:
            print("Showing t-sne representation :")
        texts = list(self.texts.values())
        embeddings = torch.stack(list(self.embeddings.values())).detach().squeeze(1).numpy()
        # Normalize embeddings
        norms = np.linalg.norm(embeddings, axis=1)
        normalized_embeddings = embeddings / norms[:, np.newaxis]

        # Embed the query text
        query_embedding = self.embed_query(query_text)
        query_embedding = query_embedding.detach().squeeze().numpy()
        query_normalized_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Combine the query embedding with the document embeddings
        combined_embeddings = np.vstack((normalized_embeddings, query_normalized_embedding))

        if use_pca:
            # Use PCA for dimensionality reduction
            pca = PCA(n_components=2)
            embeddings_2d = pca.fit_transform(combined_embeddings)
        else:
            # Use t-SNE for dimensionality reduction
            # Adjust the perplexity value
            perplexity = min(30, combined_embeddings.shape[0] - 1)
            tsne = TSNE(n_components=2, perplexity=perplexity)
            embeddings_2d = tsne.fit_transform(combined_embeddings)


        # Create a scatter plot using Seaborn
        sns.scatterplot(x=embeddings_2d[:-1, 0], y=embeddings_2d[:-1, 1])  # Plot document embeddings
        plt.scatter(embeddings_2d[-1, 0], embeddings_2d[-1, 1], color='red')  # Plot query embedding

        # Add labels to the scatter plot
        for i, (x, y) in enumerate(embeddings_2d[:-1]):
            plt.text(x, y, str(i), fontsize=8)

        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')
        if use_pca:      
            plt.title('Embeddings Scatter Plot based on PCA')
        else:
            plt.title('Embeddings Scatter Plot based on t-SNE')
        # Enable mplcursors to show tooltips on hover
        cursor = mplcursors.cursor(hover=True)

        # Define the hover event handler
        @cursor.connect("add")
        def on_hover(sel):
            index = sel.target.index
            if index > 0:
                text = texts[index]
                wrapped_text = textwrap.fill(text, width=50)  # Wrap the text into multiple lines
                sel.annotation.set_text(f"Index: {index}\nText:\n{wrapped_text}")
            else:
                sel.annotation.set_text("Query")

        # Define the click event handler using matplotlib event handling mechanism
        def on_click(event):
            if event.xdata is not None and event.ydata is not None:
                x, y = event.xdata, event.ydata
                distances = ((embeddings_2d[:, 0] - x) ** 2 + (embeddings_2d[:, 1] - y) ** 2)
                index = distances.argmin()
                text = texts[index] if index < len(texts) else query_text

                # Open a new Tkinter window with the content of the text
                root = Tk()
                root.title(f"Text for Index {index}")
                frame = Frame(root)
                frame.pack(fill=BOTH, expand=True)

                label = Label(frame, text="Text:")
                label.pack(side=TOP, padx=5, pady=5)

                text_box = Text(frame)
                text_box.pack(side=TOP, padx=5, pady=5, fill=BOTH, expand=True)
                text_box.insert(END, text)

                scrollbar = Scrollbar(frame)
                scrollbar.pack(side=RIGHT, fill=Y)
                scrollbar.config(command=text_box.yview)
                text_box.config(yscrollcommand=scrollbar.set)

                text_box.config(state="disabled")

                root.mainloop()

        # Connect the click event handler to the figure
        plt.gcf().canvas.mpl_connect("button_press_event", on_click)
        plt.show()
        
    def index_document(self, document_id, text, chunk_size, overlap_size, force_vectorize=False):
        import torch

        if document_id in self.embeddings and not force_vectorize:
            print(f"Document {document_id} already exists. Skipping vectorization.")
            return

        # Tokenize text
        tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_attention_mask=False)['input_ids']

        # Split tokens into sentences
        sentences = self.tokenizer.decode(tokens).split('. ')

        # Generate chunks with overlap and sentence boundaries
        chunks = []
        current_chunk = []
        for sentence in sentences:
            sentence_tokens = self.tokenizer.encode_plus(sentence, add_special_tokens=False, return_attention_mask=False)['input_ids']
            if len(current_chunk) + len(sentence_tokens) <= chunk_size:
                current_chunk.extend(sentence_tokens)
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence_tokens

        if current_chunk:
            chunks.append(current_chunk)

        # Generate overlapping chunks
        overlapping_chunks = []
        for i in range(len(chunks)):
            chunk_start = i * (chunk_size - overlap_size)
            chunk_end = min(chunk_start + chunk_size, len(tokens))
            chunk = tokens[chunk_start:chunk_end]
            overlapping_chunks.append(chunk)

        # Generate embeddings for each chunk
        for i, chunk in enumerate(overlapping_chunks):
            # Pad the chunk if it is smaller than chunk_size
            if len(chunk) < chunk_size:
                padding = [self.tokenizer.pad_token_id] * (chunk_size - len(chunk))
                chunk.extend(padding)

            # Convert tokens to IDs
            input_ids = chunk[:chunk_size]

            # Convert input to PyTorch tensor
            input_tensor = torch.tensor([input_ids])

            # Generate chunk embedding
            with torch.no_grad():
                self.model.eval()
                outputs = self.model(input_tensor)
                embeddings = outputs.last_hidden_state.mean(dim=1)

            # Store chunk ID, embedding, and original text
            chunk_id = f"{document_id}_chunk_{i + 1}"
            self.embeddings[chunk_id] = embeddings
            self.texts[chunk_id] = self.tokenizer.decode(chunk[:chunk_size], skip_special_tokens=True)

        self.save_to_json()
        self.ready = True
        if self.visualize_data_at_add_file:
            self.show_document()


    def embed_query(self, query_text):
        import torch
      
        # Tokenize query text
        query_tokens = self.tokenizer.encode(query_text)

        # Convert input to PyTorch tensor
        query_input_tensor = torch.tensor([query_tokens])

        # Generate query embedding
        with torch.no_grad():
            self.model.eval()
            query_outputs = self.model(query_input_tensor)
            query_embedding = query_outputs.last_hidden_state.mean(dim=1)

        return query_embedding

    def recover_text(self, query_embedding, top_k=1):
        from sklearn.metrics.pairwise import cosine_similarity
        similarities = {}
        for chunk_id, chunk_embedding in self.embeddings.items():
            similarity = cosine_similarity(query_embedding.numpy(), chunk_embedding.numpy())[0][0]
            similarities[chunk_id] = similarity

        # Sort the similarities and retrieve the top-k most similar embeddings
        sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]

        # Retrieve the original text associated with the most similar embeddings
        texts = [self.texts[chunk_id] for chunk_id, _ in sorted_similarities]

        if self.visualize_data_at_generate:
            self.show_document()

        return texts

    def save_to_json(self):
        state = {
            "embeddings": {str(k): v.tolist() for k, v in self.embeddings.items()},
            "texts": self.texts,
        }
        with open(self.database_file, "w") as f:
            json.dump(state, f)

    def load_from_json(self):
        import torch

        ASCIIColors.info("Loading vectorized documents")
        with open(self.database_file, "r") as f:
            state = json.load(f)
            self.embeddings = {k: torch.tensor(v) for k, v in state["embeddings"].items()}
            self.texts = state["texts"]
            self.ready = True


class Processor(APScript):
    """
    A class that processes model inputs and outputs.

    Inherits from APScript.
    """

    def __init__(
                 self, 
                 personality: AIPersonality
                ) -> None:
        
        self.word_callback = None    

        personality_config_template = ConfigTemplate(
            [
                {"name":"database_path","type":"str","value":f"{personality.name}_db.json", "help":"Path to the database"},
                {"name":"max_chunk_size","type":"int","value":512, "min":10, "max":personality.config["ctx_size"],"help":"Maximum size of text chunks to vectorize"},
                {"name":"chunk_overlap","type":"int","value":20, "min":0, "max":personality.config["ctx_size"],"help":"Overlap between chunks"},
                
                {"name":"max_answer_size","type":"int","value":512, "min":10, "max":personality.config["ctx_size"],"help":"Maximum number of tokens to allow the generator to generate as an answer to your question"},
                
                {"name":"visualize_data_at_startup","type":"bool","value":False, "help":"If true, the database will be visualized at startup"},
                {"name":"visualize_data_at_add_file","type":"bool","value":False, "help":"If true, the database will be visualized when a new file is added"},
                {"name":"visualize_data_at_generate","type":"bool","value":False, "help":"If true, the database will be visualized at generation time"},
            ]
            )
        personality_config_vals = BaseConfig.from_template(personality_config_template)

        personality_config = TypedConfig(
            personality_config_template,
            personality_config_vals
        )
        super().__init__(
                            personality,
                            personality_config
                        )
        self.state = 0
        self.ready = False
        self.personality = personality
        self.callback = None
        self.vector_store = TextVectorizer(
                                    "bert-base-uncased", 
                                    self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"],
                                    visualize_data_at_startup=self.personality_config["visualize_data_at_startup"],
                                    visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"],
                                    visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
                                    )
        if len(self.vector_store.embeddings)>0:
            self.ready = True
        

    @staticmethod        
    def read_pdf_file(file_path):
        import PyPDF2
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text

    @staticmethod
    def read_docx_file(file_path):
        from docx import Document
        doc = Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

    @staticmethod
    def read_json_file(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    
    @staticmethod
    def read_csv_file(file_path):
        import csv
        with open(file_path, 'r') as file:
            csv_reader = csv.reader(file)
            lines = [row for row in csv_reader]
        return lines    

    @staticmethod
    def read_html_file(file_path):
        from bs4 import BeautifulSoup
        with open(file_path, 'r') as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text()
        return text
    @staticmethod
    def read_pptx_file(file_path):
        from pptx import Presentation
        prs = Presentation(file_path)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    for paragraph in shape.text_frame.paragraphs:
                        for run in paragraph.runs:
                            text += run.text
        return text
    @staticmethod
    def read_text_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    
    def build_db(self):
        ASCIIColors.info("-> Vectorizing the database"+ASCIIColors.color_orange)
        if self.callback is not None:
            self.callback("Vectorizing the database", MSG_TYPE.MSG_TYPE_CHUNK)
        for file in self.files:
            try:
                if Path(file).suffix==".pdf":
                    text =  Processor.read_pdf_file(file)
                elif Path(file).suffix==".docx":
                    text =  Processor.read_docx_file(file)
                elif Path(file).suffix==".docx":
                    text =  Processor.read_pptx_file(file)
                elif Path(file).suffix==".json":
                    text =  Processor.read_json_file(file)
                elif Path(file).suffix==".csv":
                    text =  Processor.read_csv_file(file)
                elif Path(file).suffix==".html":
                    text =  Processor.read_html_file(file)
                else:
                    text =  Processor.read_text_file(file)
                try:
                    chunk_size=int(self.personality_config["chunk_size"])
                except:
                    ASCIIColors.warning(f"Couldn't read chunk size. Verify your configuration file")
                    chunk_size=512
                try:
                    overlap_size=int(self.personality_config["chunk_overlap"])
                except:
                    ASCIIColors.warning(f"Couldn't read chunk size. Verify your configuration file")
                    overlap_size=50

                self.vector_store.index_document(file, text, chunk_size=chunk_size, overlap_size=overlap_size)
                
                print(ASCIIColors.color_reset)
                ASCIIColors.success(f"File {file} vectorized successfully")
                self.ready = True
            except Exception as ex:
                ASCIIColors.error(f"Couldn't vectorize {file}: The vectorizer threw this exception:{ex}")

    def add_file(self, path):
        super().add_file(path)
        try:
            self.build_db()
            self.ready = True
            return True
        except Exception as ex:
            ASCIIColors.error(f"Couldn't vectorize the database: The vectgorizer threw this exception: {ex}")
            return False        

    def run_workflow(self, prompt, previous_discussion_text="", callback=None):
        """
        Runs the workflow for processing the model input and output.

        This method should be called to execute the processing workflow.

        Args:
            generate_fn (function): A function that generates model output based on the input prompt.
                The function should take a single argument (prompt) and return the generated text.
            prompt (str): The input prompt for the model.
            previous_discussion_text (str, optional): The text of the previous discussion. Default is an empty string.

        Returns:
            None
        """
        # State machine
        output =""
        self.callback = callback
        if prompt.strip().lower()=="send_file":
            self.state = 1
            print("Please provide the file name")
            if callback is not None:
                callback("Please provide the file path", MSG_TYPE.MSG_TYPE_FULL)
            output = "Please provide the file name"
        elif prompt.strip().lower()=="help":
            if callback:
                callback(self.personality.help,MSG_TYPE.MSG_TYPE_FULL)
                ASCIIColors.info(help)
            self.state = 0   
        elif prompt.strip().lower()=="show_database":
            try:
                self.vector_store.show_document()
            except Exception as ex:
                if callback is not None:
                    callback(f"Couldn't show the database\nMake sure you have already uploaded a database.\nReceived exception is: {ex}", MSG_TYPE.MSG_TYPE_FULL)        

            self.state = 0
            
        elif prompt.strip().lower()=="set_database":
            print("Please provide the database file name")
            if callback is not None:
                callback("Please provide the database file path", MSG_TYPE.MSG_TYPE_FULL)
            output = "Please provide the database file name"
            self.state = 2
        elif prompt.strip().lower()=="clear_database":
            database_fill_path:Path = self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"]
            if database_fill_path.exists():
                database_fill_path.unlink()
                self.vector_store = TextVectorizer(
                    "bert-base-uncased", 
                    self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"],
                    visualize_data_at_startup=self.personality_config["visualize_data_at_startup"],
                    visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"],
                    visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
                )
                if callback is not None:
                    callback("Database file cleared successfully", MSG_TYPE.MSG_TYPE_FULL)        
            else:
                if callback is not None:
                    callback("The database file does not exist yet, so you can't clear it", MSG_TYPE.MSG_TYPE_FULL)        
            self.state = 0
        else:
            if self.state ==1:
                try:
                    self.add_file(prompt)
                    if callback is not None:
                        callback(f"File {prompt} added successfully", MSG_TYPE.MSG_TYPE_FULL)

                except Exception as ex:
                    ASCIIColors.error(f"Exception: {ex}")
                    if callback is not None:
                        callback(f"Couldn't load file {prompt}.\nThe following exception was thrown: {ex}", MSG_TYPE.MSG_TYPE_FULL)
                    output = str(ex)
                self.state=0
            elif self.state ==2:
                try:
                    new_db_path = Path(prompt)
                    if new_db_path.exists():
                        self.personality_config["database_path"] = prompt
                        self.personality_config.save()
                        self.vector_store = TextVectorizer(
                            "bert-base-uncased", 
                            self.personality.lollms_paths.personal_data_path/self.personality_config["database_path"],
                            visualize_data_at_startup=self.personality_config["visualize_data_at_startup"],
                            visualize_data_at_add_file=self.personality_config["visualize_data_at_add_file"],
                            visualize_data_at_generate=self.personality_config["visualize_data_at_generate"]
                            )
                        
                        self.save_config_file(self.personality.lollms_paths.personal_configuration_path/f"personality_{self.personality.name}.yaml", self.personality_config)
                    else:
                        output = "Database file not found.\nGoing back to default state."
                except Exception as ex:
                    ASCIIColors.error(f"Exception: {ex}")
                    output = str(ex)
                self.state=0
            else:
                if not self.ready:
                     ASCIIColors.error(f"No data to discuss. Please upload a document first")
                else:
                    docs = self.vector_store.recover_text(self.vector_store.embed_query(prompt), top_k=3)
                    docs = '\n'.join([f"Doc{i}:\n{v}" for i,v in enumerate(docs)])
                    full_text = self.personality.personality_conditioning+"\n### Docs:\n"+docs+"\n### Question: "+prompt+"\n### Answer:"
                    ASCIIColors.blue("-------------- Documentation -----------------------")
                    ASCIIColors.blue(full_text)
                    ASCIIColors.blue("----------------------------------------------------")
                    ASCIIColors.blue("Thinking")
                    if callback is not None:
                        callback("Thinking", MSG_TYPE.MSG_TYPE_FULL)
                    output = self.generate(full_text, self.personality_config["max_answer_size"])
                    if callback is not None:
                        callback(output, MSG_TYPE.MSG_TYPE_FULL)
        return output