File size: 5,281 Bytes
7f7b773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d872c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f7b773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import logging
from pathlib import Path
import cmd
import shlex
import hydra
from omegaconf import DictConfig, OmegaConf
from art import tprint
import utils

log = logging.getLogger(__name__)

class CLIApp(cmd.Cmd):

    class CleanExit:
        def __init__(self, cliapp):
            self.cliapp = cliapp
        def __enter__(self):
            return self
        def __exit__(self, exc_type, exc_value, exc_tb):
            if exc_type is KeyboardInterrupt:
                    print("\n", end="")
                    self.cliapp.do_exit(None)
                    return True
            return exc_type is None

    prompt = '> '
    intro = """Running in interactive mode:
Welcome to the LLM4SciLit shell. Type help or ? to list commands.\n"""

    def __init__(self, app, cfg : DictConfig) -> None:
        super().__init__()
        self.app = app
        self.cfg = cfg

    def do_exit(self, _):
        """Exit the shell."""
        # self.app.vector_store.save(self.cfg.storage_path.vector_store)
        print("\nLLM4SciLit: Bye!\n")
        self.app.exit()
        return True
    do_EOF = do_exit

    def do_ask_paper(self, line):
        """Ask a question about a paper."""
        paper, line = shlex.split(line)
        filter_dict = {"paper_title": paper}
        print(f"\nLLM4SciLit: {self.app.qa_model.answer_question(line, filter_dict)['result']}\n")

    def default(self, line):
        # print(f"\nLLM4SciLit: a bunch of nonsense\n")
        print(f"\nLLM4SciLit: {self.app.qa_model.answer_question(line, {})['result']}\n")


class App:
    def __init__(self, cfg : DictConfig) -> None:
        self.cfg = cfg

        log.info("Loading: Document Loader")
        self.loader = hydra.utils.instantiate(cfg.document_loader)
        log.info("Loading: Text Splitter")
        self.splitter = hydra.utils.instantiate(cfg.text_splitter)
        log.info("Loading: Text Embedding Model")
        self.text_embedding_model = hydra.utils.instantiate(cfg.text_embedding)
        log.info("Loading: Vector Store")
        self.vector_store = hydra.utils.instantiate(cfg.vector_store, self.text_embedding_model)
        log.info("Loading: Document Retriever")
        self.retriever = hydra.utils.instantiate(cfg.document_retriever, self.vector_store)
        log.info("Loading: Question Answering Model")
        self.qa_model = hydra.utils.instantiate(cfg.question_answering, self.retriever)

    def _bootstrap(self) -> None:
        # if vector store does not exist, create it
        # if vector store exists, load it
        
        if not Path(self.cfg.storage_path.vector_store).exists() or self.cfg.debug.force_rebuild_storage:
            message = (
                "Vector store not found at %s. Building storage from scratch"
                if not self.cfg.debug.force_rebuild_storage
                else "Forced to rebuild storage. Building storage from scratch"
            )
            log.info(message, self.cfg.storage_path.vector_store)

            docs = self.loader.load_documents(self.cfg.storage_path.documents)
            docs = self.splitter.split_documents(docs)
            utils.save_docs_to_jsonl(docs, self.cfg.storage_path.documents_processed)

            self.vector_store.initialize_from_documents(docs)
            self.vector_store.save(self.cfg.storage_path.vector_store)
        else:
            log.info("Vector store found at %s. Loading existing storage", self.cfg.storage_path.vector_store)
            self.vector_store.initialize_from_file(self.cfg.storage_path.vector_store)
        
        self.retriever.initialize()
        self.qa_model.initialize()
        print("Ready to answer your questions 🔥🔥\n")


    ##################################################################################################
    # App functionalities

    def ask_paper(self, line):
        """Ask a question about a paper."""
        paper, line = shlex.split(line)
        filter_dict = {"paper_title": paper}
        print(f"\nLLM4SciLit: {self.qa_model.answer_question(line, filter_dict)['result']}\n")

    def ask(self, line):
        # print(f"\nLLM4SciLit: a bunch of nonsense\n")
        print(f"\nLLM4SciLit: {self.qa_model.answer_question(line, {})['result']}\n")
    
    def ask_chat(self, line, history):
        # print(f"\nLLM4SciLit: a bunch of nonsense\n")
        return self.qa_model.answer_question(line, {})['result']


    ##################################################################################################
    # App modes        

    def run_interactive(self) -> None:
        self._bootstrap()
        cli = CLIApp(self, self.cfg)
        with CLIApp.CleanExit(cli):
            cli.cmdloop()

    def exit(self):
        """
            Do any cleanup here
        """

@hydra.main(version_base=None, config_path="../config", config_name="config")
def main(cfg : DictConfig) -> None:
    tprint("LLM4SciLit")

    if cfg.debug.is_debug:
        print("Running with config:")
        print(OmegaConf.to_yaml(cfg))

    app = App(cfg)
    match cfg.mode:
        case "interactive":
            app.run_interactive()
        case _:
            raise ValueError(f"Unknown mode: {cfg.mode}")

if __name__ == "__main__":
    main() # pylint: disable=E1120:no-value-for-parameter