Spaces:

mojtabaa4
/

Legal_RAG_Chatbot

Runtime error

App Files Files Community

mojtabaa4 commited on Sep 23

Commit

6ee47c4

•

1 Parent(s): 64dab39

add application files

Browse files

Files changed (16) hide show

app.py +0 -0
config.py +22 -0
model/__init__.py +0 -0
model/chat.py +23 -0
model/controller.py +18 -0
model/llm/llm.py +108 -0
model/processor/case_crawler.py +113 -0
model/processor/database_Chunker.ipynb +0 -0
model/processor/law_provider.py +61 -0
model/processor/pre_process.ipynb +0 -0
model/processor/retrieval_rag_nlp_project.ipynb:Zone.Identifier +0 -0
model/propmt/__init__.py +0 -0
model/propmt/prompt_handler.py +16 -0
model/rag/__init__.py +0 -0
model/rag/rag_handler.py +102 -0
requirements.txt +24 -0

app.py ADDED Viewed

File without changes

config.py ADDED Viewed

	@@ -0,0 +1,22 @@

+gpt_3_5 = "gpt-3.5-turbo-instruct"
+gpt_mini = "gpt-4o-mini"
+aval_ai = {
+    "model": gpt_3_5,
+    "base_url": "https://api.avalai.ir/v1",
+}
+GILAS_CONFIG = {
+    "api_key": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwMzg5OTQ0NjgsImp0aSI6IjExNDg4MzAyMTE3NDA0MzY2ODc0NiIsImlhdCI6MTcyMzYzNDQ2OCwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyMzYzNDQ2OCwic3ViIjoiMTE0ODgzMDIxMTc0MDQzNjY4NzQ2In0.8hbh59BmwBcAfoH9nEB98_5BIuxzwUUb8fpHSKF1S_Q",
+    "model": "gpt-4o-mini" ,
+    "base_url": 'https://api.gilas.io/v1',
+}
+OPENAI_CONFIG = {
+    "model": gpt_mini,
+}
+LLM_CONFIG = aval_ai

model/__init__.py ADDED Viewed

File without changes

model/chat.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from model.propmt.prompt_handler import *
+from model.llm.llm import *
+from model.rag.rag_handler import *
+from config import *
+class Chat:
+    def __init__(self, chat_id, rag_handler) -> None:
+        self.chat_id = chat_id
+        self.message_history = []
+        self.response_history = []
+        self.prompt_handler = Prompt()
+        self.llm = LLM_API_Call("gilas")
+        self.rag_handler = rag_handler
+    def response(self, message: str) -> str:
+        self.message_history.append(message)
+        info_list = self.rag_handler.get_information(message)
+        prompt = self.prompt_handler.get_prompt(message, info_list)
+        response = self.llm.get_LLM_response(prompt=prompt)
+        self.response_history.append(response)
+        return response

model/controller.py ADDED Viewed

	@@ -0,0 +1,18 @@

+%%writefile model/controller.py
+from model.chat import *
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+class Controller:
+    def __init__(self) -> None:
+        self.chat_dic = {}
+        self.rag_handler = RAG()
+    def handle_message(self,
+                       chat_id: int,
+                       message: str) -> str:
+        if chat_id not in self.chat_dic:
+             self.chat_dic[chat_id] = Chat(chat_id=chat_id, rag_handler=self.rag_handler)
+        chat = self.chat_dic[chat_id]
+        return chat.response(message)
+x

model/llm/llm.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from langchain_openai import OpenAI
+import openai
+import sys
+import os
+import requests
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+from config import *
+class LLM_API_Call:
+    def __init__(self, type) -> None:
+        if type == "openai":
+            self.llm = OpenAI_API_Call(api_key = LLM_CONFIG[""],
+                                       model = LLM_CONFIG["model"])
+        elif type == "gilas":
+            self.llm = Gilas_API_Call(api_key = GILAS_CONFIG["api_key"],
+                                       model = GILAS_CONFIG["model"],
+                                       base_url=GILAS_CONFIG["base_url"])
+        else:
+            self.llm = OpenAI(
+                            **LLM_CONFIG
+            )
+    def get_LLM_response(self, prompt: str) -> str:
+        return self.llm.invoke(prompt)
+class OpenAI_API_Call:
+    def __init__(self, api_key, model="gpt-4"):
+        self.api_key = api_key
+        openai.api_key = api_key
+        self.model = model
+        self.conversation = []
+    def add_message(self, role, content):
+        self.conversation.append({"role": role, "content": content})
+    def get_response(self):
+        response = openai.ChatCompletion.create(
+            model=self.model,
+            messages=self.conversation
+        )
+        return response['choices'][0]['message']['content']
+    def invoke(self, user_input):
+        self.add_message("user", user_input)
+        response = self.get_response()
+        self.add_message("assistant", response)
+        return response
+class Gilas_API_Call:
+    def __init__(self, api_key, base_url, model="gpt-4o-mini"):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.model = model
+        self.headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        self.conversation = []
+    def add_message(self, role, content):
+        self.conversation.append({"role": role, "content": content})
+    def get_response(self):
+        data = {
+            "model": self.model,
+            "messages": self.conversation
+        }
+        response = requests.post(
+            url=f"{self.base_url}/chat/completions",
+            headers=self.headers,
+            json=data
+        )
+        # print(f"Response status code: {response.status_code}")
+        # print(f"Response content: {response.text}")
+        if response.status_code == 200:
+            try:
+                return response.json()['choices'][0]['message']['content']
+            except (KeyError, IndexError, ValueError) as e:
+                raise Exception(f"Unexpected API response format: {e}")
+        else:
+            raise Exception(f"Gilas API call failed: {response.status_code} - {response.text}")
+    def invoke(self, user_input):
+        self.add_message("user", user_input)
+        response = self.get_response()
+        self.add_message("assistant", response)
+        return response
+# test =  LLM_API_Call(type = "gilas")
+# print(test.get_LLM_response("سلام"))

model/processor/case_crawler.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import requests
+from bs4 import BeautifulSoup
+import os
+import warnings
+from tqdm import tqdm
+class Crawler:
+    # This is used for vote separating when list of vote concatenation in string
+    vote_splitter = " |split| "
+    def __init__(self, base_url: str, list_url:str ,
+                 base_vote_url:str , models_path: str , result_path:str):
+        if base_url == "":
+            self.base_url ="https://ara.jri.ac.ir/"
+        else:
+            self.base_url = base_url
+        if list_url == "":
+            self.list_url ="https://ara.jri.ac.ir/Judge/Index"
+        else:
+            self.list_url = list_url
+        if base_vote_url == "":
+            self.base_vote_url ="https://ara.jri.ac.ir/Judge/Text/"
+        else:
+            self.base_vote_url = base_vote_url
+        if models_path == "":
+            self.models_path ="Models/"
+        else:
+            self.models_path = models_path
+        self.pos_model_path = os.path.join(models_path, "postagger.model")
+        self.chunker_path = os.path.join(models_path, "chunker.model")
+        if result_path == "":
+            self.result_path = "Resource/"
+        else:
+            self.result_path = result_path
+        self.merges_vote_path = os.path.join(result_path, 'merged_vote.txt')
+        self.clean_vote_path = os.path.join(result_path, 'clean_vote.txt')
+        self.clean_vote_path_csv = os.path.join(result_path, 'clean_vote.csv')
+        self.selected_vote_path = os.path.join(result_path, 'selected_vote.txt')
+        self.law_list_path = os.path.join(result_path, 'law_list.txt')
+        self.law_clean_list_path = os.path.join(result_path, 'law_clean_list.txt')
+        self.vote_stop_path = os.path.join(result_path, "vote_stopwords.txt")
+        self.law_stop_path = os.path.join(result_path, "law_stopwords.txt")
+    @staticmethod
+    def check_valid_vote(html_soup: BeautifulSoup) -> bool:
+        # Extract title for detection of non-valid vote
+        h1_element = html_soup.find('h1', class_='Title3D')
+        if h1_element is None:
+            return False
+        span_text = h1_element.find('span').text  # Text within the <span> tag
+        full_text = h1_element.text  # Full text within the <h1> element
+        text_after_span = full_text.split(span_text)[-1].strip()  # Extract text after the </span> tag
+        return len(text_after_span) > 0
+    @staticmethod
+    def html_data_extractor(html_soup: BeautifulSoup, vote_splitter: str) -> str:
+        vote_text = html_soup.find('div', id='treeText', class_='BackText')
+        title = html_soup.find('h1', class_='Title3D')
+        info = html_soup.find('td', valign="top", class_="font-size-small")
+        # for separating each vote in file use vote_splitter
+        vote_df = str(title) + str(info) + str(vote_text) + vote_splitter
+        return vote_df
+    def vote_crawler(self, start: int, end: int, separator: int):
+        counter = 0  # For counting right votes crawled
+        result_list = []
+        warnings.filterwarnings("ignore")
+        # Loop for sending request to get each vote page
+        for i in tqdm(range(start, end)):
+            # Save every separator records gotten in .txt format
+            if (counter % separator == 0 and counter > 0) or i == end - 1:
+                text_file = open(os.path.join(self.result_path, f'vote{i}.txt'), "w", encoding='utf-8')
+                text_file.write(''.join(result_list))
+                text_file.close()
+                result_list = []
+            url = self.base_vote_url + f"{i}"
+            response = requests.get(url, verify=False)
+            # Change format for Persian text
+            response.encoding = 'utf-8'
+            resp_text = response.text
+            html_soup = BeautifulSoup(resp_text, 'html.parser')
+            if response.ok and self.check_valid_vote(html_soup):
+                counter += 1
+                vote_df = self.html_data_extractor(html_soup, self.vote_splitter)
+                result_list.append(vote_df)
+    def merge_out_txt(self) -> None:
+        with open(self.result_path, 'w', encoding='utf-8') as outfile:
+            for filename in os.listdir(self.merges_vote_path):
+                if filename.startswith("vote") and filename.endswith('.txt'):  # Only merge vote .txt
+                    with open(os.path.join(self.merges_vote_path, filename), 'r', encoding='utf-8') as infile:
+                        outfile.write(infile.read())
+if __name__ == "__main__":
+    models_path = input("Enter the models path (initial value = https://ara.jri.ac.ir/): ")
+    result_path = input("Enter the result path (initial value = https://ara.jri.ac.ir/Judge/Index): ")
+    base_url = input("Enter the base URL (initial value = https://ara.jri.ac.ir/Judge/Text/): ")
+    list_url = input("Enter the list URL (initial value = Models/ ): ")
+    base_vote_url = input("Enter the base vote URL (initial value = Resource/ ): ")
+    crawler_instance = Crawler(models_path=models_path, result_path=result_path, base_url=base_url, list_url=list_url, base_vote_url=base_vote_url)
+    start = int(input("Enter the start value for vote crawling: "))
+    end = int(input("Enter the end value for vote crawling: "))
+    separator = int(input("Enter the separator value for vote crawling: "))
+    crawler_instance.vote_crawler(start=start, end=end, separator=separator)
+    crawler_instance.merge_out_txt()

model/processor/database_Chunker.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

model/processor/law_provider.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+import re
+class LawTxetPreProcessor():
+    def __init__(self, law_texts: list) -> None:
+        self._law_texets = law_texts
+        self._law_name_df = pd.DataFrame(columns=["law_index", "law_name"])
+        self._madeh_df = pd.DataFrame(columns=["law_index", "madeh_index", "madeh_text"])
+        self._is_df = False
+    def build_df(self):
+        title_list = []
+        madeh_list = []
+        madeh_index = []
+        law_index = []
+        counter = 0
+        for text in self._law_texets:
+            title = self.title_extractor(text)
+            title_list.append(title)
+            temp_madeh_list = self.madeh_extractor(text, title == "قانون اساسی جمهوری اسلامی ایران")
+            law_index.extend([counter for i in temp_madeh_list])
+            madeh_index.extend([i+1  for i in range(len(temp_madeh_list))])
+            madeh_list.extend(temp_madeh_list)
+            counter += 1
+        law_index_list = [i for i in range(counter)]
+        self._madeh_df = pd.DataFrame({"law_index": law_index,
+                                    "madeh_index": madeh_index,
+                                    "madeh_text": madeh_list})
+        self._law_name_df = pd.DataFrame({"law_index": law_index_list,
+                                          "law_name": title_list})
+    def title_extractor(self, law_text: str) -> str:
+        first_newline_index = law_text.find('\n')
+        return law_text[:first_newline_index]
+    def madeh_extractor(self, law_text: str, is_asl:False)-> list:
+        result = []
+        pattern = r"(^.{0,1}اصل )" if is_asl else r"(^.{0,1}ماده)"
+        removed_regex = r"❯.*\n"
+        notvalid_pattern = r"(^.{0,1}ماده.*مکرر\n)"
+        cleaned_text = re.sub(removed_regex, "", law_text)
+        matches = re.finditer(pattern, cleaned_text, flags=re.MULTILINE)
+        not_valid_matches = re.finditer(notvalid_pattern, cleaned_text, flags=re.MULTILINE)
+        indices = [match.start() for match in matches]
+        not_valid_indices = [match.start() for match in not_valid_matches]
+        valid_indices = [item for item in indices if item not in not_valid_indices]
+        for i in range(len(valid_indices)):
+            start = valid_indices[i]
+            if i != len(valid_indices)-1:
+                end = valid_indices[i+1]
+                result.append(cleaned_text[start:end])
+            else:
+                result.append(cleaned_text[start:])
+        return result
+    def get_df(self) -> pd.DataFrame:
+        if not self._is_df:
+            self.build_df()
+        return self._law_name_df, self._madeh_df

model/processor/pre_process.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

model/processor/retrieval_rag_nlp_project.ipynb:Zone.Identifier ADDED Viewed

Binary file (27 Bytes). View file

model/propmt/__init__.py ADDED Viewed

File without changes

model/propmt/prompt_handler.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+class Prompt:
+    def get_prompt(self, message:str, info_list: List) -> str:
+        prompt = f"As a user, I want to ask you the following legal question:\n{message}\n\n"
+        if info_list:
+            prompt += "Here are some relevant legal cases and information you should consider:\n"
+            for i, info in enumerate(info_list):
+                prompt += f"case {i+1}:\n{info['title']}\n{info['text']}\n"
+        prompt += "\nBased on the provided information, please respond in Persian(Farsi) with a concise legal analysis.\
+                    Ensure that your response is as summarized and clear as possible. (one paragraph)"
+        return prompt

model/rag/__init__.py ADDED Viewed

File without changes

model/rag/rag_handler.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import List
+import chromadb
+from transformers import AutoTokenizer, AutoModel
+from chromadb.config import Settings
+import torch
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import os
+from hazm import *
+class RAG:
+    def __init__(self,
+                 model_name: str = "HooshvareLab/bert-base-parsbert-uncased",
+                 collection_name: str = "legal_cases",
+                 persist_directory: str = "chromadb_collections/",
+                 top_k: int = 2
+                 ) -> None:
+        self.cases_df = pd.read_csv('processed_cases.csv')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.normalizer = Normalizer()
+        self.top_k = top_k
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+        self.client = chromadb.PersistentClient(path=persist_directory)
+        self.collection = self.client.get_collection(name=collection_name)
+    def query_pre_process(self, query: str) -> str:
+        return self.normalizer.normalize(query)
+    def embed_single_text(self, text: str) -> np.ndarray:
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+    def extract_case_title_from_df(self, case_id: str) -> str:
+        case_id_int = int(case_id.split("_")[1])
+        try:
+            case_title = self.cases_df.loc[case_id_int, 'title']
+            return case_title
+        except KeyError:
+            return "Case ID not found in DataFrame."
+    def extract_case_text_from_df(self, case_id: str) -> str:
+        case_id_int = int(case_id.split("_")[1])
+        try:
+            case_text = self.cases_df.loc[case_id_int, 'text']
+            return case_text
+        except KeyError:
+            return "Case ID not found in DataFrame."
+    def retrieve_relevant_cases(self, query_text: str) -> List[str]:
+        normalized_query_text = self.query_pre_process(query_text)
+        query_embedding = self.embed_single_text(normalized_query_text)
+        query_embedding_list = query_embedding.tolist()
+        results = self.collection.query(
+            query_embeddings=[query_embedding_list],
+            n_results=self.top_k
+        )
+        retrieved_cases = []
+        for i in range(len(results['metadatas'][0])):
+            case_id = results['ids'][0][i]
+            case_text = self.extract_case_text_from_df(case_id)
+            case_title = self.extract_case_title_from_df(case_id)
+            retrieved_cases.append({
+                "text": case_text,
+                "title": case_title
+            })
+        return retrieved_cases
+    def get_information(self, query: str) -> List[str]:
+        return self.retrieve_relevant_cases(query)
+from typing import List
+class RAG:
+    def __init__(self) -> None:
+        pass
+    def get_information(self, query: str) -> List[str]:
+        return []
+    def query_pre_process(self, query: str):
+        return query

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+#dataset
+datasets
+pandas
+numpy
+indexed_gzip
+# json
+matrix-nio[e2e]
+opsdroid
+python-dotenv
+BeautifulSoup4
+requests
+tqdm
+hazm
+spacy
+rank_bm25
+openai
+gradio
+langchain_openai
+sentence-transformers
+chromadb