Legal_RAG / model /processor /law_provider.py
mojtabaa4's picture
add application files
bc68b0b
raw
history blame
2.66 kB
import pandas as pd
import re
class LawTxetPreProcessor():
def __init__(self, law_texts: list) -> None:
self._law_texets = law_texts
self._law_name_df = pd.DataFrame(columns=["law_index", "law_name"])
self._madeh_df = pd.DataFrame(columns=["law_index", "madeh_index", "madeh_text"])
self._is_df = False
def build_df(self):
title_list = []
madeh_list = []
madeh_index = []
law_index = []
counter = 0
for text in self._law_texets:
title = self.title_extractor(text)
title_list.append(title)
temp_madeh_list = self.madeh_extractor(text, title == "قانون اساسی جمهوری اسلامی ایران")
law_index.extend([counter for i in temp_madeh_list])
madeh_index.extend([i+1 for i in range(len(temp_madeh_list))])
madeh_list.extend(temp_madeh_list)
counter += 1
law_index_list = [i for i in range(counter)]
self._madeh_df = pd.DataFrame({"law_index": law_index,
"madeh_index": madeh_index,
"madeh_text": madeh_list})
self._law_name_df = pd.DataFrame({"law_index": law_index_list,
"law_name": title_list})
def title_extractor(self, law_text: str) -> str:
first_newline_index = law_text.find('\n')
return law_text[:first_newline_index]
def madeh_extractor(self, law_text: str, is_asl:False)-> list:
result = []
pattern = r"(^.{0,1}اصل )" if is_asl else r"(^.{0,1}ماده)"
removed_regex = r"❯.*\n"
notvalid_pattern = r"(^.{0,1}ماده.*مکرر\n)"
cleaned_text = re.sub(removed_regex, "", law_text)
matches = re.finditer(pattern, cleaned_text, flags=re.MULTILINE)
not_valid_matches = re.finditer(notvalid_pattern, cleaned_text, flags=re.MULTILINE)
indices = [match.start() for match in matches]
not_valid_indices = [match.start() for match in not_valid_matches]
valid_indices = [item for item in indices if item not in not_valid_indices]
for i in range(len(valid_indices)):
start = valid_indices[i]
if i != len(valid_indices)-1:
end = valid_indices[i+1]
result.append(cleaned_text[start:end])
else:
result.append(cleaned_text[start:])
return result
def get_df(self) -> pd.DataFrame:
if not self._is_df:
self.build_df()
return self._law_name_df, self._madeh_df