Spaces:
Runtime error
Runtime error
import pandas as pd | |
import re | |
class LawTxetPreProcessor(): | |
def __init__(self, law_texts: list) -> None: | |
self._law_texets = law_texts | |
self._law_name_df = pd.DataFrame(columns=["law_index", "law_name"]) | |
self._madeh_df = pd.DataFrame(columns=["law_index", "madeh_index", "madeh_text"]) | |
self._is_df = False | |
def build_df(self): | |
title_list = [] | |
madeh_list = [] | |
madeh_index = [] | |
law_index = [] | |
counter = 0 | |
for text in self._law_texets: | |
title = self.title_extractor(text) | |
title_list.append(title) | |
temp_madeh_list = self.madeh_extractor(text, title == "قانون اساسی جمهوری اسلامی ایران") | |
law_index.extend([counter for i in temp_madeh_list]) | |
madeh_index.extend([i+1 for i in range(len(temp_madeh_list))]) | |
madeh_list.extend(temp_madeh_list) | |
counter += 1 | |
law_index_list = [i for i in range(counter)] | |
self._madeh_df = pd.DataFrame({"law_index": law_index, | |
"madeh_index": madeh_index, | |
"madeh_text": madeh_list}) | |
self._law_name_df = pd.DataFrame({"law_index": law_index_list, | |
"law_name": title_list}) | |
def title_extractor(self, law_text: str) -> str: | |
first_newline_index = law_text.find('\n') | |
return law_text[:first_newline_index] | |
def madeh_extractor(self, law_text: str, is_asl:False)-> list: | |
result = [] | |
pattern = r"(^.{0,1}اصل )" if is_asl else r"(^.{0,1}ماده)" | |
removed_regex = r"❯.*\n" | |
notvalid_pattern = r"(^.{0,1}ماده.*مکرر\n)" | |
cleaned_text = re.sub(removed_regex, "", law_text) | |
matches = re.finditer(pattern, cleaned_text, flags=re.MULTILINE) | |
not_valid_matches = re.finditer(notvalid_pattern, cleaned_text, flags=re.MULTILINE) | |
indices = [match.start() for match in matches] | |
not_valid_indices = [match.start() for match in not_valid_matches] | |
valid_indices = [item for item in indices if item not in not_valid_indices] | |
for i in range(len(valid_indices)): | |
start = valid_indices[i] | |
if i != len(valid_indices)-1: | |
end = valid_indices[i+1] | |
result.append(cleaned_text[start:end]) | |
else: | |
result.append(cleaned_text[start:]) | |
return result | |
def get_df(self) -> pd.DataFrame: | |
if not self._is_df: | |
self.build_df() | |
return self._law_name_df, self._madeh_df |