Legal_RAG / model /processor /case_crawler.py
mojtabaa4's picture
add application files
bc68b0b
raw
history blame
5.5 kB
import requests
from bs4 import BeautifulSoup
import os
import warnings
from tqdm import tqdm
class Crawler:
# This is used for vote separating when list of vote concatenation in string
vote_splitter = " |split| "
def __init__(self, base_url: str, list_url:str ,
base_vote_url:str , models_path: str , result_path:str):
if base_url == "":
self.base_url ="https://ara.jri.ac.ir/"
else:
self.base_url = base_url
if list_url == "":
self.list_url ="https://ara.jri.ac.ir/Judge/Index"
else:
self.list_url = list_url
if base_vote_url == "":
self.base_vote_url ="https://ara.jri.ac.ir/Judge/Text/"
else:
self.base_vote_url = base_vote_url
if models_path == "":
self.models_path ="Models/"
else:
self.models_path = models_path
self.pos_model_path = os.path.join(models_path, "postagger.model")
self.chunker_path = os.path.join(models_path, "chunker.model")
if result_path == "":
self.result_path = "Resource/"
else:
self.result_path = result_path
self.merges_vote_path = os.path.join(result_path, 'merged_vote.txt')
self.clean_vote_path = os.path.join(result_path, 'clean_vote.txt')
self.clean_vote_path_csv = os.path.join(result_path, 'clean_vote.csv')
self.selected_vote_path = os.path.join(result_path, 'selected_vote.txt')
self.law_list_path = os.path.join(result_path, 'law_list.txt')
self.law_clean_list_path = os.path.join(result_path, 'law_clean_list.txt')
self.vote_stop_path = os.path.join(result_path, "vote_stopwords.txt")
self.law_stop_path = os.path.join(result_path, "law_stopwords.txt")
@staticmethod
def check_valid_vote(html_soup: BeautifulSoup) -> bool:
# Extract title for detection of non-valid vote
h1_element = html_soup.find('h1', class_='Title3D')
if h1_element is None:
return False
span_text = h1_element.find('span').text # Text within the <span> tag
full_text = h1_element.text # Full text within the <h1> element
text_after_span = full_text.split(span_text)[-1].strip() # Extract text after the </span> tag
return len(text_after_span) > 0
@staticmethod
def html_data_extractor(html_soup: BeautifulSoup, vote_splitter: str) -> str:
vote_text = html_soup.find('div', id='treeText', class_='BackText')
title = html_soup.find('h1', class_='Title3D')
info = html_soup.find('td', valign="top", class_="font-size-small")
# for separating each vote in file use vote_splitter
vote_df = str(title) + str(info) + str(vote_text) + vote_splitter
return vote_df
def vote_crawler(self, start: int, end: int, separator: int):
counter = 0 # For counting right votes crawled
result_list = []
warnings.filterwarnings("ignore")
# Loop for sending request to get each vote page
for i in tqdm(range(start, end)):
# Save every separator records gotten in .txt format
if (counter % separator == 0 and counter > 0) or i == end - 1:
text_file = open(os.path.join(self.result_path, f'vote{i}.txt'), "w", encoding='utf-8')
text_file.write(''.join(result_list))
text_file.close()
result_list = []
url = self.base_vote_url + f"{i}"
response = requests.get(url, verify=False)
# Change format for Persian text
response.encoding = 'utf-8'
resp_text = response.text
html_soup = BeautifulSoup(resp_text, 'html.parser')
if response.ok and self.check_valid_vote(html_soup):
counter += 1
vote_df = self.html_data_extractor(html_soup, self.vote_splitter)
result_list.append(vote_df)
def merge_out_txt(self) -> None:
with open(self.result_path, 'w', encoding='utf-8') as outfile:
for filename in os.listdir(self.merges_vote_path):
if filename.startswith("vote") and filename.endswith('.txt'): # Only merge vote .txt
with open(os.path.join(self.merges_vote_path, filename), 'r', encoding='utf-8') as infile:
outfile.write(infile.read())
if __name__ == "__main__":
models_path = input("Enter the models path (initial value = https://ara.jri.ac.ir/): ")
result_path = input("Enter the result path (initial value = https://ara.jri.ac.ir/Judge/Index): ")
base_url = input("Enter the base URL (initial value = https://ara.jri.ac.ir/Judge/Text/): ")
list_url = input("Enter the list URL (initial value = Models/ ): ")
base_vote_url = input("Enter the base vote URL (initial value = Resource/ ): ")
crawler_instance = Crawler(models_path=models_path, result_path=result_path, base_url=base_url, list_url=list_url, base_vote_url=base_vote_url)
start = int(input("Enter the start value for vote crawling: "))
end = int(input("Enter the end value for vote crawling: "))
separator = int(input("Enter the separator value for vote crawling: "))
crawler_instance.vote_crawler(start=start, end=end, separator=separator)
crawler_instance.merge_out_txt()