SearchDataLINKS / app.py
layperson99's picture
Update app.py
cb1eab8 verified
import gradio as gr
import re
from collections import defaultdict
# Load and preprocess the data
data = []
categories = defaultdict(set)
# Assuming the data loading part is kept outside a function so it loads once
with open("Legal issues.csv", "r") as file:
next(file) # Skip the header line
for line in file:
url, legal_issue_area = line.strip().split(",", 1)
data.append({"legal_issue_area": legal_issue_area.strip(), "url": url.strip()})
# Categorization and Tagging
words = re.findall(r'\w+', legal_issue_area.lower())
for word in words:
categories[word].add(legal_issue_area.strip())
# Indexing
index = defaultdict(list)
for entry in data:
legal_issue_area = entry["legal_issue_area"]
url = entry["url"]
words = re.findall(r'\w+', legal_issue_area.lower())
for word in words:
index[word].append((legal_issue_area, url))
def search(query):
query = query.lower()
words = re.findall(r'\w+', query)
results = []
for word in words:
if word in index:
results.extend(index[word])
results = list(set(results)) # Remove duplicates
# Ranking results based on relevance
ranked_results = []
for legal_issue_area, url in results:
score = sum(1 for word in words if word in legal_issue_area.lower())
ranked_results.append((score, legal_issue_area, url))
ranked_results.sort(reverse=True)
formatted_results = []
for score, legal_issue_area, url in ranked_results:
category_words = [word for word in re.findall(r'\w+', legal_issue_area.lower()) if word in categories]
category = ", ".join(category_words)
formatted_results.append(f"- {legal_issue_area} ({category}): {url}")
return "\n".join(formatted_results)
def search_interface(query):
if not query:
return "Please enter a query."
return search(query)
iface = gr.Interface(
fn=search_interface,
inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."),
outputs=gr.Textbox(lines=20),
title="Legal Issues Search",
description="Enter a query to search for legal issues and their respective URLs."
)
iface.launch()