Spaces:
Sleeping
Sleeping
# Import required libraries | |
import os | |
import pandas as pd | |
import streamlit as st | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer, util | |
import requests | |
import json | |
# Configure Hugging Face API token securely (ensure it's set in environment variables) | |
api_key = os.getenv("HF_API_KEY") | |
# Load the CSV dataset (place the CSV in the same directory as app.py in Hugging Face Spaces) | |
try: | |
data = pd.read_csv('genetic-Final.csv') # Ensure the dataset filename is correct | |
except FileNotFoundError: | |
st.error("Dataset file not found. Please upload it to this directory.") | |
# Initialize Sentence Transformer model for RAG-based retrieval | |
retriever_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
# Preprocess the dataset by creating a combined description column | |
if 'combined_description' not in data.columns: | |
data['combined_description'] = ( | |
data['Symptoms'].fillna('') + " " + | |
data['Severity Level'].fillna('') + " " + | |
data['Risk Assessment'].fillna('') + " " + | |
data['Treatment Options'].fillna('') + " " + | |
data['Suggested Medical Tests'].fillna('') + " " + | |
data['Minimum Values for Medical Tests'].fillna('') + " " + | |
data['Emergency Treatment'].fillna('') | |
) | |
# Define weights for each column based on importance | |
column_weights = { | |
'Symptoms': 0.4, | |
'Severity Level': 0.2, | |
'Risk Assessment': 0.1, | |
'Treatment Options': 0.15, | |
'Suggested Medical Tests': 0.05, | |
'Minimum Values for Medical Tests': 0.05, | |
'Emergency Treatment': 0.05 | |
} | |
# Precompute embeddings for each weighted column | |
for col in column_weights.keys(): | |
if f"{col}_embeddings" not in data.columns: | |
data[f"{col}_embeddings"] = data[col].fillna("").apply(lambda x: retriever_model.encode(x).tolist()) | |
# Function to retrieve relevant information with weighted scoring | |
def get_weighted_relevant_info(query, top_k=3): | |
query_embedding = retriever_model.encode(query) | |
weighted_similarities = [] | |
for idx, row in data.iterrows(): | |
weighted_score = 0 | |
for col, weight in column_weights.items(): | |
if row[f"{col}_embeddings"]: | |
col_similarity = util.cos_sim(query_embedding, row[f"{col}_embeddings"])[0][0].item() | |
weighted_score += col_similarity * weight | |
weighted_similarities.append(weighted_score) | |
top_indices = sorted(range(len(weighted_similarities)), key=lambda i: weighted_similarities[i], reverse=True)[:top_k] | |
return data.iloc[top_indices] | |
# Generate embeddings for the combined description if not already done | |
if 'embeddings' not in data.columns: | |
data['embeddings'] = data['combined_description'].apply(lambda x: retriever_model.encode(x).tolist() if x else []) | |
# Function to retrieve relevant information based on user query (non-weighted) | |
def get_relevant_info(query, top_k=3): | |
query_embedding = retriever_model.encode(query) | |
similarities = [util.cos_sim(query_embedding, doc_emb)[0][0].item() for doc_emb in data['embeddings']] | |
top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_k] | |
return data.iloc[top_indices] | |
# Enhanced response generation function with debugging | |
# Import required libraries | |
import os | |
import pandas as pd | |
import streamlit as st | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer, util | |
import requests | |
import json | |
# Configure Hugging Face API token securely (ensure it's set in environment variables) | |
api_key = os.getenv("HF_API_KEY") | |
# Load the CSV dataset (place the CSV in the same directory as app.py in Hugging Face Spaces) | |
try: | |
data = pd.read_csv('genetic-Final.csv') # Ensure the dataset filename is correct | |
except FileNotFoundError: | |
st.error("Dataset file not found. Please upload it to this directory.") | |
# Initialize Sentence Transformer model for RAG-based retrieval | |
retriever_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
# Preprocess the dataset by creating a combined description column | |
if 'combined_description' not in data.columns: | |
data['combined_description'] = ( | |
data['Symptoms'].fillna('') + " " + | |
data['Severity Level'].fillna('') + " " + | |
data['Risk Assessment'].fillna('') + " " + | |
data['Treatment Options'].fillna('') + " " + | |
data['Suggested Medical Tests'].fillna('') + " " + | |
data['Minimum Values for Medical Tests'].fillna('') + " " + | |
data['Emergency Treatment'].fillna('') | |
) | |
# Define weights for each column based on importance | |
column_weights = { | |
'Symptoms': 0.4, | |
'Severity Level': 0.2, | |
'Risk Assessment': 0.1, | |
'Treatment Options': 0.15, | |
'Suggested Medical Tests': 0.05, | |
'Minimum Values for Medical Tests': 0.05, | |
'Emergency Treatment': 0.05 | |
} | |
# Precompute embeddings for each weighted column | |
for col in column_weights.keys(): | |
if f"{col}_embeddings" not in data.columns: | |
data[f"{col}_embeddings"] = data[col].fillna("").apply(lambda x: retriever_model.encode(x).tolist()) | |
# Function to retrieve relevant information with weighted scoring | |
def get_weighted_relevant_info(query, top_k=3): | |
query_embedding = retriever_model.encode(query) | |
weighted_similarities = [] | |
for idx, row in data.iterrows(): | |
weighted_score = 0 | |
for col, weight in column_weights.items(): | |
if row[f"{col}_embeddings"]: | |
col_similarity = util.cos_sim(query_embedding, row[f"{col}_embeddings"])[0][0].item() | |
weighted_score += col_similarity * weight | |
weighted_similarities.append(weighted_score) | |
top_indices = sorted(range(len(weighted_similarities)), key=lambda i: weighted_similarities[i], reverse=True)[:top_k] | |
return data.iloc[top_indices] | |
# Generate embeddings for the combined description if not already done | |
if 'embeddings' not in data.columns: | |
data['embeddings'] = data['combined_description'].apply(lambda x: retriever_model.encode(x).tolist() if x else []) | |
# Function to retrieve relevant information based on user query (non-weighted) | |
def get_relevant_info(query, top_k=3): | |
query_embedding = retriever_model.encode(query) | |
similarities = [util.cos_sim(query_embedding, doc_emb)[0][0].item() for doc_emb in data['embeddings']] | |
top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_k] | |
return data.iloc[top_indices] | |
# Enhanced response generation function with debugging | |
def generate_response(input_text, relevant_info): | |
context = "\n".join(relevant_info['combined_description'].tolist()) | |
input_with_context = f"Context: {context}\n\nUser Query: {input_text}" | |
api_url = "https://api-inference.huggingface.co/models/m42-health/Llama3-Med42-8B" | |
headers = {"Authorization": f"Bearer {api_key}"} | |
payload = {"inputs": input_with_context} | |
try: | |
response = requests.post(api_url, headers=headers, json=payload) | |
st.write("API Raw Response:", response.text) # Display raw response for debugging | |
# Check response status | |
if response.status_code != 200: | |
return f"Error: API responded with status code {response.status_code}. Full response: {response.json()}" | |
# Parse and validate response | |
response_data = response.json() | |
if isinstance(response_data, list) and "generated_text" in response_data[0]: | |
return response_data[0]["generated_text"] | |
else: | |
return f"Unexpected response format from API. Full response: {response_data}" | |
except Exception as e: | |
return f"Error during API request: {e}" | |
# Streamlit UI for the Chatbot | |
def main(): | |
st.title("Medical Report and Analysis Chatbot") | |
st.sidebar.header("Upload Medical Report or Enter Query") | |
# Text input for user queries | |
user_query = st.sidebar.text_input("Type your question or query") | |
# File uploader for medical report | |
uploaded_file = st.sidebar.file_uploader("Upload a medical report (optional)", type=["txt", "pdf", "csv"]) | |
# Process the query if provided | |
if user_query: | |
st.write("### FAQ and Responses:") | |
# Retrieve relevant information from the dataset | |
relevant_info = get_weighted_relevant_info(user_query) | |
for i, row in relevant_info.iterrows(): | |
st.write(f"- {row['combined_description']}") | |
# Generate a response from the model | |
response = generate_response(user_query, relevant_info) | |
st.write("#### Model's Response:") | |
st.write(response) | |
# Process the uploaded file (if any) | |
if uploaded_file: | |
st.write("### Uploaded Report Analysis:") | |
report_text = "Extracted report content here" # Placeholder for file processing | |
st.write(report_text) | |
if __name__ == "__main__": | |
main() | |