# Import required libraries import os import pandas as pd import streamlit as st from transformers import pipeline from sentence_transformers import SentenceTransformer, util import requests import json # Configure Hugging Face API token securely (ensure it's set in environment variables) api_key = os.getenv("HF_API_KEY") # Load the CSV dataset (place the CSV in the same directory as app.py in Hugging Face Spaces) try: data = pd.read_csv('genetic-Final.csv') # Ensure the dataset filename is correct except FileNotFoundError: st.error("Dataset file not found. Please upload it to this directory.") # Initialize Sentence Transformer model for RAG-based retrieval retriever_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Preprocess the dataset by creating a combined description column if 'combined_description' not in data.columns: data['combined_description'] = ( data['Symptoms'].fillna('') + " " + data['Severity Level'].fillna('') + " " + data['Risk Assessment'].fillna('') + " " + data['Treatment Options'].fillna('') + " " + data['Suggested Medical Tests'].fillna('') + " " + data['Minimum Values for Medical Tests'].fillna('') + " " + data['Emergency Treatment'].fillna('') ) # Define weights for each column based on importance column_weights = { 'Symptoms': 0.4, 'Severity Level': 0.2, 'Risk Assessment': 0.1, 'Treatment Options': 0.15, 'Suggested Medical Tests': 0.05, 'Minimum Values for Medical Tests': 0.05, 'Emergency Treatment': 0.05 } # Precompute embeddings for each weighted column for col in column_weights.keys(): if f"{col}_embeddings" not in data.columns: data[f"{col}_embeddings"] = data[col].fillna("").apply(lambda x: retriever_model.encode(x).tolist()) # Function to retrieve relevant information with weighted scoring def get_weighted_relevant_info(query, top_k=3): query_embedding = retriever_model.encode(query) weighted_similarities = [] for idx, row in data.iterrows(): weighted_score = 0 for col, weight in column_weights.items(): if row[f"{col}_embeddings"]: col_similarity = util.cos_sim(query_embedding, row[f"{col}_embeddings"])[0][0].item() weighted_score += col_similarity * weight weighted_similarities.append(weighted_score) top_indices = sorted(range(len(weighted_similarities)), key=lambda i: weighted_similarities[i], reverse=True)[:top_k] return data.iloc[top_indices] # Generate embeddings for the combined description if not already done if 'embeddings' not in data.columns: data['embeddings'] = data['combined_description'].apply(lambda x: retriever_model.encode(x).tolist() if x else []) # Function to retrieve relevant information based on user query (non-weighted) def get_relevant_info(query, top_k=3): query_embedding = retriever_model.encode(query) similarities = [util.cos_sim(query_embedding, doc_emb)[0][0].item() for doc_emb in data['embeddings']] top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_k] return data.iloc[top_indices] # Enhanced response generation function with debugging # Import required libraries import os import pandas as pd import streamlit as st from transformers import pipeline from sentence_transformers import SentenceTransformer, util import requests import json # Configure Hugging Face API token securely (ensure it's set in environment variables) api_key = os.getenv("HF_API_KEY") # Load the CSV dataset (place the CSV in the same directory as app.py in Hugging Face Spaces) try: data = pd.read_csv('genetic-Final.csv') # Ensure the dataset filename is correct except FileNotFoundError: st.error("Dataset file not found. Please upload it to this directory.") # Initialize Sentence Transformer model for RAG-based retrieval retriever_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Preprocess the dataset by creating a combined description column if 'combined_description' not in data.columns: data['combined_description'] = ( data['Symptoms'].fillna('') + " " + data['Severity Level'].fillna('') + " " + data['Risk Assessment'].fillna('') + " " + data['Treatment Options'].fillna('') + " " + data['Suggested Medical Tests'].fillna('') + " " + data['Minimum Values for Medical Tests'].fillna('') + " " + data['Emergency Treatment'].fillna('') ) # Define weights for each column based on importance column_weights = { 'Symptoms': 0.4, 'Severity Level': 0.2, 'Risk Assessment': 0.1, 'Treatment Options': 0.15, 'Suggested Medical Tests': 0.05, 'Minimum Values for Medical Tests': 0.05, 'Emergency Treatment': 0.05 } # Precompute embeddings for each weighted column for col in column_weights.keys(): if f"{col}_embeddings" not in data.columns: data[f"{col}_embeddings"] = data[col].fillna("").apply(lambda x: retriever_model.encode(x).tolist()) # Function to retrieve relevant information with weighted scoring def get_weighted_relevant_info(query, top_k=3): query_embedding = retriever_model.encode(query) weighted_similarities = [] for idx, row in data.iterrows(): weighted_score = 0 for col, weight in column_weights.items(): if row[f"{col}_embeddings"]: col_similarity = util.cos_sim(query_embedding, row[f"{col}_embeddings"])[0][0].item() weighted_score += col_similarity * weight weighted_similarities.append(weighted_score) top_indices = sorted(range(len(weighted_similarities)), key=lambda i: weighted_similarities[i], reverse=True)[:top_k] return data.iloc[top_indices] # Generate embeddings for the combined description if not already done if 'embeddings' not in data.columns: data['embeddings'] = data['combined_description'].apply(lambda x: retriever_model.encode(x).tolist() if x else []) # Function to retrieve relevant information based on user query (non-weighted) def get_relevant_info(query, top_k=3): query_embedding = retriever_model.encode(query) similarities = [util.cos_sim(query_embedding, doc_emb)[0][0].item() for doc_emb in data['embeddings']] top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_k] return data.iloc[top_indices] # Enhanced response generation function with debugging def generate_response(input_text, relevant_info): context = "\n".join(relevant_info['combined_description'].tolist()) input_with_context = f"Context: {context}\n\nUser Query: {input_text}" api_url = "https://api-inference.huggingface.co/models/m42-health/Llama3-Med42-8B" headers = {"Authorization": f"Bearer {api_key}"} payload = {"inputs": input_with_context} try: response = requests.post(api_url, headers=headers, json=payload) st.write("API Raw Response:", response.text) # Display raw response for debugging # Check response status if response.status_code != 200: return f"Error: API responded with status code {response.status_code}. Full response: {response.json()}" # Parse and validate response response_data = response.json() if isinstance(response_data, list) and "generated_text" in response_data[0]: return response_data[0]["generated_text"] else: return f"Unexpected response format from API. Full response: {response_data}" except Exception as e: return f"Error during API request: {e}" # Streamlit UI for the Chatbot def main(): st.title("Medical Report and Analysis Chatbot") st.sidebar.header("Upload Medical Report or Enter Query") # Text input for user queries user_query = st.sidebar.text_input("Type your question or query") # File uploader for medical report uploaded_file = st.sidebar.file_uploader("Upload a medical report (optional)", type=["txt", "pdf", "csv"]) # Process the query if provided if user_query: st.write("### FAQ and Responses:") # Retrieve relevant information from the dataset relevant_info = get_weighted_relevant_info(user_query) for i, row in relevant_info.iterrows(): st.write(f"- {row['combined_description']}") # Generate a response from the model response = generate_response(user_query, relevant_info) st.write("#### Model's Response:") st.write(response) # Process the uploaded file (if any) if uploaded_file: st.write("### Uploaded Report Analysis:") report_text = "Extracted report content here" # Placeholder for file processing st.write(report_text) if __name__ == "__main__": main()