import gradio as gr from sentence_transformers import SentenceTransformer import psycopg2 import os import torch conn_string = os.environ.get("DATABASE_URL") model = SentenceTransformer("multilingual-e5-small", device="cpu") model.eval() def search(query, top_k): with torch.no_grad(): query_embedding = model.encode("query: " + query) conn = psycopg2.connect(conn_string) cur = conn.cursor() query_sql = f"SELECT source_file, chunk FROM items ORDER BY embedding <=> '{str(query_embedding.tolist())}' LIMIT {int(top_k)};" cur.execute(query_sql) results = cur.fetchall() conn.close() results_format = "\n".join([f"{i+1}. {text} __({file})__" for i, (file, text) in enumerate(results)]) template = f"{results_format}" return(template) with gr.Blocks() as demo: gr.Markdown("# Spørgsmål og svar indenfor natur og miljø med semantisk søgning") gr.Markdown("## Søgning") with gr.Row(): textbox = gr.Textbox(placeholder="Skriv her...", lines=1, scale=9, label="Søgning") num = gr.Number(5, label="Hits", scale=1) btn = gr.Button("Søg!", size="sm", scale=1) gr.Markdown("## Resultater") output = gr.Markdown() gr.Markdown("## Om") gr.Markdown("*Søgningen baseret på tekst i natur og miljø rapporter fra DCE (https://dce.au.dk/udgivelser/), se referencer til specifikke rapporter i parentes:\nSR = Videnskabelige rapporter, TR = Tekniske rapporter og MB = Miljøbiblioteksbøger*") gr.Markdown("## Tech") gr.Markdown("*PDF parsing using GROBID, preprocessing using stanza and chunkipy, SentenceTransformer embeddings (intfloat/multilingual-e5-small), PostgreSQL/pgvector database hosted at Neon Tech and Gradio frontend*") btn.click(search, [textbox, num], output) textbox.submit(search, [textbox, num], output) demo.launch()