from sentence_transformers import SentenceTransformer, util import torch import os import gradio as gr import json import re import numpy as np import pickle from datetime import datetime from huggingface_hub import Repository from datasets import load_dataset ############# Read in the data ############# access_token_1 = os.environ.get("HF_token") dataset = load_dataset("acd424/tribunal_data", use_auth_token=access_token_1) embeddings = dataset["train"]["embed"] corpus = dataset["train"]["reason_text"] files = dataset["train"]["file_name"] all_cats = dataset["train"]["all_cats"] print("The data has loaded") '''# for saving a log DATASET_REPO_URL = "https://huggingface.co/datasets/acd424/tribunal_data" DATA_FILENAME = "queries_and_responces.txt" DATA_FILE = os.path.join("data", DATA_FILENAME) repo = Repository( local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=access_token_1 ) ''' ################### Functions ########################## def semantic_search(query, corpus=corpus, corpus_embeddings=embeddings, k=5): """Performs search to find vector in corpus closest to query (using cosine similarity) and returns the top k resluts Parameters ---------- query: str The query the user wants to search with corpus: list Each item in the list is the text from a chunk corpus_embedding: numpy.ndarray The word-embedding vector representation of the chunk k: int The numer of results to print (default is 5) Returns ------- Prints output displaying top k results """ top_k = min(k, len(corpus_embeddings)) query_embedding = embedder.encode(query, convert_to_tensor=True) # We use cosine-similarity and torch.topk to find the highest 5 scores cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] top_results = torch.topk(cos_scores, k=top_k) highest_cos_score = cos_scores[list(np.argsort(cos_scores))[::-1][0]] if highest_cos_score < 0.5: final_string = "The query is either not detailed enough or is perhaps not an appropriate query" else: final_string = "" n = 1 for score, idx in zip(top_results[0], top_results[1]): final_string = final_string + f" Match: {n} \n" final_string = ( final_string + f"https://www.gov.uk/employment-tribunal-decisions/{corpus[idx]}" ) final_string = final_string + "\n ======== With Catergories ========= \n" final_string = final_string + all_cats[idx] final_string = ( final_string + f"\n ============================================ {cos_scores[list(np.argsort(cos_scores))[::-1][n-1]]} \n\n\n" ) n += 1 return final_string def produce_tribunal_out(query, corpus=files, tribunal_embeddings=embeddings): # get context context_string = semantic_search( query=query, corpus=corpus, corpus_embeddings=tribunal_embeddings ) # get time time_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S") print(f"{time_now}|{query}\n") '''# write to file with open(DATA_FILE, "a") as f: f.write(f"{time_now}|{query}\n") commit_url = repo.push_to_hub() print(commit_url) ''' return gr.update(value=context_string) ############### Specify models embedder = SentenceTransformer("all-MiniLM-L6-v2") ############### The Appp with gr.Blocks() as demo: gr.Markdown("Employment tribunal demo") # text input from user inp = gr.Textbox( placeholder="Type your employment tribunal query here - describe your problem", label="Tribunal Query", ) # initiate the functions process_btn = gr.Button("Search records from UK employment tribunals") Suggested_text = gr.Textbox( value="Suggestions will appear here", label="", lines=20 ) Suggested_text.style(show_copy_button=True) process_btn.click( fn=produce_tribunal_out, inputs=[inp], outputs=Suggested_text, ) demo.launch(auth=("admin", "dataisking")) ##########################################