import json import os import pprint import re import streamlit as st import streamlit.components.v1 as components import requests pp = pprint.PrettyPrinter(indent=2) os.environ["address"] = "http://34.79.83.149:8080" st.set_page_config(page_title="Gaia Search", layout="wide") os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True) with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file: file.write('[theme]\nbase="light"') st.sidebar.markdown( """
Gaia Search 🌖🌏
A search engine for the LAION large scale image caption corpora
""", unsafe_allow_html=True, ) st.sidebar.markdown( """ """, unsafe_allow_html=True, ) query = st.sidebar.text_input(label="Search query", value="") corpus = st.sidebar.selectbox( "Corpus", ("laion", "pile", "c4"), index=0, ) max_results = st.sidebar.slider( "Maximum Number of Results", min_value=1, max_value=100, step=1, value=10, help="Maximum Number of Documents to return", ) footer = """ """ st.sidebar.markdown(footer, unsafe_allow_html=True) def scisearch(query, corpus, num_results=10): try: print(query, corpus, num_results) query = query.strip() if query == "" or query is None: return post_data = {"query": query, "corpus": corpus, "k": num_results} output = requests.post( os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) results = payload["results"] highlight_terms = payload["highlight_terms"] return results, highlight_terms except Exception as e: results_html = f"""Raised {type(e).__name__}
Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
""" print(e) PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} PII_PREFIX = "PI:" def process_pii(text): for tag in PII_TAGS: text = text.replace( PII_PREFIX + tag, """REDACTED {}""".format( tag ), ) return text def highlight_string(paragraph: str, highlight_terms: list) -> str: # TODO: # for term in highlight_terms: # paragraph = re.sub(f"\\b{term}\\b", f"{term}", paragraph, flags=re.I) paragraph = process_pii(paragraph) return paragraph def process_results(hits: list, highlight_terms: list) -> str: hit_list = [] for i, hit in enumerate(hits): res_head = f"""{highlight_string(subhit['TEXT'], highlight_terms)}
""" res_head += f"""{highlight_string(hit['text'], highlight_terms)}
About {max_results} results
{html_results}