import json import os import pprint import re import streamlit as st import streamlit.components.v1 as components import requests pp = pprint.PrettyPrinter(indent=2) os.environ["address"] = "http://34.79.83.149:8080" st.set_page_config(page_title="Gaia Search", layout="wide") os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True) with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file: file.write('[theme]\nbase="light"') st.sidebar.markdown( """

Gaia Search 🌖🌏

A search engine for the LAION large scale image caption corpora

""", unsafe_allow_html=True, ) st.sidebar.markdown( """

GitHub | Project Report

""", unsafe_allow_html=True, ) query = st.sidebar.text_input(label="Search query", value="") corpus = st.sidebar.selectbox( "Corpus", ("laion", "pile", "c4"), index=0, ) max_results = st.sidebar.slider( "Maximum Number of Results", min_value=1, max_value=100, step=1, value=10, help="Maximum Number of Documents to return", ) footer = """ """ st.sidebar.markdown(footer, unsafe_allow_html=True) def scisearch(query, corpus, num_results=10): try: print(query, corpus, num_results) query = query.strip() if query == "" or query is None: return post_data = {"query": query, "corpus": corpus, "k": num_results} output = requests.post( os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) results = payload["results"] highlight_terms = payload["highlight_terms"] return results, highlight_terms except Exception as e: results_html = f"""

Raised {type(e).__name__}

Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.

""" print(e) PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} PII_PREFIX = "PI:" def process_pii(text): for tag in PII_TAGS: text = text.replace( PII_PREFIX + tag, """REDACTED {}""".format( tag ), ) return text def highlight_string(paragraph: str, highlight_terms: list) -> str: # TODO: # for term in highlight_terms: # paragraph = re.sub(f"\\b{term}\\b", f"{term}", paragraph, flags=re.I) paragraph = process_pii(paragraph) return paragraph def process_results(hits: list, highlight_terms: list) -> str: hit_list = [] for i, hit in enumerate(hits): res_head = f"""

{i+1}. Document ID: {hit['docid']}

, Score: {round(hit['score'], 2)}

""" if "meta" in hit: if hit["meta"] is not None and "docs" in hit["meta"]: for subhit in hit["meta"]["docs"]: res_head += f"""

{subhit['URL']}

{highlight_string(subhit['TEXT'], highlight_terms)}

""" res_head += f"""

{highlight_string(hit['text'], highlight_terms)}

""" hit_list.append(res_head) return " ".join(hit_list) if st.sidebar.button("Search"): hits, highlight_terms = scisearch(query, corpus, max_results) html_results = process_results(hits, highlight_terms) rendered_results = f"""

About {max_results} results

{html_results}

""" st.markdown( """ """, unsafe_allow_html=True, ) st.markdown( """ """, unsafe_allow_html=True, ) st.markdown( f"""

Gaia Search 🌖🌏

""", unsafe_allow_html=True, ) components.html( """ """ + rendered_results, height=800, scrolling=True, )