import streamlit as st import logging import pandas as pd from haystack.utils import print_answers from haystack.pipelines import Pipeline from haystack.document_stores import ElasticsearchDocumentStore from haystack.nodes import EmbeddingRetriever from haystack.nodes.other.docs2answers import Docs2Answers from haystack.utils import launch_es, fetch_archive_from_http # Initialize logging logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) logging.getLogger("haystack").setLevel(logging.INFO) # Launch Elasticsearch launch_es() # Initialize the Haystack pipeline and document store document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", embedding_field="question_emb", embedding_dim=384, excluded_meta_data=["question_emb"], similarity="cosine", ) retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2", use_gpu=True, scale_score=False, ) doc_to_answers = Docs2Answers() doc_dir = "data/basic_faq_pipeline" s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(queries=questions).tolist() df = df.rename(columns={"question": "content"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # Initialize a Pipeline (this time without a reader) and ask questions pipeline = Pipeline() pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["Retriever"]) # Create the Streamlit app st.title("FAQ Search") question = st.text_input("Ask a question:") if question: params = {"Retriever": {"top_k": 10}} # Modify parameters as needed prediction = pipeline.run(query=question, params=params) st.subheader("Answers:") print_answers(prediction, details="medium")