File size: 2,862 Bytes
fd57f99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5f344a
fd57f99
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# https://huggingface.co/spaces/micknikolic/enron

# here are the imports

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain import OpenAI, VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os
import nltk
import pytesseract

import pandas as pd
pd.set_option('display.max_columns',None,
              'display.max_rows',None,
              'display.max_colwidth',None
             )
import numpy as np

import os
import re
import io

import gradio

import warnings
warnings.filterwarnings('ignore')

# here is the code

# data loading.
# i am using a subset of the enron dataset, as it would be computationally very expensive to work with over 500k observations locally.

data = pd.read_csv('subset_enron.csv',encoding='utf-8')
data = data.sample(frac=0.01,random_state=12) #(5174, 2)

# Text pre-processing.

cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x))
content = cleaned_message.tolist()
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

documents = [Document(page_content) for page_content in content]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key"))
vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings)

# Retrieval QA

model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"),
                                                         temperature=0.2,
                                                         top_p=0.2,
                                                         max_tokens=2000), 
                                              chain_type="stuff", retriever=vStore.as_retriever())

# Building Gradio based app. The Retrieval model.

def get_answer(question):
    """
    Returns the answer on a given question.

    Args:
        question (string): end-user's input.

    Returns:
        the model's answer based on the enron emails dataset.
    """
    response = model_retrieval.run(question)
    return response

iface = gradio.Interface(
    fn=get_answer,
    inputs=gradio.Textbox(label="Enter your question here"),
    outputs=[
        gradio.Textbox(label="Answer")],
    title="Retrieval QA for the subset of the Enron dataset",
    examples=[
        "Who are the receivers of the emails from this corpus of emails?",
        "What's at the center of these emails?"
    ]
)

iface.launch()