micknikolic commited on
Commit
fd57f99
1 Parent(s): b1ff6f8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/spaces/micknikolic/enron
2
+
3
+ # here are the imports
4
+
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.chains import RetrievalQA
9
+ from langchain import OpenAI, VectorDBQA
10
+ from langchain.document_loaders import DirectoryLoader
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+
13
+ import os
14
+ import nltk
15
+ import pytesseract
16
+
17
+ import pandas as pd
18
+ pd.set_option('display.max_columns',None,
19
+ 'display.max_rows',None,
20
+ 'display.max_colwidth',None
21
+ )
22
+ import numpy as np
23
+
24
+ import os
25
+ import re
26
+ import io
27
+
28
+ import gradio
29
+
30
+ import warnings
31
+ warnings.filterwarnings('ignore')
32
+
33
+ # here is the code
34
+
35
+ # data loading.
36
+ # i am using a subset of the enron dataset, as it would be computationally very expensive to work with over 500k observations locally.
37
+
38
+ data = pd.read_csv('subset_enron.csv',encoding='utf-8')
39
+ data = data.sample(frac=0.01,random_state=12) #(5174, 2)
40
+
41
+ # Text pre-processing.
42
+
43
+ cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x))
44
+ content = cleaned_message.tolist()
45
+ class Document:
46
+ def __init__(self, page_content, metadata=None):
47
+ self.page_content = page_content
48
+ self.metadata = metadata if metadata is not None else {}
49
+
50
+ documents = [Document(page_content) for page_content in content]
51
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
52
+ texts = text_splitter.split_documents(documents)
53
+
54
+ openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key"))
55
+ vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings)
56
+
57
+ # Retrieval QA
58
+
59
+ model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"),
60
+ temperature=0.2,
61
+ top_p=0.2,
62
+ max_tokens=2000),
63
+ chain_type="stuff", retriever=vStore.as_retriever())
64
+
65
+ # Building Gradio based app. The Retrieval model.
66
+
67
+ def get_answer(question):
68
+ """
69
+ Returns the answer on a given question.
70
+
71
+ Args:
72
+ question (string): end-user's input.
73
+
74
+ Returns:
75
+ the model's answer based on the enron emails dataset.
76
+ """
77
+ response = model_retrieval.run(question)
78
+ return response
79
+
80
+ iface = gradio.Interface(
81
+ fn=get_answer,
82
+ inputs=gradio.Textbox(label="Enter your question here"),
83
+ outputs=[
84
+ gradio.Textbox(label="Answer")],
85
+ title="Retrieval QA for the subset of the Enron dataset",
86
+ examples=[
87
+ "Who are the senders of these emails?",
88
+ "What's at the center of these emails?"
89
+ ]
90
+ )
91
+
92
+ iface.launch()