ToddLLM commited on
Commit
a44b0d7
·
1 Parent(s): 95b96e9

fix pdf handling

Browse files
Files changed (1) hide show
  1. app.py +42 -48
app.py CHANGED
@@ -46,72 +46,65 @@ prompt = ChatPromptTemplate.from_messages(messages)
46
  chain_type_kwargs = {"prompt": prompt}
47
 
48
 
49
- def process_file(file: cl.AskFileMessage):
50
- import tempfile
 
51
 
52
- with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
53
- with open(tempfile.name, "wb") as f:
54
- f.write(file.content)
55
-
56
- pypdf_loader = PyPDFLoader(tempfile.name)
57
- texts = pypdf_loader.load_and_split()
58
- texts = [text.page_content for text in texts]
59
  return texts
60
 
61
 
62
 
 
63
  @cl.on_chat_start
64
  async def on_chat_start():
65
- files = None
66
 
67
- # Wait for the user to upload a file
68
- while files is None:
69
- # Note: This now accepts both text/plain and application/pdf files
70
  files = await cl.AskFileMessage(
71
  content="Please upload a text or PDF file to begin!",
72
- accept=["text/plain", "application/pdf"],
73
- max_size_mb=20, # Assuming PDFs might be larger
74
  timeout=180,
75
  ).send()
 
 
76
 
77
- file = files[0]
78
-
79
- # Notify the user that their file is being processed
80
- msg = cl.Message(content=f"Processing `{file.name}`...")
81
- await msg.send()
82
 
83
- # Initialize an empty list for texts, this will be populated based on file type
84
  texts = []
85
 
86
- # Check the file type and process accordingly
87
- if file.content_type == "text/plain":
88
- # Handle text file
89
  with open(file.path, "r", encoding="utf-8") as f:
90
  text = f.read()
91
- texts.append(text) # Add the text to the texts list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # Update the user about the text file
94
- await cl.Message(
95
- content=f"`{file.name}` uploaded, it contains {len(text)} characters!"
96
- ).send()
97
-
98
- elif file.content_type == "application/pdf":
99
- # Handle PDF file
100
- texts = process_file(file) # Assuming process_file() is a function you've defined to extract text from PDF
101
-
102
- # Create metadata for each chunk
103
- metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
104
-
105
- # Create a Chroma vector store
106
- embeddings = OpenAIEmbeddings()
107
- docsearch = await cl.make_async(Chroma.from_texts)(
108
- texts, embeddings, metadatas=metadatas
109
- )
110
-
111
- # The rest of your setup, like creating the chain, goes here
112
- # This part is unchanged from your second snippet
113
  message_history = ChatMessageHistory()
114
-
115
  memory = ConversationBufferMemory(
116
  memory_key="chat_history",
117
  output_key="answer",
@@ -119,6 +112,7 @@ async def on_chat_start():
119
  return_messages=True,
120
  )
121
 
 
122
  chain = ConversationalRetrievalChain.from_llm(
123
  ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
124
  chain_type="stuff",
@@ -128,9 +122,9 @@ async def on_chat_start():
128
  )
129
 
130
  # Let the user know that the system is ready
131
- msg.content = f"Processing `{file.name}` done. You can now ask questions!"
132
- await msg.update()
133
 
 
134
  cl.user_session.set("chain", chain)
135
 
136
 
 
46
  chain_type_kwargs = {"prompt": prompt}
47
 
48
 
49
+ def process_file(file_path: str):
50
+ # Example using PyPDF2 to extract text from a PDF file
51
+ from PyPDF2 import PdfReader
52
 
53
+ reader = PdfReader(file_path)
54
+ texts = []
55
+
56
+ for page in reader.pages:
57
+ texts.append(page.extract_text())
58
+
 
59
  return texts
60
 
61
 
62
 
63
+
64
  @cl.on_chat_start
65
  async def on_chat_start():
66
+ file = None
67
 
68
+ # Prompt users to upload either a text or PDF file
69
+ while file is None:
 
70
  files = await cl.AskFileMessage(
71
  content="Please upload a text or PDF file to begin!",
72
+ accept=["text/plain", "application/pdf"], # This line is for UI guidance
73
+ max_size_mb=20,
74
  timeout=180,
75
  ).send()
76
+ if files:
77
+ file = files[0] # Assuming the user uploads one file at a time
78
 
79
+ filename = file.name
 
 
 
 
80
 
81
+ # Initialize an empty list for texts, which will be populated based on the file type
82
  texts = []
83
 
84
+ # Process the file based on its extension
85
+ if filename.endswith('.txt'):
86
+ # Handle as text file
87
  with open(file.path, "r", encoding="utf-8") as f:
88
  text = f.read()
89
+ texts.append(text)
90
+ await cl.Message(content=f"`{filename}` uploaded, it contains {len(text)} characters!").send()
91
+ elif filename.endswith('.pdf'):
92
+ # Handle as PDF
93
+ texts = process_file(file.path) # Adjust this call according to your PDF processing implementation
94
+ else:
95
+ await cl.Message(content="Unsupported file type uploaded. Please upload a text or PDF file.").send()
96
+ return # Exit if the file type is not supported
97
+
98
+ # Process texts for conversational retrieval or other purposes here
99
+ # For demonstration, we'll just set up a simple Chroma vector store and conversational retrieval chain
100
+
101
+ # Create a Chroma vector store
102
+ embeddings = OpenAIEmbeddings()
103
+ docsearch = await cl.make_async(Chroma.from_texts)(
104
+ texts, embeddings, metadatas=[{"source": f"{i}-pl"} for i in range(len(texts))]
105
+ )
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  message_history = ChatMessageHistory()
 
108
  memory = ConversationBufferMemory(
109
  memory_key="chat_history",
110
  output_key="answer",
 
112
  return_messages=True,
113
  )
114
 
115
+ # Set up the conversational retrieval chain
116
  chain = ConversationalRetrievalChain.from_llm(
117
  ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
118
  chain_type="stuff",
 
122
  )
123
 
124
  # Let the user know that the system is ready
125
+ await cl.Message(content=f"Your file `{filename}` is now ready for questions!").send()
 
126
 
127
+ # Save the chain in the user session for later use
128
  cl.user_session.set("chain", chain)
129
 
130