HUANG-Stephanie commited on
Commit
4ae29c1
1 Parent(s): 1b183b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -23
app.py CHANGED
@@ -74,28 +74,7 @@ async def index(files: List[UploadFile] = File(...)):
74
 
75
  return {"message": f"Uploaded and converted {len(images)} pages"}
76
 
77
- @app.get("/search")
78
- async def search(query: str, k: int):
79
- qs = []
80
- with torch.no_grad():
81
- batch_query = process_queries(processor, [query], mock_image)
82
- batch_query = {k: v.to(device) for k, v in batch_query.items()}
83
- embeddings_query = model(**batch_query)
84
- qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
85
-
86
- retriever_evaluator = CustomEvaluator(is_multi_vector=True)
87
- scores = retriever_evaluator.evaluate(qs, ds)
88
-
89
- top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
90
-
91
- results = []
92
- for idx in top_k_indices:
93
- img_byte_arr = BytesIO()
94
- images[idx].save(img_byte_arr, format='PNG')
95
- img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
96
- results.append({"image": img_base64, "page": f"Page {idx}"})
97
-
98
- # Generate PDF
99
  pdf_buffer = BytesIO()
100
  c = canvas.Canvas(pdf_buffer, pagesize=letter)
101
  width, height = letter
@@ -118,10 +97,78 @@ async def search(query: str, k: int):
118
 
119
  c.save()
120
  pdf_buffer.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  # Use StreamingResponse to handle in-memory file
123
  response = StreamingResponse(pdf_buffer, media_type='application/pdf')
124
- response.headers['Content-Disposition'] = 'attachment; filename="search_results.pdf"'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  return response
127
 
 
74
 
75
  return {"message": f"Uploaded and converted {len(images)} pages"}
76
 
77
+ def generate_pdf(results):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  pdf_buffer = BytesIO()
79
  c = canvas.Canvas(pdf_buffer, pagesize=letter)
80
  width, height = letter
 
97
 
98
  c.save()
99
  pdf_buffer.seek(0)
100
+ return pdf_buffer
101
+
102
+ @app.get("/search")
103
+ async def search(query: str, k: int = 1):
104
+ qs = []
105
+ with torch.no_grad():
106
+ batch_query = process_queries(processor, [query], mock_image)
107
+ batch_query = {k: v.to(device) for k, v in batch_query.items()}
108
+ embeddings_query = model(**batch_query)
109
+ qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
110
+
111
+ retriever_evaluator = CustomEvaluator(is_multi_vector=True)
112
+ scores = retriever_evaluator.evaluate(qs, ds)
113
+
114
+ top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
115
+
116
+ results = []
117
+ for idx in top_k_indices:
118
+ img_byte_arr = BytesIO()
119
+ images[idx].save(img_byte_arr, format='PNG')
120
+ img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
121
+ results.append({"image": img_base64, "page": f"Page {idx}"})
122
+
123
+ pdf_buffer = generate_pdf(results)
124
 
125
  # Use StreamingResponse to handle in-memory file
126
  response = StreamingResponse(pdf_buffer, media_type='application/pdf')
127
+ response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
128
+
129
+ return response
130
+
131
+ @app.get("/search_by_cv")
132
+ async def search_by_cv(file: UploadFile = File(...), k: int = 10):
133
+ # Lire le fichier PDF uploadé
134
+ content = await file.read()
135
+ pdf_image_list = convert_from_bytes(content)
136
+
137
+ # Générer les embeddings pour les pages du PDF uploadé
138
+ qs = []
139
+ dataloader = DataLoader(
140
+ pdf_image_list,
141
+ batch_size=4,
142
+ shuffle=False,
143
+ collate_fn=lambda x: process_images(processor, x),
144
+ )
145
+ for batch_query in dataloader:
146
+ with torch.no_grad():
147
+ batch_query = {k: v.to(device) for k, v in batch_query.items()}
148
+ embeddings_query = model(**batch_query)
149
+ qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
150
+
151
+ # Comparer les embeddings du CV uploadé avec ceux déjà indexés
152
+ retriever_evaluator = CustomEvaluator(is_multi_vector=True)
153
+ scores = retriever_evaluator.evaluate(qs, ds)
154
+
155
+ # Trouver les indices des résultats les plus pertinents
156
+ top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
157
+
158
+ # Préparer les résultats sous forme d'images
159
+ results = []
160
+ for idx in top_k_indices:
161
+ img_byte_arr = BytesIO()
162
+ images[idx].save(img_byte_arr, format='PNG')
163
+ img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
164
+ results.append({"image": img_base64, "page": f"Page {idx}"})
165
+
166
+ # Générer le PDF des résultats
167
+ pdf_buffer = generate_pdf(results)
168
+
169
+ # Utiliser StreamingResponse pour renvoyer le fichier PDF généré
170
+ response = StreamingResponse(pdf_buffer, media_type='application/pdf')
171
+ response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
172
 
173
  return response
174