LVKinyanjui commited on
Commit
5effe6a
1 Parent(s): 00876e7

Added RAPTOR example code

Browse files
examples/OLMoE_1B_7B.ipynb ADDED
@@ -0,0 +1,1162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ },
15
+ "widgets": {
16
+ "application/vnd.jupyter.widget-state+json": {
17
+ "9074c90ec0f74bf29bf36d65ebba1b96": {
18
+ "model_module": "@jupyter-widgets/controls",
19
+ "model_name": "HBoxModel",
20
+ "model_module_version": "1.5.0",
21
+ "state": {
22
+ "_dom_classes": [],
23
+ "_model_module": "@jupyter-widgets/controls",
24
+ "_model_module_version": "1.5.0",
25
+ "_model_name": "HBoxModel",
26
+ "_view_count": null,
27
+ "_view_module": "@jupyter-widgets/controls",
28
+ "_view_module_version": "1.5.0",
29
+ "_view_name": "HBoxView",
30
+ "box_style": "",
31
+ "children": [
32
+ "IPY_MODEL_39038e915d234800bcaaa9b3895f1b68",
33
+ "IPY_MODEL_6d3b0f9e6c494cbea42456b60b011564",
34
+ "IPY_MODEL_831274f9a0ec40668ec7fcc5ac2771c3"
35
+ ],
36
+ "layout": "IPY_MODEL_8fa5eaaf1c8d4bc18ee076d33d563a0a"
37
+ }
38
+ },
39
+ "39038e915d234800bcaaa9b3895f1b68": {
40
+ "model_module": "@jupyter-widgets/controls",
41
+ "model_name": "HTMLModel",
42
+ "model_module_version": "1.5.0",
43
+ "state": {
44
+ "_dom_classes": [],
45
+ "_model_module": "@jupyter-widgets/controls",
46
+ "_model_module_version": "1.5.0",
47
+ "_model_name": "HTMLModel",
48
+ "_view_count": null,
49
+ "_view_module": "@jupyter-widgets/controls",
50
+ "_view_module_version": "1.5.0",
51
+ "_view_name": "HTMLView",
52
+ "description": "",
53
+ "description_tooltip": null,
54
+ "layout": "IPY_MODEL_4a614b527a2a49789d41df08c3466764",
55
+ "placeholder": "​",
56
+ "style": "IPY_MODEL_30e7f062bc1b44a3a99363cb1a06fea9",
57
+ "value": "Loading checkpoint shards:  33%"
58
+ }
59
+ },
60
+ "6d3b0f9e6c494cbea42456b60b011564": {
61
+ "model_module": "@jupyter-widgets/controls",
62
+ "model_name": "FloatProgressModel",
63
+ "model_module_version": "1.5.0",
64
+ "state": {
65
+ "_dom_classes": [],
66
+ "_model_module": "@jupyter-widgets/controls",
67
+ "_model_module_version": "1.5.0",
68
+ "_model_name": "FloatProgressModel",
69
+ "_view_count": null,
70
+ "_view_module": "@jupyter-widgets/controls",
71
+ "_view_module_version": "1.5.0",
72
+ "_view_name": "ProgressView",
73
+ "bar_style": "",
74
+ "description": "",
75
+ "description_tooltip": null,
76
+ "layout": "IPY_MODEL_528eadb5cd604eb78b99b3781ad8a357",
77
+ "max": 3,
78
+ "min": 0,
79
+ "orientation": "horizontal",
80
+ "style": "IPY_MODEL_4f5a0839fd414dbaa5a2f6306cfa5ecc",
81
+ "value": 1
82
+ }
83
+ },
84
+ "831274f9a0ec40668ec7fcc5ac2771c3": {
85
+ "model_module": "@jupyter-widgets/controls",
86
+ "model_name": "HTMLModel",
87
+ "model_module_version": "1.5.0",
88
+ "state": {
89
+ "_dom_classes": [],
90
+ "_model_module": "@jupyter-widgets/controls",
91
+ "_model_module_version": "1.5.0",
92
+ "_model_name": "HTMLModel",
93
+ "_view_count": null,
94
+ "_view_module": "@jupyter-widgets/controls",
95
+ "_view_module_version": "1.5.0",
96
+ "_view_name": "HTMLView",
97
+ "description": "",
98
+ "description_tooltip": null,
99
+ "layout": "IPY_MODEL_d44816c885194cef98b86893263b9b10",
100
+ "placeholder": "​",
101
+ "style": "IPY_MODEL_9c8da3b512bf414684a22b77d7fe48c6",
102
+ "value": " 1/3 [00:33<01:06, 33.42s/it]"
103
+ }
104
+ },
105
+ "8fa5eaaf1c8d4bc18ee076d33d563a0a": {
106
+ "model_module": "@jupyter-widgets/base",
107
+ "model_name": "LayoutModel",
108
+ "model_module_version": "1.2.0",
109
+ "state": {
110
+ "_model_module": "@jupyter-widgets/base",
111
+ "_model_module_version": "1.2.0",
112
+ "_model_name": "LayoutModel",
113
+ "_view_count": null,
114
+ "_view_module": "@jupyter-widgets/base",
115
+ "_view_module_version": "1.2.0",
116
+ "_view_name": "LayoutView",
117
+ "align_content": null,
118
+ "align_items": null,
119
+ "align_self": null,
120
+ "border": null,
121
+ "bottom": null,
122
+ "display": null,
123
+ "flex": null,
124
+ "flex_flow": null,
125
+ "grid_area": null,
126
+ "grid_auto_columns": null,
127
+ "grid_auto_flow": null,
128
+ "grid_auto_rows": null,
129
+ "grid_column": null,
130
+ "grid_gap": null,
131
+ "grid_row": null,
132
+ "grid_template_areas": null,
133
+ "grid_template_columns": null,
134
+ "grid_template_rows": null,
135
+ "height": null,
136
+ "justify_content": null,
137
+ "justify_items": null,
138
+ "left": null,
139
+ "margin": null,
140
+ "max_height": null,
141
+ "max_width": null,
142
+ "min_height": null,
143
+ "min_width": null,
144
+ "object_fit": null,
145
+ "object_position": null,
146
+ "order": null,
147
+ "overflow": null,
148
+ "overflow_x": null,
149
+ "overflow_y": null,
150
+ "padding": null,
151
+ "right": null,
152
+ "top": null,
153
+ "visibility": null,
154
+ "width": null
155
+ }
156
+ },
157
+ "4a614b527a2a49789d41df08c3466764": {
158
+ "model_module": "@jupyter-widgets/base",
159
+ "model_name": "LayoutModel",
160
+ "model_module_version": "1.2.0",
161
+ "state": {
162
+ "_model_module": "@jupyter-widgets/base",
163
+ "_model_module_version": "1.2.0",
164
+ "_model_name": "LayoutModel",
165
+ "_view_count": null,
166
+ "_view_module": "@jupyter-widgets/base",
167
+ "_view_module_version": "1.2.0",
168
+ "_view_name": "LayoutView",
169
+ "align_content": null,
170
+ "align_items": null,
171
+ "align_self": null,
172
+ "border": null,
173
+ "bottom": null,
174
+ "display": null,
175
+ "flex": null,
176
+ "flex_flow": null,
177
+ "grid_area": null,
178
+ "grid_auto_columns": null,
179
+ "grid_auto_flow": null,
180
+ "grid_auto_rows": null,
181
+ "grid_column": null,
182
+ "grid_gap": null,
183
+ "grid_row": null,
184
+ "grid_template_areas": null,
185
+ "grid_template_columns": null,
186
+ "grid_template_rows": null,
187
+ "height": null,
188
+ "justify_content": null,
189
+ "justify_items": null,
190
+ "left": null,
191
+ "margin": null,
192
+ "max_height": null,
193
+ "max_width": null,
194
+ "min_height": null,
195
+ "min_width": null,
196
+ "object_fit": null,
197
+ "object_position": null,
198
+ "order": null,
199
+ "overflow": null,
200
+ "overflow_x": null,
201
+ "overflow_y": null,
202
+ "padding": null,
203
+ "right": null,
204
+ "top": null,
205
+ "visibility": null,
206
+ "width": null
207
+ }
208
+ },
209
+ "30e7f062bc1b44a3a99363cb1a06fea9": {
210
+ "model_module": "@jupyter-widgets/controls",
211
+ "model_name": "DescriptionStyleModel",
212
+ "model_module_version": "1.5.0",
213
+ "state": {
214
+ "_model_module": "@jupyter-widgets/controls",
215
+ "_model_module_version": "1.5.0",
216
+ "_model_name": "DescriptionStyleModel",
217
+ "_view_count": null,
218
+ "_view_module": "@jupyter-widgets/base",
219
+ "_view_module_version": "1.2.0",
220
+ "_view_name": "StyleView",
221
+ "description_width": ""
222
+ }
223
+ },
224
+ "528eadb5cd604eb78b99b3781ad8a357": {
225
+ "model_module": "@jupyter-widgets/base",
226
+ "model_name": "LayoutModel",
227
+ "model_module_version": "1.2.0",
228
+ "state": {
229
+ "_model_module": "@jupyter-widgets/base",
230
+ "_model_module_version": "1.2.0",
231
+ "_model_name": "LayoutModel",
232
+ "_view_count": null,
233
+ "_view_module": "@jupyter-widgets/base",
234
+ "_view_module_version": "1.2.0",
235
+ "_view_name": "LayoutView",
236
+ "align_content": null,
237
+ "align_items": null,
238
+ "align_self": null,
239
+ "border": null,
240
+ "bottom": null,
241
+ "display": null,
242
+ "flex": null,
243
+ "flex_flow": null,
244
+ "grid_area": null,
245
+ "grid_auto_columns": null,
246
+ "grid_auto_flow": null,
247
+ "grid_auto_rows": null,
248
+ "grid_column": null,
249
+ "grid_gap": null,
250
+ "grid_row": null,
251
+ "grid_template_areas": null,
252
+ "grid_template_columns": null,
253
+ "grid_template_rows": null,
254
+ "height": null,
255
+ "justify_content": null,
256
+ "justify_items": null,
257
+ "left": null,
258
+ "margin": null,
259
+ "max_height": null,
260
+ "max_width": null,
261
+ "min_height": null,
262
+ "min_width": null,
263
+ "object_fit": null,
264
+ "object_position": null,
265
+ "order": null,
266
+ "overflow": null,
267
+ "overflow_x": null,
268
+ "overflow_y": null,
269
+ "padding": null,
270
+ "right": null,
271
+ "top": null,
272
+ "visibility": null,
273
+ "width": null
274
+ }
275
+ },
276
+ "4f5a0839fd414dbaa5a2f6306cfa5ecc": {
277
+ "model_module": "@jupyter-widgets/controls",
278
+ "model_name": "ProgressStyleModel",
279
+ "model_module_version": "1.5.0",
280
+ "state": {
281
+ "_model_module": "@jupyter-widgets/controls",
282
+ "_model_module_version": "1.5.0",
283
+ "_model_name": "ProgressStyleModel",
284
+ "_view_count": null,
285
+ "_view_module": "@jupyter-widgets/base",
286
+ "_view_module_version": "1.2.0",
287
+ "_view_name": "StyleView",
288
+ "bar_color": null,
289
+ "description_width": ""
290
+ }
291
+ },
292
+ "d44816c885194cef98b86893263b9b10": {
293
+ "model_module": "@jupyter-widgets/base",
294
+ "model_name": "LayoutModel",
295
+ "model_module_version": "1.2.0",
296
+ "state": {
297
+ "_model_module": "@jupyter-widgets/base",
298
+ "_model_module_version": "1.2.0",
299
+ "_model_name": "LayoutModel",
300
+ "_view_count": null,
301
+ "_view_module": "@jupyter-widgets/base",
302
+ "_view_module_version": "1.2.0",
303
+ "_view_name": "LayoutView",
304
+ "align_content": null,
305
+ "align_items": null,
306
+ "align_self": null,
307
+ "border": null,
308
+ "bottom": null,
309
+ "display": null,
310
+ "flex": null,
311
+ "flex_flow": null,
312
+ "grid_area": null,
313
+ "grid_auto_columns": null,
314
+ "grid_auto_flow": null,
315
+ "grid_auto_rows": null,
316
+ "grid_column": null,
317
+ "grid_gap": null,
318
+ "grid_row": null,
319
+ "grid_template_areas": null,
320
+ "grid_template_columns": null,
321
+ "grid_template_rows": null,
322
+ "height": null,
323
+ "justify_content": null,
324
+ "justify_items": null,
325
+ "left": null,
326
+ "margin": null,
327
+ "max_height": null,
328
+ "max_width": null,
329
+ "min_height": null,
330
+ "min_width": null,
331
+ "object_fit": null,
332
+ "object_position": null,
333
+ "order": null,
334
+ "overflow": null,
335
+ "overflow_x": null,
336
+ "overflow_y": null,
337
+ "padding": null,
338
+ "right": null,
339
+ "top": null,
340
+ "visibility": null,
341
+ "width": null
342
+ }
343
+ },
344
+ "9c8da3b512bf414684a22b77d7fe48c6": {
345
+ "model_module": "@jupyter-widgets/controls",
346
+ "model_name": "DescriptionStyleModel",
347
+ "model_module_version": "1.5.0",
348
+ "state": {
349
+ "_model_module": "@jupyter-widgets/controls",
350
+ "_model_module_version": "1.5.0",
351
+ "_model_name": "DescriptionStyleModel",
352
+ "_view_count": null,
353
+ "_view_module": "@jupyter-widgets/base",
354
+ "_view_module_version": "1.2.0",
355
+ "_view_name": "StyleView",
356
+ "description_width": ""
357
+ }
358
+ }
359
+ }
360
+ }
361
+ },
362
+ "cells": [
363
+ {
364
+ "cell_type": "markdown",
365
+ "source": [
366
+ "<h2 align=center> OLMoE-1B-7B </h2>"
367
+ ],
368
+ "metadata": {
369
+ "id": "exGk3x7MxVMy"
370
+ }
371
+ },
372
+ {
373
+ "cell_type": "markdown",
374
+ "source": [
375
+ "The transformer library requires special install steps because the code that supports the OMOE models was only introduced in this [PR](https://github.com/huggingface/transformers/pull/32406).\n",
376
+ "This code has not been updated on PYPI so we have to install it from source."
377
+ ],
378
+ "metadata": {
379
+ "id": "t6IQl5rxrZDp"
380
+ }
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "source": [
385
+ "!git init\n",
386
+ "!git branch -m main"
387
+ ],
388
+ "metadata": {
389
+ "colab": {
390
+ "base_uri": "https://localhost:8080/"
391
+ },
392
+ "id": "2AziwR7vq2bq",
393
+ "outputId": "c2d1effa-0a73-420c-fffd-962424a9cefb"
394
+ },
395
+ "execution_count": null,
396
+ "outputs": [
397
+ {
398
+ "output_type": "stream",
399
+ "name": "stdout",
400
+ "text": [
401
+ "\u001b[33mhint: Using 'master' as the name for the initial branch. This default branch name\u001b[m\n",
402
+ "\u001b[33mhint: is subject to change. To configure the initial branch name to use in all\u001b[m\n",
403
+ "\u001b[33mhint: of your new repositories, which will suppress this warning, call:\u001b[m\n",
404
+ "\u001b[33mhint: \u001b[m\n",
405
+ "\u001b[33mhint: \tgit config --global init.defaultBranch <name>\u001b[m\n",
406
+ "\u001b[33mhint: \u001b[m\n",
407
+ "\u001b[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and\u001b[m\n",
408
+ "\u001b[33mhint: 'development'. The just-created branch can be renamed via this command:\u001b[m\n",
409
+ "\u001b[33mhint: \u001b[m\n",
410
+ "\u001b[33mhint: \tgit branch -m <name>\u001b[m\n",
411
+ "Initialized empty Git repository in /content/.git/\n"
412
+ ]
413
+ }
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "source": [
419
+ "!export PR_NUMBER=32406"
420
+ ],
421
+ "metadata": {
422
+ "colab": {
423
+ "base_uri": "https://localhost:8080/"
424
+ },
425
+ "id": "bqhf8TGhxvB9",
426
+ "outputId": "101d77d0-5957-4534-b9a6-f3c794af86f2"
427
+ },
428
+ "execution_count": 3,
429
+ "outputs": [
430
+ {
431
+ "output_type": "stream",
432
+ "name": "stdout",
433
+ "text": [
434
+ "\n"
435
+ ]
436
+ }
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "source": [
442
+ "!git remote add origin https://github.com/huggingface/transformers.git"
443
+ ],
444
+ "metadata": {
445
+ "id": "PL3UBh06qo_C"
446
+ },
447
+ "execution_count": null,
448
+ "outputs": []
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "execution_count": null,
453
+ "metadata": {
454
+ "colab": {
455
+ "base_uri": "https://localhost:8080/"
456
+ },
457
+ "id": "7bitxdKMptwl",
458
+ "outputId": "9c5016c2-b7ca-4477-e1f4-c896e5ca6f5a",
459
+ "collapsed": true
460
+ },
461
+ "outputs": [
462
+ {
463
+ "output_type": "stream",
464
+ "name": "stdout",
465
+ "text": [
466
+ "remote: Enumerating objects: 188166, done.\u001b[K\n",
467
+ "remote: Counting objects: 2% (1/47)\u001b[K\rremote: Counting objects: 4% (2/47)\u001b[K\rremote: Counting objects: 6% (3/47)\u001b[K\rremote: Counting objects: 8% (4/47)\u001b[K\rremote: Counting objects: 10% (5/47)\u001b[K\rremote: Counting objects: 12% (6/47)\u001b[K\rremote: Counting objects: 14% (7/47)\u001b[K\rremote: Counting objects: 17% (8/47)\u001b[K\rremote: Counting objects: 19% (9/47)\u001b[K\rremote: Counting objects: 21% (10/47)\u001b[K\rremote: Counting objects: 23% (11/47)\u001b[K\rremote: Counting objects: 25% (12/47)\u001b[K\rremote: Counting objects: 27% (13/47)\u001b[K\rremote: Counting objects: 29% (14/47)\u001b[K\rremote: Counting objects: 31% (15/47)\u001b[K\rremote: Counting objects: 34% (16/47)\u001b[K\rremote: Counting objects: 36% (17/47)\u001b[K\rremote: Counting objects: 38% (18/47)\u001b[K\rremote: Counting objects: 40% (19/47)\u001b[K\rremote: Counting objects: 42% (20/47)\u001b[K\rremote: Counting objects: 44% (21/47)\u001b[K\rremote: Counting objects: 46% (22/47)\u001b[K\rremote: Counting objects: 48% (23/47)\u001b[K\rremote: Counting objects: 51% (24/47)\u001b[K\rremote: Counting objects: 53% (25/47)\u001b[K\rremote: Counting objects: 55% (26/47)\u001b[K\rremote: Counting objects: 57% (27/47)\u001b[K\rremote: Counting objects: 59% (28/47)\u001b[K\rremote: Counting objects: 61% (29/47)\u001b[K\rremote: Counting objects: 63% (30/47)\u001b[K\rremote: Counting objects: 65% (31/47)\u001b[K\rremote: Counting objects: 68% (32/47)\u001b[K\rremote: Counting objects: 70% (33/47)\u001b[K\rremote: Counting objects: 72% (34/47)\u001b[K\rremote: Counting objects: 74% (35/47)\u001b[K\rremote: Counting objects: 76% (36/47)\u001b[K\rremote: Counting objects: 78% (37/47)\u001b[K\rremote: Counting objects: 80% (38/47)\u001b[K\rremote: Counting objects: 82% (39/47)\u001b[K\rremote: Counting objects: 85% (40/47)\u001b[K\rremote: Counting objects: 87% (41/47)\u001b[K\rremote: Counting objects: 89% (42/47)\u001b[K\rremote: Counting objects: 91% (43/47)\u001b[K\rremote: Counting objects: 93% (44/47)\u001b[K\rremote: Counting objects: 95% (45/47)\u001b[K\rremote: Counting objects: 97% (46/47)\u001b[K\rremote: Counting objects: 100% (47/47)\u001b[K\rremote: Counting objects: 100% (47/47), done.\u001b[K\n",
468
+ "remote: Compressing objects: 100% (46/46), done.\u001b[K\n",
469
+ "remote: Total 188166 (delta 1), reused 47 (delta 1), pack-reused 188119 (from 1)\u001b[K\n",
470
+ "Receiving objects: 100% (188166/188166), 214.64 MiB | 20.89 MiB/s, done.\n",
471
+ "Resolving deltas: 100% (134665/134665), done.\n",
472
+ "From https://github.com/huggingface/transformers\n",
473
+ " * [new ref] refs/pull/32406/head -> pr-32406\n",
474
+ " * [new tag] 0.1.2 -> 0.1.2\n",
475
+ " * [new tag] 0.5.0 -> 0.5.0\n",
476
+ " * [new tag] 1.0 -> 1.0\n",
477
+ " * [new tag] 1.1.0 -> 1.1.0\n",
478
+ " * [new tag] 1.2.0 -> 1.2.0\n",
479
+ " * [new tag] 3.0.1 -> 3.0.1\n",
480
+ " * [new tag] 4.3.0.rc1 -> 4.3.0.rc1\n",
481
+ " * [new tag] v0.1.2 -> v0.1.2\n",
482
+ " * [new tag] v0.2.0 -> v0.2.0\n",
483
+ " * [new tag] v0.3.0 -> v0.3.0\n",
484
+ " * [new tag] v0.4.0 -> v0.4.0\n",
485
+ " * [new tag] v0.5.0 -> v0.5.0\n",
486
+ " * [new tag] v0.5.1 -> v0.5.1\n",
487
+ " * [new tag] v0.6.0 -> v0.6.0\n",
488
+ " * [new tag] v0.6.1 -> v0.6.1\n",
489
+ " * [new tag] v0.6.2 -> v0.6.2\n",
490
+ " * [new tag] v1.0.0 -> v1.0.0\n",
491
+ " * [new tag] v2.0.0 -> v2.0.0\n",
492
+ " * [new tag] v2.1.0 -> v2.1.0\n",
493
+ " * [new tag] v2.1.1 -> v2.1.1\n",
494
+ " * [new tag] v2.10.0 -> v2.10.0\n",
495
+ " * [new tag] v2.11.0 -> v2.11.0\n",
496
+ " * [new tag] v2.2.0 -> v2.2.0\n",
497
+ " * [new tag] v2.2.1 -> v2.2.1\n",
498
+ " * [new tag] v2.2.2 -> v2.2.2\n",
499
+ " * [new tag] v2.3.0 -> v2.3.0\n",
500
+ " * [new tag] v2.4.0 -> v2.4.0\n",
501
+ " * [new tag] v2.4.1 -> v2.4.1\n",
502
+ " * [new tag] v2.5.0 -> v2.5.0\n",
503
+ " * [new tag] v2.5.1 -> v2.5.1\n",
504
+ " * [new tag] v2.6.0 -> v2.6.0\n",
505
+ " * [new tag] v2.7.0 -> v2.7.0\n",
506
+ " * [new tag] v2.8.0 -> v2.8.0\n",
507
+ " * [new tag] v2.9.0 -> v2.9.0\n",
508
+ " * [new tag] v2.9.1 -> v2.9.1\n",
509
+ " * [new tag] v3.0.0 -> v3.0.0\n",
510
+ " * [new tag] v3.0.1 -> v3.0.1\n",
511
+ " * [new tag] v3.0.2 -> v3.0.2\n",
512
+ " * [new tag] v3.1.0 -> v3.1.0\n",
513
+ " * [new tag] v3.2.0 -> v3.2.0\n",
514
+ " * [new tag] v3.3.0 -> v3.3.0\n",
515
+ " * [new tag] v3.3.1 -> v3.3.1\n",
516
+ " * [new tag] v3.4.0 -> v3.4.0\n",
517
+ " * [new tag] v3.5.0 -> v3.5.0\n",
518
+ " * [new tag] v4.0.0-rc-1 -> v4.0.0-rc-1\n",
519
+ " * [new tag] v4.1.0 -> v4.1.0\n",
520
+ " * [new tag] v4.1.1 -> v4.1.1\n",
521
+ " * [new tag] v4.10.0 -> v4.10.0\n",
522
+ " * [new tag] v4.11.0 -> v4.11.0\n",
523
+ " * [new tag] v4.12.0 -> v4.12.0\n",
524
+ " * [new tag] v4.13.0 -> v4.13.0\n",
525
+ " * [new tag] v4.14.0 -> v4.14.0\n",
526
+ " * [new tag] v4.15.0 -> v4.15.0\n",
527
+ " * [new tag] v4.16.0 -> v4.16.0\n",
528
+ " * [new tag] v4.2.0 -> v4.2.0\n",
529
+ " * [new tag] v4.3.0.rc1 -> v4.3.0.rc1\n",
530
+ " * [new tag] v4.33.1 -> v4.33.1\n",
531
+ " * [new tag] v4.4.0 -> v4.4.0\n",
532
+ " * [new tag] v4.5.0 -> v4.5.0\n",
533
+ " * [new tag] v4.6.0 -> v4.6.0\n",
534
+ " * [new tag] v4.7.0 -> v4.7.0\n",
535
+ " * [new tag] v4.8.0 -> v4.8.0\n",
536
+ " * [new tag] v4.9.0 -> v4.9.0\n"
537
+ ]
538
+ }
539
+ ],
540
+ "source": [
541
+ "!git fetch origin pull/32406/head:pr-32406"
542
+ ]
543
+ },
544
+ {
545
+ "cell_type": "code",
546
+ "source": [
547
+ "!git checkout pr-32406"
548
+ ],
549
+ "metadata": {
550
+ "colab": {
551
+ "base_uri": "https://localhost:8080/"
552
+ },
553
+ "id": "_s3Mt90Hqbbd",
554
+ "outputId": "b8ac5e14-c28c-41f4-ba61-8e936497b125"
555
+ },
556
+ "execution_count": null,
557
+ "outputs": [
558
+ {
559
+ "output_type": "stream",
560
+ "name": "stdout",
561
+ "text": [
562
+ "Updating files: 76% (3390/4452)\rUpdating files: 77% (3429/4452)\rUpdating files: 78% (3473/4452)\rUpdating files: 79% (3518/4452)\rUpdating files: 80% (3562/4452)\rUpdating files: 81% (3607/4452)\rUpdating files: 82% (3651/4452)\rUpdating files: 83% (3696/4452)\rUpdating files: 84% (3740/4452)\rUpdating files: 85% (3785/4452)\rUpdating files: 86% (3829/4452)\rUpdating files: 87% (3874/4452)\rUpdating files: 88% (3918/4452)\rUpdating files: 89% (3963/4452)\rUpdating files: 90% (4007/4452)\rUpdating files: 91% (4052/4452)\rUpdating files: 92% (4096/4452)\rUpdating files: 93% (4141/4452)\rUpdating files: 94% (4185/4452)\rUpdating files: 95% (4230/4452)\rUpdating files: 96% (4274/4452)\rUpdating files: 97% (4319/4452)\rUpdating files: 98% (4363/4452)\rUpdating files: 99% (4408/4452)\rUpdating files: 100% (4452/4452)\rUpdating files: 100% (4452/4452), done.\n",
563
+ "Switched to branch 'pr-32406'\n"
564
+ ]
565
+ }
566
+ ]
567
+ },
568
+ {
569
+ "cell_type": "code",
570
+ "source": [
571
+ "!pip install -e .[all]"
572
+ ],
573
+ "metadata": {
574
+ "id": "cq0YWuyxrQbf",
575
+ "colab": {
576
+ "base_uri": "https://localhost:8080/",
577
+ "height": 1000
578
+ },
579
+ "collapsed": true,
580
+ "outputId": "9c6ea2e5-74f5-4f61-9043-215b7de2572e"
581
+ },
582
+ "execution_count": 9,
583
+ "outputs": [
584
+ {
585
+ "output_type": "stream",
586
+ "name": "stdout",
587
+ "text": [
588
+ "Obtaining file:///content\n",
589
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
590
+ " Checking if build backend supports build_editable ... \u001b[?25l\u001b[?25hdone\n",
591
+ " Getting requirements to build editable ... \u001b[?25l\u001b[?25hdone\n",
592
+ " Preparing editable metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
593
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (3.16.0)\n",
594
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.24.6)\n",
595
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (1.26.4)\n",
596
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (24.1)\n",
597
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (6.0.2)\n",
598
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (2024.5.15)\n",
599
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (2.32.3)\n",
600
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.19.1)\n",
601
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.4.5)\n",
602
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (4.66.5)\n",
603
+ "Collecting tensorflow<2.16,>2.9 (from transformers==4.45.0.dev0)\n",
604
+ " Downloading tensorflow-2.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)\n",
605
+ "Collecting onnxconverter-common (from transformers==4.45.0.dev0)\n",
606
+ " Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl.metadata (4.2 kB)\n",
607
+ "Collecting tf2onnx (from transformers==4.45.0.dev0)\n",
608
+ " Downloading tf2onnx-1.16.1-py3-none-any.whl.metadata (1.3 kB)\n",
609
+ "Collecting tensorflow-text<2.16 (from transformers==4.45.0.dev0)\n",
610
+ " Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)\n",
611
+ "Collecting keras-nlp<0.14.0,>=0.3.1 (from transformers==4.45.0.dev0)\n",
612
+ " Downloading keras_nlp-0.12.1-py3-none-any.whl.metadata (6.8 kB)\n",
613
+ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (2.4.0+cu121)\n",
614
+ "Requirement already satisfied: accelerate>=0.26.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.34.2)\n",
615
+ "Collecting jax<=0.4.13,>=0.4.1 (from transformers==4.45.0.dev0)\n",
616
+ " Downloading jax-0.4.13.tar.gz (1.3 MB)\n",
617
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
618
+ "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
619
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
620
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
621
+ "Collecting jaxlib<=0.4.13,>=0.4.1 (from transformers==4.45.0.dev0)\n",
622
+ " Downloading jaxlib-0.4.13-cp310-cp310-manylinux2014_x86_64.whl.metadata (2.1 kB)\n",
623
+ "Collecting flax<=0.7.0,>=0.4.1 (from transformers==4.45.0.dev0)\n",
624
+ " Downloading flax-0.7.0-py3-none-any.whl.metadata (9.9 kB)\n",
625
+ "Collecting optax<=0.1.4,>=0.0.8 (from transformers==4.45.0.dev0)\n",
626
+ " Downloading optax-0.1.4-py3-none-any.whl.metadata (12 kB)\n",
627
+ "Collecting scipy<1.13.0 (from transformers==4.45.0.dev0)\n",
628
+ " Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
629
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
630
+ "\u001b[?25hRequirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.1.99)\n",
631
+ "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (3.20.3)\n",
632
+ "Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (2.4.0+cu121)\n",
633
+ "Requirement already satisfied: librosa in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.10.2.post1)\n",
634
+ "Collecting pyctcdecode>=0.4.0 (from transformers==4.45.0.dev0)\n",
635
+ " Downloading pyctcdecode-0.5.0-py2.py3-none-any.whl.metadata (20 kB)\n",
636
+ "Collecting phonemizer (from transformers==4.45.0.dev0)\n",
637
+ " Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)\n",
638
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.2/48.2 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
639
+ "\u001b[?25hCollecting kenlm (from transformers==4.45.0.dev0)\n",
640
+ " Downloading kenlm-0.2.0.tar.gz (427 kB)\n",
641
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m427.4/427.4 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
642
+ "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
643
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
644
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
645
+ "Collecting Pillow<=15.0,>=10.0.1 (from transformers==4.45.0.dev0)\n",
646
+ " Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n",
647
+ "Collecting optuna (from transformers==4.45.0.dev0)\n",
648
+ " Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)\n",
649
+ "Collecting ray>=2.7.0 (from ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0)\n",
650
+ " Downloading ray-2.35.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (16 kB)\n",
651
+ "Collecting sigopt (from transformers==4.45.0.dev0)\n",
652
+ " Downloading sigopt-8.8.3-py2.py3-none-any.whl.metadata (2.7 kB)\n",
653
+ "Collecting timm<=0.9.16 (from transformers==4.45.0.dev0)\n",
654
+ " Downloading timm-0.9.16-py3-none-any.whl.metadata (38 kB)\n",
655
+ "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from transformers==4.45.0.dev0) (0.19.0+cu121)\n",
656
+ "Collecting codecarbon==1.2.0 (from transformers==4.45.0.dev0)\n",
657
+ " Downloading codecarbon-1.2.0-py3-none-any.whl.metadata (13 kB)\n",
658
+ "Collecting decord==0.6.0 (from transformers==4.45.0.dev0)\n",
659
+ " Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)\n",
660
+ "Collecting av==9.2.0 (from transformers==4.45.0.dev0)\n",
661
+ " Downloading av-9.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)\n",
662
+ "Collecting APScheduler (from codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
663
+ " Downloading APScheduler-3.10.4-py3-none-any.whl.metadata (5.7 kB)\n",
664
+ "Collecting dash (from codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
665
+ " Downloading dash-2.18.1-py3-none-any.whl.metadata (10 kB)\n",
666
+ "Collecting dash-bootstrap-components (from codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
667
+ " Downloading dash_bootstrap_components-1.6.0-py3-none-any.whl.metadata (5.2 kB)\n",
668
+ "Collecting fire (from codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
669
+ " Downloading fire-0.6.0.tar.gz (88 kB)\n",
670
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.4/88.4 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
671
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
672
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from codecarbon==1.2.0->transformers==4.45.0.dev0) (2.1.4)\n",
673
+ "Collecting pynvml (from codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
674
+ " Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)\n",
675
+ "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon==1.2.0->transformers==4.45.0.dev0) (9.0.0)\n",
676
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.26.0->transformers==4.45.0.dev0) (5.9.5)\n",
677
+ "Requirement already satisfied: msgpack in /usr/local/lib/python3.10/dist-packages (from flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (1.0.8)\n",
678
+ "Requirement already satisfied: orbax-checkpoint in /usr/local/lib/python3.10/dist-packages (from flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (0.6.3)\n",
679
+ "Requirement already satisfied: tensorstore in /usr/local/lib/python3.10/dist-packages (from flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (0.1.65)\n",
680
+ "Requirement already satisfied: rich>=11.1 in /usr/local/lib/python3.10/dist-packages (from flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (13.8.1)\n",
681
+ "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (4.12.2)\n",
682
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers==4.45.0.dev0) (2024.6.1)\n",
683
+ "Requirement already satisfied: ml-dtypes>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from jax<=0.4.13,>=0.4.1->transformers==4.45.0.dev0) (0.4.0)\n",
684
+ "Requirement already satisfied: opt-einsum in /usr/local/lib/python3.10/dist-packages (from jax<=0.4.13,>=0.4.1->transformers==4.45.0.dev0) (3.3.0)\n",
685
+ "Collecting keras-core (from keras-nlp<0.14.0,>=0.3.1->transformers==4.45.0.dev0)\n",
686
+ " Downloading keras_core-0.1.7-py3-none-any.whl.metadata (4.3 kB)\n",
687
+ "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from keras-nlp<0.14.0,>=0.3.1->transformers==4.45.0.dev0) (1.4.0)\n",
688
+ "Requirement already satisfied: dm-tree in /usr/local/lib/python3.10/dist-packages (from keras-nlp<0.14.0,>=0.3.1->transformers==4.45.0.dev0) (0.1.8)\n",
689
+ "Requirement already satisfied: kagglehub in /usr/local/lib/python3.10/dist-packages (from keras-nlp<0.14.0,>=0.3.1->transformers==4.45.0.dev0) (0.2.9)\n",
690
+ "Requirement already satisfied: chex>=0.1.5 in /usr/local/lib/python3.10/dist-packages (from optax<=0.1.4,>=0.0.8->transformers==4.45.0.dev0) (0.1.86)\n",
691
+ "Collecting pygtrie<3.0,>=2.1 (from pyctcdecode>=0.4.0->transformers==4.45.0.dev0)\n",
692
+ " Downloading pygtrie-2.5.0-py3-none-any.whl.metadata (7.5 kB)\n",
693
+ "Collecting hypothesis<7,>=6.14 (from pyctcdecode>=0.4.0->transformers==4.45.0.dev0)\n",
694
+ " Downloading hypothesis-6.112.1-py3-none-any.whl.metadata (6.2 kB)\n",
695
+ "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.10/dist-packages (from ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (8.1.7)\n",
696
+ "Requirement already satisfied: jsonschema in /usr/local/lib/python3.10/dist-packages (from ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (4.23.0)\n",
697
+ "Requirement already satisfied: aiosignal in /usr/local/lib/python3.10/dist-packages (from ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (1.3.1)\n",
698
+ "Requirement already satisfied: frozenlist in /usr/local/lib/python3.10/dist-packages (from ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (1.4.1)\n",
699
+ "Collecting tensorboardX>=1.9 (from ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0)\n",
700
+ " Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)\n",
701
+ "Requirement already satisfied: pyarrow>=6.0.1 in /usr/local/lib/python3.10/dist-packages (from ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (14.0.2)\n",
702
+ "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (1.6.3)\n",
703
+ "Requirement already satisfied: flatbuffers>=23.5.26 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (24.3.25)\n",
704
+ "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.6.0)\n",
705
+ "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.2.0)\n",
706
+ "Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (3.11.0)\n",
707
+ "Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (18.1.1)\n",
708
+ "Collecting ml-dtypes>=0.1.0 (from jax<=0.4.13,>=0.4.1->transformers==4.45.0.dev0)\n",
709
+ " Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
710
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (71.0.4)\n",
711
+ "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (1.16.0)\n",
712
+ "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (2.4.0)\n",
713
+ "Collecting wrapt<1.15,>=1.11.0 (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0)\n",
714
+ " Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
715
+ "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.37.1)\n",
716
+ "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (1.64.1)\n",
717
+ "Collecting tensorboard<2.16,>=2.15 (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0)\n",
718
+ " Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)\n",
719
+ "Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0)\n",
720
+ " Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)\n",
721
+ "Collecting keras<2.16,>=2.15.0 (from tensorflow<2.16,>2.9->transformers==4.45.0.dev0)\n",
722
+ " Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)\n",
723
+ "Requirement already satisfied: tensorflow-hub>=0.13.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow-text<2.16->transformers==4.45.0.dev0) (0.16.1)\n",
724
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->transformers==4.45.0.dev0) (1.13.2)\n",
725
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->transformers==4.45.0.dev0) (3.3)\n",
726
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->transformers==4.45.0.dev0) (3.1.4)\n",
727
+ "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (3.0.1)\n",
728
+ "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (1.3.2)\n",
729
+ "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (1.4.2)\n",
730
+ "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (4.4.2)\n",
731
+ "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (0.60.0)\n",
732
+ "Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (0.12.1)\n",
733
+ "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (1.8.2)\n",
734
+ "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (0.5.0.post1)\n",
735
+ "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa->transformers==4.45.0.dev0) (0.4)\n",
736
+ "Collecting onnx (from onnxconverter-common->transformers==4.45.0.dev0)\n",
737
+ " Downloading onnx-1.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
738
+ "INFO: pip is looking at multiple versions of onnxconverter-common to determine which version is compatible with other requirements. This could take a while.\n",
739
+ "Collecting onnxconverter-common (from transformers==4.45.0.dev0)\n",
740
+ " Downloading onnxconverter_common-1.13.0-py2.py3-none-any.whl.metadata (2.6 kB)\n",
741
+ "Collecting alembic>=1.5.0 (from optuna->transformers==4.45.0.dev0)\n",
742
+ " Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)\n",
743
+ "Collecting colorlog (from optuna->transformers==4.45.0.dev0)\n",
744
+ " Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n",
745
+ "Requirement already satisfied: sqlalchemy>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from optuna->transformers==4.45.0.dev0) (2.0.34)\n",
746
+ "Collecting segments (from phonemizer->transformers==4.45.0.dev0)\n",
747
+ " Downloading segments-2.2.1-py2.py3-none-any.whl.metadata (3.3 kB)\n",
748
+ "Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.10/dist-packages (from phonemizer->transformers==4.45.0.dev0) (24.2.0)\n",
749
+ "Collecting dlinfo (from phonemizer->transformers==4.45.0.dev0)\n",
750
+ " Downloading dlinfo-1.2.1-py3-none-any.whl.metadata (1.1 kB)\n",
751
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.45.0.dev0) (3.3.2)\n",
752
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.45.0.dev0) (3.8)\n",
753
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.45.0.dev0) (2.0.7)\n",
754
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.45.0.dev0) (2024.8.30)\n",
755
+ "Collecting backoff<2.0.0,>=1.10.0 (from sigopt->transformers==4.45.0.dev0)\n",
756
+ " Downloading backoff-1.11.1-py2.py3-none-any.whl.metadata (12 kB)\n",
757
+ "Collecting GitPython>=2.0.0 (from sigopt->transformers==4.45.0.dev0)\n",
758
+ " Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)\n",
759
+ "Collecting pypng>=0.0.20 (from sigopt->transformers==4.45.0.dev0)\n",
760
+ " Downloading pypng-0.20220715.0-py3-none-any.whl.metadata (13 kB)\n",
761
+ "Collecting urllib3<3,>=1.21.1 (from requests->transformers==4.45.0.dev0)\n",
762
+ " Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)\n",
763
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.1/50.1 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
764
+ "\u001b[?25hCollecting Mako (from alembic>=1.5.0->optuna->transformers==4.45.0.dev0)\n",
765
+ " Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)\n",
766
+ "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.44.0)\n",
767
+ "INFO: pip is looking at multiple versions of chex to determine which version is compatible with other requirements. This could take a while.\n",
768
+ "Collecting chex>=0.1.5 (from optax<=0.1.4,>=0.0.8->transformers==4.45.0.dev0)\n",
769
+ " Downloading chex-0.1.85-py3-none-any.whl.metadata (17 kB)\n",
770
+ " Downloading chex-0.1.84-py3-none-any.whl.metadata (17 kB)\n",
771
+ " Downloading chex-0.1.83-py3-none-any.whl.metadata (17 kB)\n",
772
+ " Downloading chex-0.1.82-py3-none-any.whl.metadata (17 kB)\n",
773
+ "Requirement already satisfied: toolz>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from chex>=0.1.5->optax<=0.1.4,>=0.0.8->transformers==4.45.0.dev0) (0.12.1)\n",
774
+ "Collecting gitdb<5,>=4.0.1 (from GitPython>=2.0.0->sigopt->transformers==4.45.0.dev0)\n",
775
+ " Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n",
776
+ "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from hypothesis<7,>=6.14->pyctcdecode>=0.4.0->transformers==4.45.0.dev0) (2.4.0)\n",
777
+ "Requirement already satisfied: exceptiongroup>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from hypothesis<7,>=6.14->pyctcdecode>=0.4.0->transformers==4.45.0.dev0) (1.2.2)\n",
778
+ "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa->transformers==4.45.0.dev0) (0.43.0)\n",
779
+ "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa->transformers==4.45.0.dev0) (4.3.2)\n",
780
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1->flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (3.0.0)\n",
781
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1->flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (2.16.1)\n",
782
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa->transformers==4.45.0.dev0) (3.5.0)\n",
783
+ "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.1->librosa->transformers==4.45.0.dev0) (1.17.1)\n",
784
+ "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.3.0->optuna->transformers==4.45.0.dev0) (3.1.0)\n",
785
+ "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (2.27.0)\n",
786
+ "Requirement already satisfied: google-auth-oauthlib<2,>=0.5 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (1.2.1)\n",
787
+ "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (3.7)\n",
788
+ "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.7.2)\n",
789
+ "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (3.0.4)\n",
790
+ "Requirement already satisfied: tf-keras>=2.14.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow-hub>=0.13.0->tensorflow-text<2.16->transformers==4.45.0.dev0) (2.17.0)\n",
791
+ "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from APScheduler->codecarbon==1.2.0->transformers==4.45.0.dev0) (2024.2)\n",
792
+ "Requirement already satisfied: tzlocal!=3.*,>=2.0 in /usr/local/lib/python3.10/dist-packages (from APScheduler->codecarbon==1.2.0->transformers==4.45.0.dev0) (5.2)\n",
793
+ "Requirement already satisfied: Flask<3.1,>=1.0.4 in /usr/local/lib/python3.10/dist-packages (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0) (2.2.5)\n",
794
+ "Requirement already satisfied: plotly>=5.0.0 in /usr/local/lib/python3.10/dist-packages (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0) (5.15.0)\n",
795
+ "Collecting dash-html-components==2.0.0 (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
796
+ " Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)\n",
797
+ "Collecting dash-core-components==2.0.0 (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
798
+ " Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)\n",
799
+ "Collecting dash-table==5.0.0 (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
800
+ " Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)\n",
801
+ "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.10/dist-packages (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0) (8.5.0)\n",
802
+ "Collecting retrying (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0)\n",
803
+ " Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)\n",
804
+ "Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.10/dist-packages (from dash->codecarbon==1.2.0->transformers==4.45.0.dev0) (1.6.0)\n",
805
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->transformers==4.45.0.dev0) (2.1.5)\n",
806
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema->ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (2023.12.1)\n",
807
+ "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema->ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (0.35.1)\n",
808
+ "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema->ray>=2.7.0->ray[tune]>=2.7.0; extra == \"all\"->transformers==4.45.0.dev0) (0.20.0)\n",
809
+ "Requirement already satisfied: namex in /usr/local/lib/python3.10/dist-packages (from keras-core->keras-nlp<0.14.0,>=0.3.1->transformers==4.45.0.dev0) (0.0.8)\n",
810
+ "Requirement already satisfied: etils[epath,epy] in /usr/local/lib/python3.10/dist-packages (from orbax-checkpoint->flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (1.9.4)\n",
811
+ "INFO: pip is looking at multiple versions of orbax-checkpoint to determine which version is compatible with other requirements. This could take a while.\n",
812
+ "Collecting orbax-checkpoint (from flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0)\n",
813
+ " Downloading orbax_checkpoint-0.6.2-py3-none-any.whl.metadata (1.8 kB)\n",
814
+ " Downloading orbax_checkpoint-0.6.1-py3-none-any.whl.metadata (1.8 kB)\n",
815
+ " Downloading orbax_checkpoint-0.6.0-py3-none-any.whl.metadata (1.8 kB)\n",
816
+ " Downloading orbax_checkpoint-0.5.23-py3-none-any.whl.metadata (1.8 kB)\n",
817
+ " Downloading orbax_checkpoint-0.5.22-py3-none-any.whl.metadata (1.8 kB)\n",
818
+ " Downloading orbax_checkpoint-0.5.21-py3-none-any.whl.metadata (1.8 kB)\n",
819
+ " Downloading orbax_checkpoint-0.5.20-py3-none-any.whl.metadata (1.8 kB)\n",
820
+ "INFO: pip is still looking at multiple versions of orbax-checkpoint to determine which version is compatible with other requirements. This could take a while.\n",
821
+ " Downloading orbax_checkpoint-0.5.19-py3-none-any.whl.metadata (1.8 kB)\n",
822
+ " Downloading orbax_checkpoint-0.5.18-py3-none-any.whl.metadata (1.8 kB)\n",
823
+ " Downloading orbax_checkpoint-0.5.17-py3-none-any.whl.metadata (1.8 kB)\n",
824
+ " Downloading orbax_checkpoint-0.5.16-py3-none-any.whl.metadata (1.8 kB)\n",
825
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->codecarbon==1.2.0->transformers==4.45.0.dev0) (2.8.2)\n",
826
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->codecarbon==1.2.0->transformers==4.45.0.dev0) (2024.1)\n",
827
+ "Collecting clldutils>=1.7.3 (from segments->phonemizer->transformers==4.45.0.dev0)\n",
828
+ " Downloading clldutils-3.22.2-py2.py3-none-any.whl.metadata (3.0 kB)\n",
829
+ "Collecting csvw>=1.5.6 (from segments->phonemizer->transformers==4.45.0.dev0)\n",
830
+ " Downloading csvw-3.3.1-py2.py3-none-any.whl.metadata (10 kB)\n",
831
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->transformers==4.45.0.dev0) (1.3.0)\n",
832
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.1->librosa->transformers==4.45.0.dev0) (2.22)\n",
833
+ "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer->transformers==4.45.0.dev0) (0.9.0)\n",
834
+ "Collecting bibtexparser>=2.0.0b4 (from clldutils>=1.7.3->segments->phonemizer->transformers==4.45.0.dev0)\n",
835
+ " Downloading bibtexparser-2.0.0b7-py3-none-any.whl.metadata (5.6 kB)\n",
836
+ "Collecting pylatexenc (from clldutils>=1.7.3->segments->phonemizer->transformers==4.45.0.dev0)\n",
837
+ " Downloading pylatexenc-2.10.tar.gz (162 kB)\n",
838
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.6/162.6 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
839
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
840
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer->transformers==4.45.0.dev0) (4.9.4)\n",
841
+ "Requirement already satisfied: babel in /usr/local/lib/python3.10/dist-packages (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0) (2.16.0)\n",
842
+ "Collecting colorama (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0)\n",
843
+ " Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)\n",
844
+ "Collecting isodate (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0)\n",
845
+ " Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)\n",
846
+ "Collecting language-tags (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0)\n",
847
+ " Downloading language_tags-1.2.0-py3-none-any.whl.metadata (2.1 kB)\n",
848
+ "Collecting rdflib (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0)\n",
849
+ " Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)\n",
850
+ "Collecting rfc3986<2 (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0)\n",
851
+ " Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)\n",
852
+ "Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0) (4.1.1)\n",
853
+ "Requirement already satisfied: itsdangerous>=2.0 in /usr/local/lib/python3.10/dist-packages (from Flask<3.1,>=1.0.4->dash->codecarbon==1.2.0->transformers==4.45.0.dev0) (2.2.0)\n",
854
+ "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython>=2.0.0->sigopt->transformers==4.45.0.dev0)\n",
855
+ " Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n",
856
+ "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (5.5.0)\n",
857
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.4.1)\n",
858
+ "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (4.9)\n",
859
+ "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (1.3.1)\n",
860
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=11.1->flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (0.1.2)\n",
861
+ "Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly>=5.0.0->dash->codecarbon==1.2.0->transformers==4.45.0.dev0) (9.0.0)\n",
862
+ "INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.\n",
863
+ "Collecting tf-keras>=2.14.1 (from tensorflow-hub>=0.13.0->tensorflow-text<2.16->transformers==4.45.0.dev0)\n",
864
+ " Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)\n",
865
+ " Downloading tf_keras-2.15.1-py3-none-any.whl.metadata (1.7 kB)\n",
866
+ "Requirement already satisfied: importlib_resources in /usr/local/lib/python3.10/dist-packages (from etils[epath,epy]->orbax-checkpoint->flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (6.4.5)\n",
867
+ "Requirement already satisfied: zipp in /usr/local/lib/python3.10/dist-packages (from etils[epath,epy]->orbax-checkpoint->flax<=0.7.0,>=0.4.1->transformers==4.45.0.dev0) (3.20.1)\n",
868
+ "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (0.6.1)\n",
869
+ "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow<2.16,>2.9->transformers==4.45.0.dev0) (3.2.2)\n",
870
+ "Requirement already satisfied: pyparsing<4,>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer->transformers==4.45.0.dev0) (3.1.4)\n",
871
+ "Downloading av-9.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)\n",
872
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m28.8/28.8 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
873
+ "\u001b[?25hDownloading codecarbon-1.2.0-py3-none-any.whl (135 kB)\n",
874
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m135.0/135.0 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
875
+ "\u001b[?25hDownloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n",
876
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m75.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
877
+ "\u001b[?25hDownloading flax-0.7.0-py3-none-any.whl (225 kB)\n",
878
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.9/225.9 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
879
+ "\u001b[?25hDownloading jaxlib-0.4.13-cp310-cp310-manylinux2014_x86_64.whl (71.6 MB)\n",
880
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.6/71.6 MB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
881
+ "\u001b[?25hDownloading keras_nlp-0.12.1-py3-none-any.whl (570 kB)\n",
882
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m570.5/570.5 kB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
883
+ "\u001b[?25hDownloading optax-0.1.4-py3-none-any.whl (154 kB)\n",
884
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.9/154.9 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
885
+ "\u001b[?25hDownloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)\n",
886
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m83.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
887
+ "\u001b[?25hDownloading pyctcdecode-0.5.0-py2.py3-none-any.whl (39 kB)\n",
888
+ "Downloading ray-2.35.0-cp310-cp310-manylinux2014_x86_64.whl (65.0 MB)\n",
889
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.0/65.0 MB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
890
+ "\u001b[?25hDownloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)\n",
891
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.4/38.4 MB\u001b[0m \u001b[31m18.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
892
+ "\u001b[?25hDownloading tensorflow-2.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)\n",
893
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
894
+ "\u001b[?25hDownloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)\n",
895
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.2/5.2 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
896
+ "\u001b[?25hDownloading timm-0.9.16-py3-none-any.whl (2.2 MB)\n",
897
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m49.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
898
+ "\u001b[?25hDownloading onnxconverter_common-1.13.0-py2.py3-none-any.whl (83 kB)\n",
899
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
900
+ "\u001b[?25hDownloading optuna-4.0.0-py3-none-any.whl (362 kB)\n",
901
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m362.8/362.8 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
902
+ "\u001b[?25hDownloading phonemizer-3.3.0-py3-none-any.whl (103 kB)\n",
903
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.8/103.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
904
+ "\u001b[?25hDownloading sigopt-8.8.3-py2.py3-none-any.whl (198 kB)\n",
905
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.8/198.8 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
906
+ "\u001b[?25hDownloading tf2onnx-1.16.1-py3-none-any.whl (455 kB)\n",
907
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
908
+ "\u001b[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)\n",
909
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.0/233.0 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
910
+ "\u001b[?25hDownloading backoff-1.11.1-py2.py3-none-any.whl (13 kB)\n",
911
+ "Downloading chex-0.1.82-py3-none-any.whl (94 kB)\n",
912
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
913
+ "\u001b[?25hDownloading GitPython-3.1.43-py3-none-any.whl (207 kB)\n",
914
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.3/207.3 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
915
+ "\u001b[?25hDownloading hypothesis-6.112.1-py3-none-any.whl (467 kB)\n",
916
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m467.5/467.5 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
917
+ "\u001b[?25hDownloading keras-2.15.0-py3-none-any.whl (1.7 MB)\n",
918
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m46.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
919
+ "\u001b[?25hDownloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)\n",
920
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
921
+ "\u001b[?25hDownloading onnx-1.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)\n",
922
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m84.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
923
+ "\u001b[?25hDownloading pygtrie-2.5.0-py3-none-any.whl (25 kB)\n",
924
+ "Downloading pypng-0.20220715.0-py3-none-any.whl (58 kB)\n",
925
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.1/58.1 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
926
+ "\u001b[?25hDownloading tensorboard-2.15.2-py3-none-any.whl (5.5 MB)\n",
927
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m95.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
928
+ "\u001b[?25hDownloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)\n",
929
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.7/101.7 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
930
+ "\u001b[?25hDownloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)\n",
931
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m442.0/442.0 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
932
+ "\u001b[?25hDownloading urllib3-1.26.20-py2.py3-none-any.whl (144 kB)\n",
933
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.2/144.2 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
934
+ "\u001b[?25hDownloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (77 kB)\n",
935
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
936
+ "\u001b[?25hDownloading APScheduler-3.10.4-py3-none-any.whl (59 kB)\n",
937
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.3/59.3 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
938
+ "\u001b[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n",
939
+ "Downloading dash-2.18.1-py3-none-any.whl (7.5 MB)\n",
940
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.5/7.5 MB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
941
+ "\u001b[?25hDownloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)\n",
942
+ "Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)\n",
943
+ "Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)\n",
944
+ "Downloading dash_bootstrap_components-1.6.0-py3-none-any.whl (222 kB)\n",
945
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.5/222.5 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
946
+ "\u001b[?25hDownloading dlinfo-1.2.1-py3-none-any.whl (3.6 kB)\n",
947
+ "Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)\n",
948
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m950.8/950.8 kB\u001b[0m \u001b[31m36.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
949
+ "\u001b[?25hDownloading orbax_checkpoint-0.5.16-py3-none-any.whl (217 kB)\n",
950
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m217.0/217.0 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
951
+ "\u001b[?25hDownloading pynvml-11.5.3-py3-none-any.whl (53 kB)\n",
952
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
953
+ "\u001b[?25hDownloading segments-2.2.1-py2.py3-none-any.whl (15 kB)\n",
954
+ "Downloading clldutils-3.22.2-py2.py3-none-any.whl (1.7 MB)\n",
955
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m45.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
956
+ "\u001b[?25hDownloading csvw-3.3.1-py2.py3-none-any.whl (57 kB)\n",
957
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
958
+ "\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
959
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
960
+ "\u001b[?25hDownloading tf_keras-2.15.1-py3-none-any.whl (1.7 MB)\n",
961
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
962
+ "\u001b[?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)\n",
963
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
964
+ "\u001b[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)\n",
965
+ "Downloading bibtexparser-2.0.0b7-py3-none-any.whl (38 kB)\n",
966
+ "Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)\n",
967
+ "Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
968
+ "Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
969
+ "Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n",
970
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
971
+ "\u001b[?25hDownloading language_tags-1.2.0-py3-none-any.whl (213 kB)\n",
972
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.4/213.4 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
973
+ "\u001b[?25hDownloading rdflib-7.0.0-py3-none-any.whl (531 kB)\n",
974
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m531.9/531.9 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
975
+ "\u001b[?25hBuilding wheels for collected packages: transformers, jax, kenlm, fire, pylatexenc\n",
976
+ " Building editable for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
977
+ " Created wheel for transformers: filename=transformers-4.45.0.dev0-0.editable-py3-none-any.whl size=17235 sha256=47f8d7fd33fbf68a16dcdf9eb478de0330868e96b41dddcaa2e90ce820af3d0c\n",
978
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-4a6k3c9o/wheels/e8/d3/96/0e8c7135806cbda4db28d12fc8d710e5e4f66ced1411163e67\n",
979
+ " Building wheel for jax (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
980
+ " Created wheel for jax: filename=jax-0.4.13-py3-none-any.whl size=1518703 sha256=696a457ee4a211a88e694e2ebf08a4f507a8a30771990bfddfe138813b2c9365\n",
981
+ " Stored in directory: /root/.cache/pip/wheels/f3/7a/25/f297f69029b5e4064e4736a0c4b3996a44cc27781c120bcb99\n",
982
+ " Building wheel for kenlm (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
983
+ " Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=3184429 sha256=13f3f18abd6e36cd73d41e00a8f62353ba919d0dc6283b26d233ce3d7a001263\n",
984
+ " Stored in directory: /root/.cache/pip/wheels/fd/80/e0/18f4148e863fb137bd87e21ee2bf423b81b3ed6989dab95135\n",
985
+ " Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
986
+ " Created wheel for fire: filename=fire-0.6.0-py2.py3-none-any.whl size=117030 sha256=663b80270e4bec0b4b2a2170c8587b95783f5bb0683d5e2d919836b362b8e838\n",
987
+ " Stored in directory: /root/.cache/pip/wheels/d6/6d/5d/5b73fa0f46d01a793713f8859201361e9e581ced8c75e5c6a3\n",
988
+ " Building wheel for pylatexenc (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
989
+ " Created wheel for pylatexenc: filename=pylatexenc-2.10-py3-none-any.whl size=136817 sha256=ac148679a22a494d8e13b2f1d3c8edf4753a07b4a509bdc91ede1fac56069ade\n",
990
+ " Stored in directory: /root/.cache/pip/wheels/d3/31/8b/e09b0386afd80cfc556c00408c9aeea5c35c4d484a9c762fd5\n",
991
+ "Successfully built transformers jax kenlm fire pylatexenc\n",
992
+ "Installing collected packages: rfc3986, pypng, pylatexenc, pygtrie, language-tags, kenlm, dlinfo, dash-table, dash-html-components, dash-core-components, av, wrapt, urllib3, tensorflow-estimator, tensorboardX, smmap, scipy, retrying, pynvml, Pillow, onnx, ml-dtypes, Mako, keras, isodate, hypothesis, fire, decord, colorlog, colorama, bibtexparser, backoff, APScheduler, rdflib, pyctcdecode, onnxconverter-common, jaxlib, jax, gitdb, clldutils, alembic, tf2onnx, orbax-checkpoint, optuna, keras-core, GitPython, dash, chex, timm, sigopt, ray, optax, dash-bootstrap-components, csvw, transformers, tensorboard, segments, flax, codecarbon, tensorflow, phonemizer, tf-keras, tensorflow-text, keras-nlp\n",
993
+ " Attempting uninstall: wrapt\n",
994
+ " Found existing installation: wrapt 1.16.0\n",
995
+ " Uninstalling wrapt-1.16.0:\n",
996
+ " Successfully uninstalled wrapt-1.16.0\n",
997
+ " Attempting uninstall: urllib3\n",
998
+ " Found existing installation: urllib3 2.0.7\n",
999
+ " Uninstalling urllib3-2.0.7:\n",
1000
+ " Successfully uninstalled urllib3-2.0.7\n",
1001
+ " Attempting uninstall: scipy\n",
1002
+ " Found existing installation: scipy 1.13.1\n",
1003
+ " Uninstalling scipy-1.13.1:\n",
1004
+ " Successfully uninstalled scipy-1.13.1\n",
1005
+ " Attempting uninstall: Pillow\n",
1006
+ " Found existing installation: Pillow 9.4.0\n",
1007
+ " Uninstalling Pillow-9.4.0:\n",
1008
+ " Successfully uninstalled Pillow-9.4.0\n",
1009
+ " Attempting uninstall: ml-dtypes\n",
1010
+ " Found existing installation: ml-dtypes 0.4.0\n",
1011
+ " Uninstalling ml-dtypes-0.4.0:\n",
1012
+ " Successfully uninstalled ml-dtypes-0.4.0\n",
1013
+ " Attempting uninstall: keras\n",
1014
+ " Found existing installation: keras 3.4.1\n",
1015
+ " Uninstalling keras-3.4.1:\n",
1016
+ " Successfully uninstalled keras-3.4.1\n",
1017
+ " Attempting uninstall: jaxlib\n",
1018
+ " Found existing installation: jaxlib 0.4.26+cuda12.cudnn89\n",
1019
+ " Uninstalling jaxlib-0.4.26+cuda12.cudnn89:\n",
1020
+ " Successfully uninstalled jaxlib-0.4.26+cuda12.cudnn89\n",
1021
+ " Attempting uninstall: jax\n",
1022
+ " Found existing installation: jax 0.4.26\n",
1023
+ " Uninstalling jax-0.4.26:\n",
1024
+ " Successfully uninstalled jax-0.4.26\n",
1025
+ " Attempting uninstall: orbax-checkpoint\n",
1026
+ " Found existing installation: orbax-checkpoint 0.6.3\n",
1027
+ " Uninstalling orbax-checkpoint-0.6.3:\n",
1028
+ " Successfully uninstalled orbax-checkpoint-0.6.3\n",
1029
+ " Attempting uninstall: chex\n",
1030
+ " Found existing installation: chex 0.1.86\n",
1031
+ " Uninstalling chex-0.1.86:\n",
1032
+ " Successfully uninstalled chex-0.1.86\n",
1033
+ " Attempting uninstall: optax\n",
1034
+ " Found existing installation: optax 0.2.2\n",
1035
+ " Uninstalling optax-0.2.2:\n",
1036
+ " Successfully uninstalled optax-0.2.2\n",
1037
+ " Attempting uninstall: transformers\n",
1038
+ " Found existing installation: transformers 4.44.2\n",
1039
+ " Uninstalling transformers-4.44.2:\n",
1040
+ " Successfully uninstalled transformers-4.44.2\n",
1041
+ " Attempting uninstall: tensorboard\n",
1042
+ " Found existing installation: tensorboard 2.17.0\n",
1043
+ " Uninstalling tensorboard-2.17.0:\n",
1044
+ " Successfully uninstalled tensorboard-2.17.0\n",
1045
+ " Attempting uninstall: flax\n",
1046
+ " Found existing installation: flax 0.8.4\n",
1047
+ " Uninstalling flax-0.8.4:\n",
1048
+ " Successfully uninstalled flax-0.8.4\n",
1049
+ " Attempting uninstall: tensorflow\n",
1050
+ " Found existing installation: tensorflow 2.17.0\n",
1051
+ " Uninstalling tensorflow-2.17.0:\n",
1052
+ " Successfully uninstalled tensorflow-2.17.0\n",
1053
+ " Attempting uninstall: tf-keras\n",
1054
+ " Found existing installation: tf_keras 2.17.0\n",
1055
+ " Uninstalling tf_keras-2.17.0:\n",
1056
+ " Successfully uninstalled tf_keras-2.17.0\n",
1057
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
1058
+ "osqp 0.6.7.post0 requires scipy!=1.12.0,>=0.13.2, but you have scipy 1.12.0 which is incompatible.\u001b[0m\u001b[31m\n",
1059
+ "\u001b[0mSuccessfully installed APScheduler-3.10.4 GitPython-3.1.43 Mako-1.3.5 Pillow-10.4.0 alembic-1.13.2 av-9.2.0 backoff-1.11.1 bibtexparser-2.0.0b7 chex-0.1.82 clldutils-3.22.2 codecarbon-1.2.0 colorama-0.4.6 colorlog-6.8.2 csvw-3.3.1 dash-2.18.1 dash-bootstrap-components-1.6.0 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 decord-0.6.0 dlinfo-1.2.1 fire-0.6.0 flax-0.7.0 gitdb-4.0.11 hypothesis-6.112.1 isodate-0.6.1 jax-0.4.13 jaxlib-0.4.13 kenlm-0.2.0 keras-2.15.0 keras-core-0.1.7 keras-nlp-0.12.1 language-tags-1.2.0 ml-dtypes-0.3.2 onnx-1.16.2 onnxconverter-common-1.13.0 optax-0.1.4 optuna-4.0.0 orbax-checkpoint-0.5.16 phonemizer-3.3.0 pyctcdecode-0.5.0 pygtrie-2.5.0 pylatexenc-2.10 pynvml-11.5.3 pypng-0.20220715.0 ray-2.35.0 rdflib-7.0.0 retrying-1.3.4 rfc3986-1.5.0 scipy-1.12.0 segments-2.2.1 sigopt-8.8.3 smmap-5.0.1 tensorboard-2.15.2 tensorboardX-2.6.2.2 tensorflow-2.15.1 tensorflow-estimator-2.15.0 tensorflow-text-2.15.0 tf-keras-2.15.1 tf2onnx-1.16.1 timm-0.9.16 transformers-4.45.0.dev0 urllib3-1.26.20 wrapt-1.14.1\n"
1060
+ ]
1061
+ },
1062
+ {
1063
+ "output_type": "display_data",
1064
+ "data": {
1065
+ "application/vnd.colab-display-data+json": {
1066
+ "pip_warning": {
1067
+ "packages": [
1068
+ "PIL"
1069
+ ]
1070
+ },
1071
+ "id": "13f986199d30470480155a0a26153ae8"
1072
+ }
1073
+ },
1074
+ "metadata": {}
1075
+ }
1076
+ ]
1077
+ },
1078
+ {
1079
+ "cell_type": "code",
1080
+ "source": [
1081
+ "from transformers import OlmoeForCausalLM, AutoTokenizer"
1082
+ ],
1083
+ "metadata": {
1084
+ "id": "ijEEXUXztzXD"
1085
+ },
1086
+ "execution_count": 6,
1087
+ "outputs": []
1088
+ },
1089
+ {
1090
+ "cell_type": "code",
1091
+ "source": [
1092
+ "import torch"
1093
+ ],
1094
+ "metadata": {
1095
+ "id": "3sUL_mpAvywU"
1096
+ },
1097
+ "execution_count": 5,
1098
+ "outputs": []
1099
+ },
1100
+ {
1101
+ "cell_type": "code",
1102
+ "source": [
1103
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
1104
+ "\n",
1105
+ "# Load different ckpts via passing e.g. `revision=step10000-tokens41B`\n",
1106
+ "model = OlmoeForCausalLM.from_pretrained(\"allenai/OLMoE-1B-7B-0924\").to(DEVICE)\n",
1107
+ "tokenizer = AutoTokenizer.from_pretrained(\"allenai/OLMoE-1B-7B-0924\")\n",
1108
+ "inputs = tokenizer(\"Bitcoin is\", return_tensors=\"pt\")\n",
1109
+ "inputs = {k: v.to(DEVICE) for k, v in inputs.items()}\n",
1110
+ "out = model.generate(**inputs, max_length=64)\n",
1111
+ "print(tokenizer.decode(out[0]))"
1112
+ ],
1113
+ "metadata": {
1114
+ "colab": {
1115
+ "base_uri": "https://localhost:8080/",
1116
+ "height": 49,
1117
+ "referenced_widgets": [
1118
+ "9074c90ec0f74bf29bf36d65ebba1b96",
1119
+ "39038e915d234800bcaaa9b3895f1b68",
1120
+ "6d3b0f9e6c494cbea42456b60b011564",
1121
+ "831274f9a0ec40668ec7fcc5ac2771c3",
1122
+ "8fa5eaaf1c8d4bc18ee076d33d563a0a",
1123
+ "4a614b527a2a49789d41df08c3466764",
1124
+ "30e7f062bc1b44a3a99363cb1a06fea9",
1125
+ "528eadb5cd604eb78b99b3781ad8a357",
1126
+ "4f5a0839fd414dbaa5a2f6306cfa5ecc",
1127
+ "d44816c885194cef98b86893263b9b10",
1128
+ "9c8da3b512bf414684a22b77d7fe48c6"
1129
+ ]
1130
+ },
1131
+ "id": "Kj8Ti_I_wcvc",
1132
+ "outputId": "4419620d-f678-4f1c-9e27-593298f9c0ed"
1133
+ },
1134
+ "execution_count": null,
1135
+ "outputs": [
1136
+ {
1137
+ "output_type": "display_data",
1138
+ "data": {
1139
+ "text/plain": [
1140
+ "Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
1141
+ ],
1142
+ "application/vnd.jupyter.widget-view+json": {
1143
+ "version_major": 2,
1144
+ "version_minor": 0,
1145
+ "model_id": "9074c90ec0f74bf29bf36d65ebba1b96"
1146
+ }
1147
+ },
1148
+ "metadata": {}
1149
+ }
1150
+ ]
1151
+ },
1152
+ {
1153
+ "cell_type": "code",
1154
+ "source": [],
1155
+ "metadata": {
1156
+ "id": "R0VMDS_Hwm83"
1157
+ },
1158
+ "execution_count": null,
1159
+ "outputs": []
1160
+ }
1161
+ ]
1162
+ }
examples/RAPTOR.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
examples/RAPTOR_llama_index.ipynb ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval\n",
8
+ "\n",
9
+ "This notebook shows how to use an implementation of RAPTOR with llama-index, leveraging the RAPTOR llama-pack.\n",
10
+ "\n",
11
+ "RAPTOR works by recursively clustering and summarizing clusters in layers for retrieval.\n",
12
+ "\n",
13
+ "There two retrieval modes:\n",
14
+ "- tree_traversal -- traversing the tree of clusters, performing top-k at each level in the tree.\n",
15
+ "- collapsed -- treat the entire tree as a giant pile of nodes, perform simple top-k.\n",
16
+ "\n",
17
+ "See [the paper](https://arxiv.org/abs/2401.18059) for full algorithm details."
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "## Setup"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "!pip install llama-index llama-index-packs-raptor llama-index-vector-stores-qdrant"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": null,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "from llama_index.packs.raptor import RaptorPack\n",
43
+ "\n",
44
+ "# optionally download the pack to inspect/modify it yourself!\n",
45
+ "# from llama_index.core.llama_pack import download_llama_pack\n",
46
+ "# RaptorPack = download_llama_pack(\"RaptorPack\", \"./raptor_pack\")"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.\n",
59
+ "ERROR: could not open HSTS store at '/home/loganm/.wget-hsts'. HSTS will be disabled.\n",
60
+ "--2024-02-29 22:16:11-- https://arxiv.org/pdf/2401.18059.pdf\n",
61
+ "Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.195.42, 151.101.131.42, ...\n",
62
+ "Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.\n",
63
+ "HTTP request sent, awaiting response... 200 OK\n",
64
+ "Length: 2547113 (2.4M) [application/pdf]\n",
65
+ "Saving to: ‘./raptor_paper.pdf’\n",
66
+ "\n",
67
+ "./raptor_paper.pdf 100%[===================>] 2.43M 12.5MB/s in 0.2s \n",
68
+ "\n",
69
+ "2024-02-29 22:16:12 (12.5 MB/s) - ‘./raptor_paper.pdf’ saved [2547113/2547113]\n",
70
+ "\n"
71
+ ]
72
+ }
73
+ ],
74
+ "source": [
75
+ "!wget https://arxiv.org/pdf/2401.18059.pdf -O ./raptor_paper.pdf"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "import os\n",
85
+ "\n",
86
+ "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "markdown",
91
+ "metadata": {},
92
+ "source": [
93
+ "## Constructing the Clusters/Hierarchy Tree"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "import nest_asyncio\n",
103
+ "\n",
104
+ "nest_asyncio.apply()"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "from llama_index.core import SimpleDirectoryReader\n",
114
+ "\n",
115
+ "documents = SimpleDirectoryReader(input_files=[\"./raptor_paper.pdf\"]).load_data()"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": null,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "Generating embeddings for level 0.\n",
128
+ "Performing clustering for level 0.\n",
129
+ "Generating summaries for level 0 with 10 clusters.\n",
130
+ "Level 0 created summaries/clusters: 10\n",
131
+ "Generating embeddings for level 1.\n",
132
+ "Performing clustering for level 1.\n",
133
+ "Generating summaries for level 1 with 1 clusters.\n",
134
+ "Level 1 created summaries/clusters: 1\n",
135
+ "Generating embeddings for level 2.\n",
136
+ "Performing clustering for level 2.\n",
137
+ "Generating summaries for level 2 with 1 clusters.\n",
138
+ "Level 2 created summaries/clusters: 1\n"
139
+ ]
140
+ }
141
+ ],
142
+ "source": [
143
+ "from llama_index.core.node_parser import SentenceSplitter\n",
144
+ "from llama_index.llms.openai import OpenAI\n",
145
+ "from llama_index.embeddings.openai import OpenAIEmbedding\n",
146
+ "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
147
+ "import chromadb\n",
148
+ "\n",
149
+ "client = chromadb.PersistentClient(path=\"./raptor_paper_db\")\n",
150
+ "collection = client.get_or_create_collection(\"raptor\")\n",
151
+ "\n",
152
+ "vector_store = ChromaVectorStore(chroma_collection=collection)\n",
153
+ "\n",
154
+ "raptor_pack = RaptorPack(\n",
155
+ " documents,\n",
156
+ " embed_model=OpenAIEmbedding(\n",
157
+ " model=\"text-embedding-3-small\"\n",
158
+ " ), # used for embedding clusters\n",
159
+ " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1), # used for generating summaries\n",
160
+ " vector_store=vector_store, # used for storage\n",
161
+ " similarity_top_k=2, # top k for each layer, or overall top-k for collapsed\n",
162
+ " mode=\"collapsed\", # sets default mode\n",
163
+ " transformations=[\n",
164
+ " SentenceSplitter(chunk_size=400, chunk_overlap=50)\n",
165
+ " ], # transformations applied for ingestion\n",
166
+ ")"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "metadata": {},
172
+ "source": [
173
+ "## Retrieval"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "2\n",
186
+ "Specifically, RAPTOR’s F-1 scores are at least 1.8% points higher than DPR and at least 5.3% points\n",
187
+ "higher than BM25.\n",
188
+ "Retriever GPT-3 F-1 Match GPT-4 F-1 Match UnifiedQA F-1 Match\n",
189
+ "Title + Abstract 25.2 22.2 17.5\n",
190
+ "BM25 46.6 50.2 26.4\n",
191
+ "DPR 51.3 53.0 32.1\n",
192
+ "RAPTOR 53.1 55.7 36.6\n",
193
+ "Table 4: Comparison of accuracies on the QuAL-\n",
194
+ "ITY dev dataset for two different language mod-\n",
195
+ "els (GPT-3, UnifiedQA 3B) using various retrieval\n",
196
+ "methods. RAPTOR outperforms the baselines of\n",
197
+ "BM25 and DPR by at least 2.0% in accuracy.\n",
198
+ "Model GPT-3 Acc. UnifiedQA Acc.\n",
199
+ "BM25 57.3 49.9\n",
200
+ "DPR 60.4 53.9\n",
201
+ "RAPTOR 62.4 56.6\n",
202
+ "Table 5: Results on F-1 Match scores of various\n",
203
+ "models on the QASPER dataset.\n",
204
+ "Model F-1 Match\n",
205
+ "LongT5 XL (Guo et al., 2022) 53.1\n",
206
+ "CoLT5 XL (Ainslie et al., 2023) 53.9\n",
207
+ "RAPTOR + GPT-4 55.7Comparison to State-of-the-art Systems\n",
208
+ "Building upon our controlled comparisons,\n",
209
+ "we examine RAPTOR’s performance relative\n",
210
+ "to other state-of-the-art models.\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "nodes = raptor_pack.run(\"What baselines is raptor compared against?\", mode=\"collapsed\")\n",
216
+ "print(len(nodes))\n",
217
+ "print(nodes[0].text)"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": null,
223
+ "metadata": {},
224
+ "outputs": [
225
+ {
226
+ "name": "stdout",
227
+ "output_type": "stream",
228
+ "text": [
229
+ "Retrieved parent IDs from level 2: ['cc3b3f41-f4ca-4020-b11f-be7e0ce04c4f']\n",
230
+ "Retrieved 1 from parents at level 2.\n",
231
+ "Retrieved parent IDs from level 1: ['a4ca9426-a312-4a01-813a-c9b02aefc7e8']\n",
232
+ "Retrieved 2 from parents at level 1.\n",
233
+ "Retrieved parent IDs from level 0: ['63126782-2778-449f-99c0-1e8fd90caa36', 'd8f68d31-d878-41f1-aeb6-a7dde8ed5143']\n",
234
+ "Retrieved 4 from parents at level 0.\n",
235
+ "4\n",
236
+ "Specifically, RAPTOR’s F-1 scores are at least 1.8% points higher than DPR and at least 5.3% points\n",
237
+ "higher than BM25.\n",
238
+ "Retriever GPT-3 F-1 Match GPT-4 F-1 Match UnifiedQA F-1 Match\n",
239
+ "Title + Abstract 25.2 22.2 17.5\n",
240
+ "BM25 46.6 50.2 26.4\n",
241
+ "DPR 51.3 53.0 32.1\n",
242
+ "RAPTOR 53.1 55.7 36.6\n",
243
+ "Table 4: Comparison of accuracies on the QuAL-\n",
244
+ "ITY dev dataset for two different language mod-\n",
245
+ "els (GPT-3, UnifiedQA 3B) using various retrieval\n",
246
+ "methods. RAPTOR outperforms the baselines of\n",
247
+ "BM25 and DPR by at least 2.0% in accuracy.\n",
248
+ "Model GPT-3 Acc. UnifiedQA Acc.\n",
249
+ "BM25 57.3 49.9\n",
250
+ "DPR 60.4 53.9\n",
251
+ "RAPTOR 62.4 56.6\n",
252
+ "Table 5: Results on F-1 Match scores of various\n",
253
+ "models on the QASPER dataset.\n",
254
+ "Model F-1 Match\n",
255
+ "LongT5 XL (Guo et al., 2022) 53.1\n",
256
+ "CoLT5 XL (Ainslie et al., 2023) 53.9\n",
257
+ "RAPTOR + GPT-4 55.7Comparison to State-of-the-art Systems\n",
258
+ "Building upon our controlled comparisons,\n",
259
+ "we examine RAPTOR’s performance relative\n",
260
+ "to other state-of-the-art models.\n"
261
+ ]
262
+ }
263
+ ],
264
+ "source": [
265
+ "nodes = raptor_pack.run(\n",
266
+ " \"What baselines is raptor compared against?\", mode=\"tree_traversal\"\n",
267
+ ")\n",
268
+ "print(len(nodes))\n",
269
+ "print(nodes[0].text)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "markdown",
274
+ "metadata": {},
275
+ "source": [
276
+ "## Loading\n",
277
+ "\n",
278
+ "Since we saved to a vector store, we can also use it again! (For local vector stores, there is a `persist` and `from_persist_dir` method on the retriever)"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "from llama_index.packs.raptor import RaptorRetriever\n",
288
+ "\n",
289
+ "retriever = RaptorRetriever(\n",
290
+ " [],\n",
291
+ " embed_model=OpenAIEmbedding(\n",
292
+ " model=\"text-embedding-3-small\"\n",
293
+ " ), # used for embedding clusters\n",
294
+ " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1), # used for generating summaries\n",
295
+ " vector_store=vector_store, # used for storage\n",
296
+ " similarity_top_k=2, # top k for each layer, or overall top-k for collapsed\n",
297
+ " mode=\"tree_traversal\", # sets default mode\n",
298
+ ")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": null,
304
+ "metadata": {},
305
+ "outputs": [],
306
+ "source": [
307
+ "# if using a default vector store\n",
308
+ "# retriever.persist(\"./persist\")\n",
309
+ "# retriever = RaptorRetriever.from_persist_dir(\"./persist\", ...)"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "markdown",
314
+ "metadata": {},
315
+ "source": [
316
+ "## Query Engine"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": [
325
+ "from llama_index.core.query_engine import RetrieverQueryEngine\n",
326
+ "\n",
327
+ "query_engine = RetrieverQueryEngine.from_args(\n",
328
+ " retriever, llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)\n",
329
+ ")"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": null,
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "response = query_engine.query(\"What baselines was RAPTOR compared against?\")"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": null,
344
+ "metadata": {},
345
+ "outputs": [
346
+ {
347
+ "name": "stdout",
348
+ "output_type": "stream",
349
+ "text": [
350
+ "BM25 and DPR\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "print(str(response))"
356
+ ]
357
+ }
358
+ ],
359
+ "metadata": {
360
+ "kernelspec": {
361
+ "display_name": "llama-index-4aB9_5sa-py3.10",
362
+ "language": "python",
363
+ "name": "python3"
364
+ },
365
+ "language_info": {
366
+ "codemirror_mode": {
367
+ "name": "ipython",
368
+ "version": 3
369
+ },
370
+ "file_extension": ".py",
371
+ "mimetype": "text/x-python",
372
+ "name": "python",
373
+ "nbconvert_exporter": "python",
374
+ "pygments_lexer": "ipython3"
375
+ }
376
+ },
377
+ "nbformat": 4,
378
+ "nbformat_minor": 2
379
+ }
examples/Zamba_2_1_2B.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
examples/upsert_RAPTOR.ipynb ADDED
@@ -0,0 +1,834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import asyncio\n",
11
+ "import tiktoken\n",
12
+ "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "NAMESPACE = 'world_bank' # A relevant namespace to store our documents under."
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "markdown",
26
+ "metadata": {},
27
+ "source": [
28
+ "<h3 align=center> Prepare Documents </h3>\n"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "01HWZ9TCDDZAXHY6MQ2EM2P0WV",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "with open('data/world_bank_articles.txt', encoding='utf-8') as f:\n",
39
+ " texts = f.read()"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "id": "01HWZ9R350F1RF5Z59XDZ18DSG",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "separator = \"-\" * 150 # Defined earlier during webscraping\n",
50
+ "\n",
51
+ "# Necessary to limit the payload to and avoid a\n",
52
+ "# 400: 'Request payload size exceeds the limit: 10000 bytes.'\n",
53
+ "\n",
54
+ "text_splitter = RecursiveCharacterTextSplitter(separators=[separator, \"\\n\\n\\n\", \"\\n\\n\", \"\\n\"], \n",
55
+ " chunk_size=7000, # Empirically set from the output of CharacterTextSplitter\n",
56
+ " chunk_overlap=0)\n",
57
+ "docs = text_splitter.split_text(texts)\n",
58
+ "len(docs)"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "id": "01HYB6KQ7GA77RW1D4YRJQTNHM",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "# TODO\n",
69
+ "# Remove the separator to avoid filling retrieved context with distracting delimiters"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "markdown",
74
+ "metadata": {},
75
+ "source": [
76
+ "<h3 align=center> Exploring Docs </h3>"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "id": "01HWZA60JDH7EZE8ZFCNVTPA1J",
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
87
+ " \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
88
+ " encoding = tiktoken.get_encoding(encoding_name)\n",
89
+ " num_tokens = len(encoding.encode(string))\n",
90
+ " return num_tokens"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "id": "01HWZCET07B0AVHDV6RJX374JR",
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "import matplotlib.pyplot as plt\n",
101
+ "\n",
102
+ "# Calculate the number of tokens for each document\n",
103
+ "counts = [num_tokens_from_string(d, \"cl100k_base\") for d in docs]\n",
104
+ "\n",
105
+ "# Plotting the histogram of token counts\n",
106
+ "plt.figure(figsize=(10, 6))\n",
107
+ "plt.hist(counts, bins=30, color=\"blue\", edgecolor=\"black\", alpha=0.7)\n",
108
+ "plt.title(\"Histogram of Token Counts\")\n",
109
+ "plt.xlabel(\"Token Count\")\n",
110
+ "plt.ylabel(\"Frequency\")\n",
111
+ "plt.grid(axis=\"y\", alpha=0.75)\n",
112
+ "\n",
113
+ "# Display the histogram\n",
114
+ "plt.show"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "id": "01HWZCXXCRE80ZQDPY4SY0DH9N",
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "# Doc texts concat\n",
125
+ "concatenated_content = \"\\n\\n\\n --- \\n\\n\\n\".join(docs)\n",
126
+ "print(\n",
127
+ " \"Num tokens in all context: %s\"\n",
128
+ " % num_tokens_from_string(concatenated_content, \"cl100k_base\")\n",
129
+ ")"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "markdown",
134
+ "metadata": {},
135
+ "source": [
136
+ "### Remarks\n",
137
+ "Most of the documents are small ( < 2000 tokens ) though some are considerably longer at around 8000 tokens. The whole corpus is just shy of 90000 tokens. This is too large to fit in standard 32k context windows."
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "markdown",
142
+ "metadata": {},
143
+ "source": [
144
+ "<h3 align=center> Define Model </h3>"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "from langchain_google_genai import ChatGoogleGenerativeAI"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "metadata": {},
160
+ "outputs": [],
161
+ "source": [
162
+ "model = ChatGoogleGenerativeAI(google_api_key=os.getenv('GOOGLE_API_KEY'), model='gemini-pro')"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "markdown",
167
+ "metadata": {},
168
+ "source": [
169
+ "<h2 align=center> Clustering </h2>\n",
170
+ "\n",
171
+ "Now onto step two. Given the embeddings we have gotten we now cluster the texts. There are various well-researched techniques available to us."
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "from custom.gemini_async import async_embed # Custom module"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "id": "01HWZCSYAH3C5ZV69EFX4938GC",
187
+ "metadata": {},
188
+ "outputs": [],
189
+ "source": [
190
+ "from typing import Dict, List, Optional, Tuple\n",
191
+ "\n",
192
+ "import numpy as np\n",
193
+ "import pandas as pd\n",
194
+ "import umap\n",
195
+ "from langchain.prompts import ChatPromptTemplate\n",
196
+ "from langchain_core.output_parsers import StrOutputParser\n",
197
+ "from sklearn.mixture import GaussianMixture\n",
198
+ "\n",
199
+ "RANDOM_SEED = 224 # Fixed seed for reproducibility\n",
200
+ "\n",
201
+ "\n",
202
+ "def global_cluster_embeddings(\n",
203
+ " embeddings: np.ndarray,\n",
204
+ " dim: int,\n",
205
+ " n_neighbors: Optional[int] = None,\n",
206
+ " metric: str = \"cosine\",\n",
207
+ ") -> np.ndarray:\n",
208
+ " \"\"\"\n",
209
+ " Perform global dimensionality reduction on the embeddings using UMAP.\n",
210
+ "\n",
211
+ " Parameters:\n",
212
+ " - embeddings: The input embeddings as a numpy array.\n",
213
+ " - dim: The target dimensionality for the reduced space.\n",
214
+ " - n_neighbors: Optional; the number of neighbors to consider for each point.\n",
215
+ " If not provided, it defaults to the square root of the number of embeddings.\n",
216
+ " - metric: The distance metric to use for UMAP.\n",
217
+ "\n",
218
+ " Returns:\n",
219
+ " - A numpy array of the embeddings reduced to the specified dimensionality.\n",
220
+ " \"\"\"\n",
221
+ " if n_neighbors is None:\n",
222
+ " n_neighbors = int((len(embeddings) - 1) ** 0.5)\n",
223
+ " return umap.UMAP(\n",
224
+ " n_neighbors=n_neighbors, n_components=dim, metric=metric\n",
225
+ " ).fit_transform(embeddings)\n",
226
+ "\n",
227
+ "\n",
228
+ "def local_cluster_embeddings(\n",
229
+ " embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = \"cosine\"\n",
230
+ ") -> np.ndarray:\n",
231
+ " \"\"\"\n",
232
+ " Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.\n",
233
+ "\n",
234
+ " Parameters:\n",
235
+ " - embeddings: The input embeddings as a numpy array.\n",
236
+ " - dim: The target dimensionality for the reduced space.\n",
237
+ " - num_neighbors: The number of neighbors to consider for each point.\n",
238
+ " - metric: The distance metric to use for UMAP.\n",
239
+ "\n",
240
+ " Returns:\n",
241
+ " - A numpy array of the embeddings reduced to the specified dimensionality.\n",
242
+ " \"\"\"\n",
243
+ " return umap.UMAP(\n",
244
+ " n_neighbors=num_neighbors, n_components=dim, metric=metric\n",
245
+ " ).fit_transform(embeddings)\n",
246
+ "\n",
247
+ "\n",
248
+ "def get_optimal_clusters(\n",
249
+ " embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED\n",
250
+ ") -> int:\n",
251
+ " \"\"\"\n",
252
+ " Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.\n",
253
+ "\n",
254
+ " Parameters:\n",
255
+ " - embeddings: The input embeddings as a numpy array.\n",
256
+ " - max_clusters: The maximum number of clusters to consider.\n",
257
+ " - random_state: Seed for reproducibility.\n",
258
+ "\n",
259
+ " Returns:\n",
260
+ " - An integer representing the optimal number of clusters found.\n",
261
+ " \"\"\"\n",
262
+ " max_clusters = min(max_clusters, len(embeddings))\n",
263
+ " n_clusters = np.arange(1, max_clusters)\n",
264
+ " bics = []\n",
265
+ " for n in n_clusters:\n",
266
+ " gm = GaussianMixture(n_components=n, random_state=random_state)\n",
267
+ " gm.fit(embeddings)\n",
268
+ " bics.append(gm.bic(embeddings))\n",
269
+ " return n_clusters[np.argmin(bics)]\n",
270
+ "\n",
271
+ "\n",
272
+ "def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):\n",
273
+ " \"\"\"\n",
274
+ " Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.\n",
275
+ "\n",
276
+ " Parameters:\n",
277
+ " - embeddings: The input embeddings as a numpy array.\n",
278
+ " - threshold: The probability threshold for assigning an embedding to a cluster.\n",
279
+ " - random_state: Seed for reproducibility.\n",
280
+ "\n",
281
+ " Returns:\n",
282
+ " - A tuple containing the cluster labels and the number of clusters determined.\n",
283
+ " \"\"\"\n",
284
+ " n_clusters = get_optimal_clusters(embeddings)\n",
285
+ " gm = GaussianMixture(n_components=n_clusters, random_state=random_state)\n",
286
+ " gm.fit(embeddings)\n",
287
+ " probs = gm.predict_proba(embeddings)\n",
288
+ " labels = [np.where(prob > threshold)[0] for prob in probs]\n",
289
+ " return labels, n_clusters\n",
290
+ "\n",
291
+ "\n",
292
+ "def perform_clustering(\n",
293
+ " embeddings: np.ndarray,\n",
294
+ " dim: int,\n",
295
+ " threshold: float,\n",
296
+ ") -> List[np.ndarray]:\n",
297
+ " \"\"\"\n",
298
+ " Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering\n",
299
+ " using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.\n",
300
+ "\n",
301
+ " Parameters:\n",
302
+ " - embeddings: The input embeddings as a numpy array.\n",
303
+ " - dim: The target dimensionality for UMAP reduction.\n",
304
+ " - threshold: The probability threshold for assigning an embedding to a cluster in GMM.\n",
305
+ "\n",
306
+ " Returns:\n",
307
+ " - A list of numpy arrays, where each array contains the cluster IDs for each embedding.\n",
308
+ " \"\"\"\n",
309
+ " if len(embeddings) <= dim + 1:\n",
310
+ " # Avoid clustering when there's insufficient data\n",
311
+ " return [np.array([0]) for _ in range(len(embeddings))]\n",
312
+ "\n",
313
+ " # Global dimensionality reduction\n",
314
+ " reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)\n",
315
+ " # Global clustering\n",
316
+ " global_clusters, n_global_clusters = GMM_cluster(\n",
317
+ " reduced_embeddings_global, threshold\n",
318
+ " )\n",
319
+ "\n",
320
+ " all_local_clusters = [np.array([]) for _ in range(len(embeddings))]\n",
321
+ " total_clusters = 0\n",
322
+ "\n",
323
+ " # Iterate through each global cluster to perform local clustering\n",
324
+ " for i in range(n_global_clusters):\n",
325
+ " # Extract embeddings belonging to the current global cluster\n",
326
+ " global_cluster_embeddings_ = embeddings[\n",
327
+ " np.array([i in gc for gc in global_clusters])\n",
328
+ " ]\n",
329
+ "\n",
330
+ " if len(global_cluster_embeddings_) == 0:\n",
331
+ " continue\n",
332
+ " if len(global_cluster_embeddings_) <= dim + 1:\n",
333
+ " # Handle small clusters with direct assignment\n",
334
+ " local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]\n",
335
+ " n_local_clusters = 1\n",
336
+ " else:\n",
337
+ " # Local dimensionality reduction and clustering\n",
338
+ " reduced_embeddings_local = local_cluster_embeddings(\n",
339
+ " global_cluster_embeddings_, dim\n",
340
+ " )\n",
341
+ " local_clusters, n_local_clusters = GMM_cluster(\n",
342
+ " reduced_embeddings_local, threshold\n",
343
+ " )\n",
344
+ "\n",
345
+ " # Assign local cluster IDs, adjusting for total clusters already processed\n",
346
+ " for j in range(n_local_clusters):\n",
347
+ " local_cluster_embeddings_ = global_cluster_embeddings_[\n",
348
+ " np.array([j in lc for lc in local_clusters])\n",
349
+ " ]\n",
350
+ " indices = np.where(\n",
351
+ " (embeddings == local_cluster_embeddings_[:, None]).all(-1)\n",
352
+ " )[1]\n",
353
+ " for idx in indices:\n",
354
+ " all_local_clusters[idx] = np.append(\n",
355
+ " all_local_clusters[idx], j + total_clusters\n",
356
+ " )\n",
357
+ "\n",
358
+ " total_clusters += n_local_clusters\n",
359
+ "\n",
360
+ " return all_local_clusters\n"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": null,
366
+ "id": "01HWZF1JG63PCW32XKGWBB0813",
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "async def async_embed_with_postprocess(texts: list[str]) -> list[float]:\n",
371
+ " \"\"\"Call custom async embedder and return embeddings with an (informal) interface acceptable to downstream operations\"\"\"\n",
372
+ "\n",
373
+ " # await the asynchronous embedding function\n",
374
+ " # In a jupyter notebook trying to start an event loop with asyncio.run witll result in an error\n",
375
+ " all_embeddings = await async_embed(texts)\n",
376
+ "\n",
377
+ " try:\n",
378
+ " raw_embeddings = [sub_embedding['embedding']['values'] \n",
379
+ " for sub_embedding in [embedding['embeddings'] \n",
380
+ " for embedding in all_embeddings]]\n",
381
+ " except KeyError:\n",
382
+ " raise Exception(\"Possible HTTP CODE 400: 'Request payload size exceeds the limit: 10000 bytes. Documents may be too large\")\n",
383
+ " return raw_embeddings"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": null,
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": [
392
+ "# TODO\n",
393
+ "# The summary prompts HAVE to be changed to make them generalized.\n",
394
+ "# They are currently too specific to the use case of the authors.\n",
395
+ "# Perhaps the user can specify the general topic to better guide the LLM."
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": null,
401
+ "id": "01HWZDX8HD0G37BGK4KQH5712P",
402
+ "metadata": {},
403
+ "outputs": [],
404
+ "source": [
405
+ "async def embed_cluster_texts(texts: list[str]) -> pd.DataFrame:\n",
406
+ " \"\"\"\n",
407
+ " Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.\n",
408
+ "\n",
409
+ " This function combines embedding generation and clustering into a single step. It assumes the existence\n",
410
+ " of a previously defined `perform_clustering` function that performs clustering on the embeddings.\n",
411
+ "\n",
412
+ " Parameters:\n",
413
+ " - texts: List[str], a list of text documents to be processed.\n",
414
+ "\n",
415
+ " Returns:\n",
416
+ " - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.\n",
417
+ " \"\"\"\n",
418
+ " embed_list_2d = await async_embed_with_postprocess(texts) # Generate embeddings\n",
419
+ " text_embeddings_np = np.array(embed_list_2d)\n",
420
+ " \n",
421
+ " cluster_labels = perform_clustering(\n",
422
+ " text_embeddings_np, 10, 0.1\n",
423
+ " ) # Perform clustering on the embeddings\n",
424
+ " df = pd.DataFrame() # Initialize a DataFrame to store the results\n",
425
+ " df[\"text\"] = texts # Store original texts\n",
426
+ " df[\"embd\"] = list(text_embeddings_np) # Store embeddings as a list in the DataFrame\n",
427
+ " df[\"cluster\"] = cluster_labels # Store cluster labels\n",
428
+ " return df\n",
429
+ "\n",
430
+ "def fmt_txt(df: pd.DataFrame) -> str:\n",
431
+ " \"\"\"\n",
432
+ " Formats the text documents in a DataFrame into a single string.\n",
433
+ "\n",
434
+ " Parameters:\n",
435
+ " - df: DataFrame containing the 'text' column with text documents to format.\n",
436
+ "\n",
437
+ " Returns:\n",
438
+ " - A single string where all text documents are joined by a specific delimiter.\n",
439
+ " \"\"\"\n",
440
+ " unique_txt = df[\"text\"].tolist()\n",
441
+ " return \"--- --- \\n --- --- \".join(unique_txt)\n",
442
+ "\n",
443
+ "async def embed_cluster_summarize_texts(\n",
444
+ " texts: List[str], level: int\n",
445
+ ") -> Tuple[pd.DataFrame, pd.DataFrame]:\n",
446
+ " \"\"\"\n",
447
+ " Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,\n",
448
+ " clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes\n",
449
+ " the content within each cluster.\n",
450
+ "\n",
451
+ " Parameters:\n",
452
+ " - texts: A list of text documents to be processed.\n",
453
+ " - level: An integer parameter that could define the depth or detail of processing.\n",
454
+ "\n",
455
+ " Returns:\n",
456
+ " - Tuple containing two DataFrames:\n",
457
+ " 1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.\n",
458
+ " 2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,\n",
459
+ " and the cluster identifiers.\n",
460
+ " \"\"\"\n",
461
+ "\n",
462
+ " # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns\n",
463
+ " df_clusters = await embed_cluster_texts(texts)\n",
464
+ "\n",
465
+ " # Prepare to expand the DataFrame for easier manipulation of clusters\n",
466
+ " expanded_list = []\n",
467
+ "\n",
468
+ " # Expand DataFrame entries to document-cluster pairings for straightforward processing\n",
469
+ " for index, row in df_clusters.iterrows():\n",
470
+ " for cluster in row[\"cluster\"]:\n",
471
+ " expanded_list.append(\n",
472
+ " {\"text\": row[\"text\"], \"embd\": row[\"embd\"], \"cluster\": cluster}\n",
473
+ " )\n",
474
+ "\n",
475
+ " # Create a new DataFrame from the expanded list\n",
476
+ " expanded_df = pd.DataFrame(expanded_list)\n",
477
+ "\n",
478
+ " # Retrieve unique cluster identifiers for processing\n",
479
+ " all_clusters = expanded_df[\"cluster\"].unique()\n",
480
+ "\n",
481
+ " print(f\"--Generated {len(all_clusters)} clusters--\")\n",
482
+ "\n",
483
+ " # Summarization\n",
484
+ " template = \"\"\"\n",
485
+ " This is a summarization task in 20 or so words\n",
486
+ " Your goal is to be descriptive but concise.\n",
487
+ " Create something like an abstract; a fitting summarization of the whole document.\n",
488
+ " You are expected to summarize the following document:\n",
489
+ " ```\n",
490
+ " {context}\n",
491
+ " ```\n",
492
+ " \n",
493
+ " \"\"\"\n",
494
+ " prompt = ChatPromptTemplate.from_template(template)\n",
495
+ " chain = prompt | model | StrOutputParser()\n",
496
+ "\n",
497
+ " # Format text within each cluster for summarization\n",
498
+ " summaries = []\n",
499
+ " for i in all_clusters:\n",
500
+ " df_cluster = expanded_df[expanded_df[\"cluster\"] == i]\n",
501
+ " formatted_txt = fmt_txt(df_cluster)\n",
502
+ " summaries.append(chain.invoke({\"context\": formatted_txt}))\n",
503
+ " await asyncio.sleep(2)\n",
504
+ "\n",
505
+ " # Create a DataFrame to store summaries with their corresponding cluster and level\n",
506
+ " df_summary = pd.DataFrame(\n",
507
+ " {\n",
508
+ " \"summaries\": summaries,\n",
509
+ " \"level\": [level] * len(summaries),\n",
510
+ " \"cluster\": list(all_clusters),\n",
511
+ " }\n",
512
+ " )\n",
513
+ "\n",
514
+ " return df_clusters, df_summary\n",
515
+ "\n",
516
+ "async def recursive_embed_cluster_summarize(\n",
517
+ " texts: List[str], level: int = 1, n_levels: int = 3\n",
518
+ ") -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:\n",
519
+ " \"\"\"\n",
520
+ " Recursively embeds, clusters, and summarizes texts up to a specified level or until\n",
521
+ " the number of unique clusters becomes 1, storing the results at each level.\n",
522
+ "\n",
523
+ " Parameters:\n",
524
+ " - texts: List[str], texts to be processed.\n",
525
+ " - level: int, current recursion level (starts at 1).\n",
526
+ " - n_levels: int, maximum depth of recursion.\n",
527
+ "\n",
528
+ " Returns:\n",
529
+ " - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion\n",
530
+ " levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.\n",
531
+ " \"\"\"\n",
532
+ " results = {} # Dictionary to store results at each level\n",
533
+ "\n",
534
+ " # Perform embedding, clustering, and summarization for the current level\n",
535
+ " df_clusters, df_summary = await embed_cluster_summarize_texts(texts, level)\n",
536
+ "\n",
537
+ " # Store the results of the current level\n",
538
+ " results[level] = (df_clusters, df_summary)\n",
539
+ "\n",
540
+ " # Determine if further recursion is possible and meaningful\n",
541
+ " unique_clusters = df_summary[\"cluster\"].nunique()\n",
542
+ " if level < n_levels and unique_clusters > 1:\n",
543
+ " # Use summaries as the input texts for the next level of recursion\n",
544
+ " new_texts = df_summary[\"summaries\"].tolist()\n",
545
+ " next_level_results = await recursive_embed_cluster_summarize(\n",
546
+ " new_texts, level + 1, n_levels\n",
547
+ " )\n",
548
+ "\n",
549
+ " # Merge the results from the next level into the current results dictionary\n",
550
+ " results.update(next_level_results)\n",
551
+ "\n",
552
+ " return results"
553
+ ]
554
+ },
555
+ {
556
+ "cell_type": "code",
557
+ "execution_count": null,
558
+ "id": "01HWZEDB0A0G7GT84R21S6RTVV",
559
+ "metadata": {},
560
+ "outputs": [],
561
+ "source": [
562
+ "# Build tree\n",
563
+ "results = await recursive_embed_cluster_summarize(docs, # Leaf texts\n",
564
+ " level=1, \n",
565
+ " n_levels=3)\n"
566
+ ]
567
+ },
568
+ {
569
+ "cell_type": "code",
570
+ "execution_count": null,
571
+ "id": "01HX44B793B2NBGPN2QSCT4A8W",
572
+ "metadata": {},
573
+ "outputs": [],
574
+ "source": [
575
+ "len(results)"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": null,
581
+ "id": "01HX44G8CT47PESNW3TR1B294T",
582
+ "metadata": {},
583
+ "outputs": [],
584
+ "source": [
585
+ "results[1][0]"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": null,
591
+ "id": "01HX44PP50EZ5T7W9PAX00HYAC",
592
+ "metadata": {},
593
+ "outputs": [],
594
+ "source": [
595
+ "results[1][0]['text'].tolist()\n",
596
+ "results[1][0]['embd'].tolist()"
597
+ ]
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "execution_count": null,
602
+ "id": "01HX445HS59D0WW7SE1604YB54",
603
+ "metadata": {},
604
+ "outputs": [],
605
+ "source": [
606
+ "# Extracting all summaries\n",
607
+ "summaries: list[str] = []\n",
608
+ "for level in sorted(results.keys()):\n",
609
+ " summaries.extend(results[1][1]['summaries'].tolist())\n",
610
+ "len(summaries)"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": null,
616
+ "id": "01HX44YEHJF6KHJTWPKE4A1QXA",
617
+ "metadata": {},
618
+ "outputs": [],
619
+ "source": []
620
+ },
621
+ {
622
+ "cell_type": "markdown",
623
+ "metadata": {},
624
+ "source": [
625
+ "#### Remarks\n",
626
+ "This mortly ensemble works! I have modified the original code to fit my use of asynchronous embeddings and I am glad they play well together.\n",
627
+ "\n",
628
+ "However, because of the payload limits imposed on us by the Gmeini API, we have had to resort to chunking our documents. This we inteded not to do, because the whole idea was to embed entire documents as they are and perform tree based RAG/ But the limitations of practical tools have forced a compromise upon us. We must make the best of it."
629
+ ]
630
+ },
631
+ {
632
+ "cell_type": "markdown",
633
+ "metadata": {},
634
+ "source": [
635
+ "<h3 align=center> Collapsed Tree Retrieval </h3>\n",
636
+ "\n",
637
+ "> This involves flattening the tree structure into a single layer and then applying a k-nearest neighbors (kNN) search across all nodes simultaneously.\n",
638
+ "\n",
639
+ "It is reported to have the best performance.\n",
640
+ "\n",
641
+ "### Strategy\n",
642
+ "\n",
643
+ "We will have a two pronged strategy: upsert the texts and the summaries separately. They are flattened but we already have embeddings for the texts already. We got them during the clustering operation. There is no need to get them anew, that would be inefficient. We don't have the embeddings for the summaries though, these we get. Then we use the pinecone client to upsert them sequentially."
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "markdown",
648
+ "metadata": {},
649
+ "source": [
650
+ "<h3 align=center> Pinecone CRUD Operations </h3>\n",
651
+ "\n",
652
+ "We are going to go our own way in this section. Instead of using the absractions langchain provides us to interact with vectorstores, we will perform our operations using the `pinecone` client. This gives us finer control."
653
+ ]
654
+ },
655
+ {
656
+ "cell_type": "code",
657
+ "execution_count": null,
658
+ "metadata": {},
659
+ "outputs": [],
660
+ "source": [
661
+ "from pinecone import Pinecone\n",
662
+ "import os, uuid"
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": null,
668
+ "metadata": {},
669
+ "outputs": [],
670
+ "source": [
671
+ "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'), environment='gcp-starter')\n",
672
+ "\n",
673
+ "# Pick an index at random\n",
674
+ "index_ = pc.list_indexes()[0]\n",
675
+ "index = pc.Index(index_['name'])\n",
676
+ "\n",
677
+ "# Check whether index matches our embedding dimension\n",
678
+ "dim_a = index_['dimension']\n",
679
+ "dim_b = len(results[1][0]['embd'][0]) # Pick any random embedding vector in our results\n",
680
+ "\n",
681
+ "if dim_a != dim_b:\n",
682
+ " raise Exception(f\"Pinecone Index dimension: {dim_a} does not match Vector Embedding dimension {dim_b}\")\n",
683
+ "\n",
684
+ "# Delete namespace if found\n",
685
+ "# Will be created anew when we upsert to it. Avoids duplication\n",
686
+ "if NAMESPACE in index.describe_index_stats()['namespaces'].keys():\n",
687
+ " index.delete(delete_all=True, namespace=NAMESPACE)\n",
688
+ " index.describe_index_stats()"
689
+ ]
690
+ },
691
+ {
692
+ "cell_type": "code",
693
+ "execution_count": null,
694
+ "metadata": {},
695
+ "outputs": [],
696
+ "source": [
697
+ "def pinecone_upsert(embeddings: list[float], texts: list[str], index: Pinecone.Index, namespace: str):\n",
698
+ " \"\"\"Store embeddings and their corresponding text metadata in the pinecone vectorstore\"\"\"\n",
699
+ " records = []\n",
700
+ "\n",
701
+ " for embedding, text in zip(embeddings, texts):\n",
702
+ " records.append({\n",
703
+ " 'id': str(uuid.uuid4().int),\n",
704
+ " 'values': embedding,\n",
705
+ " 'metadata': {\n",
706
+ " 'text': text\n",
707
+ " }\n",
708
+ " })\n",
709
+ "\n",
710
+ " # Asynchronous upsert: Faster\n",
711
+ " def chunker(seq, batch_size):\n",
712
+ " return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))\n",
713
+ "\n",
714
+ " async_results = [\n",
715
+ " index.upsert(vectors=chunk, namespace=namespace, async_req=True)\n",
716
+ " for chunk in chunker(records, batch_size=100)\n",
717
+ " ]\n"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "execution_count": null,
723
+ "id": "01HX4546ANV2QE9C934782AB8W",
724
+ "metadata": {},
725
+ "outputs": [],
726
+ "source": [
727
+ "# Iterate through the results to extract summaries from each level\n",
728
+ "summaries: list[str] = []\n",
729
+ "for level in sorted(results.keys()):\n",
730
+ " summaries.extend(results[level][1]['summaries'].tolist())\n",
731
+ "len(summaries)"
732
+ ]
733
+ },
734
+ {
735
+ "cell_type": "code",
736
+ "execution_count": null,
737
+ "id": "01HX457G9ZDARP0HZJ8V1PMHSD",
738
+ "metadata": {},
739
+ "outputs": [],
740
+ "source": [
741
+ "\n",
742
+ "# Get summary embeddigs\n",
743
+ "summary_embeddings = await async_embed(summaries)"
744
+ ]
745
+ },
746
+ {
747
+ "cell_type": "code",
748
+ "execution_count": null,
749
+ "metadata": {},
750
+ "outputs": [],
751
+ "source": [
752
+ "# Upsering summaries\n",
753
+ "pinecone_upsert([vect['embeddings']['embedding']['values'] for vect in summary_embeddings],\n",
754
+ " [txt['text_metadata'] for txt in summary_embeddings],\n",
755
+ " index, \n",
756
+ " NAMESPACE) "
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "code",
761
+ "execution_count": null,
762
+ "metadata": {},
763
+ "outputs": [],
764
+ "source": [
765
+ "# Upserting all texts\n",
766
+ "pinecone_upsert(results[1][0]['embd'].tolist(),\n",
767
+ " results[1][0]['text'].tolist(),\n",
768
+ " index, \n",
769
+ " NAMESPACE) "
770
+ ]
771
+ },
772
+ {
773
+ "cell_type": "markdown",
774
+ "metadata": {},
775
+ "source": [
776
+ "#### Remarks\n",
777
+ "After a long process, we have been able to upsert our documments succesfully to pinecone. The moving parts don't fit very well and the construction is brittle. We move on but we will return to refactor."
778
+ ]
779
+ },
780
+ {
781
+ "cell_type": "code",
782
+ "execution_count": null,
783
+ "metadata": {},
784
+ "outputs": [],
785
+ "source": [
786
+ "import json\n",
787
+ "\n",
788
+ "with open('data/sample_embeddings.json', \"w\") as file:\n",
789
+ " json.dump(summary_embeddings, file, indent=4) # indent=4 for pretty printing"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": null,
795
+ "id": "01HX4AB9X4ZRQ6Q39PYEX8W052",
796
+ "metadata": {},
797
+ "outputs": [],
798
+ "source": [
799
+ "# TODO\n",
800
+ "# Scrape the whole page rather than the article only to include date and time and other references.\n",
801
+ "# this may help with citation and grounding in time"
802
+ ]
803
+ },
804
+ {
805
+ "cell_type": "code",
806
+ "execution_count": null,
807
+ "id": "01HYB6K630M45KV1KWWQHJ4V1K",
808
+ "metadata": {},
809
+ "outputs": [],
810
+ "source": []
811
+ }
812
+ ],
813
+ "metadata": {
814
+ "kernelspec": {
815
+ "display_name": "Python 3",
816
+ "language": "python",
817
+ "name": "python3"
818
+ },
819
+ "language_info": {
820
+ "codemirror_mode": {
821
+ "name": "ipython",
822
+ "version": 3
823
+ },
824
+ "file_extension": ".py",
825
+ "mimetype": "text/x-python",
826
+ "name": "python",
827
+ "nbconvert_exporter": "python",
828
+ "pygments_lexer": "ipython3",
829
+ "version": "3.10.4"
830
+ }
831
+ },
832
+ "nbformat": 4,
833
+ "nbformat_minor": 2
834
+ }