diff --git "a/Capstone_1_SubjQATransformer.ipynb" "b/Capstone_1_SubjQATransformer.ipynb" new file mode 100644--- /dev/null +++ "b/Capstone_1_SubjQATransformer.ipynb" @@ -0,0 +1,5368 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "3238b837634a47db899e9151534dbde8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_017feac2ec2c414a9d9fed06a0236450", + "IPY_MODEL_f33a070eaa7a461799076726260c2810", + "IPY_MODEL_d6b917dc13614bb5b98e352ba7af0ded", + "IPY_MODEL_ef71f921a16541b0b5e29f6c0cddd1da" + ], + "layout": "IPY_MODEL_f28d5164aedb452db1c9e649b564ca06" + } + }, + "b0e8e35e9046496aaa5a7d2b1edf1958": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9a00148607964bc6ab902316c6a94b3d", + "placeholder": "​", + "style": "IPY_MODEL_59f55019601f4a898ec0f094686916ff", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "0892f90286544cc8ac9de26eda52b897": { + "model_module": "@jupyter-widgets/controls", + "model_name": "PasswordModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_b0c6fbf0c0454b8db44050c207752cb4", + "placeholder": "​", + "style": "IPY_MODEL_4996420f16784d53b3bdc02bc36cbe9e", + "value": "" + } + }, + "fdc7e5d873af4530bf7598665bb66517": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_66c75939b38f4f4b907abd5699f7cee4", + "style": "IPY_MODEL_40f69a6162954f55839021035bce8863", + "value": true + } + }, + "dec50c0ecd64421aa55077bb56a034dc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_1d03693f32af4e81a252a3ccfc3ec8c3", + "style": "IPY_MODEL_fd3e2731e3494babb7d9d73c4178954a", + "tooltip": "" + } + }, + "a98b2edf76074df9bd3c7a25705af31a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_342c460a4df64397b6c754fb99262932", + "placeholder": "​", + "style": "IPY_MODEL_703cd77af78b47fc90e6ea357d67abf3", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "f28d5164aedb452db1c9e649b564ca06": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "9a00148607964bc6ab902316c6a94b3d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59f55019601f4a898ec0f094686916ff": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b0c6fbf0c0454b8db44050c207752cb4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4996420f16784d53b3bdc02bc36cbe9e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "66c75939b38f4f4b907abd5699f7cee4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "40f69a6162954f55839021035bce8863": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1d03693f32af4e81a252a3ccfc3ec8c3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd3e2731e3494babb7d9d73c4178954a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "342c460a4df64397b6c754fb99262932": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "703cd77af78b47fc90e6ea357d67abf3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d8ffc35a02894ca183d4ed89b8bd3626": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4913579fe7344f358604de28627841d3", + "placeholder": "​", + "style": "IPY_MODEL_03478eb3a7414966b67e054d8f0b13ec", + "value": "Connecting..." + } + }, + "4913579fe7344f358604de28627841d3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "03478eb3a7414966b67e054d8f0b13ec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "017feac2ec2c414a9d9fed06a0236450": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2339c46035e54936a11239cda588df52", + "placeholder": "​", + "style": "IPY_MODEL_317307d87d924722b1cca58c7bfefb62", + "value": "Token is valid (permission: read)." + } + }, + "f33a070eaa7a461799076726260c2810": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6ad25282973c4066a1bee09122582eb4", + "placeholder": "​", + "style": "IPY_MODEL_a9cf45485257457f82d79198464e9ba2", + "value": "Your token has been saved in your configured git credential helpers (store)." + } + }, + "d6b917dc13614bb5b98e352ba7af0ded": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb04e3f6fa7c4344b2187a8325f6b2cf", + "placeholder": "​", + "style": "IPY_MODEL_1b644e3182564dd5bef2627e93ff03a2", + "value": "Your token has been saved to /root/.cache/huggingface/token" + } + }, + "ef71f921a16541b0b5e29f6c0cddd1da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e09e10fcc0a04a0d956f61a980b30dd6", + "placeholder": "​", + "style": "IPY_MODEL_3a692ba3aaee44a19ae981ad2d0798ce", + "value": "Login successful" + } + }, + "2339c46035e54936a11239cda588df52": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "317307d87d924722b1cca58c7bfefb62": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6ad25282973c4066a1bee09122582eb4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9cf45485257457f82d79198464e9ba2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fb04e3f6fa7c4344b2187a8325f6b2cf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1b644e3182564dd5bef2627e93ff03a2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e09e10fcc0a04a0d956f61a980b30dd6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a692ba3aaee44a19ae981ad2d0798ce": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cce630e45be646b190c6999a28733168": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_48c4b23d0c244633adab4180f74ab773", + "IPY_MODEL_cb33700a1e5c4a3a9cdd98884e48f5d6", + "IPY_MODEL_bd0d37d5c6dd4a4e9997715376a773ef" + ], + "layout": "IPY_MODEL_0589ff01b9df477fbad5ea797e96a4f0" + } + }, + "48c4b23d0c244633adab4180f74ab773": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_038bd2fc97074a8aa75a6f92c964c0e7", + "placeholder": "​", + "style": "IPY_MODEL_8fa04fcf0bf44b2ab38555aa1c6dc162", + "value": "tokenizer_config.json: 100%" + } + }, + "cb33700a1e5c4a3a9cdd98884e48f5d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7f282cf480d548a3992e1f7c97afe9db", + "max": 79, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_acf1b501048d42828f9fa50214ec1e5f", + "value": 79 + } + }, + "bd0d37d5c6dd4a4e9997715376a773ef": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_04d406e9202544559bcde33ebd2f7062", + "placeholder": "​", + "style": "IPY_MODEL_7e3170a7e1464ffc8a2ffce8a86e8c22", + "value": " 79.0/79.0 [00:00<00:00, 3.06kB/s]" + } + }, + "0589ff01b9df477fbad5ea797e96a4f0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "038bd2fc97074a8aa75a6f92c964c0e7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8fa04fcf0bf44b2ab38555aa1c6dc162": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7f282cf480d548a3992e1f7c97afe9db": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "acf1b501048d42828f9fa50214ec1e5f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "04d406e9202544559bcde33ebd2f7062": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e3170a7e1464ffc8a2ffce8a86e8c22": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "40007c7ac41d4fa7bd1a1dfc8b797a28": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_25dc2ea3b0f34450897566ba285a5ef5", + "IPY_MODEL_75106df5712d4790af9c7f38e09689ff", + "IPY_MODEL_a251a0150af44299a9fe01ce885ba237" + ], + "layout": "IPY_MODEL_61402404f69a4135a2d78d1876afeb7f" + } + }, + "25dc2ea3b0f34450897566ba285a5ef5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_093e227b15034d5c9f92592eb9b26e3e", + "placeholder": "​", + "style": "IPY_MODEL_616f5dc3758c459cb9dc3aeda0454602", + "value": "config.json: 100%" + } + }, + "75106df5712d4790af9c7f38e09689ff": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e18f8bba554d4789aa126b14ea259a6e", + "max": 571, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e9d96aa69c9f4678b09c61d9d5fa1c4e", + "value": 571 + } + }, + "a251a0150af44299a9fe01ce885ba237": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ed3d3d7ea5094206a614533bb7a52397", + "placeholder": "​", + "style": "IPY_MODEL_65be10d75afe45b6948b677cc7d92f89", + "value": " 571/571 [00:00<00:00, 6.15kB/s]" + } + }, + "61402404f69a4135a2d78d1876afeb7f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "093e227b15034d5c9f92592eb9b26e3e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "616f5dc3758c459cb9dc3aeda0454602": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e18f8bba554d4789aa126b14ea259a6e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e9d96aa69c9f4678b09c61d9d5fa1c4e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ed3d3d7ea5094206a614533bb7a52397": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "65be10d75afe45b6948b677cc7d92f89": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4cdd68881f934afab33c0ec54db86e91": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_765dc3a4f874418db757ee837848155d", + "IPY_MODEL_9521e2f37eee45f2aa641ddbc57c5635", + "IPY_MODEL_4d6b1531658143bab74c8d8fcdf1962d" + ], + "layout": "IPY_MODEL_8b36b1d06ec44d6a8bec1a540547eea8" + } + }, + "765dc3a4f874418db757ee837848155d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0a56916da88948f7a538a8f66b6d6430", + "placeholder": "​", + "style": "IPY_MODEL_e681565ca6754ef8b4f7fdfe2fb6a360", + "value": "vocab.json: 100%" + } + }, + "9521e2f37eee45f2aa641ddbc57c5635": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bebc95e489aa4036b6012637912b70b4", + "max": 898822, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0b7a9cf3b56c402583531ee7d1844a43", + "value": 898822 + } + }, + "4d6b1531658143bab74c8d8fcdf1962d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5bf79491604245bea82d61edd4d13560", + "placeholder": "​", + "style": "IPY_MODEL_0cdbbed010b44a43a4d8e457585d2db9", + "value": " 899k/899k [00:00<00:00, 8.17MB/s]" + } + }, + "8b36b1d06ec44d6a8bec1a540547eea8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0a56916da88948f7a538a8f66b6d6430": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e681565ca6754ef8b4f7fdfe2fb6a360": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bebc95e489aa4036b6012637912b70b4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0b7a9cf3b56c402583531ee7d1844a43": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5bf79491604245bea82d61edd4d13560": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0cdbbed010b44a43a4d8e457585d2db9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ead7ac25cabe4c919ac9ee9744eeecba": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_43d01cb496374b57ba7c1e426d9c029a", + "IPY_MODEL_691e362ce7814878a8e35dd1a2fcd7e8", + "IPY_MODEL_b2df15de3b5e4456ab95456fa4fdd857" + ], + "layout": "IPY_MODEL_34e97833f77a4cecaa4d46c992490afc" + } + }, + "43d01cb496374b57ba7c1e426d9c029a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59142ab104424e8fbd6459b9a3a0bd1e", + "placeholder": "​", + "style": "IPY_MODEL_d69195cd3bfd4cfd88e7d521eb46f4fb", + "value": "merges.txt: 100%" + } + }, + "691e362ce7814878a8e35dd1a2fcd7e8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_929c8bbed3da465c989349f22c93cdb8", + "max": 456318, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_69049fc2d6e345e58023fbf4f4dc1d81", + "value": 456318 + } + }, + "b2df15de3b5e4456ab95456fa4fdd857": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_03125a0b697d4acebdb3068fcf3352c4", + "placeholder": "​", + "style": "IPY_MODEL_a964666eb9a2428d9cdc87bf9ad9ca27", + "value": " 456k/456k [00:00<00:00, 6.73MB/s]" + } + }, + "34e97833f77a4cecaa4d46c992490afc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59142ab104424e8fbd6459b9a3a0bd1e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d69195cd3bfd4cfd88e7d521eb46f4fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "929c8bbed3da465c989349f22c93cdb8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "69049fc2d6e345e58023fbf4f4dc1d81": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "03125a0b697d4acebdb3068fcf3352c4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a964666eb9a2428d9cdc87bf9ad9ca27": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e01af6c7bc014394a7a1438c66be1f36": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_51ecf681ef614164b97e2ce057b6763e", + "IPY_MODEL_8450b81bca3f41969564d7352c783d07", + "IPY_MODEL_333e7ed0a5a24e289a2775315336e1f7" + ], + "layout": "IPY_MODEL_8044dbc3b7f04c888b1e3a547c4f6eac" + } + }, + "51ecf681ef614164b97e2ce057b6763e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_156b5a82fdb14777819a290c1f638c3c", + "placeholder": "​", + "style": "IPY_MODEL_c7c412609a104adcbc1a3fcee55a3205", + "value": "special_tokens_map.json: 100%" + } + }, + "8450b81bca3f41969564d7352c783d07": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59afc8dd721849e8a06b9959ad1bc0fe", + "max": 772, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_28133bfb9e1846239979aad0b0b1a912", + "value": 772 + } + }, + "333e7ed0a5a24e289a2775315336e1f7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_593946596d1448f3b1a485bac9d583df", + "placeholder": "​", + "style": "IPY_MODEL_e91a2991b3bc4a4e90d3928780c4eb34", + "value": " 772/772 [00:00<00:00, 14.2kB/s]" + } + }, + "8044dbc3b7f04c888b1e3a547c4f6eac": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "156b5a82fdb14777819a290c1f638c3c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c7c412609a104adcbc1a3fcee55a3205": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "59afc8dd721849e8a06b9959ad1bc0fe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "28133bfb9e1846239979aad0b0b1a912": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "593946596d1448f3b1a485bac9d583df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e91a2991b3bc4a4e90d3928780c4eb34": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "33ba4c1c6ba046b78c6c0e5ef987de2a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3dd07c7fd25f4a0cac9c6d79bf312c5b", + "IPY_MODEL_68d6df7a72b84174a216d6804a422c00", + "IPY_MODEL_02857e625d664d748f0411dbd9307285" + ], + "layout": "IPY_MODEL_adfc7c2afddb4979bd6e30d450d4efb6" + } + }, + "3dd07c7fd25f4a0cac9c6d79bf312c5b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cbf6d52e02bb47c8a4c73208c8cf658a", + "placeholder": "​", + "style": "IPY_MODEL_20b683d9910440f882ca10ba0837c545", + "value": "Map: 100%" + } + }, + "68d6df7a72b84174a216d6804a422c00": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a39c78323e05487aa6c94e3c67b1d136", + "max": 2501, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_27ee050eee1f479b8cd328158e52e497", + "value": 2501 + } + }, + "02857e625d664d748f0411dbd9307285": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_50478538cf9947c5a760c538eaeb6f46", + "placeholder": "​", + "style": "IPY_MODEL_735906f0d31049e4af3fae4f2462cc05", + "value": " 2501/2501 [00:10<00:00, 260.48 examples/s]" + } + }, + "adfc7c2afddb4979bd6e30d450d4efb6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cbf6d52e02bb47c8a4c73208c8cf658a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "20b683d9910440f882ca10ba0837c545": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a39c78323e05487aa6c94e3c67b1d136": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27ee050eee1f479b8cd328158e52e497": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "50478538cf9947c5a760c538eaeb6f46": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "735906f0d31049e4af3fae4f2462cc05": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## Data Collection 🛠\n", + "\n", + "The subjQA dataset is constructed based on publicly available review datasets. Specifically, the movies, books, electronics, and grocery categories are constructed using reviews from the Amazon Review dataset. The TripAdvisor category, as the name suggests, is constructed using reviews from TripAdvisor which can be found [here](link). Finally, the restaurants category is constructed using the Yelp Dataset which is also publicly available.\n", + "\n", + "The process of constructing SubjQA is discussed in detail in our paper. In a nutshell, the dataset construction consists of the following steps:\n", + "\n", + "1. First, all opinions expressed in reviews are extracted. In the pipeline, each opinion is modeled as a (modifier, aspect) pair which is a pair of spans where the former describes the latter. *(e.g., \"good, hotel\", and \"terrible, acting\" are a few examples of extracted opinions)*.\n", + "2. Using Matrix Factorization techniques, implication relationships between different expressed opinions are mined. For instance, the system mines that \"responsive keys\" implies \"good keyboard\". In our pipeline, we refer to the conclusion of an implication (i.e., \"good keyboard\" in this example) as the query opinion, and we refer to the premise (i.e., \"responsive keys\") as its neighboring opinion.\n", + "3. Annotators are then asked to write a question based on query opinions. For instance, given \"good keyboard\" as the query opinion, they might write \"Is this keyboard any good?\"\n", + "4. Each question written based on a query opinion is then paired with a review that mentions its neighboring opinion. In our example, that would be a review that mentions \"responsive keys\".\n", + "5. The question and review pairs are presented to annotators to select the correct answer span, and rate the subjectivity level of the question as well as the subjectivity level of the highlighted answer span." + ], + "metadata": { + "id": "r32wNE8C5FeQ" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Data Format 📊\n", + "\n", + "All files are in standard CSV format, and they consist of the following columns:\n", + "\n", + "- **domain**: The category/domain of the review (e.g., hotels, books, ...).\n", + "- **question**: The question (written based on a query opinion).\n", + "- **review**: The review (that mentions the neighboring opinion).\n", + "- **human_ans_spans**: The span labeled by annotators as the answer.\n", + "- **human_ans_indices**: The (character-level) start and end indices of the answer span highlighted by annotators.\n", + "- **question_subj_level**: The subjectivity level of the question (on a 1 to 5 scale with 1 being the most subjective).\n", + "- **ques_subj_score**: The subjectivity score of the question computed using the TextBlob package.\n", + "- **is_ques_subjective**: A boolean subjectivity label derived from question_subj_level (i.e., scores below 4 are considered as subjective).\n", + "- **answer_subj_level**: The subjectivity level of the answer span (on a 1 to 5 scale with 5 being the most subjective).\n", + "- **ans_subj_score**: The subjectivity score of the answer span computed using the TextBlob package.\n", + "- **is_ans_subjective**: A boolean subjectivity label derived from answer_subj_level (i.e., scores below 4 are considered as subjective).\n", + "- **nn_mod**: The modifier of the neighboring opinion (which appears in the review).\n", + "- **nn_asp**: The aspect of the neighboring opinion (which appears in the review).\n", + "- **query_mod**: The modifier of the query opinion (around which a question is manually written).\n", + "- **query_asp**: The aspect of the query opinion (around which a question is manually written).\n", + "- **item_id**: The id of the item/business discussed in the review.\n", + "- **review_id**: A unique id associated with the review.\n", + "- **q_review_id**: A unique id assigned to the question-review pair.\n", + "- **q_reviews_id**: A unique id assigned to all question-review pairs with a shared question." + ], + "metadata": { + "id": "GM783wxV4XQa" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Citation\n", + "Johannes Bjerva, Nikita Bhutani, Behzad Golahn, Wang-Chiew Tan, and Isabelle Augenstein. (2020). SubjQA: A Dataset for Subjectivity and Review Comprehension. In *Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing*. Association for Computational Linguistics." + ], + "metadata": { + "id": "1-kEwC-R5a14" + } + }, + { + "cell_type": "code", + "source": [ + "from google.colab import userdata\n", + "userdata.get('HuggingFace')\n", + "\n", + "# Retrieve secret name\n", + "secret_name = userdata.get('HuggingFace')\n", + "\n", + "# Set up Git configuration\n", + "!git config --global user.email \"kagantimur@icloud.com\"\n", + "!git config --global user.name \"kgntmr\"\n", + "\n", + "# Log in to the Hugging Face Hub\n", + "!huggingface-cli login" + ], + "metadata": { + "id": "f5HXB1xKmjIE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "3238b837634a47db899e9151534dbde8", + "b0e8e35e9046496aaa5a7d2b1edf1958", + "0892f90286544cc8ac9de26eda52b897", + "fdc7e5d873af4530bf7598665bb66517", + "dec50c0ecd64421aa55077bb56a034dc", + "a98b2edf76074df9bd3c7a25705af31a", + "f28d5164aedb452db1c9e649b564ca06", + "9a00148607964bc6ab902316c6a94b3d", + "59f55019601f4a898ec0f094686916ff", + "b0c6fbf0c0454b8db44050c207752cb4", + "4996420f16784d53b3bdc02bc36cbe9e", + "66c75939b38f4f4b907abd5699f7cee4", + "40f69a6162954f55839021035bce8863", + "1d03693f32af4e81a252a3ccfc3ec8c3", + "fd3e2731e3494babb7d9d73c4178954a", + "342c460a4df64397b6c754fb99262932", + "703cd77af78b47fc90e6ea357d67abf3", + "d8ffc35a02894ca183d4ed89b8bd3626", + "4913579fe7344f358604de28627841d3", + "03478eb3a7414966b67e054d8f0b13ec", + "017feac2ec2c414a9d9fed06a0236450", + "f33a070eaa7a461799076726260c2810", + "d6b917dc13614bb5b98e352ba7af0ded", + "ef71f921a16541b0b5e29f6c0cddd1da", + "2339c46035e54936a11239cda588df52", + "317307d87d924722b1cca58c7bfefb62", + "6ad25282973c4066a1bee09122582eb4", + "a9cf45485257457f82d79198464e9ba2", + "fb04e3f6fa7c4344b2187a8325f6b2cf", + "1b644e3182564dd5bef2627e93ff03a2", + "e09e10fcc0a04a0d956f61a980b30dd6", + "3a692ba3aaee44a19ae981ad2d0798ce" + ] + }, + "id": "yaTEJggLuZaK", + "outputId": "7c00023e-a2c1-4579-d095-efbf04f3e266" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "VBox(children=(HTML(value='
start_char or offset[context_end][1] < end_char:\n", + "# start_positions.append(0)\n", + "# end_positions.append(0)\n", + "# else:\n", + "# idx = context_start\n", + "# while idx <= context_end and offset[idx][0] <= start_char:\n", + "# idx += 1\n", + "# start_positions.append(idx - 1)\n", + "\n", + "# idx = context_end\n", + "# while idx >= context_start and offset[idx][1] >= end_char:\n", + "# idx -= 1\n", + "# end_positions.append(idx + 1)\n", + "\n", + "# # Add start and end positions to the inputs dictionary\n", + "# inputs[\"start_positions\"] = start_positions\n", + "# inputs[\"end_positions\"] = end_positions\n", + "\n", + "# # Return the modified inputs dictionary\n", + "# return inputs" + ], + "metadata": { + "id": "WqjndZe6vKwB" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "df_train=pd.read_csv('/content/drive/MyDrive/subjqa-train.csv')\n", + "df_test=pd.read_csv('/content/drive/MyDrive/subjqa-test.csv')" + ], + "metadata": { + "id": "O3WTknCj1qru" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Define the maximum length and stride parameters for tokenization\n", + "max_length = 384 # Maximum length of tokenized sequences, commonly used for a balance between context and memory usage\n", + "stride = 128 # Stride determines overlap between tokenized sequences, providing context while avoiding redundancy\n", + "\n", + "# Define a function to preprocess training samples\n", + "def preprocess_training_samples(samples, batch_size=32):\n", + " # Get the total number of samples\n", + " num_samples = len(samples[\"question\"])\n", + " processed_samples = []\n", + "\n", + " # Process samples in batches to optimize memory usage\n", + " for i in range(0, num_samples, batch_size):\n", + " # Extract questions, contexts, and answers for the current batch\n", + " batch_questions = [q.strip() for q in samples[\"question\"][i:i+batch_size]]\n", + " batch_contexts = samples[\"context\"][i:i+batch_size]\n", + " batch_answers = samples[\"answers\"][i:i+batch_size]\n", + "\n", + " # Tokenize questions and contexts using the tokenizer\n", + " inputs = tokenizer(\n", + " batch_questions,\n", + " batch_contexts,\n", + " max_length=max_length,\n", + " truncation=\"only_second\",\n", + " stride=stride,\n", + " return_overflowing_tokens=True,\n", + " return_offsets_mapping=True,\n", + " padding=\"max_length\",\n", + " )\n", + "\n", + " # Extract offset_mapping and sample_map from tokenized inputs\n", + " offset_mapping = inputs.pop(\"offset_mapping\")\n", + " sample_map = inputs.pop(\"overflow_to_sample_mapping\")\n", + "\n", + " # Process each tokenized input in the batch\n", + " for j, offset in enumerate(offset_mapping):\n", + " # Get the sample index for the current tokenized input\n", + " sample_idx = sample_map[j]\n", + " answer = batch_answers[j]\n", + " start_char = answer[\"answer_start\"][0]\n", + " end_char = start_char + len(answer[\"text\"][0])\n", + " sequence_ids = inputs.sequence_ids(j)\n", + "\n", + " # Find the start and end token positions of the context\n", + " context_start = next(idx for idx, seq_id in enumerate(sequence_ids) if seq_id == 1)\n", + " context_end = next(idx for idx, seq_id in enumerate(sequence_ids[::-1]) if seq_id == 1)\n", + "\n", + " # Calculate start and end positions of the answer within the tokenized sequence\n", + " start_position = max(0, context_start - 1)\n", + " end_position = min(len(offset), context_end + 1)\n", + "\n", + " # Initialize lists to store start and end positions of answers\n", + " start_positions = [0] * len(offset)\n", + " end_positions = [0] * len(offset)\n", + "\n", + " # Set start and end positions if answer is fully contained within the context\n", + " if offset[start_position][0] <= start_char and offset[end_position][1] >= end_char:\n", + " start_positions[start_position] = 1\n", + " end_positions[end_position] = 1\n", + "\n", + " # Add tokenized input and corresponding start/end positions to processed samples\n", + " processed_samples.append({\n", + " \"inputs\": inputs,\n", + " \"start_positions\": start_positions,\n", + " \"end_positions\": end_positions\n", + " })\n", + "\n", + " return processed_samples" + ], + "metadata": { + "id": "eisYCm-TvyL1" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_train.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 608 + }, + "id": "HzhybUfc1BxM", + "outputId": "74d0ebc6-d59e-4ac8-a7d0-febb2080bf1b" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " item_id domain nn_mod nn_asp query_mod \\\n", + "0 B00BVMXBDO movies addictive show full \n", + "1 1404918051 movies enough simple film charming \n", + "2 B0000633ZP movies weak plot bad \n", + "3 B0000AQS0F movies outstanding show wonderful \n", + "4 B003Y5H5FG movies great production design great \n", + "\n", + " query_asp q_review_id \\\n", + "0 series d9a9615d45df2f6e6108db4ca46bfded \n", + "1 movie 06ffe37a8023636a3ce00b020a517e87 \n", + "2 one 3b625c68e91b9e6987a08b84a9a9d234 \n", + "3 series f3abfa98b011127e7cb49bcd07f8deeb \n", + "4 costume design 1b03744e764b257592c2c768345c14bc \n", + "\n", + " q_reviews_id \\\n", + "0 399f1046fe6bd97990107f9d7aa86f4a \n", + "1 42d9dd5b0c67150cac1e13308811cbb5 \n", + "2 32d06ccf2132cda644aea791fa688c53 \n", + "3 e546636f0bb9f93d5f24b4ade9ebab45 \n", + "4 a0a97e460a194bcb3286fe68d20aadc2 \n", + "\n", + " question question_subj_level \\\n", + "0 Who is the author of this series? 1 \n", + "1 Can we enjoy the movie along with our family ? 1 \n", + "2 Does this one good? 5 \n", + "3 Is this series good and excelent? 1 \n", + "4 How is the costume design? 1 \n", + "\n", + " ques_subj_score is_ques_subjective review_id \\\n", + "0 0.0 False 090671369dddfeb02db9bf7125a47c79 \n", + "1 0.5 False a29821121e74d319cb93f77101e99c88 \n", + "2 0.6 True 12a1b821f761bd19a75be7b16cef4a7c \n", + "3 0.6 True cd0f92322e67cc9d70de6674caace78c \n", + "4 0.0 False f6b5024393ebc70287befdaf47a50b75 \n", + "\n", + " review \\\n", + "0 Whether it be in her portrayal of a nerdy lesb... \n", + "1 An outstanding romantic comedy, 13 Going on 30... \n", + "2 To let the truth be known, I watched this movi... \n", + "3 At the time of my review, there had been 910 c... \n", + "4 \"Fright Night\" is great! This is how the story... \n", + "\n", + " human_ans_spans human_ans_indices \\\n", + "0 ANSWERNOTFOUND (251, 265) \n", + "1 ANSWERNOTFOUND (1195, 1209) \n", + "2 ANSWERNOTFOUND (1476, 1490) \n", + "3 this show is OUTSTANDING (296, 320) \n", + "4 The costume design by Susan Matheson is great (1254, 1299) \n", + "\n", + " answer_subj_level ans_subj_score is_ans_subjective \n", + "0 1 0.000 False \n", + "1 1 0.000 False \n", + "2 5 0.000 False \n", + "3 1 0.875 True \n", + "4 1 0.750 True " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_iddomainnn_modnn_aspquery_modquery_aspq_review_idq_reviews_idquestionquestion_subj_levelques_subj_scoreis_ques_subjectivereview_idreviewhuman_ans_spanshuman_ans_indicesanswer_subj_levelans_subj_scoreis_ans_subjective
0B00BVMXBDOmoviesaddictiveshowfullseriesd9a9615d45df2f6e6108db4ca46bfded399f1046fe6bd97990107f9d7aa86f4aWho is the author of this series?10.0False090671369dddfeb02db9bf7125a47c79Whether it be in her portrayal of a nerdy lesb...ANSWERNOTFOUND(251, 265)10.000False
11404918051moviesenough simplefilmcharmingmovie06ffe37a8023636a3ce00b020a517e8742d9dd5b0c67150cac1e13308811cbb5Can we enjoy the movie along with our family ?10.5Falsea29821121e74d319cb93f77101e99c88An outstanding romantic comedy, 13 Going on 30...ANSWERNOTFOUND(1195, 1209)10.000False
2B0000633ZPmoviesweakplotbadone3b625c68e91b9e6987a08b84a9a9d23432d06ccf2132cda644aea791fa688c53Does this one good?50.6True12a1b821f761bd19a75be7b16cef4a7cTo let the truth be known, I watched this movi...ANSWERNOTFOUND(1476, 1490)50.000False
3B0000AQS0Fmoviesoutstandingshowwonderfulseriesf3abfa98b011127e7cb49bcd07f8deebe546636f0bb9f93d5f24b4ade9ebab45Is this series good and excelent?10.6Truecd0f92322e67cc9d70de6674caace78cAt the time of my review, there had been 910 c...this show is OUTSTANDING(296, 320)10.875True
4B003Y5H5FGmoviesgreatproduction designgreatcostume design1b03744e764b257592c2c768345c14bca0a97e460a194bcb3286fe68d20aadc2How is the costume design?10.0Falsef6b5024393ebc70287befdaf47a50b75\"Fright Night\" is great! This is how the story...The costume design by Susan Matheson is great(1254, 1299)10.750True
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_train", + "summary": "{\n \"name\": \"df_train\",\n \"rows\": 2501,\n \"fields\": [\n {\n \"column\": \"item_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 503,\n \"samples\": [\n \"B00005JNTI\",\n \"B000EQ5PUA\",\n \"B0000BWVCJ\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"domain\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"movies\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"nn_mod\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 277,\n \"samples\": [\n \"watchable\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"nn_asp\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 188,\n \"samples\": [\n \"emotion\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query_mod\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 314,\n \"samples\": [\n \"believable\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query_asp\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 181,\n \"samples\": [\n \"thing\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"q_review_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1369,\n \"samples\": [\n \"8563d7cf4c4f5432ec5cd62d06d436c1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"q_reviews_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1265,\n \"samples\": [\n \"c8c5527688a2acd208ba8d62e2dad23b\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"question\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 798,\n \"samples\": [\n \"How can I read the set story at this hotel?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"question_subj_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ques_subj_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.29416550579019396,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 50,\n \"samples\": [\n 0.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"is_ques_subjective\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n true\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1330,\n \"samples\": [\n \"24ad2780d2984f734476f57a5d412537\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1330,\n \"samples\": [\n \"An outstanding animated romantic comedy, Shrek, brings to the screen the love story between, an ogre (Shrek), and a beautiful princess (Fiona), with all the ups and downs that that entails! In addition, the couple finds itself in the company of adorable characters from classic fairy tales, with the cherry on the cake being the hilarious talking donkey.It is a film about human relations, hope and second chances, but most importantly about trust, love, and inner strength.Mike Myers, Eddie Murphy, Cameron Diaz, and the rest of the cast, have truly outdone themselves with their performances, which are exceptional to say the least! All the actors, without exceptions, give it their 100% and it really shows (the animation does come ALIVE)! Very well written and very well presented, the movie is without a doubt guaranteed to provide more than just a few laughs. The film is simple enough, but does a great job of describing people's every day lives and the problems they face. It just goes to show that simplicity is often far better than complexity, when trying to present issues of a human nature.The setting, the plot, the dialogues, the HUMOR (!!!) and the music are all wonderful!In short, Shrek is a movie definitely worth watching and one to seriously consider adding to your movie collection! Strongly recommended along with Shrek 2. ANSWERNOTFOUND\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"human_ans_spans\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 906,\n \"samples\": [\n \"my review of Christopher Nolan 's \\\" Memento \\\"\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"human_ans_indices\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1744,\n \"samples\": [\n \"(985, 999)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer_subj_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ans_subj_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.33359523274248176,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 148,\n \"samples\": [\n 0.6749999999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"is_ans_subjective\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n true\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "GMK9xMMf4V3p" + } + }, + { + "cell_type": "code", + "source": [ + "df_train.info" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "PRe8FNYw133V", + "outputId": "75d5d576-55fd-48e0-d87a-0338ff0e41aa" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
\n", + "
pandas.core.frame.DataFrame.info
def info(verbose: bool | None=None, buf: WriteBuffer[str] | None=None, max_cols: int | None=None, memory_usage: bool | str | None=None, show_counts: bool | None=None, null_counts: bool | None=None) -> None
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.pyPrint a concise summary of a DataFrame.\n",
+              "\n",
+              "This method prints information about a DataFrame including\n",
+              "the index dtype and columns, non-null values and memory usage.\n",
+              "\n",
+              "Parameters\n",
+              "----------\n",
+              "verbose : bool, optional\n",
+              "    Whether to print the full summary. By default, the setting in\n",
+              "    ``pandas.options.display.max_info_columns`` is followed.\n",
+              "buf : writable buffer, defaults to sys.stdout\n",
+              "    Where to send the output. By default, the output is printed to\n",
+              "    sys.stdout. Pass a writable buffer if you need to further process\n",
+              "    the output.    max_cols : int, optional\n",
+              "    When to switch from the verbose to the truncated output. If the\n",
+              "    DataFrame has more than `max_cols` columns, the truncated output\n",
+              "    is used. By default, the setting in\n",
+              "    ``pandas.options.display.max_info_columns`` is used.\n",
+              "memory_usage : bool, str, optional\n",
+              "    Specifies whether total memory usage of the DataFrame\n",
+              "    elements (including the index) should be displayed. By default,\n",
+              "    this follows the ``pandas.options.display.memory_usage`` setting.\n",
+              "\n",
+              "    True always show memory usage. False never shows memory usage.\n",
+              "    A value of 'deep' is equivalent to "True with deep introspection".\n",
+              "    Memory usage is shown in human-readable units (base-2\n",
+              "    representation). Without deep introspection a memory estimation is\n",
+              "    made based in column dtype and number of rows assuming values\n",
+              "    consume the same memory amount for corresponding dtypes. With deep\n",
+              "    memory introspection, a real memory usage calculation is performed\n",
+              "    at the cost of computational resources. See the\n",
+              "    :ref:`Frequently Asked Questions <df-memory-usage>` for more\n",
+              "    details.\n",
+              "show_counts : bool, optional\n",
+              "    Whether to show the non-null counts. By default, this is shown\n",
+              "    only if the DataFrame is smaller than\n",
+              "    ``pandas.options.display.max_info_rows`` and\n",
+              "    ``pandas.options.display.max_info_columns``. A value of True always\n",
+              "    shows the counts, and False never shows the counts.\n",
+              "null_counts : bool, optional\n",
+              "    .. deprecated:: 1.2.0\n",
+              "        Use show_counts instead.\n",
+              "\n",
+              "Returns\n",
+              "-------\n",
+              "None\n",
+              "    This method prints a summary of a DataFrame and returns None.\n",
+              "\n",
+              "See Also\n",
+              "--------\n",
+              "DataFrame.describe: Generate descriptive statistics of DataFrame\n",
+              "    columns.\n",
+              "DataFrame.memory_usage: Memory usage of DataFrame columns.\n",
+              "\n",
+              "Examples\n",
+              "--------\n",
+              ">>> int_values = [1, 2, 3, 4, 5]\n",
+              ">>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']\n",
+              ">>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]\n",
+              ">>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,\n",
+              "...                   "float_col": float_values})\n",
+              ">>> df\n",
+              "    int_col text_col  float_col\n",
+              "0        1    alpha       0.00\n",
+              "1        2     beta       0.25\n",
+              "2        3    gamma       0.50\n",
+              "3        4    delta       0.75\n",
+              "4        5  epsilon       1.00\n",
+              "\n",
+              "Prints information of all columns:\n",
+              "\n",
+              ">>> df.info(verbose=True)\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 5 entries, 0 to 4\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column     Non-Null Count  Dtype\n",
+              "---  ------     --------------  -----\n",
+              " 0   int_col    5 non-null      int64\n",
+              " 1   text_col   5 non-null      object\n",
+              " 2   float_col  5 non-null      float64\n",
+              "dtypes: float64(1), int64(1), object(1)\n",
+              "memory usage: 248.0+ bytes\n",
+              "\n",
+              "Prints a summary of columns count and its dtypes but not per column\n",
+              "information:\n",
+              "\n",
+              ">>> df.info(verbose=False)\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 5 entries, 0 to 4\n",
+              "Columns: 3 entries, int_col to float_col\n",
+              "dtypes: float64(1), int64(1), object(1)\n",
+              "memory usage: 248.0+ bytes\n",
+              "\n",
+              "Pipe output of DataFrame.info to buffer instead of sys.stdout, get\n",
+              "buffer content and writes to a text file:\n",
+              "\n",
+              ">>> import io\n",
+              ">>> buffer = io.StringIO()\n",
+              ">>> df.info(buf=buffer)\n",
+              ">>> s = buffer.getvalue()\n",
+              ">>> with open("df_info.txt", "w",\n",
+              "...           encoding="utf-8") as f:  # doctest: +SKIP\n",
+              "...     f.write(s)\n",
+              "260\n",
+              "\n",
+              "The `memory_usage` parameter allows deep introspection mode, specially\n",
+              "useful for big DataFrames and fine-tune memory optimization:\n",
+              "\n",
+              ">>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
+              ">>> df = pd.DataFrame({\n",
+              "...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
+              "...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
+              "...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
+              "... })\n",
+              ">>> df.info()\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 1000000 entries, 0 to 999999\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column    Non-Null Count    Dtype\n",
+              "---  ------    --------------    -----\n",
+              " 0   column_1  1000000 non-null  object\n",
+              " 1   column_2  1000000 non-null  object\n",
+              " 2   column_3  1000000 non-null  object\n",
+              "dtypes: object(3)\n",
+              "memory usage: 22.9+ MB\n",
+              "\n",
+              ">>> df.info(memory_usage='deep')\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 1000000 entries, 0 to 999999\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column    Non-Null Count    Dtype\n",
+              "---  ------    --------------    -----\n",
+              " 0   column_1  1000000 non-null  object\n",
+              " 1   column_2  1000000 non-null  object\n",
+              " 2   column_3  1000000 non-null  object\n",
+              "dtypes: object(3)\n",
+              "memory usage: 165.9 MB
\n", + " \n", + "
" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_train.columns" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5X6yTrJ-2Fmo", + "outputId": "ca4ad115-d9d0-4ef9-dd94-37c0d926036c" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['item_id', 'domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp',\n", + " 'q_review_id', 'q_reviews_id', 'question', 'question_subj_level',\n", + " 'ques_subj_score', 'is_ques_subjective', 'review_id', 'review',\n", + " 'human_ans_spans', 'human_ans_indices', 'answer_subj_level',\n", + " 'ans_subj_score', 'is_ans_subjective'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_train.iloc[0:10].question" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6nnoYQyK2MBe", + "outputId": "cd521ee7-69b9-47b3-83c0-5b36947aeb37" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 Who is the author of this series?\n", + "1 Can we enjoy the movie along with our family ?\n", + "2 Does this one good?\n", + "3 Is this series good and excelent?\n", + "4 How is the costume design?\n", + "5 How are the special effects?\n", + "6 Do you have any credit?\n", + "7 How do you like the story?\n", + "8 What criticism deserves the movie Passion of C...\n", + "9 How much is missing from the collection?\n", + "Name: question, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_train.iloc[2].review" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 178 + }, + "id": "0lpkVFjm26FR", + "outputId": "79204717-1a1f-4f8d-cb5b-5795cf5e4e9d" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\"To let the truth be known, I watched this movie with a mix of anticipation and fear. Being an avid Star Wars fan, I was excited to see any Star Wars movie, but I suspected this would be as disappointing as the Phantom Menace. WRONG! Although this doesn't even come close to the great casting and story lines and sheer art of the first three Star Wars series, it was WAY better than Phantom Menace for the following reasons: 1) This movie included LESS Jar-Jar, which, despite initial heavy marketing for the first movie, the character was found by the general consensus to be REALLY annoying. 2) This movie demonstrated some of the political turmoil behind the original Star Wars movies. 3) You get to see some of what led Anakin to turn over to the Dark Side. Finally, the special effects were really good!It was not 4 or 5 stars because the actors that were cast in this movie (as well as The Phantom Menace) are all well known for other cinematic accomplishments, and it was hard to believe that they were supposed to be these other characters. They should have casted lesser-known actors, in my opinion. Also, the plot about the clones was weak, to me.But, note well- the fight-scene with Yoda by itself makes the movie worth watching. It was action packed, entertaining, and even a little bit funny. I do recommend this movie to any Star Wars fan, way way better than the Phantom Menace, but do not go into it expecting it to be as good as the first Star Wars series. ANSWERNOTFOUND\"" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 56 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_train.iloc[2].human_ans_indices" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "CNxpn9lv3QRx", + "outputId": "0072592a-3de1-4d67-ca7a-57437419618e" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'(1476, 1490)'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 57 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_train.iloc[2].review[1476:1490]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "mi0231lB3cjJ", + "outputId": "e3145726-3527-48b0-dc07-63d1ebee6da0" + }, + "execution_count": 59, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'ANSWERNOTFOUND'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 59 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Picking the necessary columns for further analysis\n", + "df_train=df_train[['question','human_ans_indices','review','human_ans_spans']]\n", + "df_test=df_test[['question','human_ans_indices','review','human_ans_spans']]" + ], + "metadata": { + "id": "GYTknC6D3zZQ" + }, + "execution_count": 60, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Generate a sequence evenly spaced numbers\n", + "import numpy as np\n", + "df_train['id']=np.linspace(0,len(df_train)-1,len(df_train)) # Generates a sequence of IDs from 0 to the length of the training data minus 1\n", + "df_test['id']=np.linspace(0,len(df_test)-1,len(df_test)) # Same\n", + "\n", + "# Convert to strings\n", + "df_train['id']=df_train['id'].astype(str)\n", + "df_test['id']=df_test['id'].astype(str)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ginf6f7C330b", + "outputId": "5bb1df99-c9d1-47aa-c417-98163f3e7e5f" + }, + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_train['id']=np.linspace(0,len(df_train)-1,len(df_train))\n", + ":5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_train['id']=df_train['id'].astype(str)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_train.info" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "IA1O56Rt5m7m", + "outputId": "1c782044-f9f5-4833-c907-24595fb5ad3d" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
\n", + "
pandas.core.frame.DataFrame.info
def info(verbose: bool | None=None, buf: WriteBuffer[str] | None=None, max_cols: int | None=None, memory_usage: bool | str | None=None, show_counts: bool | None=None, null_counts: bool | None=None) -> None
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.pyPrint a concise summary of a DataFrame.\n",
+              "\n",
+              "This method prints information about a DataFrame including\n",
+              "the index dtype and columns, non-null values and memory usage.\n",
+              "\n",
+              "Parameters\n",
+              "----------\n",
+              "verbose : bool, optional\n",
+              "    Whether to print the full summary. By default, the setting in\n",
+              "    ``pandas.options.display.max_info_columns`` is followed.\n",
+              "buf : writable buffer, defaults to sys.stdout\n",
+              "    Where to send the output. By default, the output is printed to\n",
+              "    sys.stdout. Pass a writable buffer if you need to further process\n",
+              "    the output.    max_cols : int, optional\n",
+              "    When to switch from the verbose to the truncated output. If the\n",
+              "    DataFrame has more than `max_cols` columns, the truncated output\n",
+              "    is used. By default, the setting in\n",
+              "    ``pandas.options.display.max_info_columns`` is used.\n",
+              "memory_usage : bool, str, optional\n",
+              "    Specifies whether total memory usage of the DataFrame\n",
+              "    elements (including the index) should be displayed. By default,\n",
+              "    this follows the ``pandas.options.display.memory_usage`` setting.\n",
+              "\n",
+              "    True always show memory usage. False never shows memory usage.\n",
+              "    A value of 'deep' is equivalent to "True with deep introspection".\n",
+              "    Memory usage is shown in human-readable units (base-2\n",
+              "    representation). Without deep introspection a memory estimation is\n",
+              "    made based in column dtype and number of rows assuming values\n",
+              "    consume the same memory amount for corresponding dtypes. With deep\n",
+              "    memory introspection, a real memory usage calculation is performed\n",
+              "    at the cost of computational resources. See the\n",
+              "    :ref:`Frequently Asked Questions <df-memory-usage>` for more\n",
+              "    details.\n",
+              "show_counts : bool, optional\n",
+              "    Whether to show the non-null counts. By default, this is shown\n",
+              "    only if the DataFrame is smaller than\n",
+              "    ``pandas.options.display.max_info_rows`` and\n",
+              "    ``pandas.options.display.max_info_columns``. A value of True always\n",
+              "    shows the counts, and False never shows the counts.\n",
+              "null_counts : bool, optional\n",
+              "    .. deprecated:: 1.2.0\n",
+              "        Use show_counts instead.\n",
+              "\n",
+              "Returns\n",
+              "-------\n",
+              "None\n",
+              "    This method prints a summary of a DataFrame and returns None.\n",
+              "\n",
+              "See Also\n",
+              "--------\n",
+              "DataFrame.describe: Generate descriptive statistics of DataFrame\n",
+              "    columns.\n",
+              "DataFrame.memory_usage: Memory usage of DataFrame columns.\n",
+              "\n",
+              "Examples\n",
+              "--------\n",
+              ">>> int_values = [1, 2, 3, 4, 5]\n",
+              ">>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']\n",
+              ">>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]\n",
+              ">>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,\n",
+              "...                   "float_col": float_values})\n",
+              ">>> df\n",
+              "    int_col text_col  float_col\n",
+              "0        1    alpha       0.00\n",
+              "1        2     beta       0.25\n",
+              "2        3    gamma       0.50\n",
+              "3        4    delta       0.75\n",
+              "4        5  epsilon       1.00\n",
+              "\n",
+              "Prints information of all columns:\n",
+              "\n",
+              ">>> df.info(verbose=True)\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 5 entries, 0 to 4\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column     Non-Null Count  Dtype\n",
+              "---  ------     --------------  -----\n",
+              " 0   int_col    5 non-null      int64\n",
+              " 1   text_col   5 non-null      object\n",
+              " 2   float_col  5 non-null      float64\n",
+              "dtypes: float64(1), int64(1), object(1)\n",
+              "memory usage: 248.0+ bytes\n",
+              "\n",
+              "Prints a summary of columns count and its dtypes but not per column\n",
+              "information:\n",
+              "\n",
+              ">>> df.info(verbose=False)\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 5 entries, 0 to 4\n",
+              "Columns: 3 entries, int_col to float_col\n",
+              "dtypes: float64(1), int64(1), object(1)\n",
+              "memory usage: 248.0+ bytes\n",
+              "\n",
+              "Pipe output of DataFrame.info to buffer instead of sys.stdout, get\n",
+              "buffer content and writes to a text file:\n",
+              "\n",
+              ">>> import io\n",
+              ">>> buffer = io.StringIO()\n",
+              ">>> df.info(buf=buffer)\n",
+              ">>> s = buffer.getvalue()\n",
+              ">>> with open("df_info.txt", "w",\n",
+              "...           encoding="utf-8") as f:  # doctest: +SKIP\n",
+              "...     f.write(s)\n",
+              "260\n",
+              "\n",
+              "The `memory_usage` parameter allows deep introspection mode, specially\n",
+              "useful for big DataFrames and fine-tune memory optimization:\n",
+              "\n",
+              ">>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
+              ">>> df = pd.DataFrame({\n",
+              "...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
+              "...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
+              "...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
+              "... })\n",
+              ">>> df.info()\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 1000000 entries, 0 to 999999\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column    Non-Null Count    Dtype\n",
+              "---  ------    --------------    -----\n",
+              " 0   column_1  1000000 non-null  object\n",
+              " 1   column_2  1000000 non-null  object\n",
+              " 2   column_3  1000000 non-null  object\n",
+              "dtypes: object(3)\n",
+              "memory usage: 22.9+ MB\n",
+              "\n",
+              ">>> df.info(memory_usage='deep')\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 1000000 entries, 0 to 999999\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column    Non-Null Count    Dtype\n",
+              "---  ------    --------------    -----\n",
+              " 0   column_1  1000000 non-null  object\n",
+              " 1   column_2  1000000 non-null  object\n",
+              " 2   column_3  1000000 non-null  object\n",
+              "dtypes: object(3)\n",
+              "memory usage: 165.9 MB
\n", + " \n", + "
" + ] + }, + "metadata": {}, + "execution_count": 63 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_test.info" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "6PuUHxJ75tI9", + "outputId": "231377e9-d714-4b80-97d5-79ad13b7df0a" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
\n", + "
pandas.core.frame.DataFrame.info
def info(verbose: bool | None=None, buf: WriteBuffer[str] | None=None, max_cols: int | None=None, memory_usage: bool | str | None=None, show_counts: bool | None=None, null_counts: bool | None=None) -> None
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.pyPrint a concise summary of a DataFrame.\n",
+              "\n",
+              "This method prints information about a DataFrame including\n",
+              "the index dtype and columns, non-null values and memory usage.\n",
+              "\n",
+              "Parameters\n",
+              "----------\n",
+              "verbose : bool, optional\n",
+              "    Whether to print the full summary. By default, the setting in\n",
+              "    ``pandas.options.display.max_info_columns`` is followed.\n",
+              "buf : writable buffer, defaults to sys.stdout\n",
+              "    Where to send the output. By default, the output is printed to\n",
+              "    sys.stdout. Pass a writable buffer if you need to further process\n",
+              "    the output.    max_cols : int, optional\n",
+              "    When to switch from the verbose to the truncated output. If the\n",
+              "    DataFrame has more than `max_cols` columns, the truncated output\n",
+              "    is used. By default, the setting in\n",
+              "    ``pandas.options.display.max_info_columns`` is used.\n",
+              "memory_usage : bool, str, optional\n",
+              "    Specifies whether total memory usage of the DataFrame\n",
+              "    elements (including the index) should be displayed. By default,\n",
+              "    this follows the ``pandas.options.display.memory_usage`` setting.\n",
+              "\n",
+              "    True always show memory usage. False never shows memory usage.\n",
+              "    A value of 'deep' is equivalent to "True with deep introspection".\n",
+              "    Memory usage is shown in human-readable units (base-2\n",
+              "    representation). Without deep introspection a memory estimation is\n",
+              "    made based in column dtype and number of rows assuming values\n",
+              "    consume the same memory amount for corresponding dtypes. With deep\n",
+              "    memory introspection, a real memory usage calculation is performed\n",
+              "    at the cost of computational resources. See the\n",
+              "    :ref:`Frequently Asked Questions <df-memory-usage>` for more\n",
+              "    details.\n",
+              "show_counts : bool, optional\n",
+              "    Whether to show the non-null counts. By default, this is shown\n",
+              "    only if the DataFrame is smaller than\n",
+              "    ``pandas.options.display.max_info_rows`` and\n",
+              "    ``pandas.options.display.max_info_columns``. A value of True always\n",
+              "    shows the counts, and False never shows the counts.\n",
+              "null_counts : bool, optional\n",
+              "    .. deprecated:: 1.2.0\n",
+              "        Use show_counts instead.\n",
+              "\n",
+              "Returns\n",
+              "-------\n",
+              "None\n",
+              "    This method prints a summary of a DataFrame and returns None.\n",
+              "\n",
+              "See Also\n",
+              "--------\n",
+              "DataFrame.describe: Generate descriptive statistics of DataFrame\n",
+              "    columns.\n",
+              "DataFrame.memory_usage: Memory usage of DataFrame columns.\n",
+              "\n",
+              "Examples\n",
+              "--------\n",
+              ">>> int_values = [1, 2, 3, 4, 5]\n",
+              ">>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']\n",
+              ">>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]\n",
+              ">>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,\n",
+              "...                   "float_col": float_values})\n",
+              ">>> df\n",
+              "    int_col text_col  float_col\n",
+              "0        1    alpha       0.00\n",
+              "1        2     beta       0.25\n",
+              "2        3    gamma       0.50\n",
+              "3        4    delta       0.75\n",
+              "4        5  epsilon       1.00\n",
+              "\n",
+              "Prints information of all columns:\n",
+              "\n",
+              ">>> df.info(verbose=True)\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 5 entries, 0 to 4\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column     Non-Null Count  Dtype\n",
+              "---  ------     --------------  -----\n",
+              " 0   int_col    5 non-null      int64\n",
+              " 1   text_col   5 non-null      object\n",
+              " 2   float_col  5 non-null      float64\n",
+              "dtypes: float64(1), int64(1), object(1)\n",
+              "memory usage: 248.0+ bytes\n",
+              "\n",
+              "Prints a summary of columns count and its dtypes but not per column\n",
+              "information:\n",
+              "\n",
+              ">>> df.info(verbose=False)\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 5 entries, 0 to 4\n",
+              "Columns: 3 entries, int_col to float_col\n",
+              "dtypes: float64(1), int64(1), object(1)\n",
+              "memory usage: 248.0+ bytes\n",
+              "\n",
+              "Pipe output of DataFrame.info to buffer instead of sys.stdout, get\n",
+              "buffer content and writes to a text file:\n",
+              "\n",
+              ">>> import io\n",
+              ">>> buffer = io.StringIO()\n",
+              ">>> df.info(buf=buffer)\n",
+              ">>> s = buffer.getvalue()\n",
+              ">>> with open("df_info.txt", "w",\n",
+              "...           encoding="utf-8") as f:  # doctest: +SKIP\n",
+              "...     f.write(s)\n",
+              "260\n",
+              "\n",
+              "The `memory_usage` parameter allows deep introspection mode, specially\n",
+              "useful for big DataFrames and fine-tune memory optimization:\n",
+              "\n",
+              ">>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
+              ">>> df = pd.DataFrame({\n",
+              "...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
+              "...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
+              "...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
+              "... })\n",
+              ">>> df.info()\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 1000000 entries, 0 to 999999\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column    Non-Null Count    Dtype\n",
+              "---  ------    --------------    -----\n",
+              " 0   column_1  1000000 non-null  object\n",
+              " 1   column_2  1000000 non-null  object\n",
+              " 2   column_3  1000000 non-null  object\n",
+              "dtypes: object(3)\n",
+              "memory usage: 22.9+ MB\n",
+              "\n",
+              ">>> df.info(memory_usage='deep')\n",
+              "<class 'pandas.core.frame.DataFrame'>\n",
+              "RangeIndex: 1000000 entries, 0 to 999999\n",
+              "Data columns (total 3 columns):\n",
+              " #   Column    Non-Null Count    Dtype\n",
+              "---  ------    --------------    -----\n",
+              " 0   column_1  1000000 non-null  object\n",
+              " 1   column_2  1000000 non-null  object\n",
+              " 2   column_3  1000000 non-null  object\n",
+              "dtypes: object(3)\n",
+              "memory usage: 165.9 MB
\n", + " \n", + "
" + ] + }, + "metadata": {}, + "execution_count": 64 + } + ] + }, + { + "cell_type": "code", + "source": [ + "int(df_train.iloc[0].human_ans_indices.split('(')[1].split(',')[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dDGXgO925wRW", + "outputId": "96b23f6e-6300-4c92-9aac-74bc7377ad7e" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "251" + ] + }, + "metadata": {}, + "execution_count": 68 + } + ] + }, + { + "cell_type": "code", + "source": [ + "float(df_train.iloc[0].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NCXUJLI09hjC", + "outputId": "9f08eadf-f05c-4be8-b9c6-d2630b824462" + }, + "execution_count": 67, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "265.0" + ] + }, + "metadata": {}, + "execution_count": 67 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Where the answers are\n", + "df_train['answers']=df_train['human_ans_spans']\n", + "# Actual answer text itself, right answer where should be\n", + "df_test['answers']=df_test['human_ans_spans']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_2AIdMOg_E9_", + "outputId": "39e7aa3d-fe13-47d6-9312-ab26048a9cdb" + }, + "execution_count": 70, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_train['answers']=df_train['human_ans_spans']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Extract answer data and adds it to a new column\n", + "for i in range(0,len(df_train)):\n", + " answer1={}\n", + " si=int(df_train.iloc[i].human_ans_indices.split('(')[1].split(',')[0])\n", + " ei=int(df_train.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])\n", + " answer1['text']=[df_train.iloc[i].review[si:ei]]\n", + " answer1['answer_start']=[si]\n", + " df_train.at[i, 'answers']=answer1" + ], + "metadata": { + "id": "NBY1eCL9-wNR" + }, + "execution_count": 71, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(df_train.iloc[i].answers,df_train.iloc[i].human_ans_spans)" + ], + "metadata": { + "id": "XEPxIoLK-yg_" + }, + "execution_count": 72, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Same for the test data\n", + "for i in range(0,len(df_test)):\n", + " answer1={}\n", + " si=int(df_test.iloc[i].human_ans_indices.split('(')[1].split(',')[0])\n", + " ei=int(df_test.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])\n", + " answer1['text']=[df_test.iloc[i].review[si:ei]]\n", + " answer1['answer_start']=[si]\n", + " df_test.at[i, 'answers']=answer1" + ], + "metadata": { + "id": "M5vNLyBe_Oac" + }, + "execution_count": 73, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(df_train.iloc[i].answers,df_train.iloc[i].human_ans_spans)" + ], + "metadata": { + "id": "FDsJrxf5_SPi" + }, + "execution_count": 74, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_train.columns" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QZMKSb7d_UG1", + "outputId": "f71e43ec-6df8-43e9-a5c3-1f1f04bd5a64" + }, + "execution_count": 75, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['question', 'human_ans_indices', 'review', 'human_ans_spans', 'id',\n", + " 'answers'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 75 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Standardizing the columns for clarity (context)\n", + "df_train.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans', 'id',\n", + " 'answers']\n", + "\n", + "df_test.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans','id',\n", + " 'answers']" + ], + "metadata": { + "id": "dAWno84T_zo0" + }, + "execution_count": 76, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Creating Datasets from Pandas DataFrames for Validation and Training\n", + "val_dataset2 = datasets.Dataset.from_pandas(df_test)\n", + "train_dataset2 = datasets.Dataset.from_pandas(df_train)" + ], + "metadata": { + "id": "f_m7XTH4_1ez" + }, + "execution_count": 78, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Preprocess the training examples .map() function on training dataset with the preprocessing function\n", + "train_dataset = train_dataset2.map(\n", + " preprocess_training_examples,\n", + " batched=True,\n", + " remove_columns=train_dataset2.column_names,\n", + ")\n", + "len(train_dataset2), len(train_dataset) # compare the lengths of the original dataset (train_dataset2) and the preprocessed dataset (train_dataset)." + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67, + "referenced_widgets": [ + "33ba4c1c6ba046b78c6c0e5ef987de2a", + "3dd07c7fd25f4a0cac9c6d79bf312c5b", + "68d6df7a72b84174a216d6804a422c00", + "02857e625d664d748f0411dbd9307285", + "adfc7c2afddb4979bd6e30d450d4efb6", + "cbf6d52e02bb47c8a4c73208c8cf658a", + "20b683d9910440f882ca10ba0837c545", + "a39c78323e05487aa6c94e3c67b1d136", + "27ee050eee1f479b8cd328158e52e497", + "50478538cf9947c5a760c538eaeb6f46", + "735906f0d31049e4af3fae4f2462cc05" + ] + }, + "id": "ShRJOEGxAgU0", + "outputId": "8db75a97-189a-4bdc-b64a-5f52ab020729" + }, + "execution_count": 81, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Map: 0%| | 0/2501 [00:00