From c42cbe74e57e7ead3275ccf5aadebdbef70fc2ab Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Mon, 3 Jun 2024 22:54:45 -0500 Subject: [PATCH] Fetch Surrounding Chunks commit of Fetch Surrounding Chunks python notebook --- .../fetch-surrounding-chunks.ipynb | 4048 +++++++++++++++++ 1 file changed, 4048 insertions(+) create mode 100644 notebooks/document-chunking/fetch-surrounding-chunks.ipynb diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb new file mode 100644 index 00000000..9bd766c9 --- /dev/null +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -0,0 +1,4048 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "9c160e35cf414c528b5bffe05725a7d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", + "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", + "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" + ], + "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" + } + }, + "e87bc6913a7747728aed4b60a645bc2c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d93a6e3ddd364921a7c2a24451d27ffc", + "placeholder": "​", + "style": "IPY_MODEL_2155cf3c7b2043d0a41fc011bf4f0e04", + "value": "tokenizer_config.json: 100%" + } + }, + "9fa94c466004402bb293e4aa0bdc82f4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_500a70f25097484bbec10c0ffd402595", + "max": 48, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9d43995246744e26a8053c21e2c5fcfa", + "value": 48 + } + }, + "95316b2f654a4ddc99c92d7c60c2f417": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ed85f5360ba4feda6469aabd0324e7a", + "placeholder": "​", + "style": "IPY_MODEL_808af1e1f2464a928ee23398c837ff48", + "value": " 48.0/48.0 [00:00<00:00, 1.58kB/s]" + } + }, + "0fc0b516e82941dc934c26eba22d9e01": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d93a6e3ddd364921a7c2a24451d27ffc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2155cf3c7b2043d0a41fc011bf4f0e04": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "500a70f25097484bbec10c0ffd402595": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d43995246744e26a8053c21e2c5fcfa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2ed85f5360ba4feda6469aabd0324e7a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "808af1e1f2464a928ee23398c837ff48": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "31906527169a4c08801dc6b21936188d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a6570ce51dfc46f383d855e28534bf73", + "IPY_MODEL_41cc49a71a164065bc833d080027e4d2", + "IPY_MODEL_748e7f3c8da243e9b5320654ec8e8146" + ], + "layout": "IPY_MODEL_a88953429ab6436fb4f01b6b1e2cf6ff" + } + }, + "a6570ce51dfc46f383d855e28534bf73": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0a2671c90a048548314c2e3d21e19e7", + "placeholder": "​", + "style": "IPY_MODEL_5f2080a5d12241638447a5851d0c8db3", + "value": "vocab.txt: 100%" + } + }, + "41cc49a71a164065bc833d080027e4d2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ce44d2f323d45838633a750f2386525", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_406f3564a217478d8f60dee5e1fb6dbf", + "value": 231508 + } + }, + "748e7f3c8da243e9b5320654ec8e8146": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_61ae734ac8d441fd9b3ea198aff3f2c7", + "placeholder": "​", + "style": "IPY_MODEL_bc52c57fa6464ab39823cd3ddb9d7d78", + "value": " 232k/232k [00:00<00:00, 2.88MB/s]" + } + }, + "a88953429ab6436fb4f01b6b1e2cf6ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0a2671c90a048548314c2e3d21e19e7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f2080a5d12241638447a5851d0c8db3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7ce44d2f323d45838633a750f2386525": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "406f3564a217478d8f60dee5e1fb6dbf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "61ae734ac8d441fd9b3ea198aff3f2c7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bc52c57fa6464ab39823cd3ddb9d7d78": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "02f735a438bf4058a9cfacf8d2b8660f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_331b178397164de49408dc50ce417a36", + "IPY_MODEL_07ee43d2a1684fb0b1445755802b6ea5", + "IPY_MODEL_c867bce7e34b4800903eb9ec99f34784" + ], + "layout": "IPY_MODEL_8169e16a9b0146f5a57a015601c2ebcb" + } + }, + "331b178397164de49408dc50ce417a36": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35ca86faebfd43faaef0202389d958fd", + "placeholder": "​", + "style": "IPY_MODEL_f04f37ba10e9498ea61acdce637431ee", + "value": "tokenizer.json: 100%" + } + }, + "07ee43d2a1684fb0b1445755802b6ea5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_527bfa6067c84b94a1e70dfadfd4b78e", + "max": 466062, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_312e85864e074b958d86325b6417a0fa", + "value": 466062 + } + }, + "c867bce7e34b4800903eb9ec99f34784": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b0fc37739334025b037a5270c9515bf", + "placeholder": "​", + "style": "IPY_MODEL_0dfb7f264674449b92a390324d17c4cf", + "value": " 466k/466k [00:00<00:00, 6.88MB/s]" + } + }, + "8169e16a9b0146f5a57a015601c2ebcb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35ca86faebfd43faaef0202389d958fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f04f37ba10e9498ea61acdce637431ee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "527bfa6067c84b94a1e70dfadfd4b78e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "312e85864e074b958d86325b6417a0fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3b0fc37739334025b037a5270c9515bf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0dfb7f264674449b92a390324d17c4cf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7e58bf25549d4b428f231d528e8fef54": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_461ca08f677a4cba9ec2a388c2e346f3", + "IPY_MODEL_90d31fb52af949b0a2b41e3613827233", + "IPY_MODEL_1afbe347ab364b28b887f49dad54f5d7" + ], + "layout": "IPY_MODEL_8483a759cc0e4e12834fc7d08dab3b7e" + } + }, + "461ca08f677a4cba9ec2a388c2e346f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59d8ffb31bb340eba7e0dcebfbbdd977", + "placeholder": "​", + "style": "IPY_MODEL_2b62b542c091466cbae559e29ec797bd", + "value": "config.json: 100%" + } + }, + "90d31fb52af949b0a2b41e3613827233": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_092a4de220ba4ca2a23a0f273aba601b", + "max": 570, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e0c46565371f437a85a26d44c5b20c5b", + "value": 570 + } + }, + "1afbe347ab364b28b887f49dad54f5d7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4831ce9114e5437ea8a24919557c40e2", + "placeholder": "​", + "style": "IPY_MODEL_53344cac458d4d5ebdc504744c18b7de", + "value": " 570/570 [00:00<00:00, 19.9kB/s]" + } + }, + "8483a759cc0e4e12834fc7d08dab3b7e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59d8ffb31bb340eba7e0dcebfbbdd977": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2b62b542c091466cbae559e29ec797bd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "092a4de220ba4ca2a23a0f273aba601b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0c46565371f437a85a26d44c5b20c5b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4831ce9114e5437ea8a24919557c40e2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53344cac458d4d5ebdc504744c18b7de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f34bf7b0bb424a8e8c00ff75309bbe6f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d604bc170c02491fae573c702e790893", + "IPY_MODEL_d8a3bdb8be354365944ab587738280d3", + "IPY_MODEL_89ad2dee66324ae896eec71924aee670" + ], + "layout": "IPY_MODEL_31c0c3d684564d5fb87d2e25e6de96eb" + } + }, + "d604bc170c02491fae573c702e790893": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b7bef190ebed494eb8773ec21d9b7160", + "placeholder": "​", + "style": "IPY_MODEL_6c822f08434c4212931dcf097a80b7d4", + "value": "tokenizer_config.json: 100%" + } + }, + "d8a3bdb8be354365944ab587738280d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f8d4b5000174234bded5d4e017aa4e9", + "max": 418, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_243f1e7de5414b82aaa4b50482dd964d", + "value": 418 + } + }, + "89ad2dee66324ae896eec71924aee670": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0961f276155348f98c12d1be4ad78e62", + "placeholder": "​", + "style": "IPY_MODEL_95c6cffafe1b4345a905be485b787728", + "value": " 418/418 [00:00<00:00, 13.9kB/s]" + } + }, + "31c0c3d684564d5fb87d2e25e6de96eb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b7bef190ebed494eb8773ec21d9b7160": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c822f08434c4212931dcf097a80b7d4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f8d4b5000174234bded5d4e017aa4e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "243f1e7de5414b82aaa4b50482dd964d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0961f276155348f98c12d1be4ad78e62": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "95c6cffafe1b4345a905be485b787728": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "43072f923bd24566ae0e20ca9aa3cdc5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5a4f80526c2c4b53a1bce182e9b3e5fa", + "IPY_MODEL_400b0a8ef7c64477bf4f02a16b5508a0", + "IPY_MODEL_b03347c3201849778ea3129314ac340c" + ], + "layout": "IPY_MODEL_3542b02e36ce4e03850b37c28d88da30" + } + }, + "5a4f80526c2c4b53a1bce182e9b3e5fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7b2abc768054422f8af3d21837400b4a", + "placeholder": "​", + "style": "IPY_MODEL_d8e4ceae237d4381aa5e44b020d7564e", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "400b0a8ef7c64477bf4f02a16b5508a0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5ec838bb84644b6a27e3eaec9d7ac74", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_30ba8c556cc34fde96b530ed66ac376d", + "value": 5069051 + } + }, + "b03347c3201849778ea3129314ac340c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_711439e7dcab4c10ab4300bdbe6b86aa", + "placeholder": "​", + "style": "IPY_MODEL_15887401b0814d9386fb4d02d6279412", + "value": " 5.07M/5.07M [00:00<00:00, 19.7MB/s]" + } + }, + "3542b02e36ce4e03850b37c28d88da30": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b2abc768054422f8af3d21837400b4a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d8e4ceae237d4381aa5e44b020d7564e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e5ec838bb84644b6a27e3eaec9d7ac74": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "30ba8c556cc34fde96b530ed66ac376d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "711439e7dcab4c10ab4300bdbe6b86aa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "15887401b0814d9386fb4d02d6279412": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bec0cae37feb48a4add318d970d8ef96": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_feb8127671424fa68b9b93a7547e40eb", + "IPY_MODEL_94c89aa435e44c8d9369305c21ca028c", + "IPY_MODEL_7abe3bc7884e4afeb9995f7d7acc8c0f" + ], + "layout": "IPY_MODEL_7edccbff1ca145eabd4af6f9da32442a" + } + }, + "feb8127671424fa68b9b93a7547e40eb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c11fce2ab1f4811a3d98f9154818825", + "placeholder": "​", + "style": "IPY_MODEL_2cf5d1a84ed947ddb16e5f8e6984b01e", + "value": "tokenizer.json: 100%" + } + }, + "94c89aa435e44c8d9369305c21ca028c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5de868032e640d5a07c34c9917190c3", + "max": 17082660, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d6295baf48b24c929c3ab4a317356e2b", + "value": 17082660 + } + }, + "7abe3bc7884e4afeb9995f7d7acc8c0f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df2ed1f8e3754f3a8f30be35935e82f3", + "placeholder": "​", + "style": "IPY_MODEL_2f125504e41344c088231f0307d7cb92", + "value": " 17.1M/17.1M [00:00<00:00, 76.7MB/s]" + } + }, + "7edccbff1ca145eabd4af6f9da32442a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9c11fce2ab1f4811a3d98f9154818825": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2cf5d1a84ed947ddb16e5f8e6984b01e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d5de868032e640d5a07c34c9917190c3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d6295baf48b24c929c3ab4a317356e2b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "df2ed1f8e3754f3a8f30be35935e82f3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f125504e41344c088231f0307d7cb92": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a6665e93675459394536fd9f846fbea": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_aed65947759c47c58abe86f1ee279b86", + "IPY_MODEL_b5d7fb93223c4c458e7c80e59daea4d2", + "IPY_MODEL_540140c9c2f541b3a82f7a59e4f0b867" + ], + "layout": "IPY_MODEL_d5212aa2a4f74de1970c07e282f0e2bc" + } + }, + "aed65947759c47c58abe86f1ee279b86": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a75d002a4ae4fc99625420ec6e580ee", + "placeholder": "​", + "style": "IPY_MODEL_695f77c019db487ea60171277073efe6", + "value": "special_tokens_map.json: 100%" + } + }, + "b5d7fb93223c4c458e7c80e59daea4d2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bc3fad5fe0194399add875a6d78907bd", + "max": 280, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_42c418329198400bb77cdd3a654a96de", + "value": 280 + } + }, + "540140c9c2f541b3a82f7a59e4f0b867": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "placeholder": "​", + "style": "IPY_MODEL_735b9a74223f4941a2837a1108889f63", + "value": " 280/280 [00:00<00:00, 15.7kB/s]" + } + }, + "d5212aa2a4f74de1970c07e282f0e2bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a75d002a4ae4fc99625420ec6e580ee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "695f77c019db487ea60171277073efe6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bc3fad5fe0194399add875a6d78907bd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "42c418329198400bb77cdd3a654a96de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "735b9a74223f4941a2837a1108889f63": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Fetch surronding chucks (N-1, N+1)\n", + "\n", + "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", + "\n", + "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", + "\n" + ], + "metadata": { + "id": "aAUkwshINwV7" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install required python libraries\n" + ], + "metadata": { + "id": "MUEpppV7SeLu" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nXuL8wsQNq8G", + "outputId": "80261fea-a44b-429b-e55d-5947e7ac8b6c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.13.2)\n", + "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.1)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2.0.7)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2024.2.2)\n" + ] + } + ], + "source": [ + "!pip install elasticsearch\n", + "import json\n", + "import time\n", + "import urllib.request\n", + "import re\n", + "import pandas as pd\n", + "from transformers import AutoTokenizer, BertTokenizer\n", + "from elasticsearch import Elasticsearch, helpers\n", + "from google.colab import userdata\n", + "import textwrap" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Elasticsearch and Tokenizer Configuration\n" + ], + "metadata": { + "id": "_d4RWjNAN6Q9" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Elasticsearch and Tokenizer Configuration\n", + "\n", + "This section sets up the necessary configurations for connecting to Elasticsearch and initializing the tokenizers used for text processing.\n", + "\n", + "### Configuration Details:\n", + "1. **Elasticsearch Credentials**:\n", + " - `es_username`: The username for Elasticsearch authentication.\n", + " - `es_password`: The password for Elasticsearch authentication, securely fetched using Google Colab's `userdata` module.\n", + " - `es_cloudid`: The Cloud ID for the Elasticsearch cluster.\n", + "\n", + "2. **Index Settings**:\n", + " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", + " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", + " - `delete_raw_source_index`: A boolean flag indicating whether the raw data index should be deleted before ingesting new data.\n", + "\n", + "3. **Embedding Model**:\n", + " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", + "\n", + "4. **Tokenizer Initialization**:\n", + " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", + " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", + "\n", + "5. **Chunking Parameters**:\n", + " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", + " - `ELSER_TOKEN_OVERLAP`: Defines the overlap ratio between chunks (default is 0%, customizable for context continuity).\n", + "\n", + "These configurations ensure that the necessary components are properly set up for efficient text processing, indexing, and search operations in Elasticsearch.\n" + ], + "metadata": { + "id": "2w7uTCYdQ0m6" + } + }, + { + "cell_type": "code", + "source": [ + "# Elasticsearch and Tokenizer Configuration\n", + "es_username = \"elastic\"\n", + "es_password = userdata.get(\"es_password\")\n", + "es_cloudid = userdata.get(\"es_cloudid\")\n", + "\n", + "raw_source_index = \"harry_potter_dataset-raw\"\n", + "index_name = \"harry_potter_dataset_enriched\"\n", + "\n", + "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", + "elser_model_id = \".elser_model_2_linux-x86_64\"\n", + "\n", + "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", + "\n", + "\n", + "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", + "ELSER_TOKEN_OVERLAP = 0.0" + ], + "metadata": { + "id": "LQzCw0pgN4ll", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 328, + "referenced_widgets": [ + "9c160e35cf414c528b5bffe05725a7d9", + "e87bc6913a7747728aed4b60a645bc2c", + "9fa94c466004402bb293e4aa0bdc82f4", + "95316b2f654a4ddc99c92d7c60c2f417", + "0fc0b516e82941dc934c26eba22d9e01", + "d93a6e3ddd364921a7c2a24451d27ffc", + "2155cf3c7b2043d0a41fc011bf4f0e04", + "500a70f25097484bbec10c0ffd402595", + "9d43995246744e26a8053c21e2c5fcfa", + "2ed85f5360ba4feda6469aabd0324e7a", + "808af1e1f2464a928ee23398c837ff48", + "31906527169a4c08801dc6b21936188d", + "a6570ce51dfc46f383d855e28534bf73", + "41cc49a71a164065bc833d080027e4d2", + "748e7f3c8da243e9b5320654ec8e8146", + "a88953429ab6436fb4f01b6b1e2cf6ff", + "b0a2671c90a048548314c2e3d21e19e7", + "5f2080a5d12241638447a5851d0c8db3", + "7ce44d2f323d45838633a750f2386525", + "406f3564a217478d8f60dee5e1fb6dbf", + "61ae734ac8d441fd9b3ea198aff3f2c7", + "bc52c57fa6464ab39823cd3ddb9d7d78", + "02f735a438bf4058a9cfacf8d2b8660f", + "331b178397164de49408dc50ce417a36", + "07ee43d2a1684fb0b1445755802b6ea5", + "c867bce7e34b4800903eb9ec99f34784", + "8169e16a9b0146f5a57a015601c2ebcb", + "35ca86faebfd43faaef0202389d958fd", + "f04f37ba10e9498ea61acdce637431ee", + "527bfa6067c84b94a1e70dfadfd4b78e", + "312e85864e074b958d86325b6417a0fa", + "3b0fc37739334025b037a5270c9515bf", + "0dfb7f264674449b92a390324d17c4cf", + "7e58bf25549d4b428f231d528e8fef54", + "461ca08f677a4cba9ec2a388c2e346f3", + "90d31fb52af949b0a2b41e3613827233", + "1afbe347ab364b28b887f49dad54f5d7", + "8483a759cc0e4e12834fc7d08dab3b7e", + "59d8ffb31bb340eba7e0dcebfbbdd977", + "2b62b542c091466cbae559e29ec797bd", + "092a4de220ba4ca2a23a0f273aba601b", + "e0c46565371f437a85a26d44c5b20c5b", + "4831ce9114e5437ea8a24919557c40e2", + "53344cac458d4d5ebdc504744c18b7de", + "f34bf7b0bb424a8e8c00ff75309bbe6f", + "d604bc170c02491fae573c702e790893", + "d8a3bdb8be354365944ab587738280d3", + "89ad2dee66324ae896eec71924aee670", + "31c0c3d684564d5fb87d2e25e6de96eb", + "b7bef190ebed494eb8773ec21d9b7160", + "6c822f08434c4212931dcf097a80b7d4", + "0f8d4b5000174234bded5d4e017aa4e9", + "243f1e7de5414b82aaa4b50482dd964d", + "0961f276155348f98c12d1be4ad78e62", + "95c6cffafe1b4345a905be485b787728", + "43072f923bd24566ae0e20ca9aa3cdc5", + "5a4f80526c2c4b53a1bce182e9b3e5fa", + "400b0a8ef7c64477bf4f02a16b5508a0", + "b03347c3201849778ea3129314ac340c", + "3542b02e36ce4e03850b37c28d88da30", + "7b2abc768054422f8af3d21837400b4a", + "d8e4ceae237d4381aa5e44b020d7564e", + "e5ec838bb84644b6a27e3eaec9d7ac74", + "30ba8c556cc34fde96b530ed66ac376d", + "711439e7dcab4c10ab4300bdbe6b86aa", + "15887401b0814d9386fb4d02d6279412", + "bec0cae37feb48a4add318d970d8ef96", + "feb8127671424fa68b9b93a7547e40eb", + "94c89aa435e44c8d9369305c21ca028c", + "7abe3bc7884e4afeb9995f7d7acc8c0f", + "7edccbff1ca145eabd4af6f9da32442a", + "9c11fce2ab1f4811a3d98f9154818825", + "2cf5d1a84ed947ddb16e5f8e6984b01e", + "d5de868032e640d5a07c34c9917190c3", + "d6295baf48b24c929c3ab4a317356e2b", + "df2ed1f8e3754f3a8f30be35935e82f3", + "2f125504e41344c088231f0307d7cb92", + "6a6665e93675459394536fd9f846fbea", + "aed65947759c47c58abe86f1ee279b86", + "b5d7fb93223c4c458e7c80e59daea4d2", + "540140c9c2f541b3a82f7a59e4f0b867", + "d5212aa2a4f74de1970c07e282f0e2bc", + "4a75d002a4ae4fc99625420ec6e580ee", + "695f77c019db487ea60171277073efe6", + "bc3fad5fe0194399add875a6d78907bd", + "42c418329198400bb77cdd3a654a96de", + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "735b9a74223f4941a2837a1108889f63" + ] + }, + "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/48.0 [00:00= len(tokens):\n", + " break\n", + " return result\n", + "\n", + "\n", + "def check_task_status(es, task_id):\n", + " while True:\n", + " task_response = es.tasks.get(task_id=task_id)\n", + " if task_response[\"completed\"]:\n", + " print(\"Reindexing complete.\")\n", + " break\n", + " else:\n", + " print(\"Indexing...\")\n", + " time.sleep(10)" + ], + "metadata": { + "id": "xB2a9-qtONbQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##Ingest Pipelines" + ], + "metadata": { + "id": "izMU8HqqP7ld" + } + }, + { + "cell_type": "code", + "source": [ + "# Define the ingest pipeline configuration\n", + "pipeline_body = {\n", + " \"description\": \"Pipeline for processing book passages\",\n", + " \"processors\": [\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"target_field\": \"_ingest._value.vector\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": elser_model_id,\n", + " \"target_field\": \"_ingest._value.content_embedding\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " ],\n", + "}\n", + "\n", + "# Create or update the pipeline\n", + "pipeline_id = \"books_dataset_chunker\"\n", + "es = create_es_client()\n", + "es.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)\n", + "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iUOFJK48OamP", + "outputId": "5dc25103-a2ee-4a19-e184-92ec65c29187" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Index Settings" + ], + "metadata": { + "id": "6ZkRwEGdQBRP" + } + }, + { + "cell_type": "code", + "source": [ + "index_settings = {\n", + " \"settings\": {\n", + " \"number_of_shards\": 2,\n", + " \"number_of_replicas\": 0,\n", + " \"default_pipeline\": \"books_dataset_chunker\",\n", + " },\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"content_embedding\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", + " },\n", + " \"predicted_value\": {\"type\": \"sparse_vector\"},\n", + " }\n", + " },\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"vector\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", + " },\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"dot_product\",\n", + " },\n", + " }\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "raw_source_index_settings = {\n", + " \"settings\": {\"number_of_shards\": 2, \"number_of_replicas\": 0},\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "# Manage indices\n", + "manage_index(\n", + " es,\n", + " index_name,\n", + " index_settings[\"settings\"],\n", + " index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")\n", + "manage_index(\n", + " es,\n", + " raw_source_index,\n", + " raw_source_index_settings[\"settings\"],\n", + " raw_source_index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vZ3Z5gZbOgjF", + "outputId": "996f6ca5-d27d-4ea0-ed4d-07570b9942ad" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Index harry_potter_dataset_enriched exists. Deleting it...\n", + "Index harry_potter_dataset_enriched deleted!\n", + "Index harry_potter_dataset_enriched created successfully!\n", + "Index harry_potter_dataset-raw exists. Deleting it...\n", + "Index harry_potter_dataset-raw deleted!\n", + "Index harry_potter_dataset-raw created successfully!\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Fetch and Process the Book Text\n", + "\n", + "This section downloads the full text of \"Harry Potter and the Sorcerer's Stone\" from a specified URL and processes it to extract chapters and their titles. The text is then structured into a pandas DataFrame for further analysis and indexing.\n", + "\n", + "### Key Steps:\n", + "1. **Download Text**: The book is fetched using `urllib.request` from the provided URL.\n", + "2. **Extract Chapters**: The text is split into chapters based on predefined patterns, omitting the text before the first chapter.\n", + "3. **Capture Chapter Titles**: Chapter titles are extracted and paired with their respective texts.\n", + "4. **Data Structuring**:\n", + " - Convert the list of chapter titles and texts into a DataFrame.\n", + " - Assign sequential numbers to chapters.\n", + " - Add the book title as metadata.\n", + " - Apply a text chunking function to split each chapter into manageable passages.\n", + "\n", + "This prepares the text data for efficient indexing and advanced search operations in Elasticsearch.\n" + ], + "metadata": { + "id": "NPtbLhVOQUF3" + } + }, + { + "cell_type": "code", + "source": [ + "# Fetch and process the book text\n", + "potter_book_url = \"https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt\"\n", + "response = urllib.request.urlopen(potter_book_url)\n", + "harry_potter_book_text = response.read().decode(\"utf-8\")\n", + "chapter_pattern = re.compile(r\"CHAPTER [A-Z]+\", re.IGNORECASE)\n", + "chapters = chapter_pattern.split(harry_potter_book_text)[1:]\n", + "chapter_titles = re.findall(chapter_pattern, harry_potter_book_text)\n", + "chapters_with_titles = list(zip(chapter_titles, chapters))\n", + "\n", + "print(\"Total chapters found:\", len(chapters))\n", + "if chapters_with_titles:\n", + " print(\"First chapter title:\", chapters_with_titles[0][0])\n", + " print(\"Text sample from first chapter:\", chapters_with_titles[0][1][:500])\n", + "\n", + "\n", + "# Structuring chapters into a DataFrame\n", + "df = pd.DataFrame(chapters_with_titles, columns=[\"chapter_title\", \"chapter_full_text\"])\n", + "df[\"chapter\"] = df.index + 1\n", + "df[\"book_title\"] = \"Harry Potter and the Sorcerer’s Stone\"\n", + "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0L4YI96xOuKn", + "outputId": "7f9c63c7-82d8-4490-aabb-c3629872d80d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total chapters found: 17\n", + "First chapter title: CHAPTER ONE\n", + "Text sample from first chapter: \n", + "\n", + "THE BOY WHO LIVED\n", + "\n", + "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", + "that they were perfectly normal, thank you very much. They were the last\n", + "people you'd expect to be involved in anything strange or mysterious,\n", + "because they just didn't hold with such nonsense.\n", + "\n", + "Mr. Dursley was the director of a firm called Grunnings, which made\n", + "drills. He was a big, beefy man with hardly any neck, although he did\n", + "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", + "nearly t\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Indexing DataFrame into Elasticsearch\n", + "\n", + "This section uploads the structured data from a pandas DataFrame into a specified Elasticsearch index. The DataFrame contains chapter information from \"Harry Potter and the Sorcerer's Stone\", including chapter titles, full texts, and additional metadata.\n", + "\n", + "### Key Operation:\n", + "- **Index Data**: The `index_dataframe` function is called with the Elasticsearch client, the raw source index name, and the DataFrame as arguments. This operation effectively uploads the data into Elasticsearch, making it searchable and ready for further processing.\n" + ], + "metadata": { + "id": "DKK4574EQaTl" + } + }, + { + "cell_type": "code", + "source": [ + "index_dataframe(es, raw_source_index, df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7ReLAtz1O1HF", + "outputId": "3bf70ccc-804d-4718-e2a7-13dc0008e073" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Indexing documents to harry_potter_dataset-raw...\n", + "Successfully indexed 17 documents.\n", + "Failed to index 0 documents.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Asynchronous Reindexing in Elasticsearch\n", + "\n", + "This section initiates an asynchronous reindex operation to transfer data from the raw source index to the enriched index in Elasticsearch. This process runs in the background, allowing other operations to continue without waiting for completion.\n", + "\n", + "### Key Steps:\n", + "1. **Start Reindex**: The reindex operation is triggered from the `raw_source_index` to the `index_name`, with `wait_for_completion` set to `False` to allow asynchronous execution.\n", + "2. **Retrieve Task ID**: The task ID of the reindex operation is captured and printed for monitoring purposes.\n", + "3. **Monitor Progress**: The `check_task_status` function continuously checks the status of the reindex task, providing updates every 10 seconds until the operation is complete.\n" + ], + "metadata": { + "id": "pA5QroYdQgcM" + } + }, + { + "cell_type": "code", + "source": [ + "# Start the reindex operation asynchronously\n", + "response = es.reindex(\n", + " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", + " wait_for_completion=False,\n", + ")\n", + "task_id = response[\"task\"]\n", + "print(\"Task ID:\", task_id)\n", + "check_task_status(es, task_id)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HOCX_lbmO3zl", + "outputId": "014309de-8ec6-4cf8-b647-6bf0e6f512d8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Task ID: _ov-FtHBSkqocXXBG6nu4A:68576798\n", + "Indexing...\n", + "Reindexing complete.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Custom Search Query Construction and Execution\n", + "\n", + "This section constructs and executes a custom search query in Elasticsearch, utilizing a hybrid approach combining vector and text-based search methods to enhance search accuracy and relevance. The specific example used is a user query about the \"Nimbus 2000\".\n", + "\n", + "### Key Steps:\n", + "1. **Define User Query**: The user query is specified as \"what is a nimbus 2000\".\n", + "2. **Set Boost Factors**:\n", + " - `knn_boost_factor`: A value to amplify the importance of the vector-based search component.\n", + " - `text_expansion_boost`: A value to modify the weight of the text-based search component.\n", + "3. **Build Query**: The `build_custom_query` function constructs the search query, incorporating both dense vector and text expansion components.\n", + "4. **Execute Search**: The query is executed against the specified Elasticsearch index.\n", + "5. **Identify Relevant Passages**:\n", + " - The search results are analyzed to find the passage with the highest relevance score.\n", + " - The ID and chunk number of the best matching passage are captured and printed.\n", + "6. **Fetch Surrounding Chunks**: Constructs and executes a query to retrieve chunks adjacent to the identified passage for broader context. If the matched chunk is the first chunk, fetches n, n+1, and n+2. If the chunk is the last chunk in the chapter, fetches n, n-1, and n-2. For other chunks, fetches n-1, n, and n+1.\n", + "7. **Display Results**: Outputs text from the relevant and adjacent passages." + ], + "metadata": { + "id": "xJBDwRmDQq4n" + } + }, + { + "cell_type": "code", + "source": [ + "# Custom Search Query Construction\n", + "user_query = \"what is a nimbus 2000\"\n", + "\n", + "\n", + "knn_boost_factor = 20\n", + "text_expansion_boost = 1\n", + "query = build_custom_query(\n", + " build_vector(user_query),\n", + " user_query,\n", + " knn_boost_factor,\n", + " text_expansion_boost,\n", + " debug=False,\n", + ")\n", + "\n", + "# Searching and identifying relevant passages\n", + "results = es.search(index=index_name, body=query, _source=False)\n", + "\n", + "hit_id = None\n", + "chunk_number = None\n", + "\n", + "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", + " highest_score = -1\n", + " best_hit = None\n", + " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", + " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", + " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", + " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", + " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", + " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", + " if inner_hit[\"hits\"]:\n", + " max_score = inner_hit[\"max_score\"]\n", + " if max_score and max_score > highest_score:\n", + " highest_score = max_score\n", + " best_hit = inner_hit[\"hits\"][0]\n", + "\n", + " if best_hit:\n", + " first_passage_text = best_hit[\"_source\"][\"text\"]\n", + " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", + " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(\n", + " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", + " )\n", + " print(f\"\\n\")\n", + " else:\n", + " print(f\"ID: {hit_id}, No relevant passages found.\")\n", + "else:\n", + " print(\"No results found.\")\n", + "\n", + "print(f\"Fetch Surrounding Chunks\")\n", + "print(f\"------------------------\")\n", + "\n", + "max_chapter_chunk_result = es.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + ")\n", + "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + "][\"value\"]\n", + "\n", + "adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + ")\n", + "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", + "print_text_from_results(results)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u7NFZBRJO3t7", + "outputId": "6f9ec0d9-bb1d-4235-da45-1c8040ac7036" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Matched Chunk ID: wz8m148BbBK3er50L0-W, Chunk Number: 3, Text:\n", + "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Fetch Surrounding Chunks\n", + "------------------------\n", + "\n", + "\n", + "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" + ] + } + ] + } + ] +} \ No newline at end of file