diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 9ce5d56e..beb4428e 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -986,35 +986,43 @@ "\n", "knn_boost_factor = 20\n", "text_expansion_boost = 1\n", - "query = build_custom_query(build_vector(user_query), user_query, knn_boost_factor, text_expansion_boost, debug=False)\n", + "query = build_custom_query(\n", + " build_vector(user_query),\n", + " user_query,\n", + " knn_boost_factor,\n", + " text_expansion_boost,\n", + " debug=False,\n", + ")\n", "\n", "# Searching and identifying relevant passages\n", "results = esclient.search(index=index_name, body=query, _source=False)\n", "\n", "hit_id = None\n", "chunk_number = None\n", - "chapter_number None\n", + "chapter_number = None\n", "\n", - "if results and results.get('hits') and results['hits'].get('hits'):\n", + "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", " highest_score = -1\n", " best_hit = None\n", - " hit_id = results['hits']['hits'][0]['_id']\n", - " chapter_number = results['hits']['hits'][0]['fields']['chapter'][0]\n", - " if 'inner_hits' in results['hits']['hits'][0]:\n", - " for hit_type in ['text_hits', 'dense_hit', 'sparse_hits']:\n", - " if hit_type in results['hits']['hits'][0]['inner_hits']:\n", - " inner_hit = results['hits']['hits'][0]['inner_hits'][hit_type]['hits']\n", - " if inner_hit['hits']:\n", - " max_score = inner_hit['max_score']\n", + " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", + " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", + " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", + " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", + " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", + " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", + " if inner_hit[\"hits\"]:\n", + " max_score = inner_hit[\"max_score\"]\n", " if max_score and max_score > highest_score:\n", " highest_score = max_score\n", - " best_hit = inner_hit['hits'][0]\n", + " best_hit = inner_hit[\"hits\"][0]\n", "\n", " if best_hit:\n", - " first_passage_text = best_hit['_source']['text']\n", - " chunk_number = best_hit['_source']['chunk_number']\n", - " #print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", - " print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\")\n", + " first_passage_text = best_hit[\"_source\"][\"text\"]\n", + " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", + " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(\n", + " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", + " )\n", " print(f\"\\n\")\n", " else:\n", " print(f\"ID: {hit_id}, No relevant passages found.\")\n", @@ -1024,10 +1032,18 @@ "print(f\"Fetch Surrounding Chunks\")\n", "print(f\"------------------------\")\n", "\n", - "max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)\n", - "max_chunk_number = max_chapter_chunk_result['aggregations']['max_chunk_number']['max_chunk']['value']\n", + "max_chapter_chunk_result = esclient.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + ")\n", + "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + "][\"value\"]\n", "\n", - "adjacent_chunks_query = get_adjacent_chunks_query(hit_id, chunk_number, max_chunk_number, debug=False)\n", + "adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + ")\n", "results = esclient.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", "print_text_from_results(results)" ],