Skip to content

Commit

Permalink
updated notebook for chapter_number
Browse files Browse the repository at this point in the history
bug chapter_number = None. forgot = sign
  • Loading branch information
sunilemanjee committed Jun 5, 2024
1 parent fe8141e commit 5b49345
Showing 1 changed file with 35 additions and 19 deletions.
54 changes: 35 additions & 19 deletions notebooks/document-chunking/fetch-surrounding-chunks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -986,35 +986,43 @@
"\n",
"knn_boost_factor = 20\n",
"text_expansion_boost = 1\n",
"query = build_custom_query(build_vector(user_query), user_query, knn_boost_factor, text_expansion_boost, debug=False)\n",
"query = build_custom_query(\n",
" build_vector(user_query),\n",
" user_query,\n",
" knn_boost_factor,\n",
" text_expansion_boost,\n",
" debug=False,\n",
")\n",
"\n",
"# Searching and identifying relevant passages\n",
"results = esclient.search(index=index_name, body=query, _source=False)\n",
"\n",
"hit_id = None\n",
"chunk_number = None\n",
"chapter_number None\n",
"chapter_number = None\n",
"\n",
"if results and results.get('hits') and results['hits'].get('hits'):\n",
"if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n",
" highest_score = -1\n",
" best_hit = None\n",
" hit_id = results['hits']['hits'][0]['_id']\n",
" chapter_number = results['hits']['hits'][0]['fields']['chapter'][0]\n",
" if 'inner_hits' in results['hits']['hits'][0]:\n",
" for hit_type in ['text_hits', 'dense_hit', 'sparse_hits']:\n",
" if hit_type in results['hits']['hits'][0]['inner_hits']:\n",
" inner_hit = results['hits']['hits'][0]['inner_hits'][hit_type]['hits']\n",
" if inner_hit['hits']:\n",
" max_score = inner_hit['max_score']\n",
" hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n",
" chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n",
" if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n",
" for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n",
" if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n",
" inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n",
" if inner_hit[\"hits\"]:\n",
" max_score = inner_hit[\"max_score\"]\n",
" if max_score and max_score > highest_score:\n",
" highest_score = max_score\n",
" best_hit = inner_hit['hits'][0]\n",
" best_hit = inner_hit[\"hits\"][0]\n",
"\n",
" if best_hit:\n",
" first_passage_text = best_hit['_source']['text']\n",
" chunk_number = best_hit['_source']['chunk_number']\n",
" #print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n",
" print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\")\n",
" first_passage_text = best_hit[\"_source\"][\"text\"]\n",
" chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n",
" # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n",
" print(\n",
" f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n",
" )\n",
" print(f\"\\n\")\n",
" else:\n",
" print(f\"ID: {hit_id}, No relevant passages found.\")\n",
Expand All @@ -1024,10 +1032,18 @@
"print(f\"Fetch Surrounding Chunks\")\n",
"print(f\"------------------------\")\n",
"\n",
"max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)\n",
"max_chunk_number = max_chapter_chunk_result['aggregations']['max_chunk_number']['max_chunk']['value']\n",
"max_chapter_chunk_result = esclient.search(\n",
" index=index_name,\n",
" body=get_max_chunk_number_query(chapter_number, debug=False),\n",
" _source=False,\n",
")\n",
"max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n",
" \"max_chunk\"\n",
"][\"value\"]\n",
"\n",
"adjacent_chunks_query = get_adjacent_chunks_query(hit_id, chunk_number, max_chunk_number, debug=False)\n",
"adjacent_chunks_query = get_adjacent_chunks_query(\n",
" hit_id, chunk_number, max_chunk_number, debug=False\n",
")\n",
"results = esclient.search(index=index_name, body=adjacent_chunks_query, _source=False)\n",
"print_text_from_results(results)"
],
Expand Down

0 comments on commit 5b49345

Please sign in to comment.