Skip to content

Commit 8811057

Browse files
committed
Save selected chunks with metadata
Signed-off-by: Anastas Stoyanovsky <[email protected]>
1 parent 55a512a commit 8811057

File tree

1 file changed

+22
-19
lines changed

1 file changed

+22
-19
lines changed

notebooks/instructlab-knowledge/subset-selection.ipynb

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 17,
66
"id": "39792a86-af14-4ce8-bb25-287c0f16d766",
77
"metadata": {},
88
"outputs": [
@@ -113,15 +113,15 @@
113113
},
114114
{
115115
"cell_type": "code",
116-
"execution_count": 27,
116+
"execution_count": 20,
117117
"id": "7c2f16ed-52a3-465d-9961-61e0f6823fea",
118118
"metadata": {},
119119
"outputs": [
120120
{
121121
"name": "stdout",
122122
"output_type": "stream",
123123
"text": [
124-
"/Users/astoyano/Documents/code/examples/notebooks/instructlab-knowledge\n"
124+
"/home/ec2-user/examples/notebooks/instructlab-knowledge\n"
125125
]
126126
}
127127
],
@@ -131,7 +131,7 @@
131131
},
132132
{
133133
"cell_type": "code",
134-
"execution_count": 1,
134+
"execution_count": 21,
135135
"id": "27dcb09d-75bd-448c-ba14-8fb773ed5aa2",
136136
"metadata": {},
137137
"outputs": [
@@ -148,7 +148,7 @@
148148
"'/home/ec2-user/examples/notebooks/instructlab-knowledge/DataCurate4LLMs'"
149149
]
150150
},
151-
"execution_count": 1,
151+
"execution_count": 21,
152152
"metadata": {},
153153
"output_type": "execute_result"
154154
}
@@ -170,11 +170,12 @@
170170
},
171171
{
172172
"cell_type": "code",
173-
"execution_count": 5,
173+
"execution_count": 13,
174174
"id": "e7efc49a-c08d-42f9-ad7a-b0723b49982f",
175175
"metadata": {},
176176
"outputs": [],
177177
"source": [
178+
"!sed -i -e 's ^faiss-gpu$ faiss-gpu-cu12 g' requirements.txt # fix faiss dependency; yes you can use spaces as delimiters for sed expressions\n",
178179
"!pip install -qq -r requirements.txt"
179180
]
180181
},
@@ -217,33 +218,35 @@
217218
},
218219
{
219220
"cell_type": "code",
220-
"execution_count": 9,
221+
"execution_count": 29,
221222
"id": "02b5622c-2101-47df-b366-2afb137cdd30",
222223
"metadata": {},
223224
"outputs": [
224225
{
225226
"name": "stdout",
226227
"output_type": "stream",
227228
"text": [
228-
"\"Rulings:\\n(a) B's ball, first-and-10 on A22. If Team A recovers and does not advance, Team B gets the ball at the spot of recovery.\\n(b) B's ball, first-and-10 on A33. If Team A recovers and advances, but does not reach the line to gain, Team B gets the ball at the dead ball spot.\\n(c) A's ball, first-and-10 on A36. If Team A recovers and advances beyond the line to gain, A has a first down.\\n(d) B's ball, first-and-10 on A20. Illegal pass, as the ball has been beyond the line. The penalty is five yards from the previous spot and a loss of down.\"\n",
229-
"\n",
230-
"\"Rulings:\\n(a) B's ball, first-and-10 on B45. No foul for running into the kicker, since the snap hit the ground. (12-2-10-e)\\n(b) B's ball, first-and-10 on B45. No foul for roughing the kicker, since the snap hit the ground. (12-2-10-e)\\n(c) A's ball, first-and-10 on B40. Unnecessary roughness. If the snap touches the ground, only unnecessary roughness protection applies, and if the contact is unnecessary roughness, it is a foul whether or not B2 touches the punt.\"\n",
231-
"\n",
232-
"\"Rulings:\\n(e) B's ball, first-and-10 on A20. Team B can recover and advance the ball because it is a scrimmage kick. The succeeding spot is the end of the run and Team A's foul will be enforced from that spot.\"\n",
233-
"\n",
234-
"\"Rulings:\\n(a) Third-and-15 on B35, run 10 seconds off the game clock, set the play clock to 25, and start the game clock on the ready for play. Team A has intentionally fouled after the two-minute warning, stopping the clock. (Team B could choose to decline the runoff and start the clock on the snap.)\\n(b) Third-and-5 on B25. Set the play clock to 40 seconds and start the game clock on the ready if Team A chooses. Team B has intentionally fouled after the two-minute warning, stopping the clock.\"\n",
235-
"\n",
236-
"\"A.R. 14.86 PUNT-TEAMBSCORES-DOUBLEFOULAFTERCHANGEOF POSSESSION\\nFourth-and-10 on A7. The punt is partially blocked, and B1 recovers on the A22 and runs for a touchdown. Prior to the score, B2 holds in A's end zone. After the score, A3 grabs B1's facemask and throws him to the ground. Ruling: B's ball, first-and-10 on A1. The live ball offensive holding by B2, combines with the dead ball unnecessary roughness foul by A3, and the penalties offset at the A1. Since the spot of B's foul and the dead ball spot are both in A's end zone, the enforcement of B2's foul would be from the goal line, if that was the only foul; the ball is therefore spotted on the 1-yard line.\"\n",
229+
"{\"chunk\": \"Rulings:\\n(a) B's ball, first-and-10 on A22. If Team A recovers and does not advance, Team B gets the ball at the spot of recovery.\\n(b) B's ball, first-and-10 on A33. If Team A recovers and advances, but does not reach the line to gain, Team B gets the ball at the dead ball spot.\\n(c) A's ball, first-and-10 on A36. If Team A recovers and advances beyond the line to gain, A has a first down.\\n(d) B's ball, first-and-10 on A20. Illegal pass, as the ball has been beyond the line. The penalty is five yards from the previous spot and a loss of down.\", \"file\": \"2022-nfl-rulebook-final\", \"metadata\": {\"schema_name\": \"docling_core.transforms.chunker.DocMeta\", \"version\": \"1.0.0\", \"doc_items\": [{\"self_ref\": \"#/texts/4058\", \"parent\": {\"$ref\": \"#/groups/352\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 152, \"bbox\": {\"l\": 116.259, \"t\": 390.686, \"r\": 548.923, \"b\": 373.313, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 121]}]}, {\"self_ref\": \"#/texts/4059\", \"parent\": {\"$ref\": \"#/groups/352\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 152, \"bbox\": {\"l\": 116.259, \"t\": 371.26, \"r\": 548.916, \"b\": 353.887, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 148]}]}, {\"self_ref\": \"#/texts/4060\", \"parent\": {\"$ref\": \"#/groups/352\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 152, \"bbox\": {\"l\": 116.259, \"t\": 351.608, \"r\": 535.333, \"b\": 343.949, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 111]}]}, {\"self_ref\": \"#/texts/4061\", \"parent\": {\"$ref\": \"#/groups/352\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 152, \"bbox\": {\"l\": 116.259, \"t\": 341.895, \"r\": 548.965, \"b\": 324.523, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 156]}]}], \"headings\": [\"Rulings:\"], \"origin\": {\"mimetype\": \"application/pdf\", \"binary_hash\": 3168899155016516390, \"filename\": \"2022-nfl-rulebook-final.pdf\"}}}\n",
230+
"{\"chunk\": \"Rulings:\\n(a) B's ball, first-and-10 on B45. No foul for running into the kicker, since the snap hit the ground. (12-2-10-e)\\n(b) B's ball, first-and-10 on B45. No foul for roughing the kicker, since the snap hit the ground. (12-2-10-e)\\n(c) A's ball, first-and-10 on B40. Unnecessary roughness. If the snap touches the ground, only unnecessary roughness protection applies, and if the contact is unnecessary roughness, it is a foul whether or not B2 touches the punt.\", \"file\": \"2022-nfl-rulebook-final\", \"metadata\": {\"schema_name\": \"docling_core.transforms.chunker.DocMeta\", \"version\": \"1.0.0\", \"doc_items\": [{\"self_ref\": \"#/texts/4694\", \"parent\": {\"$ref\": \"#/groups/399\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 172, \"bbox\": {\"l\": 116.259, \"t\": 489.171, \"r\": 531.566, \"b\": 481.511, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 114]}]}, {\"self_ref\": \"#/texts/4695\", \"parent\": {\"$ref\": \"#/groups/399\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 172, \"bbox\": {\"l\": 116.259, \"t\": 479.458, \"r\": 520.266, \"b\": 471.798, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 110]}]}, {\"self_ref\": \"#/texts/4696\", \"parent\": {\"$ref\": \"#/groups/399\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 172, \"bbox\": {\"l\": 116.259, \"t\": 469.745, \"r\": 548.973, \"b\": 442.433, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 244]}]}], \"headings\": [\"Rulings:\"], \"origin\": {\"mimetype\": \"application/pdf\", \"binary_hash\": 3168899155016516390, \"filename\": \"2022-nfl-rulebook-final.pdf\"}}}\n",
231+
"{\"chunk\": \"Rulings:\\n(e) B's ball, first-and-10 on A20. Team B can recover and advance the ball because it is a scrimmage kick. The succeeding spot is the end of the run and Team A's foul will be enforced from that spot.\", \"file\": \"2022-nfl-rulebook-final\", \"metadata\": {\"schema_name\": \"docling_core.transforms.chunker.DocMeta\", \"version\": \"1.0.0\", \"doc_items\": [{\"self_ref\": \"#/texts/4554\", \"parent\": {\"$ref\": \"#/groups/386\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 168, \"bbox\": {\"l\": 116.259, \"t\": 586.526, \"r\": 548.996, \"b\": 569.153, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 199]}]}], \"headings\": [\"Rulings:\"], \"origin\": {\"mimetype\": \"application/pdf\", \"binary_hash\": 3168899155016516390, \"filename\": \"2022-nfl-rulebook-final.pdf\"}}}\n",
232+
"{\"chunk\": \"Rulings:\\n(a) Third-and-15 on B35, run 10 seconds off the game clock, set the play clock to 25, and start the game clock on the ready for play. Team A has intentionally fouled after the two-minute warning, stopping the clock. (Team B could choose to decline the runoff and start the clock on the snap.)\\n(b) Third-and-5 on B25. Set the play clock to 40 seconds and start the game clock on the ready if Team A chooses. Team B has intentionally fouled after the two-minute warning, stopping the clock.\", \"file\": \"2022-nfl-rulebook-final\", \"metadata\": {\"schema_name\": \"docling_core.transforms.chunker.DocMeta\", \"version\": \"1.0.0\", \"doc_items\": [{\"self_ref\": \"#/texts/2692\", \"parent\": {\"$ref\": \"#/groups/269\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 110, \"bbox\": {\"l\": 116.259, \"t\": 469.745, \"r\": 548.955, \"b\": 442.433, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 292]}]}, {\"self_ref\": \"#/texts/2693\", \"parent\": {\"$ref\": \"#/groups/269\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"list_item\", \"prov\": [{\"page_no\": 110, \"bbox\": {\"l\": 116.259, \"t\": 440.38, \"r\": 548.951, \"b\": 423.007, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 195]}]}], \"headings\": [\"Rulings:\"], \"origin\": {\"mimetype\": \"application/pdf\", \"binary_hash\": 3168899155016516390, \"filename\": \"2022-nfl-rulebook-final.pdf\"}}}\n",
233+
"{\"chunk\": \"A.R. 14.86 PUNT-TEAMBSCORES-DOUBLEFOULAFTERCHANGEOF POSSESSION\\nFourth-and-10 on A7. The punt is partially blocked, and B1 recovers on the A22 and runs for a touchdown. Prior to the score, B2 holds in A's end zone. After the score, A3 grabs B1's facemask and throws him to the ground. Ruling: B's ball, first-and-10 on A1. The live ball offensive holding by B2, combines with the dead ball unnecessary roughness foul by A3, and the penalties offset at the A1. Since the spot of B's foul and the dead ball spot are both in A's end zone, the enforcement of B2's foul would be from the goal line, if that was the only foul; the ball is therefore spotted on the 1-yard line.\", \"file\": \"2022-nfl-rulebook-final\", \"metadata\": {\"schema_name\": \"docling_core.transforms.chunker.DocMeta\", \"version\": \"1.0.0\", \"doc_items\": [{\"self_ref\": \"#/texts/5593\", \"parent\": {\"$ref\": \"#/body\"}, \"children\": [], \"content_layer\": \"body\", \"label\": \"text\", \"prov\": [{\"page_no\": 199, \"bbox\": {\"l\": 116.259, \"t\": 127.75900000000001, \"r\": 549.001, \"b\": 71.30799999999999, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 606]}]}], \"headings\": [\"A.R. 14.86 PUNT-TEAMBSCORES-DOUBLEFOULAFTERCHANGEOF POSSESSION\"], \"origin\": {\"mimetype\": \"application/pdf\", \"binary_hash\": 3168899155016516390, \"filename\": \"2022-nfl-rulebook-final.pdf\"}}}\n",
237234
"\n"
238235
]
239236
}
240237
],
241238
"source": [
242239
"import json\n",
243240
"\n",
244-
"with open('./chunks/chunks_samples_5_subset.jsonl') as f:\n",
245-
" for line in f.readlines():\n",
246-
" print(json.dumps(json.loads(line)['chunk'], indent=2) + \"\\n\")"
241+
"with open('./chunks/chunks_samples_5_subset.jsonl') as fin:\n",
242+
" with open('selected_chunks.jsonl','w') as fout:\n",
243+
" for line in fin.readlines():\n",
244+
" selected_chunk = json.loads(line)['chunk']\n",
245+
" original_chunk = chunk_lookup[selected_chunk]\n",
246+
" fout.write(json.dumps(original_chunk) + \"\\n\")\n",
247+
"\n",
248+
"with open('selected_chunks.jsonl') as final:\n",
249+
" print(final.read())"
247250
]
248251
}
249252
],

0 commit comments

Comments
 (0)