Save selected chunks with metadata

anastasds · anastasds · commit d2885a51093a · 2025-06-10T13:47:22.000Z
Signed-off-by: Anastas Stoyanovsky &lt;astoyano@redhat.com&gt;
diff --git a/notebooks/instructlab-knowledge/subset-selection.ipynb b/notebooks/instructlab-knowledge/subset-selection.ipynb
@@ -2,18 +2,10 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "39792a86-af14-4ce8-bb25-287c0f16d766",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Read 2024 chunks\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "import os\n",
@@ -30,57 +22,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "0594d7bb-228b-468a-99a0-38c14651a562",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Cloning into 'DataCurate4LLMs'...\n",
-      "remote: Enumerating objects: 228, done.\u001b[K\n",
-      "remote: Counting objects: 100% (228/228), done.\u001b[K\n",
-      "remote: Compressing objects: 100% (143/143), done.\u001b[K\n",
-      "remote: Total 228 (delta 112), reused 183 (delta 78), pack-reused 0 (from 0)\u001b[K\n",
-      "Receiving objects: 100% (228/228), 255.39 KiB | 6.08 MiB/s, done.\n",
-      "Resolving deltas: 100% (112/112), done.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "!git clone git@github.com:krishnatejakk/DataCurate4LLMs.git"
+    "!git clone git@github.com:krishnatejakk/DataCurate4LLMs.git\n",
+    "%cd DataCurate4LLMs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "2b2c8b60-daa9-48e4-b680-0b59588609ab",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "    \"output_dir\": \"output\",\n",
-      "    \"encoder_model\": \"BAAI/bge-large-en-v1.5\",\n",
-      "    \"encoder_type\": \"bge\",\n",
-      "    \"instruction\": \"Generate embeddings that capture the semantic meaning of text segments across multiple domains, ensuring generalization and suitability for clustering based on semantic similarity.\",\n",
-      "    \"query_description\": \"default\",\n",
-      "    \"templates\": {\n",
-      "        \"default\": \"{{ chunk }}\"\n",
-      "    },\n",
-      "    \"template_name\": \"default\",\n",
-      "    \"num_folds\": 1,\n",
-      "    \"num_gpus\": 1,\n",
-      "    \"subset_sizes\": [\"5\"],\n",
-      "    \"epsilon\": 0.01,\n",
-      "    \"seed\": 148\n",
-      "}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import random\n",
     "\n",
@@ -113,137 +69,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "7c2f16ed-52a3-465d-9961-61e0f6823fea",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/Users/astoyano/Documents/code/examples/notebooks/instructlab-knowledge\n"
-     ]
-    }
-   ],
-   "source": [
-    "%cd .."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "27dcb09d-75bd-448c-ba14-8fb773ed5aa2",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/home/ec2-user/examples/notebooks/instructlab-knowledge/DataCurate4LLMs\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'/home/ec2-user/examples/notebooks/instructlab-knowledge/DataCurate4LLMs'"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "%cd DataCurate4LLMs\n",
-    "%pwd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "0d700a78-f0cb-48ea-b829-883a4675966c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install -qq submodlib-py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "e7efc49a-c08d-42f9-ad7a-b0723b49982f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -qq -r requirements.txt"
+    "!sed -i -e 's ^faiss-gpu$ faiss-gpu-cu12 g' requirements.txt # fix faiss dependency; yes you can use spaces as delimiters for sed expressions\n",
+    "!pip install -qq -r requirements.txt\n",
+    "!pip install -qq submodlib-py"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "8b086790-5a9f-4e2f-b130-2d9f9f8a0319",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2025-06-06 12:46:09,325 - __main__ - INFO - Processing configuration: ProcessingConfig(instruction='Generate embeddings that capture the semantic meaning of text segments across multiple domains, ensuring generalization and suitability for clustering based on semantic similarity.', query_description='default', templates={'default': '{{ chunk }}'}, batch_size=100000, num_folds=1, subset_sizes=[5], num_gpus=1, seed=148, max_retries=3, retry_delay=1, output_dir='.', template_name='default', combine_files=False, encoder_type='bge', encoder_model='BAAI/bge-large-en-v1.5', epsilon=0.01)\n",
-      "2025-06-06 12:46:11,019 - __main__ - INFO - Processing datasets separately...\n",
-      "2025-06-06 12:46:11,090 - __main__ - INFO - Processing dataset: chunks\n",
-      "2025-06-06 12:46:11,090 - __main__ - INFO - Generating embeddings for chunks\n",
-      "2025-06-06 12:46:11,090 - __main__ - INFO - Embeddings file already exists in ./chunks/embeddings, skipping\n",
-      "2025-06-06 12:46:11,090 - __main__ - INFO - Loading embeddings for subset selection\n",
-      "2025-06-06 12:46:11,100 - __main__ - INFO - Selecting subsets\n",
-      "2025-06-06 12:46:11,100 - __main__ - INFO - Loaded 2025 embeddings for dataset chunks\n",
-      "2025-06-06 12:46:11,102 - __main__ - INFO - Embeddings at top-5 Indices: tensor([[ 0.0074,  0.0125,  0.0055,  ...,  0.0088, -0.0070,  0.0092],\n",
-      "        [-0.0220,  0.0177, -0.0053,  ..., -0.0122, -0.0294,  0.0034],\n",
-      "        [-0.0258,  0.0094,  0.0226,  ...,  0.0247, -0.0270,  0.0251],\n",
-      "        [ 0.0043, -0.0121,  0.0127,  ...,  0.0399, -0.0329,  0.0285],\n",
-      "        [-0.0193, -0.0069,  0.0378,  ..., -0.0013, -0.0378,  0.0177]])\n",
-      "2025-06-06 12:46:17,153 - __mp_main__ - INFO - Processing fold 1 on GPU 0\n",
-      "2025-06-06 12:46:17,336 - __mp_main__ - INFO - Computing similarity matrix for fold 1\n",
-      "2025-06-06 12:46:17,471 - __mp_main__ - INFO - Selecting subset of size 5 for fold 1\n",
-      "[||||||||||||||||||||]100% [Iteration 5 of 5]2025-06-06 12:46:17,888 - __main__ - INFO - Saved metadata to ./chunks_fl_1_partitions_samples_5_metadata.npz\n",
-      "2025-06-06 12:46:17,889 - __main__ - INFO - Saving subsets\n",
-      "Creating json from Arrow format: 100%|███████████| 1/1 [00:00<00:00, 166.00ba/s]\n",
-      "2025-06-06 12:46:17,900 - __main__ - INFO - Saved subset with 5 samples to ./chunks/chunks_samples_5_subset.jsonl\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "!python data_subset_selection.py --input_files '../data/chunks.jsonl' --output_dir '.' --config '../subset_config.json' --retry_delay 1 --subset_sizes 5 --num_gpus 1"
+    "!python data_subset_selection.py --input_files '../data/chunks.jsonl' --output_dir '../data' --config '../subset_config.json' --retry_delay 1 --subset_sizes 5 --num_gpus 1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "02b5622c-2101-47df-b366-2afb137cdd30",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\"Rulings:\\n(a) B's ball, first-and-10 on A22. If Team A recovers and does not advance, Team B gets the ball at the spot of recovery.\\n(b) B's ball, first-and-10 on A33. If Team A recovers and advances, but does not reach the line to gain, Team B gets the ball at the dead ball spot.\\n(c) A's ball, first-and-10 on A36. If Team A recovers and advances beyond the line to gain, A has a first down.\\n(d)  B's ball, first-and-10 on A20. Illegal pass, as the ball has been beyond the line. The penalty is five yards from the previous spot and a loss of down.\"\n",
-      "\n",
-      "\"Rulings:\\n(a) B's ball, first-and-10 on B45. No foul for running into the kicker, since the snap hit the ground. (12-2-10-e)\\n(b) B's ball, first-and-10 on B45. No foul for roughing the kicker, since the snap hit the ground. (12-2-10-e)\\n(c) A's  ball,  first-and-10  on  B40.  Unnecessary  roughness.  If  the  snap  touches  the  ground,  only  unnecessary roughness protection applies, and if the contact is unnecessary roughness, it is a foul whether or not B2 touches the punt.\"\n",
-      "\n",
-      "\"Rulings:\\n(e) B's ball, first-and-10 on A20. Team B can recover and advance the ball because it is a scrimmage kick. The succeeding spot is the end of the run and Team A's foul will be enforced from that spot.\"\n",
-      "\n",
-      "\"Rulings:\\n(a) Third-and-15 on B35, run 10 seconds off the game clock, set the play clock to 25, and start the game clock on the ready for play. Team A has intentionally fouled after the two-minute warning, stopping the clock. (Team B could choose to decline the runoff and start the clock on the snap.)\\n(b) Third-and-5 on B25. Set the play clock to 40 seconds and start the game clock on the ready if Team A chooses. Team B has intentionally fouled after the two-minute warning, stopping the clock.\"\n",
-      "\n",
-      "\"A.R. 14.86 PUNT-TEAMBSCORES-DOUBLEFOULAFTERCHANGEOF POSSESSION\\nFourth-and-10 on A7. The punt is partially blocked, and B1 recovers on the A22 and runs for a touchdown. Prior to the score, B2 holds in A's end zone. After the score, A3 grabs B1's facemask and throws him to the ground. Ruling: B's ball, first-and-10 on A1. The live ball offensive holding by B2, combines with the dead ball unnecessary roughness foul by A3, and the penalties offset at the A1. Since the spot of B's foul and the dead ball spot are both in A's end zone, the enforcement of B2's foul would be from the goal line, if that was the only foul; the ball is therefore spotted on the 1-yard line.\"\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "\n",
-    "with open('./chunks/chunks_samples_5_subset.jsonl') as f:\n",
-    "    for line in f.readlines():\n",
-    "        print(json.dumps(json.loads(line)['chunk'], indent=2) + \"\\n\")"
+    "with open('../data/chunks/chunks_samples_5_subset.jsonl') as fin:\n",
+    "    with open('../data/selected_chunks.jsonl','w') as fout:\n",
+    "        for line in fin.readlines():\n",
+    "            selected_chunk = json.loads(line)['chunk']\n",
+    "            original_chunk = chunk_lookup[selected_chunk]\n",
+    "            fout.write(json.dumps(original_chunk) + \"\\n\")\n",
+    "\n",
+    "with open('../data/selected_chunks.jsonl') as final:\n",
+    "    for line in final.readlines():\n",
+    "        print(json.dumps(json.loads(line), indent=2))"
    ]
   }
  ],