rerun with v0.1.4 dev

AnFreTh · Aug 9, 2024 · 20e7c5e · 20e7c5e
1 parent 5dc01a7
commit 20e7c5e
Show file tree

Hide file tree

Showing 3 changed files with 199 additions and 199 deletions.
diff --git a/docs/notebooks/datasets.ipynb b/docs/notebooks/datasets.ipynb
@@ -50,8 +50,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "  warnings.warn(\n"
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "  from tqdm.autonotebook import tqdm, trange\n"
      ]
     }
    ],
@@ -76,57 +76,27 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Stocktwits_GME_large',\n",
-       " 'BBC_News',\n",
-       " 'Stocktwits_GME',\n",
-       " 'Reddit_GME',\n",
-       " 'Reuters',\n",
-       " 'Spotify',\n",
-       " '20NewsGroups',\n",
-       " 'DummyDataset',\n",
-       " 'Spotify_most_popular',\n",
-       " 'Poliblogs',\n",
-       " 'Spotify_least_popular']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset = TMDataset()\n",
-    "dataset.get_dataset_list()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-08 22:59:47.537\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:47.907\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:48.146\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:48.510\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:48.690\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
    "source": [
+    "dataset = TMDataset()\n",
     "dataset.fetch_dataset(name=\"Reuters\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -142,7 +112,7 @@
        " array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -153,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -169,7 +139,7 @@
        " array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -180,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,20 +159,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m158\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:50.219\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:50.521\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:50.871\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:51.061\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
@@ -212,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -422,7 +392,7 @@
        "4  [alexia, and, ice, you, the, came, down, you, ...  "
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -433,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -443,7 +413,7 @@
        " 'seinabo sey have always wondered your cause when you you can not and ive been you your just see one seinabo sey will you ever tell what really your mind cause the greatest love youll ever find seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving moving have always wondered you your like every word the ive always been and every word but the seinabo sey vargas lagola will you ever tell what really your mind cause the greatest love youll ever find you know seinabo sey vargas lagola seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving cause not seinabo sey vargas lagola seinabo sey and you cant hold heart heart heart heart and you cant hold heart heart heart heart you cant hold heart heart hold heart and you cant hold heart heart heart and you cant hold heart heart you cant you cant you cant you cant hold heart heart heart heart heart moving moving ohohooh ohohooh ohohooh']"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -454,7 +424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -463,7 +433,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -472,7 +442,7 @@
        "[75, 58]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -490,7 +460,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,17 +484,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 258.65it/s]\n",
-      "\u001b[32m2024-08-08 22:59:55.068\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m454\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:55.073\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m469\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:55.074\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
+      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
+      "\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
      ]
     }
    ],
@@ -541,15 +510,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-08 22:59:55.077\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:55.078\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
      ]
     }
    ],
@@ -561,7 +530,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -634,7 +603,7 @@
        "4  BGHXO      3  [BGHXO]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -643,6 +612,13 @@
     "dataset.dataframe.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,