Skip to content

Commit

Permalink
rerun with v0.1.4 dev
Browse files Browse the repository at this point in the history
  • Loading branch information
mkumar73 committed Aug 9, 2024
1 parent 5dc01a7 commit 20e7c5e
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 199 deletions.
114 changes: 45 additions & 69 deletions docs/notebooks/datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
"/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
Expand All @@ -76,57 +76,27 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Stocktwits_GME_large',\n",
" 'BBC_News',\n",
" 'Stocktwits_GME',\n",
" 'Reddit_GME',\n",
" 'Reuters',\n",
" 'Spotify',\n",
" '20NewsGroups',\n",
" 'DummyDataset',\n",
" 'Spotify_most_popular',\n",
" 'Poliblogs',\n",
" 'Spotify_least_popular']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = TMDataset()\n",
"dataset.get_dataset_list()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-08 22:59:47.537\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:47.907\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:48.146\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:48.510\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:48.690\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
]
}
],
"source": [
"dataset = TMDataset()\n",
"dataset.fetch_dataset(name=\"Reuters\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -142,7 +112,7 @@
" array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -153,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -169,7 +139,7 @@
" array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -180,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -189,20 +159,20 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m158\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:50.219\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:50.521\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:50.871\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:51.061\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
]
}
],
Expand All @@ -212,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -422,7 +392,7 @@
"4 [alexia, and, ice, you, the, came, down, you, ... "
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -433,7 +403,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -443,7 +413,7 @@
" 'seinabo sey have always wondered your cause when you you can not and ive been you your just see one seinabo sey will you ever tell what really your mind cause the greatest love youll ever find seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving moving have always wondered you your like every word the ive always been and every word but the seinabo sey vargas lagola will you ever tell what really your mind cause the greatest love youll ever find you know seinabo sey vargas lagola seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving cause not seinabo sey vargas lagola seinabo sey and you cant hold heart heart heart heart and you cant hold heart heart heart heart you cant hold heart heart hold heart and you cant hold heart heart heart and you cant hold heart heart you cant you cant you cant you cant hold heart heart heart heart heart moving moving ohohooh ohohooh ohohooh']"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -454,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -463,7 +433,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -472,7 +442,7 @@
"[75, 58]"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -490,7 +460,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -514,17 +484,16 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 258.65it/s]\n",
"\u001b[32m2024-08-08 22:59:55.068\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m454\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:55.073\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m469\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:55.074\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
"Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
"\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
]
}
],
Expand All @@ -541,15 +510,15 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-08 22:59:55.077\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:55.078\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
]
}
],
Expand All @@ -561,7 +530,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -634,7 +603,7 @@
"4 BGHXO 3 [BGHXO]"
]
},
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -643,6 +612,13 @@
"dataset.dataframe.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit 20e7c5e

Please sign in to comment.