From 97f5a33987a038db09ae9b1df6d56aa02d90b284 Mon Sep 17 00:00:00 2001 From: Patrick Urbanke Date: Thu, 3 Aug 2023 22:16:39 +0200 Subject: [PATCH] Adapted baseball to 1.4.0 --- baseball.ipynb | 888 +++++++++++++++++++++++++++++++------------------ 1 file changed, 568 insertions(+), 320 deletions(-) diff --git a/baseball.ipynb b/baseball.ipynb index dfb828e..049b4d7 100644 --- a/baseball.ipynb +++ b/baseball.ipynb @@ -100,14 +100,21 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "getML engine is already running.\n", "\n", - "Connected to project 'baseball'\n", - "localhost:1709/#/listprojects/baseball/\n" + "Connected to project 'baseball'\n" ] } ], @@ -120,7 +127,6 @@ "import pandas as pd\n", "from IPython.display import Image\n", "import matplotlib.pyplot as plt\n", - "plt.style.use('seaborn')\n", "%matplotlib inline \n", "\n", "import featuretools\n", @@ -161,8 +167,7 @@ { "data": { "text/plain": [ - "Connection(conn_id='default',\n", - " dbname='lahman_2014',\n", + "Connection(dbname='lahman_2014',\n", " dialect='mysql',\n", " host='relational.fit.cvut.cz',\n", " port=3306)" @@ -1171,8 +1176,6 @@ " name: allstarfull
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/allstarfull/\n", - " \n", "

\n" ], "text/plain": [ @@ -1207,9 +1210,7 @@ "\n", "4831 rows x 8 columns\n", "memory usage: 0.45 MB\n", - "name: allstarfull\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/allstarfull/" + "type: getml.DataFrame" ] }, "execution_count": 5, @@ -1764,8 +1765,6 @@ " name: awardsplayers
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/awardsplayers/\n", - " \n", "

\n" ], "text/plain": [ @@ -1786,9 +1785,7 @@ "\n", "5795 rows x 6 columns\n", "memory usage: 0.48 MB\n", - "name: awardsplayers\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/awardsplayers/" + "type: getml.DataFrame" ] }, "execution_count": 6, @@ -2692,8 +2689,6 @@ " name: awardsshareplayers
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/awardsshareplayers/\n", - " \n", "

\n" ], "text/plain": [ @@ -2714,9 +2709,7 @@ "\n", "6289 rows x 7 columns\n", "memory usage: 0.47 MB\n", - "name: awardsshareplayers\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/awardsshareplayers/" + "type: getml.DataFrame" ] }, "execution_count": 7, @@ -6187,8 +6180,6 @@ " name: batting
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/batting/\n", - " \n", "

\n" ], "text/plain": [ @@ -6223,9 +6214,7 @@ "\n", "92353 rows x 24 columns\n", "memory usage: 19.29 MB\n", - "name: batting\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/batting/" + "type: getml.DataFrame" ] }, "execution_count": 8, @@ -9295,8 +9284,6 @@ " name: battingpost
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/battingpost/\n", - " \n", "

\n" ], "text/plain": [ @@ -9331,9 +9318,7 @@ "\n", "9798 rows x 22 columns\n", "memory usage: 1.93 MB\n", - "name: battingpost\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/battingpost/" + "type: getml.DataFrame" ] }, "execution_count": 9, @@ -11799,8 +11784,6 @@ " name: fielding
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/fielding/\n", - " \n", "

\n" ], "text/plain": [ @@ -11835,9 +11818,7 @@ "\n", "137975 rows x 18 columns\n", "memory usage: 22.57 MB\n", - "name: fielding\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/fielding/" + "type: getml.DataFrame" ] }, "execution_count": 10, @@ -14053,8 +14034,6 @@ " name: fieldingpost
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/fieldingpost/\n", - " \n", "

\n" ], "text/plain": [ @@ -14089,9 +14068,7 @@ "\n", "10346 rows x 17 columns\n", "memory usage: 1.65 MB\n", - "name: fieldingpost\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/fieldingpost/" + "type: getml.DataFrame" ] }, "execution_count": 11, @@ -18468,8 +18445,6 @@ " name: pitching
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/pitching/\n", - " \n", "

\n" ], "text/plain": [ @@ -18504,9 +18479,7 @@ "\n", "39361 rows x 30 columns\n", "memory usage: 10.11 MB\n", - "name: pitching\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/pitching/" + "type: getml.DataFrame" ] }, "execution_count": 12, @@ -22784,8 +22757,6 @@ " name: pitchingpost
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/pitchingpost/\n", - " \n", "

\n" ], "text/plain": [ @@ -22820,9 +22791,7 @@ "\n", "4197 rows x 30 columns\n", "memory usage: 1.10 MB\n", - "name: pitchingpost\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/pitchingpost/" + "type: getml.DataFrame" ] }, "execution_count": 13, @@ -23424,8 +23393,6 @@ " name: salaries
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/salaries/\n", - " \n", "

\n" ], "text/plain": [ @@ -23446,9 +23413,7 @@ "\n", "23111 rows x 5 columns\n", "memory usage: 1.31 MB\n", - "name: salaries\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/salaries/" + "type: getml.DataFrame" ] }, "execution_count": 14, @@ -24528,8 +24493,6 @@ " name: allstarfull
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/allstarfull/\n", - " \n", "

\n" ], "text/plain": [ @@ -24566,9 +24529,7 @@ "\n", "4831 rows x 9 columns\n", "memory usage: 0.27 MB\n", - "name: allstarfull\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/allstarfull/" + "type: getml.DataFrame" ] }, "execution_count": 15, @@ -25220,8 +25181,6 @@ " name: awardsplayers
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/awardsplayers/\n", - " \n", "

\n" ], "text/plain": [ @@ -25258,9 +25217,7 @@ "\n", "5795 rows x 7 columns\n", "memory usage: 0.24 MB\n", - "name: awardsplayers\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/awardsplayers/" + "type: getml.DataFrame" ] }, "execution_count": 16, @@ -26262,8 +26219,6 @@ " name: awardsshareplayers
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/awardsshareplayers/\n", - " \n", "

\n" ], "text/plain": [ @@ -26300,9 +26255,7 @@ "\n", "6289 rows x 8 columns\n", "memory usage: 0.33 MB\n", - "name: awardsshareplayers\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/awardsshareplayers/" + "type: getml.DataFrame" ] }, "execution_count": 17, @@ -29941,8 +29894,6 @@ " name: batting
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/batting/\n", - " \n", "

\n" ], "text/plain": [ @@ -29979,9 +29930,7 @@ "\n", "92353 rows x 25 columns\n", "memory usage: 17.36 MB\n", - "name: batting\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/batting/" + "type: getml.DataFrame" ] }, "execution_count": 18, @@ -33211,8 +33160,6 @@ " name: battingpost
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/battingpost/\n", - " \n", "

\n" ], "text/plain": [ @@ -33249,9 +33196,7 @@ "\n", "9798 rows x 23 columns\n", "memory usage: 1.65 MB\n", - "name: battingpost\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/battingpost/" + "type: getml.DataFrame" ] }, "execution_count": 19, @@ -35861,8 +35806,6 @@ " name: fielding
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/fielding/\n", - " \n", "

\n" ], "text/plain": [ @@ -35899,9 +35842,7 @@ "\n", "137975 rows x 19 columns\n", "memory usage: 18.76 MB\n", - "name: fielding\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/fielding/" + "type: getml.DataFrame" ] }, "execution_count": 20, @@ -38256,8 +38197,6 @@ " name: fieldingpost
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/fieldingpost/\n", - " \n", "

\n" ], "text/plain": [ @@ -38294,9 +38233,7 @@ "\n", "10346 rows x 18 columns\n", "memory usage: 1.28 MB\n", - "name: fieldingpost\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/fieldingpost/" + "type: getml.DataFrame" ] }, "execution_count": 21, @@ -42864,8 +42801,6 @@ " name: pitching
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/pitching/\n", - " \n", "

\n" ], "text/plain": [ @@ -42902,9 +42837,7 @@ "\n", "39361 rows x 31 columns\n", "memory usage: 9.29 MB\n", - "name: pitching\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/pitching/" + "type: getml.DataFrame" ] }, "execution_count": 22, @@ -47398,8 +47331,6 @@ " name: pitchingpost
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/pitchingpost/\n", - " \n", "

\n" ], "text/plain": [ @@ -47436,9 +47367,7 @@ "\n", "4197 rows x 31 columns\n", "memory usage: 0.97 MB\n", - "name: pitchingpost\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/pitchingpost/" + "type: getml.DataFrame" ] }, "execution_count": 23, @@ -48188,8 +48117,6 @@ " name: salaries
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/salaries/\n", - " \n", "

\n" ], "text/plain": [ @@ -48211,9 +48138,7 @@ "\n", "23111 rows x 7 columns\n", "memory usage: 0.92 MB\n", - "name: salaries\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/salaries/" + "type: getml.DataFrame" ] }, "execution_count": 24, @@ -48267,12 +48192,14 @@ { "data": { "text/html": [ - "

data model

\n", - "

diagram


\n", + "data model\n", + "
\n", + "
diagram
\n", "
allstarfullawardsplayersawardsshareplayersbattingbattingpostfieldingfieldingpostpitchingpitchingpostsalariesplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 daysplayerID = playerIDyear <= yearHorizon: 1.0 days
\n", "
\n", "\n", - "

staging

\n", + "
\n", + "
staging
\n", " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
typelabel message
0INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and ALLSTARFULL__STAGING_TABLE_2 over 'playerID' and 'playerID', there are no corresponding entries for 64.710317% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
1INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and AWARDSPLAYERS__STAGING_TABLE_3 over 'playerID' and 'playerID', there are no corresponding entries for 75.376911% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
2INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and AWARDSSHAREPLAYERS__STAGING_TABLE_4 over 'playerID' and 'playerID', there are no corresponding entries for 62.459617% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
3INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and BATTING__STAGING_TABLE_5 over 'playerID' and 'playerID', there are no corresponding entries for 8.765884% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
4INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and BATTINGPOST__STAGING_TABLE_6 over 'playerID' and 'playerID', there are no corresponding entries for 41.018738% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
5INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and FIELDING__STAGING_TABLE_7 over 'playerID' and 'playerID', there are no corresponding entries for 19.270946% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
6INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and FIELDINGPOST__STAGING_TABLE_8 over 'playerID' and 'playerID', there are no corresponding entries for 38.369589% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
7INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and PITCHING__STAGING_TABLE_9 over 'playerID' and 'playerID', there are no corresponding entries for 54.862158% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
8INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and PITCHINGPOST__STAGING_TABLE_10 over 'playerID' and 'playerID', there are no corresponding entries for 73.589274% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
" + ], + "text/plain": [ + " type label message \n", + "0 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "1 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "2 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "3 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "4 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "5 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "6 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "7 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "8 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T..." + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -49221,23 +49368,17 @@ "Staging... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and ALLSTARFULL__STAGING_TABLE_2 over 'playerID' and 'playerID', there are no corresponding entries for 64.710317% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and AWARDSPLAYERS__STAGING_TABLE_3 over 'playerID' and 'playerID', there are no corresponding entries for 75.376911% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and AWARDSSHAREPLAYERS__STAGING_TABLE_4 over 'playerID' and 'playerID', there are no corresponding entries for 62.459617% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and BATTING__STAGING_TABLE_5 over 'playerID' and 'playerID', there are no corresponding entries for 8.765884% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and BATTINGPOST__STAGING_TABLE_6 over 'playerID' and 'playerID', there are no corresponding entries for 41.018738% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and FIELDING__STAGING_TABLE_7 over 'playerID' and 'playerID', there are no corresponding entries for 19.270946% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and FIELDINGPOST__STAGING_TABLE_8 over 'playerID' and 'playerID', there are no corresponding entries for 38.369589% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and PITCHING__STAGING_TABLE_9 over 'playerID' and 'playerID', there are no corresponding entries for 54.862158% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and PITCHINGPOST__STAGING_TABLE_10 over 'playerID' and 'playerID', there are no corresponding entries for 73.589274% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", + "The pipeline check generated 9 issues labeled INFO and 0 issues labeled WARNING.\n", + "To see the issues in full, run .check() on the pipeline.\n", + "\n", "Staging... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", - "FastProp: Trying 3080 features... 100% |██████████| [elapsed: 00:11, remaining: 00:00] \n", + "FastProp: Trying 3080 features... 100% |██████████| [elapsed: 00:12, remaining: 00:00] \n", "FastProp: Building features... 100% |██████████| [elapsed: 00:02, remaining: 00:00] \n", - "XGBoost: Training as predictor... 100% |██████████| [elapsed: 00:24, remaining: 00:00] \n", + "XGBoost: Training as predictor... 100% |██████████| [elapsed: 00:23, remaining: 00:00] \n", "\n", "Trained pipeline.\n", - "Time taken: 0h:0m:39.821604\n", + "Time taken: 0h:0m:41.877725\n", "\n" ] }, @@ -49254,7 +49395,7 @@ " predictors=['XGBoostRegressor'],\n", " preprocessors=['Mapping'],\n", " share_selected_features=0.5,\n", - " tags=['fast_prop', 'container-udGggs'])
url: localhost:1709/#/getpipeline/baseball/Hiv492/0/
" + " tags=['fast_prop', 'container-SHAMaN'])" ], "text/plain": [ "Pipeline(data_model='salaries',\n", @@ -49267,9 +49408,7 @@ " predictors=['XGBoostRegressor'],\n", " preprocessors=['Mapping'],\n", " share_selected_features=0.5,\n", - " tags=['fast_prop', 'container-udGggs'])\n", - "\n", - "url: localhost:1709/#/getpipeline/baseball/Hiv492/0/" + " tags=['fast_prop', 'container-SHAMaN'])" ] }, "execution_count": 31, @@ -49295,16 +49434,234 @@ "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Checking... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and ALLSTARFULL__STAGING_TABLE_2 over 'playerID' and 'playerID', there are no corresponding entries for 64.710317% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and AWARDSPLAYERS__STAGING_TABLE_3 over 'playerID' and 'playerID', there are no corresponding entries for 75.376911% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and AWARDSSHAREPLAYERS__STAGING_TABLE_4 over 'playerID' and 'playerID', there are no corresponding entries for 62.459617% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and BATTING__STAGING_TABLE_5 over 'playerID' and 'playerID', there are no corresponding entries for 8.765884% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and BATTINGPOST__STAGING_TABLE_6 over 'playerID' and 'playerID', there are no corresponding entries for 41.018738% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and FIELDING__STAGING_TABLE_7 over 'playerID' and 'playerID', there are no corresponding entries for 19.270946% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and FIELDINGPOST__STAGING_TABLE_8 over 'playerID' and 'playerID', there are no corresponding entries for 38.369589% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and PITCHING__STAGING_TABLE_9 over 'playerID' and 'playerID', there are no corresponding entries for 54.862158% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and PITCHINGPOST__STAGING_TABLE_10 over 'playerID' and 'playerID', there are no corresponding entries for 73.589274% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n" + "The pipeline check generated 9 issues labeled INFO and 0 issues labeled WARNING.\n" ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
typelabel message
0INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and ALLSTARFULL__STAGING_TABLE_2 over 'playerID' and 'playerID', there are no corresponding entries for 64.710317% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
1INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and AWARDSPLAYERS__STAGING_TABLE_3 over 'playerID' and 'playerID', there are no corresponding entries for 75.376911% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
2INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and AWARDSSHAREPLAYERS__STAGING_TABLE_4 over 'playerID' and 'playerID', there are no corresponding entries for 62.459617% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
3INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and BATTING__STAGING_TABLE_5 over 'playerID' and 'playerID', there are no corresponding entries for 8.765884% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
4INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and BATTINGPOST__STAGING_TABLE_6 over 'playerID' and 'playerID', there are no corresponding entries for 41.018738% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
5INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and FIELDING__STAGING_TABLE_7 over 'playerID' and 'playerID', there are no corresponding entries for 19.270946% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
6INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and FIELDINGPOST__STAGING_TABLE_8 over 'playerID' and 'playerID', there are no corresponding entries for 38.369589% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
7INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and PITCHING__STAGING_TABLE_9 over 'playerID' and 'playerID', there are no corresponding entries for 54.862158% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
8INFOFOREIGN KEYS NOT FOUNDWhen joining SALARIES__STAGING_TABLE_1 and PITCHINGPOST__STAGING_TABLE_10 over 'playerID' and 'playerID', there are no corresponding entries for 73.589274% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.
" + ], + "text/plain": [ + " type label message \n", + "0 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "1 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "2 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "3 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "4 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "5 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "6 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "7 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T...\n", + "8 INFO FOREIGN KEYS NOT FOUND When joining SALARIES__STAGING_T..." + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -49324,23 +49681,17 @@ "Staging... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and ALLSTARFULL__STAGING_TABLE_2 over 'playerID' and 'playerID', there are no corresponding entries for 64.710317% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and AWARDSPLAYERS__STAGING_TABLE_3 over 'playerID' and 'playerID', there are no corresponding entries for 75.376911% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and AWARDSSHAREPLAYERS__STAGING_TABLE_4 over 'playerID' and 'playerID', there are no corresponding entries for 62.459617% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and BATTING__STAGING_TABLE_5 over 'playerID' and 'playerID', there are no corresponding entries for 8.765884% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and BATTINGPOST__STAGING_TABLE_6 over 'playerID' and 'playerID', there are no corresponding entries for 41.018738% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and FIELDING__STAGING_TABLE_7 over 'playerID' and 'playerID', there are no corresponding entries for 19.270946% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and FIELDINGPOST__STAGING_TABLE_8 over 'playerID' and 'playerID', there are no corresponding entries for 38.369589% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and PITCHING__STAGING_TABLE_9 over 'playerID' and 'playerID', there are no corresponding entries for 54.862158% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", - "INFO [FOREIGN KEYS NOT FOUND]: When joining SALARIES__STAGING_TABLE_1 and PITCHINGPOST__STAGING_TABLE_10 over 'playerID' and 'playerID', there are no corresponding entries for 73.589274% of entries in 'playerID' in 'SALARIES__STAGING_TABLE_1'. You might want to double-check your join keys.\n", + "The pipeline check generated 9 issues labeled INFO and 0 issues labeled WARNING.\n", + "To see the issues in full, run .check() on the pipeline.\n", + "\n", "Staging... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", - "Relboost: Training features... 100% |██████████| [elapsed: 00:39, remaining: 00:00] \n", + "Relboost: Training features... 100% |██████████| [elapsed: 00:40, remaining: 00:00] \n", "Relboost: Building features... 100% |██████████| [elapsed: 00:04, remaining: 00:00] \n", "XGBoost: Training as predictor... 100% |██████████| [elapsed: 00:04, remaining: 00:00] \n", "\n", "Trained pipeline.\n", - "Time taken: 0h:0m:47.967351\n", + "Time taken: 0h:0m:48.206881\n", "\n" ] }, @@ -49357,7 +49708,7 @@ " predictors=['XGBoostRegressor'],\n", " preprocessors=['Mapping'],\n", " share_selected_features=0.5,\n", - " tags=['relboost', 'container-udGggs'])
url: localhost:1709/#/getpipeline/baseball/qERlSW/0/
" + " tags=['relboost', 'container-SHAMaN'])" ], "text/plain": [ "Pipeline(data_model='salaries',\n", @@ -49370,9 +49721,7 @@ " predictors=['XGBoostRegressor'],\n", " preprocessors=['Mapping'],\n", " share_selected_features=0.5,\n", - " tags=['relboost', 'container-udGggs'])\n", - "\n", - "url: localhost:1709/#/getpipeline/baseball/qERlSW/0/" + " tags=['relboost', 'container-SHAMaN'])" ] }, "execution_count": 33, @@ -49482,7 +49831,7 @@ " 0\n", " \n", " \n", - " 2022-10-31 07:09:48\n", + " 2023-07-30 18:59:23\n", " \n", " \n", " \n", @@ -49511,7 +49860,7 @@ " 1\n", " \n", " \n", - " 2022-10-31 07:10:40\n", + " 2023-07-30 19:00:14\n", " \n", " \n", " \n", @@ -49541,8 +49890,8 @@ ], "text/plain": [ " date time set used target mae rmse rsquared\n", - "0 2022-10-31 07:09:48 train salary 690630.2317 1242307.2495 0.8248\n", - "1 2022-10-31 07:10:40 test salary 763930.032 1401705.6283 0.7883" + "0 2023-07-30 18:59:23 train salary 690630.2317 1242307.2495 0.8248\n", + "1 2023-07-30 19:00:14 test salary 763930.032 1401705.6283 0.7883" ] }, "execution_count": 34, @@ -49640,7 +49989,7 @@ " 0\n", " \n", " \n", - " 2022-10-31 07:10:36\n", + " 2023-07-30 19:00:11\n", " \n", " \n", " \n", @@ -49669,7 +50018,7 @@ " 1\n", " \n", " \n", - " 2022-10-31 07:10:41\n", + " 2023-07-30 19:00:16\n", " \n", " \n", " \n", @@ -49699,8 +50048,8 @@ ], "text/plain": [ " date time set used target mae rmse rsquared\n", - "0 2022-10-31 07:10:36 train salary 459470.8604 793963.8048 0.9284\n", - "1 2022-10-31 07:10:41 test salary 664766.3496 1217213.7658 0.8402" + "0 2023-07-30 19:00:11 train salary 459470.8604 793963.8048 0.9284\n", + "1 2023-07-30 19:00:16 test salary 664766.3496 1217213.7658 0.8402" ] }, "execution_count": 35, @@ -52893,23 +53242,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n" ] } @@ -52930,23 +53279,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n", - "/home/ubuntu/.local/lib/python3.8/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", + "/home/ubuntu/.local/lib/python3.10/site-packages/featuretools/entityset/entityset.py:1906: UserWarning: index index not found in dataframe, creating new integer column\n", " warnings.warn(\n" ] } @@ -52983,44 +53332,6 @@ "execution_count": 58, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "91.880250% of all entries of column 'MAX(allstarfull.startingPos)' are NULL values.\n", - "91.880250% of all entries of column 'MEAN(allstarfull.startingPos)' are NULL values.\n", - "91.880250% of all entries of column 'MIN(allstarfull.startingPos)' are NULL values.\n", - "91.923325% of all entries of column 'SKEW(allstarfull.GP)' are NULL values.\n", - "91.912557% of all entries of column 'SKEW(allstarfull.gameNum)' are NULL values.\n", - "97.571613% of all entries of column 'SKEW(allstarfull.startingPos)' are NULL values.\n", - "96.047814% of all entries of column 'STD(allstarfull.startingPos)' are NULL values.\n", - "91.158734% of all entries of column 'SKEW(pitchingpost.BAOpp)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.BB)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.BFP)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.BK)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.CG)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.ER)' are NULL values.\n", - "91.756407% of all entries of column 'SKEW(pitchingpost.ERA)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.G)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.GF)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.GIDP)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.GS)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.H)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.HBP)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.HR)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.IBB)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.IPouts)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.L)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.R)' are NULL values.\n", - "91.180271% of all entries of column 'SKEW(pitchingpost.SF)' are NULL values.\n", - "91.180271% of all entries of column 'SKEW(pitchingpost.SH)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.SHO)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.SO)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.SV)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.W)' are NULL values.\n", - "91.142580% of all entries of column 'SKEW(pitchingpost.WP)' are NULL values.\n" - ] - }, { "data": { "text/html": [ @@ -155636,8 +155947,6 @@ " name: featuretools_train
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/featuretools_train/\n", - " \n", "

\n" ], "text/plain": [ @@ -155672,9 +155981,7 @@ "\n", "18572 rows x 722 columns\n", "memory usage: 102.37 MB\n", - "name: featuretools_train\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/featuretools_train/" + "type: getml.DataFrame" ] }, "execution_count": 58, @@ -155692,47 +155999,9 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "91.451862% of all entries of column 'MAX(allstarfull.startingPos)' are NULL values.\n", - "91.451862% of all entries of column 'MEAN(allstarfull.startingPos)' are NULL values.\n", - "91.451862% of all entries of column 'MIN(allstarfull.startingPos)' are NULL values.\n", - "91.914519% of all entries of column 'SKEW(allstarfull.GP)' are NULL values.\n", - "91.892487% of all entries of column 'SKEW(allstarfull.gameNum)' are NULL values.\n", - "97.884997% of all entries of column 'SKEW(allstarfull.startingPos)' are NULL values.\n", - "96.342807% of all entries of column 'STD(allstarfull.startingPos)' are NULL values.\n", - "90.746861% of all entries of column 'SKEW(pitchingpost.BAOpp)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.BB)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.BFP)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.BK)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.CG)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.ER)' are NULL values.\n", - "91.341705% of all entries of column 'SKEW(pitchingpost.ERA)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.G)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.GF)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.GIDP)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.GS)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.H)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.HBP)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.HR)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.IBB)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.IPouts)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.L)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.R)' are NULL values.\n", - "90.768892% of all entries of column 'SKEW(pitchingpost.SF)' are NULL values.\n", - "90.768892% of all entries of column 'SKEW(pitchingpost.SH)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.SHO)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.SO)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.SV)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.W)' are NULL values.\n", - "90.724829% of all entries of column 'SKEW(pitchingpost.WP)' are NULL values.\n" - ] - }, { "data": { "text/html": [ @@ -258348,8 +258617,6 @@ " name: featuretools_test
\n", " type: getml.DataFrame
\n", " \n", - " url: localhost:1709/#/getdataframe/baseball/featuretools_test/\n", - " \n", "

\n" ], "text/plain": [ @@ -258384,12 +258651,10 @@ "\n", "4539 rows x 722 columns\n", "memory usage: 25.02 MB\n", - "name: featuretools_test\n", - "type: getml.DataFrame\n", - "url: localhost:1709/#/getdataframe/baseball/featuretools_test/" + "type: getml.DataFrame" ] }, - "execution_count": 59, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -258413,7 +258678,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -258443,7 +258708,7 @@ " tags=['featuretools'])" ] }, - "execution_count": 60, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -258465,7 +258730,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -258477,30 +258742,15 @@ "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Checking... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'day(year)' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_string or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'month(year)' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_string or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( max(allstarfull.gamenum), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( mean(allstarfull.gamenum), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( min(allstarfull.gamenum), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( skew(allstarfull.gamenum), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( std(allstarfull.gamenum), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( sum(allstarfull.gamenum), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( min(batting.stint), 1.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( min(fielding.stint), 1.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( max(fieldingpost.tp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( mean(fieldingpost.tp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( min(fieldingpost.tp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( skew(fieldingpost.tp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( std(fieldingpost.tp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( sum(fieldingpost.tp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( min(pitching.baopp), 0.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", - "WARNING [COLUMN SHOULD BE UNUSED]: All non-NULL entries in column 'COALESCE( min(pitching.stint), 1.000000 )' in POPULATION__STAGING_TABLE_1 are equal to each other. You should consider setting its role to unused_float or using it for comparison only (you can do the latter by setting a unit that contains 'comparison only').\n", + "The pipeline check generated 0 issues labeled INFO and 18 issues labeled WARNING.\n", + "To see the issues in full, run .check() on the pipeline.\n", + "\n", "Staging... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", "Preprocessing... 100% |██████████| [elapsed: 00:00, remaining: 00:00] \n", - "XGBoost: Training as predictor... 100% |██████████| [elapsed: 00:24, remaining: 00:00] \n", + "XGBoost: Training as predictor... 100% |██████████| [elapsed: 00:23, remaining: 00:00] \n", "\n", "Trained pipeline.\n", - "Time taken: 0h:0m:27.394815\n", + "Time taken: 0h:0m:28.412967\n", "\n" ] }, @@ -258516,7 +258766,7 @@ " predictors=['XGBoostRegressor'],\n", " preprocessors=['Imputation'],\n", " share_selected_features=0.5,\n", - " tags=['featuretools'])
url: localhost:1709/#/getpipeline/baseball/LFHvfA/0/
" + " tags=['featuretools'])" ], "text/plain": [ "Pipeline(data_model='population',\n", @@ -258528,12 +258778,10 @@ " predictors=['XGBoostRegressor'],\n", " preprocessors=['Imputation'],\n", " share_selected_features=0.5,\n", - " tags=['featuretools'])\n", - "\n", - "url: localhost:1709/#/getpipeline/baseball/LFHvfA/0/" + " tags=['featuretools'])" ] }, - "execution_count": 61, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -258544,7 +258792,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -258625,7 +258873,7 @@ " 0\n", " \n", " \n", - " 2022-10-31 07:12:56\n", + " 2023-07-30 19:07:40\n", " \n", " \n", " \n", @@ -258654,7 +258902,7 @@ " 1\n", " \n", " \n", - " 2022-10-31 07:12:57\n", + " 2023-07-30 19:07:44\n", " \n", " \n", " \n", @@ -258684,11 +258932,11 @@ ], "text/plain": [ " date time set used target mae rmse rsquared\n", - "0 2022-10-31 07:12:56 featuretools_train salary 704893.7458 1288741.874 0.8128\n", - "1 2022-10-31 07:12:57 featuretools_test salary 776053.9972 1445682.6312 0.775 " + "0 2023-07-30 19:07:40 featuretools_train salary 704893.7458 1288741.874 0.8128\n", + "1 2023-07-30 19:07:44 featuretools_test salary 776053.9972 1445682.6312 0.775 " ] }, - "execution_count": 62, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -258708,7 +258956,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -258718,20 +258966,20 @@ "DROP TABLE IF EXISTS \"FEATURE_1_29\";\n", "\n", "CREATE TABLE \"FEATURE_1_29\" AS\n", - "SELECT LAST( t2.\"gameid__mapping_1_target_1_avg\", t2.\"year, '+1.000000 days'\" ) AS \"feature_1_29\",\n", + "SELECT LAST( t2.\"gameid__mapping_1_target_1_avg\", t2.\"year__1_000000_days\" ) AS \"feature_1_29\",\n", " t1.rowid AS rownum\n", "FROM \"SALARIES__STAGING_TABLE_1\" t1\n", "INNER JOIN \"ALLSTARFULL__STAGING_TABLE_2\" t2\n", "ON t1.\"playerid\" = t2.\"playerid\"\n", - "WHERE t2.\"year, '+1.000000 days'\" <= t1.\"year\"\n", + "WHERE t2.\"year__1_000000_days\" <= t1.\"year\"\n", "GROUP BY t1.rowid;\n", "```" ], "text/plain": [ - "'DROP TABLE IF EXISTS \"FEATURE_1_29\";\\n\\nCREATE TABLE \"FEATURE_1_29\" AS\\nSELECT LAST( t2.\"gameid__mapping_1_target_1_avg\", t2.\"year, \\'+1.000000 days\\'\" ) AS \"feature_1_29\",\\n t1.rowid AS rownum\\nFROM \"SALARIES__STAGING_TABLE_1\" t1\\nINNER JOIN \"ALLSTARFULL__STAGING_TABLE_2\" t2\\nON t1.\"playerid\" = t2.\"playerid\"\\nWHERE t2.\"year, \\'+1.000000 days\\'\" <= t1.\"year\"\\nGROUP BY t1.rowid;'" + "'DROP TABLE IF EXISTS \"FEATURE_1_29\";\\n\\nCREATE TABLE \"FEATURE_1_29\" AS\\nSELECT LAST( t2.\"gameid__mapping_1_target_1_avg\", t2.\"year__1_000000_days\" ) AS \"feature_1_29\",\\n t1.rowid AS rownum\\nFROM \"SALARIES__STAGING_TABLE_1\" t1\\nINNER JOIN \"ALLSTARFULL__STAGING_TABLE_2\" t2\\nON t1.\"playerid\" = t2.\"playerid\"\\nWHERE t2.\"year__1_000000_days\" <= t1.\"year\"\\nGROUP BY t1.rowid;'" ] }, - "execution_count": 63, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -258742,14 +258990,14 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```sql\n", - "-- The size of the SQL code for FEATURE_1_1 is 133317 characters, which is greater than the size_threshold of 50000!\n", + "-- The size of the SQL code for FEATURE_1_1 is 133314 characters, which is greater than the size_threshold of 50000!\n", "-- To display very long features like this anyway, increase the size_threshold or set the size_threshold to None.\n", "DROP TABLE IF EXISTS \"FEATURE_1_1\";\n", "\n", @@ -258757,10 +259005,10 @@ "```" ], "text/plain": [ - "'-- The size of the SQL code for FEATURE_1_1 is 133317 characters, which is greater than the size_threshold of 50000!\\n-- To display very long features like this anyway, increase the size_threshold or set the size_threshold to None.\\nDROP TABLE IF EXISTS \"FEATURE_1_1\";\\n\\nCREATE TABLE \"FEATURE_1_1\";'" + "'-- The size of the SQL code for FEATURE_1_1 is 133314 characters, which is greater than the size_threshold of 50000!\\n-- To display very long features like this anyway, increase the size_threshold or set the size_threshold to None.\\nDROP TABLE IF EXISTS \"FEATURE_1_1\";\\n\\nCREATE TABLE \"FEATURE_1_1\";'" ] }, - "execution_count": 64, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -258787,7 +259035,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -258798,7 +259046,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -258929,7 +259177,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.6" }, "toc": { "base_numbering": 1,