diff --git a/notebooks/data_preparation.ipynb b/notebooks/data_preparation.ipynb new file mode 100644 index 0000000..5df91d1 --- /dev/null +++ b/notebooks/data_preparation.ipynb @@ -0,0 +1,3472 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3e50aaae", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "import xarray as xr\n", + "import numpy as np\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "id": "516565d1", + "metadata": {}, + "source": [ + "This notebook takes the raw/downloaded information and pre-process it into a format suitable for AI/ML approaches. This pre-processing procedure assumes all gridded data is in the same spatio-temporal resolution." + ] + }, + { + "cell_type": "markdown", + "id": "7826db32", + "metadata": {}, + "source": [ + "# Outcome variable (or predictand)" + ] + }, + { + "cell_type": "markdown", + "id": "e08681ba", + "metadata": {}, + "source": [ + "## Above Ground Biomass" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b2d9b8b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
<xarray.Dataset>\n", + "Dimensions: (latitude: 720, longitude: 1440, time: 84)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2010-01-01 ... 2016-12-01\n", + " * longitude (longitude) float32 -179.9 -179.6 ... 179.6 179.9\n", + " * latitude (latitude) float32 -89.88 -89.62 ... 89.62 89.88\n", + "Data variables:\n", + " abg_avitabile_vod5th (time, latitude, longitude) float64 ...\n", + " abg_avitabile_vodmean (time, latitude, longitude) float64 ...\n", + " abg_avitabile_vod95th (time, latitude, longitude) float64 ...\n", + " abg_baccini_vod5th (time, latitude, longitude) float64 ...\n", + " abg_baccini_vodmean (time, latitude, longitude) float64 ...\n", + " abg_baccini_vod95th (time, latitude, longitude) float64 ...\n", + " abg_saatchi_vod5th (time, latitude, longitude) float64 ...\n", + " abg_saatchi_vodmean (time, latitude, longitude) float64 ...\n", + " abg_saatchi_vod95th (time, latitude, longitude) float64 ...
array(['2010-01-01T00:00:00.000000000', '2010-02-01T00:00:00.000000000',\n", + " '2010-03-01T00:00:00.000000000', '2010-04-01T00:00:00.000000000',\n", + " '2010-05-01T00:00:00.000000000', '2010-06-01T00:00:00.000000000',\n", + " '2010-07-01T00:00:00.000000000', '2010-08-01T00:00:00.000000000',\n", + " '2010-09-01T00:00:00.000000000', '2010-10-01T00:00:00.000000000',\n", + " '2010-11-01T00:00:00.000000000', '2010-12-01T00:00:00.000000000',\n", + " '2011-01-01T00:00:00.000000000', '2011-02-01T00:00:00.000000000',\n", + " '2011-03-01T00:00:00.000000000', '2011-04-01T00:00:00.000000000',\n", + " '2011-05-01T00:00:00.000000000', '2011-06-01T00:00:00.000000000',\n", + " '2011-07-01T00:00:00.000000000', '2011-08-01T00:00:00.000000000',\n", + " '2011-09-01T00:00:00.000000000', '2011-10-01T00:00:00.000000000',\n", + " '2011-11-01T00:00:00.000000000', '2011-12-01T00:00:00.000000000',\n", + " '2012-01-01T00:00:00.000000000', '2012-02-01T00:00:00.000000000',\n", + " '2012-03-01T00:00:00.000000000', '2012-04-01T00:00:00.000000000',\n", + " '2012-05-01T00:00:00.000000000', '2012-06-01T00:00:00.000000000',\n", + " '2012-07-01T00:00:00.000000000', '2012-08-01T00:00:00.000000000',\n", + " '2012-09-01T00:00:00.000000000', '2012-10-01T00:00:00.000000000',\n", + " '2012-11-01T00:00:00.000000000', '2012-12-01T00:00:00.000000000',\n", + " '2013-01-01T00:00:00.000000000', '2013-02-01T00:00:00.000000000',\n", + " '2013-03-01T00:00:00.000000000', '2013-04-01T00:00:00.000000000',\n", + " '2013-05-01T00:00:00.000000000', '2013-06-01T00:00:00.000000000',\n", + " '2013-07-01T00:00:00.000000000', '2013-08-01T00:00:00.000000000',\n", + " '2013-09-01T00:00:00.000000000', '2013-10-01T00:00:00.000000000',\n", + " '2013-11-01T00:00:00.000000000', '2013-12-01T00:00:00.000000000',\n", + " '2014-01-01T00:00:00.000000000', '2014-02-01T00:00:00.000000000',\n", + " '2014-03-01T00:00:00.000000000', '2014-04-01T00:00:00.000000000',\n", + " '2014-05-01T00:00:00.000000000', '2014-06-01T00:00:00.000000000',\n", + " '2014-07-01T00:00:00.000000000', '2014-08-01T00:00:00.000000000',\n", + " '2014-09-01T00:00:00.000000000', '2014-10-01T00:00:00.000000000',\n", + " '2014-11-01T00:00:00.000000000', '2014-12-01T00:00:00.000000000',\n", + " '2015-01-01T00:00:00.000000000', '2015-02-01T00:00:00.000000000',\n", + " '2015-03-01T00:00:00.000000000', '2015-04-01T00:00:00.000000000',\n", + " '2015-05-01T00:00:00.000000000', '2015-06-01T00:00:00.000000000',\n", + " '2015-07-01T00:00:00.000000000', '2015-08-01T00:00:00.000000000',\n", + " '2015-09-01T00:00:00.000000000', '2015-10-01T00:00:00.000000000',\n", + " '2015-11-01T00:00:00.000000000', '2015-12-01T00:00:00.000000000',\n", + " '2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',\n", + " '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',\n", + " '2016-05-01T00:00:00.000000000', '2016-06-01T00:00:00.000000000',\n", + " '2016-07-01T00:00:00.000000000', '2016-08-01T00:00:00.000000000',\n", + " '2016-09-01T00:00:00.000000000', '2016-10-01T00:00:00.000000000',\n", + " '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000'],\n", + " dtype='datetime64[ns]')
array([-179.875, -179.625, -179.375, ..., 179.375, 179.625, 179.875],\n", + " dtype=float32)
array([-89.875, -89.625, -89.375, ..., 89.375, 89.625, 89.875],\n", + " dtype=float32)
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
[87091200 values with dtype=float64]
<xarray.DataArray 'abg_avitabile_vodmean' (time: 84, latitude: 720, longitude: 1440)>\n", + "[87091200 values with dtype=float64]\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2010-01-01 2010-02-01 ... 2016-12-01\n", + " * longitude (longitude) float32 -179.9 -179.6 -179.4 ... 179.4 179.6 179.9\n", + " * latitude (latitude) float32 -89.88 -89.62 -89.38 ... 89.38 89.62 89.88\n", + "Attributes:\n", + " long_name: AGB estimation using a relation Avitabile et al. as a fct of ...\n", + " units: Mg/h
[87091200 values with dtype=float64]
array(['2010-01-01T00:00:00.000000000', '2010-02-01T00:00:00.000000000',\n", + " '2010-03-01T00:00:00.000000000', '2010-04-01T00:00:00.000000000',\n", + " '2010-05-01T00:00:00.000000000', '2010-06-01T00:00:00.000000000',\n", + " '2010-07-01T00:00:00.000000000', '2010-08-01T00:00:00.000000000',\n", + " '2010-09-01T00:00:00.000000000', '2010-10-01T00:00:00.000000000',\n", + " '2010-11-01T00:00:00.000000000', '2010-12-01T00:00:00.000000000',\n", + " '2011-01-01T00:00:00.000000000', '2011-02-01T00:00:00.000000000',\n", + " '2011-03-01T00:00:00.000000000', '2011-04-01T00:00:00.000000000',\n", + " '2011-05-01T00:00:00.000000000', '2011-06-01T00:00:00.000000000',\n", + " '2011-07-01T00:00:00.000000000', '2011-08-01T00:00:00.000000000',\n", + " '2011-09-01T00:00:00.000000000', '2011-10-01T00:00:00.000000000',\n", + " '2011-11-01T00:00:00.000000000', '2011-12-01T00:00:00.000000000',\n", + " '2012-01-01T00:00:00.000000000', '2012-02-01T00:00:00.000000000',\n", + " '2012-03-01T00:00:00.000000000', '2012-04-01T00:00:00.000000000',\n", + " '2012-05-01T00:00:00.000000000', '2012-06-01T00:00:00.000000000',\n", + " '2012-07-01T00:00:00.000000000', '2012-08-01T00:00:00.000000000',\n", + " '2012-09-01T00:00:00.000000000', '2012-10-01T00:00:00.000000000',\n", + " '2012-11-01T00:00:00.000000000', '2012-12-01T00:00:00.000000000',\n", + " '2013-01-01T00:00:00.000000000', '2013-02-01T00:00:00.000000000',\n", + " '2013-03-01T00:00:00.000000000', '2013-04-01T00:00:00.000000000',\n", + " '2013-05-01T00:00:00.000000000', '2013-06-01T00:00:00.000000000',\n", + " '2013-07-01T00:00:00.000000000', '2013-08-01T00:00:00.000000000',\n", + " '2013-09-01T00:00:00.000000000', '2013-10-01T00:00:00.000000000',\n", + " '2013-11-01T00:00:00.000000000', '2013-12-01T00:00:00.000000000',\n", + " '2014-01-01T00:00:00.000000000', '2014-02-01T00:00:00.000000000',\n", + " '2014-03-01T00:00:00.000000000', '2014-04-01T00:00:00.000000000',\n", + " '2014-05-01T00:00:00.000000000', '2014-06-01T00:00:00.000000000',\n", + " '2014-07-01T00:00:00.000000000', '2014-08-01T00:00:00.000000000',\n", + " '2014-09-01T00:00:00.000000000', '2014-10-01T00:00:00.000000000',\n", + " '2014-11-01T00:00:00.000000000', '2014-12-01T00:00:00.000000000',\n", + " '2015-01-01T00:00:00.000000000', '2015-02-01T00:00:00.000000000',\n", + " '2015-03-01T00:00:00.000000000', '2015-04-01T00:00:00.000000000',\n", + " '2015-05-01T00:00:00.000000000', '2015-06-01T00:00:00.000000000',\n", + " '2015-07-01T00:00:00.000000000', '2015-08-01T00:00:00.000000000',\n", + " '2015-09-01T00:00:00.000000000', '2015-10-01T00:00:00.000000000',\n", + " '2015-11-01T00:00:00.000000000', '2015-12-01T00:00:00.000000000',\n", + " '2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',\n", + " '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',\n", + " '2016-05-01T00:00:00.000000000', '2016-06-01T00:00:00.000000000',\n", + " '2016-07-01T00:00:00.000000000', '2016-08-01T00:00:00.000000000',\n", + " '2016-09-01T00:00:00.000000000', '2016-10-01T00:00:00.000000000',\n", + " '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000'],\n", + " dtype='datetime64[ns]')
array([-179.875, -179.625, -179.375, ..., 179.375, 179.625, 179.875],\n", + " dtype=float32)
array([-89.875, -89.625, -89.375, ..., 89.375, 89.625, 89.875],\n", + " dtype=float32)
<xarray.Dataset>\n", + "Dimensions: (latitude: 720, longitude: 1440, time: 81)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2010-04-01T12:00:00 ... 2016-12-01T12:...\n", + " * latitude (latitude) float64 -89.88 -89.62 -89.38 ... 89.38 89.62 89.88\n", + " * longitude (longitude) float32 -179.9 -179.6 -179.4 ... 179.4 179.6 179.9\n", + "Data variables:\n", + " danger_risk (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " fwinx (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " ffmcode (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " dufmcode (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " drtcode (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " infsinx (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " fbupinx (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + " fdsrte (time, latitude, longitude) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>\n", + "Attributes:\n", + " CDI: Climate Data Interface version 1.9.8 (https://mpimet.mpg.de...\n", + " Conventions: CF-1.6\n", + " history: Sun Apr 25 20:57:44 2021: cdo -R -f nc -remapbil,n320 -setg...\n", + " institution: European Centre for Medium-Range Weather Forecasts\n", + " CDO: Climate Data Operators version 1.9.8 (https://mpimet.mpg.de...
array(['2010-04-01T12:00:00.000000000', '2010-05-01T12:00:00.000000000',\n", + " '2010-06-01T12:00:00.000000000', '2010-07-01T12:00:00.000000000',\n", + " '2010-08-01T12:00:00.000000000', '2010-09-01T12:00:00.000000000',\n", + " '2010-10-01T12:00:00.000000000', '2010-11-01T12:00:00.000000000',\n", + " '2010-12-01T12:00:00.000000000', '2011-01-01T12:00:00.000000000',\n", + " '2011-02-01T12:00:00.000000000', '2011-03-01T12:00:00.000000000',\n", + " '2011-04-01T12:00:00.000000000', '2011-05-01T12:00:00.000000000',\n", + " '2011-06-01T12:00:00.000000000', '2011-07-01T12:00:00.000000000',\n", + " '2011-08-01T12:00:00.000000000', '2011-09-01T12:00:00.000000000',\n", + " '2011-10-01T12:00:00.000000000', '2011-11-01T12:00:00.000000000',\n", + " '2011-12-01T12:00:00.000000000', '2012-01-01T12:00:00.000000000',\n", + " '2012-02-01T12:00:00.000000000', '2012-03-01T12:00:00.000000000',\n", + " '2012-04-01T12:00:00.000000000', '2012-05-01T12:00:00.000000000',\n", + " '2012-06-01T12:00:00.000000000', '2012-07-01T12:00:00.000000000',\n", + " '2012-08-01T12:00:00.000000000', '2012-09-01T12:00:00.000000000',\n", + " '2012-10-01T12:00:00.000000000', '2012-11-01T12:00:00.000000000',\n", + " '2012-12-01T12:00:00.000000000', '2013-01-01T12:00:00.000000000',\n", + " '2013-02-01T12:00:00.000000000', '2013-03-01T12:00:00.000000000',\n", + " '2013-04-01T12:00:00.000000000', '2013-05-01T12:00:00.000000000',\n", + " '2013-06-01T12:00:00.000000000', '2013-07-01T12:00:00.000000000',\n", + " '2013-08-01T12:00:00.000000000', '2013-09-01T12:00:00.000000000',\n", + " '2013-10-01T12:00:00.000000000', '2013-11-01T12:00:00.000000000',\n", + " '2013-12-01T12:00:00.000000000', '2014-01-01T12:00:00.000000000',\n", + " '2014-02-01T12:00:00.000000000', '2014-03-01T12:00:00.000000000',\n", + " '2014-04-01T12:00:00.000000000', '2014-05-01T12:00:00.000000000',\n", + " '2014-06-01T12:00:00.000000000', '2014-07-01T12:00:00.000000000',\n", + " '2014-08-01T12:00:00.000000000', '2014-09-01T12:00:00.000000000',\n", + " '2014-10-01T12:00:00.000000000', '2014-11-01T12:00:00.000000000',\n", + " '2014-12-01T12:00:00.000000000', '2015-01-01T12:00:00.000000000',\n", + " '2015-02-01T12:00:00.000000000', '2015-03-01T12:00:00.000000000',\n", + " '2015-04-01T12:00:00.000000000', '2015-05-01T12:00:00.000000000',\n", + " '2015-06-01T12:00:00.000000000', '2015-07-01T12:00:00.000000000',\n", + " '2015-08-01T12:00:00.000000000', '2015-09-01T12:00:00.000000000',\n", + " '2015-10-01T12:00:00.000000000', '2015-11-01T12:00:00.000000000',\n", + " '2015-12-01T12:00:00.000000000', '2016-01-01T12:00:00.000000000',\n", + " '2016-02-01T12:00:00.000000000', '2016-03-01T12:00:00.000000000',\n", + " '2016-04-01T12:00:00.000000000', '2016-05-01T12:00:00.000000000',\n", + " '2016-06-01T12:00:00.000000000', '2016-07-01T12:00:00.000000000',\n", + " '2016-08-01T12:00:00.000000000', '2016-09-01T12:00:00.000000000',\n", + " '2016-10-01T12:00:00.000000000', '2016-11-01T12:00:00.000000000',\n", + " '2016-12-01T12:00:00.000000000'], dtype='datetime64[ns]')
array([-89.875, -89.625, -89.375, ..., 89.375, 89.625, 89.875])
array([-179.875, -179.625, -179.375, ..., 179.375, 179.625, 179.875],\n", + " dtype=float32)
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "
\n",
+ "
| \n",
+ "\n", + "\n", + " | \n", + "