diff --git a/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb b/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb index 75d3f70..06b8d58 100644 --- a/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb +++ b/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb @@ -26,6 +26,7 @@ "import cartopy.crs as ccrs\n", "import matplotlib.cbook\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", "import xarray as xr\n", "from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils\n", @@ -53,14 +54,11 @@ "stop = \"2000-01\"\n", "\n", "# Region\n", - "lon_slice = slice(28, 41)\n", - "lat_slice = slice(-16, 4)\n", + "lon_slice = slice(-92.10, -84.80)\n", + "lat_slice = slice(46.30, 49.00)\n", "\n", "# Variable\n", - "varname = \"lake_surface_water_temperature\"\n", - "\n", - "# Lakes to show in the analysis\n", - "lakeids = [3, 7, 10]" + "varname = \"lake_surface_water_temperature\"" ] }, { @@ -91,7 +89,7 @@ "id": "7", "metadata": {}, "source": [ - "## Define function to extract region and compute spatial weighted mean" + "## Download data" ] }, { @@ -101,23 +99,16 @@ "metadata": {}, "outputs": [], "source": [ - "def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname, lakeids):\n", - " ds = ds[[varname, \"lakeid\"]]\n", - " ds = ds.chunk({\"time\": 1, \"latitude\": 1_200, \"longitude\": 2_400})\n", - " ds = utils.regionalise(ds, lon_slice=lon_slice, lat_slice=lat_slice)\n", - " dataarrays = []\n", - " for lakeid in lakeids:\n", - " da = ds[varname].where(ds[\"lakeid\"] == lakeid)\n", - " da = diagnostics.spatial_weighted_mean(da)\n", - " dataarrays.append(da.expand_dims(lakeid=[lakeid]))\n", - " return xr.concat(dataarrays, \"lakeid\").to_dataset()\n", - "\n", - "\n", - "def get_lakeid(ds, lon_slice, lat_slice):\n", - " da = ds[\"lakeid\"].isel(time=0)\n", - " da = da.chunk({\"latitude\": 1_200, \"longitude\": 2_400})\n", - " da = utils.regionalise(da, lon_slice=lon_slice, lat_slice=lat_slice)\n", - " return da.to_dataset()" + "requests = download.update_request_date(\n", + " request, start=start, stop=stop, stringify_dates=True\n", + ")\n", + "ds = download.download_and_transform(\n", + " collection_id,\n", + " requests,\n", + " chunks={\"year\": 1, \"month\": 1},\n", + " transform_func=utils.regionalise,\n", + " transform_func_kwargs={\"lon_slice\": lon_slice, \"lat_slice\": lat_slice},\n", + ")" ] }, { @@ -125,7 +116,7 @@ "id": "9", "metadata": {}, "source": [ - "## Download data" + "## Plot lakeid" ] }, { @@ -135,72 +126,47 @@ "metadata": {}, "outputs": [], "source": [ - "chunks = {\"year\": 1, \"month\": 1}\n", - "requests = download.update_request_date(\n", - " request, start=start, stop=stop, stringify_dates=True\n", - ")\n", - "ds = download.download_and_transform(\n", - " collection_id,\n", - " requests,\n", - " chunks=chunks,\n", - " transform_func=spatial_weighted_mean_of_region,\n", - " transform_func_kwargs={\n", - " \"lon_slice\": lon_slice,\n", - " \"lat_slice\": lat_slice,\n", - " \"varname\": varname,\n", - " \"lakeids\": lakeids,\n", - " },\n", - ")\n", - "da = ds[varname].compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "# We use one of the request previously cached\n", - "single_request = {\n", - " k: v if isinstance(v, str) or k not in chunks else v[0]\n", - " for k, v in requests[0].items()\n", - "}\n", - "da_map = download.download_and_transform(\n", - " collection_id,\n", - " single_request,\n", - " chunks=chunks,\n", - " transform_func=get_lakeid,\n", - " transform_func_kwargs={\n", - " \"lon_slice\": lon_slice,\n", - " \"lat_slice\": lat_slice,\n", - " },\n", - ")[\"lakeid\"]" + "_ = plot.projected_map(\n", + " ds[\"lakeid\"].isel(time=0), projection=ccrs.PlateCarree(), show_stats=False\n", + ")" ] }, { "cell_type": "markdown", - "id": "12", + "id": "11", "metadata": {}, "source": [ - "## Plot projected map of lake IDs" + "## Reindex" ] }, { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "12", "metadata": {}, "outputs": [], "source": [ - "for da_to_plot in [da_map, da_map.where(da_map.isin(lakeids))]:\n", - " _ = plot.projected_map(da_to_plot, projection=ccrs.PlateCarree(), show_stats=False)\n", - " plt.show()" + "# Reindex using lakeids\n", + "lakeid = (\n", + " xr.DataArray(np.unique(ds[\"lakeid\"].values), dims=(\"lakeid\"))\n", + " .dropna(\"lakeid\")\n", + " .astype(int)\n", + ")\n", + "ds = ds.drop_vars(\"lakeid\").where(ds[\"lakeid\"] == lakeid).assign_coords(lakeid=lakeid)\n", + "\n", + "# Reindex using min_quality_level\n", + "min_quality_level = xr.DataArray([0, 4], dims=(\"min_quality_level\"))\n", + "ds = ds.where(ds[\"quality_level\"] >= min_quality_level).assign_coords(\n", + " min_quality_level=min_quality_level\n", + ")\n", + "\n", + "# Spatial weighted mean\n", + "da = diagnostics.spatial_weighted_mean(ds[varname])" ] }, { "cell_type": "markdown", - "id": "14", + "id": "13", "metadata": {}, "source": [ "## Plot spatial weighted mean" @@ -209,27 +175,24 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "14", "metadata": {}, "outputs": [], "source": [ - "for lakeid, da_lakeid in da.groupby(\"lakeid\"):\n", - " da_lakeid.dropna(\"time\").plot(label=lakeid)\n", - "plt.legend(title=\"lake ID\")\n", + "da.dropna(\"time\").plot(hue=\"min_quality_level\", col=\"lakeid\", figsize=(10, 6))\n", "plt.grid()\n", "plt.title(\"Spatial weighted mean\")\n", "plt.show()\n", "\n", "# Print missing values\n", - "missings = da.isnull().sum(\"time\") / da.sizes[\"time\"] * 100\n", - "id_digits = max(map(len, da[\"lakeid\"].astype(str).values))\n", - "for lakeid, missing in missings.groupby(\"lakeid\"):\n", - " print(f\"Missing values of lake ID {lakeid:<{id_digits}}: {missing.values:.2f} %\")" + "missings = da.sel(min_quality_level=0).isnull().sum(\"time\") / da.sizes[\"time\"] * 100\n", + "for lakeid, missing in missings.groupby(\"lakeid\", squeeze=False):\n", + " print(f\"Missing values of lake ID {lakeid}: {float(missing.squeeze()):.2f} %\")" ] }, { "cell_type": "markdown", - "id": "16", + "id": "15", "metadata": {}, "source": [ "## Boxplot" @@ -238,31 +201,28 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "16", "metadata": {}, "outputs": [], "source": [ "df = da.to_dataframe()\n", - "df.boxplot(by=\"lakeid\")\n", + "df.boxplot(by=([\"lakeid\", \"min_quality_level\"]))\n", "plt.ylabel(f\"{da.attrs['long_name']} [{da.attrs['units']}]\")\n", "plt.show()\n", "\n", - "# Print statistics\n", - "boxplot_stats = {\n", - " lakeid: matplotlib.cbook.boxplot_stats(df_lakeid.dropna().values.squeeze())\n", - " for lakeid, df_lakeid in df.groupby(\"lakeid\")\n", - "}\n", - "for lakeid, df_lakeid in df.groupby(\"lakeid\"):\n", - " values = df_lakeid.dropna().values.squeeze()\n", - " (boxplot_stats[lakeid],) = matplotlib.cbook.boxplot_stats(values)\n", - "boxplot_stats = pd.DataFrame(boxplot_stats)\n", - "boxplot_stats" + "# Show stats\n", + "stats = {}\n", + "for label, grouped_df in df.groupby([\"lakeid\", \"min_quality_level\"]):\n", + " (stats[label],) = matplotlib.cbook.boxplot_stats(\n", + " grouped_df.dropna().values.squeeze()\n", + " )\n", + "pd.DataFrame(stats)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -276,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.9" } }, "nbformat": 4,