From 0198c0f4f3553280dd399b3e4419f4e67080fa31 Mon Sep 17 00:00:00 2001 From: Joe Ranalli Date: Thu, 4 Apr 2024 16:21:54 -0400 Subject: [PATCH] Automated CMV notebook demo, along with updates to the code-only version. --- README.md | 2 +- demos/automate_cmv_demo.ipynb | 446 ++++++++++++++++++++++++++++++++++ demos/automate_cmv_demo.py | 25 +- 3 files changed, 462 insertions(+), 11 deletions(-) create mode 100644 demos/automate_cmv_demo.ipynb diff --git a/README.md b/README.md index 8ea4279..cb7bde7 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ warranted. - Covers a single hour of data (9:15 - 10:15 UTC on Sept 8, 2013). - `hope_melpitz_10s` contains data sampled at 10s time resolution, acquired by temporally averaging time series data from the original dataset. - - Covers 3 full days, from Sept 8 - Sept 11, 2013. + - Covers 4 full days, from Sept 8 - Sept 11, 2013. - In both cases, data were first postprocessed using only removal of nulls (-9999) and linear interpolation to fill gaps left by the nulls, with a maximum interpolation window of 5s. See `dataio.hope_campaign` for details on diff --git a/demos/automate_cmv_demo.ipynb b/demos/automate_cmv_demo.ipynb new file mode 100644 index 0000000..43ec5c7 --- /dev/null +++ b/demos/automate_cmv_demo.ipynb @@ -0,0 +1,446 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Automation of CMV Detection for Large Time Series\n", + "\n", + "When applying the field analysis methodology, there is often an extensive dataset available showing, e.g. time series data for a plant over the course of an entire year or longer. In order to apply the method, it is necessary to obtain reduce the CMVs to a subset for which the method can be applied, because simply calculating for every possible cloud motion vector pair is computationally intractable. \n", + "\n", + "This notebook will demonstrate the process for downselecting a set of interesting CMVs according to the methodology discussed in the 2024 IEEE PVSC paper.\n", + "\n", + "\n", + "## Initialization\n", + "First we need to import relevant packages. " + ], + "id": "f25751d6420017ed" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:31.842693Z", + "start_time": "2024-04-04T20:19:29.837140Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pvlib\n", + "\n", + "from solartoolbox import stats, spatial, cmv" + ], + "id": "f63c27a5889e67a3", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\jar339\\AppData\\Local\\Temp\\ipykernel_19064\\3447369378.py:1: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## The Data\n", + "For this example, we will work on four days of 10s resolution data from the HOPE Melpitz campaign, included in the sample data package. We'll first read it in. Since the CMV routine is built to work on the clearsky index, we'll calculate that while we're at it. The plot shows a sample of the data for a single sensor over the four days. " + ], + "id": "2ad28bcd45de2c8e" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:34.439491Z", + "start_time": "2024-04-04T20:19:31.842693Z" + } + }, + "cell_type": "code", + "source": [ + "fn = 'data/hope_melpitz_10s.h5'\n", + "pos = pd.read_hdf(fn, mode=\"r\", key=\"latlon\")\n", + "pos_utm = spatial.latlon2utm(pos['lat'], pos['lon'])\n", + "ts = pd.read_hdf(fn, mode=\"r\", key=\"data\")\n", + "\n", + "loc = pvlib.location.Location(np.mean(pos['lat']), np.mean(pos['lon']))\n", + "cs_ghi = loc.get_clearsky(ts.index, model='simplified_solis')['ghi']\n", + "kt = ts.divide(cs_ghi, axis=0).clip(0,2)\n", + "\n", + "# Plot a single sensor's data\n", + "plt.plot(ts.index, ts[40])\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel('Time')\n", + "plt.ylabel('Irradiance')\n", + "plt.show()" + ], + "id": "a40b8d0bce7985b7", + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 2 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Deciding which Periods to Calculate CMVs For\n", + "I typically calculate CMVs using a one hour period of data. While that's viable for the whole four days here, it would rapidly become ornerous over e.g. an entire year, due to the computational intensity of the CMV method. One method to reduce the number of CMVs to calculate is to only calculate them for periods where the irradiance is changing rapidly, indicating the presence of clouds. Here we will use the Variability Score to quantify how variable each one hour period is in the data. \n", + "\n", + "Note that the variability score calculation has a slightly weird form using lambda functions, because doing so can allow the calculation to be performed using the vectorized form of the code. Other techniques might be possible. " + ], + "id": "cba49ff956de6bd6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:39.254139Z", + "start_time": "2024-04-04T20:19:34.439491Z" + } + }, + "cell_type": "code", + "source": [ + "avg_interval = '1h'\n", + "\n", + "# Calc the VS for each 1 hour period\n", + "vs_all = ts.resample(avg_interval).apply(\n", + " lambda x: stats.variability_score(x[ts.columns]))\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax2 = ax.twinx()\n", + "ax.plot(ts.index, ts[40])\n", + "plt.xlabel('Time')\n", + "plt.xticks(rotation=90)\n", + "ax.set_ylabel('Irradiance', color='blue')\n", + "ax.tick_params(axis='y', colors='blue')\n", + "ax2.plot(vs_all.index, vs_all[40], 'r')\n", + "ax2.set_ylabel('Variability Score', color='red')\n", + "ax2.tick_params(axis='y', colors='red')\n", + "plt.show()" + ], + "id": "50eecc1e62ada760", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\jar339\\AppData\\Local\\Temp\\ipykernel_19064\\2407418665.py:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " lambda x: stats.variability_score(x[ts.columns]))\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 3 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Once we have a metric of variability, we can downselect to retain only the N most variable periods. Here we'll use 20 just as an example.", + "id": "ecde5e3cad9e3f40" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:39.268061Z", + "start_time": "2024-04-04T20:19:39.254139Z" + } + }, + "cell_type": "code", + "source": [ + "n_var = 20\n", + "vs = vs_all.median(axis=1).sort_values(ascending=False)\n", + "vs = vs.iloc[0:n_var]\n", + "print(vs)" + ], + "id": "92f462cc466ac188", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2013-09-08 11:00:00+00:00 36.827512\n", + "2013-09-10 10:00:00+00:00 17.346362\n", + "2013-09-09 10:00:00+00:00 16.861423\n", + "2013-09-08 10:00:00+00:00 14.032438\n", + "2013-09-10 08:00:00+00:00 11.659271\n", + "2013-09-10 12:00:00+00:00 7.277602\n", + "2013-09-08 07:00:00+00:00 4.998409\n", + "2013-09-10 14:00:00+00:00 3.878509\n", + "2013-09-08 06:00:00+00:00 3.846806\n", + "2013-09-08 12:00:00+00:00 3.722167\n", + "2013-09-09 12:00:00+00:00 3.016234\n", + "2013-09-09 16:00:00+00:00 2.515403\n", + "2013-09-10 16:00:00+00:00 2.369758\n", + "2013-09-11 12:00:00+00:00 2.250612\n", + "2013-09-09 11:00:00+00:00 2.169986\n", + "2013-09-11 08:00:00+00:00 1.778225\n", + "2013-09-09 13:00:00+00:00 1.750230\n", + "2013-09-09 09:00:00+00:00 1.719327\n", + "2013-09-10 13:00:00+00:00 1.264782\n", + "2013-09-08 09:00:00+00:00 1.185312\n", + "dtype: float64\n" + ] + } + ], + "execution_count": 4 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Calculating the CMVs\n", + "Now that we have a set of periods during which to calculate the CMV, we can loop over them to do the calculations. We also want to record some of the useful statistical and quality control information about the quality of the CMVs, because we'd later like to be able to determine which of these are of high quailty.\n", + "\n", + "We will assess the CMVs based on the following values:\n", + "- `ngood` - the number of point pairs that passed the Jamaly QC routines\n", + "- `r_corr` - the correlation coefficient of separation vs. delay for the good point pairs (see cmv_demo.ipynb). \n", + "- `flag` - the overall flag from the CMV QC process" + ], + "id": "2f9fbcc2bae8dc4d" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:41.668975Z", + "start_time": "2024-04-04T20:19:39.270073Z" + } + }, + "cell_type": "code", + "source": [ + "cmvs = pd.DataFrame(columns=[\"cld_spd\", \"cld_dir_rad\", \"df_p95\", \"ngood\", \"r_corr\", \"stderr_corr\", \"error_index\", \"flag\"])\n", + "for date in vs.index:\n", + " # Select the subset of data\n", + " hour = pd.date_range(date, date + pd.to_timedelta('1h'), freq='10s')\n", + " kt_hour = kt.loc[hour]\n", + " \n", + " hourlymax = ts.loc[hour].max().quantile(0.95)\n", + "\n", + " # Compute the CMV using the Jamaly method\n", + " cld_spd, cld_dir, dat = cmv.compute_cmv(kt_hour, pos_utm, method='jamaly', options={'minvelocity': 1})\n", + "\n", + " cmvs.loc[date] = [cld_spd, cld_dir, hourlymax, dat.method_data['ngood'], dat.method_data['r_corr'], dat.method_data['stderr_corr'], np.abs(dat.method_data[\"error_index\"]), dat.flag.name]\n", + "pd.options.display.width = 800\n", + "print(cmvs)" + ], + "id": "529475ecae6e313f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " cld_spd cld_dir_rad df_p95 ngood r_corr stderr_corr error_index flag\n", + "2013-09-08 11:00:00+00:00 17.253624 1.015742 1036.323712 306 0.990169 0.147371 0.102980 GOOD\n", + "2013-09-10 10:00:00+00:00 9.323374 0.446288 967.214252 624 0.970793 0.090337 0.128893 GOOD\n", + "2013-09-09 10:00:00+00:00 12.315270 5.789374 1105.664569 467 0.951945 0.197357 0.175704 GOOD\n", + "2013-09-08 10:00:00+00:00 15.790708 1.022334 1070.301172 781 0.872326 0.190603 0.202564 GOOD\n", + "2013-09-10 08:00:00+00:00 13.386957 0.736245 752.929056 581 0.989464 0.080910 0.082828 GOOD\n", + "2013-09-10 12:00:00+00:00 27.468773 0.515227 839.723816 412 0.978562 0.275288 0.122317 GOOD\n", + "2013-09-08 07:00:00+00:00 18.609486 0.871618 642.030344 797 0.992215 0.084133 0.081098 GOOD\n", + "2013-09-10 14:00:00+00:00 13.931439 0.698200 657.489758 756 0.943971 0.168528 0.195472 GOOD\n", + "2013-09-08 06:00:00+00:00 21.260689 0.833676 351.352144 680 0.993707 0.088246 0.086116 GOOD\n", + "2013-09-08 12:00:00+00:00 21.164760 0.558943 916.882816 90 0.873551 1.707730 0.241034 GOOD\n", + "2013-09-09 12:00:00+00:00 8.103066 5.511727 896.326129 623 0.966734 0.078114 0.110935 GOOD\n", + "2013-09-09 16:00:00+00:00 17.422306 0.081691 215.279455 491 0.806969 0.365211 0.133241 GOOD\n", + "2013-09-10 16:00:00+00:00 24.613327 0.378690 294.552469 578 0.960366 0.265352 0.146015 GOOD\n", + "2013-09-11 12:00:00+00:00 13.008833 0.057238 447.995486 772 0.983205 0.081499 0.100318 GOOD\n", + "2013-09-09 11:00:00+00:00 11.170620 0.140915 981.699747 933 0.919513 0.143933 0.240121 GOOD\n", + "2013-09-11 08:00:00+00:00 15.618439 1.117748 398.662215 760 0.825436 0.287277 0.173498 GOOD\n", + "2013-09-09 13:00:00+00:00 6.956777 5.610811 801.185779 860 0.879255 0.119534 0.306667 GOOD\n", + "2013-09-09 09:00:00+00:00 14.288631 5.987103 1088.490857 75 0.823863 1.168389 0.201482 GOOD\n", + "2013-09-10 13:00:00+00:00 31.678545 1.045541 675.928391 456 0.804433 0.710315 0.161430 GOOD\n", + "2013-09-08 09:00:00+00:00 18.401612 1.539226 1044.880005 689 0.970706 0.161717 0.092783 GOOD\n" + ] + } + ], + "execution_count": 5 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Downselect to High Quality CMVs\n", + "We can now downselect to only the highest quality CMVs. The limits shown here are just for reference based off this dataset. They probably need to be tuned to each specific dataset to appropriately limit the CMVs. The printout shows the ones that failed the QC." + ], + "id": "c379e843cd5efa35" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:41.683230Z", + "start_time": "2024-04-04T20:19:41.668975Z" + } + }, + "cell_type": "code", + "source": [ + "ngood_min = 200\n", + "rval_min = 0.85\n", + "bad_inds = []\n", + "for row in cmvs.itertuples():\n", + " if row.ngood < ngood_min:\n", + " bad_inds.append(row.Index)\n", + " elif row.r_corr < rval_min:\n", + " bad_inds.append(row.Index)\n", + " elif row.flag != 'GOOD':\n", + " bad_inds.append(row.Index)\n", + "print(cmvs.loc[bad_inds])\n", + "\n", + "cmvs = cmvs.drop(index=bad_inds)" + ], + "id": "642d3200bf9398f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " cld_spd cld_dir_rad df_p95 ngood r_corr stderr_corr error_index flag\n", + "2013-09-08 12:00:00+00:00 21.164760 0.558943 916.882816 90 0.873551 1.707730 0.241034 GOOD\n", + "2013-09-09 16:00:00+00:00 17.422306 0.081691 215.279455 491 0.806969 0.365211 0.133241 GOOD\n", + "2013-09-11 08:00:00+00:00 15.618439 1.117748 398.662215 760 0.825436 0.287277 0.173498 GOOD\n", + "2013-09-09 09:00:00+00:00 14.288631 5.987103 1088.490857 75 0.823863 1.168389 0.201482 GOOD\n", + "2013-09-10 13:00:00+00:00 31.678545 1.045541 675.928391 456 0.804433 0.710315 0.161430 GOOD\n" + ] + } + ], + "execution_count": 6 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Decide which CMVs make a representative set for Field Analysis\n", + "The field analysis requires vectors that are roughly perpendicular. So we will try to downselect to a set of CMVs that will give us a representative scatter across all directions. Since parallel and anti-parallel vectors are both disadvantageous here, we will rotate them all by 180 degrees. The function `cmv.optimum_subset` is capable of performing this optimization. For a real dataset, we might want to choose to keep around 10, but here we'll use 5 just to show the downselection process." + ], + "id": "c9d955d991ac2710" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:41.697237Z", + "start_time": "2024-04-04T20:19:41.683230Z" + } + }, + "cell_type": "code", + "source": [ + "nfinal = 5\n", + "# Compute the x and y components of the CMVs for optimum_subset\n", + "vx, vy = spatial.pol2rect(cmvs.cld_spd, cmvs.cld_dir_rad)\n", + "indices = cmv.optimum_subset(vx, vy, n=nfinal)\n", + "print(cmvs.iloc[indices])" + ], + "id": "2a76f4d6f99c3da", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " cld_spd cld_dir_rad df_p95 ngood r_corr stderr_corr error_index flag\n", + "2013-09-10 16:00:00+00:00 24.613327 0.378690 294.552469 578 0.960366 0.265352 0.146015 GOOD\n", + "2013-09-08 07:00:00+00:00 18.609486 0.871618 642.030344 797 0.992215 0.084133 0.081098 GOOD\n", + "2013-09-08 09:00:00+00:00 18.401612 1.539226 1044.880005 689 0.970706 0.161717 0.092783 GOOD\n", + "2013-09-09 12:00:00+00:00 8.103066 5.511727 896.326129 623 0.966734 0.078114 0.110935 GOOD\n", + "2013-09-09 10:00:00+00:00 12.315270 5.789374 1105.664569 467 0.951945 0.197357 0.175704 GOOD\n" + ] + } + ], + "execution_count": 7 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Visualizing the result\n", + "Finally, we have a set of CMVs that are of high quality and represent a broad range of directions so that we are likely to have a good sampling of nearly perpendicular vectors that can be of use for the field analysis methodology. We'll plot the combination of vectors that passed QC and those that were selected for field analysis just to visualize that process." + ], + "id": "4e833173e929bceb" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-04T20:19:41.948803Z", + "start_time": "2024-04-04T20:19:41.697237Z" + } + }, + "cell_type": "code", + "source": [ + "# Plot the vectors visually\n", + "plt.figure()\n", + "for i, (dx, dy) in enumerate(zip(vx, vy)):\n", + " mylabel = 'All CMVs' if i == 0 else '_nolegend_'\n", + " plt.arrow(0, 0, dx, dy, head_width=1, head_length=1, fc='k', label=mylabel)\n", + "for i, (dx, dy) in enumerate(zip(vx.iloc[indices], vy.iloc[indices])):\n", + " mylabel = 'Selected CMVs' if i == 0 else '_nolegend_'\n", + " plt.arrow(0, 0, dx, dy, head_width=1, head_length=1, fc='r', ec='r', label=mylabel)\n", + "plt.xlabel('Eastward Velocity (m/s)')\n", + "plt.ylabel('Northward Velocity (m/s)')\n", + "plt.axis('equal')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ], + "id": "6369160a17047a45", + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 8 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demos/automate_cmv_demo.py b/demos/automate_cmv_demo.py index d8d1e00..56b2995 100644 --- a/demos/automate_cmv_demo.py +++ b/demos/automate_cmv_demo.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt +import pvlib from solartoolbox import stats, spatial, cmv @@ -21,12 +22,18 @@ # # LOAD DATA # # ############# -# Load 3 days of data from the HOPE campaign. +# Load 4 days of data from the HOPE campaign. fn = 'data/hope_melpitz_10s.h5' pos = pd.read_hdf(fn, mode="r", key="latlon") pos_utm = spatial.latlon2utm(pos['lat'], pos['lon']) ts = pd.read_hdf(fn, mode="r", key="data") +# We will need the clear sky index for the CMV calculation, so let's compute it +# now. +loc = pvlib.location.Location(np.mean(pos['lat']), np.mean(pos['lon'])) +cs_ghi = loc.get_clearsky(ts.index, model='simplified_solis')['ghi'] +kt = ts.divide(cs_ghi, axis=0) + # ####################### # # FIND VARIABLE HOURS # # ####################### @@ -53,27 +60,25 @@ # Now we'll compute the CMVs for each of those 20 hours. # Build a holder for the output data -cmvs = pd.DataFrame(columns=["cld_spd", "cld_dir_rad", "df_p95", "ngood", "rval", "stderr", "error_index"]) +cmvs = pd.DataFrame(columns=["cld_spd", "cld_dir_rad", "df_p95", "ngood", "r_corr", "stderr_corr", "error_index", "flag"]) cmvs_flags = [] # Loop over hours for date in vs.index: # Select the subset of data hour = pd.date_range(date, date + pd.to_timedelta('1h'), freq='10s') + kt_hour = kt.loc[hour] hour = ts.loc[hour] - # Normalize it to yield something like kt (could actually calculate - # clearsky if available. Here, we're just using quantiles) - hourlymax = np.mean(hour.quantile(0.95)) - kt = hour / hourlymax + hourlymax = kt_hour.quantile(0.95).mean() # Compute the CMV using the Jamaly method - cld_spd, cld_dir, dat = cmv.compute_cmv(kt, pos_utm, method='jamaly', options={'minvelocity': 1}) + cld_spd, cld_dir, dat = cmv.compute_cmv(kt_hour, pos_utm, method='jamaly', options={'minvelocity': 1}) # Store the global flag cmvs_flags.append(dat.flag) - cmvs.loc[date] = [cld_spd, cld_dir, hourlymax, dat.method_data['ngood'], dat.method_data['r_corr'], dat.method_data['stderr_corr'], np.abs(dat.method_data["error_index"])] + cmvs.loc[date] = [cld_spd, cld_dir, hourlymax, dat.method_data['ngood'], dat.method_data['r_corr'], dat.method_data['stderr_corr'], np.abs(dat.method_data["error_index"]), dat.flag.name] # Display the 20 CMVs we just acquired pd.options.display.max_columns = None @@ -93,9 +98,9 @@ for row in cmvs.itertuples(): if row.ngood < ngood_min: bad_inds.append(row.Index) - elif row.rval < rval_min: + elif row.r_corr < rval_min: bad_inds.append(row.Index) - elif cmvs_flags[row.index(row.Index)] is not None: + elif cmvs_flags[row.index(row.Index)] is not cmv.Flag.GOOD: bad_inds.append(row.Index) # Drop the bad ones from the DF