Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to nldas_soil_moisture_ml.py and related scripts #236

Open
wants to merge 54 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
55959a5
cross validation of MLDataset Pipeline
Oct 24, 2017
396f9aa
changes with CV sampling
Oct 26, 2017
33bac56
changes to cv_cache
Oct 26, 2017
b422e68
closer to working cross validation for MLDataset
Oct 26, 2017
d45d4e1
CV / xarray experimentation - work in progress
Oct 31, 2017
92054c9
MLDataset cross validation working for pipeline of 1 step that is uns…
Nov 1, 2017
35450c1
wrapped sklearn classes need to wrap score methods as fit, predict, o…
Nov 1, 2017
f86a079
update tests;fix cross validation with most data structures
Nov 3, 2017
5cf646f
a couple tests for Python 2.7
Nov 3, 2017
744109a
avoid dask-searchcv test in conda.recipe;add test_config.yml to MANIF…
Nov 3, 2017
1e7bec8
remove print statement
Nov 3, 2017
83437f5
ensure test_config.yaml included in pkg
Nov 3, 2017
de9efd0
remove elm.mldataset.cross_validation - modify environment.yml for el…
Nov 3, 2017
6267041
fix usage of is_arr utility to separate X, y tuple
Nov 3, 2017
66013e6
1850 passing tests
Nov 4, 2017
a91caf6
dask-searchcv in meta.yaml
Nov 4, 2017
e9b5d85
use elm/label/dev and elm for CI installs
Nov 4, 2017
f6ef7c8
change earthio version for fixing CI build
Nov 4, 2017
948efe5
ensure EARTHIO_CHANNEL_STR is set correctly in .travis.yml
Nov 6, 2017
edbe1f5
ensure ANACONDA_UPLOAD_USER is defined in .travis for pkg upload
Nov 6, 2017
6304e37
change order of channels to ensure dask-searchcv comes from elm
Nov 6, 2017
8a6d46f
subset the number of tests being run in CI
Nov 6, 2017
21a18d9
better diagnostics on upload failure in CI
Nov 6, 2017
8ad7b4c
remove earthio from CI
Nov 6, 2017
9a1734d
be sure to create env from elm's conda build output
Nov 6, 2017
dc47f65
remove diagnostic print from deploy section
Nov 6, 2017
00ea1be
refactor to simplify changes in dask-searchcv
Nov 8, 2017
7d81830
fix pep8 issues
Nov 8, 2017
cca7b36
move some of dask-searchcv PR 61 changes to Elm
Nov 8, 2017
5018e3e
add cross_validation.py - remove commented code
Nov 8, 2017
acdf244
remove extra whitespace
Nov 8, 2017
431b1aa
changes to avoid needing changes in dask-searchcv
Nov 9, 2017
3f78207
space between functions
Nov 9, 2017
589762c
changes for dask-searchcv PR 65 refit changes
Nov 30, 2017
7be6c74
get rid of unicode literals
Dec 2, 2017
82547dd
Merge branch 'fix_data_expl_notebook' of https://github.com/Continuum…
Dec 2, 2017
8111b40
merge Greg's changes and PR 228
Dec 2, 2017
0b4681a
move Elm-Earthio-NLDAS commit 88047abc80684d0ea0c9d831b7887da082b69c84
Dec 2, 2017
3d2e64e
fixes for reading forcing data and ML ideas in NLDAS notebooks
Dec 2, 2017
e8b1299
fixes related to function signatures in py 2.7
Dec 2, 2017
7596431
updates for Python 2.7 - fixes to NLDAS scripts
Dec 6, 2017
56360e0
resolve merge conflicts
Dec 6, 2017
c6669fe
get changes to notebook from PR 232 228
Dec 6, 2017
f6a8338
py2.7 fixes
Dec 7, 2017
17ad50c
fixes to NLDAS examples
Dec 7, 2017
264af45
nldas example fixes
Dec 7, 2017
d759ee7
fixes to soil readers
Dec 7, 2017
8526313
fixes to nldas example
Dec 7, 2017
3d28a30
fixes to soil phys,chem data
Dec 7, 2017
ad7c1b4
fixes to nldas example with soil physical data
Dec 8, 2017
8405d41
fixes to nldas examples with soil moisture ML
Dec 9, 2017
4dbd4a5
fixes to nldas examples with soil moisture ML
Dec 9, 2017
d42b36b
commit just to test out Travis CI config
Dec 12, 2017
3301542
agu poster
Dec 18, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixes to nldas example with soil physical data
Peter Steinberg committed Dec 8, 2017
commit ad7c1b4e1b356ff777843c0a14a6a39186d137c0
6 changes: 3 additions & 3 deletions examples/changing_structure.py
Original file line number Diff line number Diff line change
@@ -3,13 +3,13 @@
class ChooseWithPreproc(Step):

estimator = None
trans_if = None
use_transform = None
run = True

def _pre_trans(self, X):
X, y = X
if self.trans_if:
return self.trans_if(X, y=y)
if self.use_transform:
return self.use_transform(X, y=y)
return X

def transform(self, X, y=None, **kw):
13 changes: 5 additions & 8 deletions examples/nldas_soil_features.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import print_function, division
from collections import OrderedDict
import numpy as np
import xarray as xr

from read_nldas_soils import SOIL_META, read_nldas_soils
from xarray_filters import MLDataset

_endswith = lambda x, end: x.endswith('_{}'.format(end))

@@ -30,19 +32,14 @@ def _avg_cos_hyd_params(soils_dset, attrs=None):


def flatten_horizons(soils_dset, attrs=None):
arrs = {}
arrs = OrderedDict()
attrs = attrs or soils_dset.attrs.copy()
for k, v in soils_dset.data_vars.items():
if 'horizon' in v.dims:
which_dim = v.dims.index('horizon')
for idx, horizon in enumerate(v.horizon):
slc = (slice(None),) * 3
array_label = '{}_{}'.format(k, horizon)
arrs[array_label] = v[slc]
arrs[array_label].attrs['horizon'] = horizon
arrs[k] = v.mean(dim='horizon')
else:
arrs[k] = v
return xr.Dataset(arrs, attrs=attrs)
return MLDataset(arrs, attrs=attrs)


def nldas_soil_features(soils_dset=None,
125 changes: 71 additions & 54 deletions examples/nldas_soil_moisture_ml.py
Original file line number Diff line number Diff line change
@@ -40,6 +40,9 @@

START_DATE = datetime.datetime(2000, 1, 1, 1, 0, 0)

print('nldas_soil_features')
SOIL_PHYS_CHEM = nldas_soil_features().to_features()
print('post_features')
ONE_HR = datetime.timedelta(hours=1)
TIME_OPERATIONS = ('mean',
'std',
@@ -51,20 +54,28 @@

np.random.seed(42) # TODO remove

def log_trans_only_positive(self, X, y=None, **kw):
Xnew = OrderedDict()
for j in range(X.features.shape[1]):
minn = X.features[:, j].min().values
if minn <= 0:
continue
X.features.values[:, j] = np.log10(X.features.values[:, j])
return X, y
class LogOnlyPositive(Step):
use_transform = False
def transform(self, X, y=None, **kw):
print('LOP,', X, y)
X, y = X
assert y is not None
if not self.get_params()['use_transform']:
return X, y
for j in range(X.features.shape[1]):
minn = X.features[:, j].min().values
if minn <= 0:
continue
X.features.values[:, j] = np.log10(X.features.values[:, j])
return X, y
fit_transform = transform


class Flatten(Step):

def transform(self, X, y=None, **kw):
return X.to_features(), y
feat = X.to_features().features.dropna(dim='space', how='any')
return MLDataset(OrderedDict([('features', feat)]), attrs=X.attrs)

fit_transform = transform

@@ -88,36 +99,22 @@ def transform(self, X, y=None, **kw):
SOIL_PHYS_CHEM = {}
class AddSoilPhysicalChemical(Step):
add = True
soils_dset = None
to_raster = True
avg_cos_hyd_params = False

def transform(self, X, y=None, **kw):
global SOIL_PHYS_CHEM
params = self.get_params().copy()
if not params.pop('add'):
return X
hsh = hash(repr(params))
if hsh in SOIL_PHYS_CHEM:
soils = SOIL_PHYS_CHEM[hsh]
else:
soils = nldas_soil_features(**params)
soils = MLDataset(soils).to_features()
if len(SOIL_PHYS_CHEM) < 3:
SOIL_PHYS_CHEM[hsh] = soils
return X[0].concat_ml_features()
soils = SOIL_PHYS_CHEM.copy()
return X.concat_ml_features()

fit_transform = transform

SCALERS = [preprocessing.StandardScaler()] + [preprocessing.MinMaxScaler()] * 10
np.random.shuffle(SCALERS)
param_distributions = {
'log__kw_args': [dict(trans_if=log_trans_only_positive),
dict(trans_if=None)],
'log__use_transform': [True, False],
'scaler__feature_range': [(x, x * 2) for x in np.linspace(0, 1, 10)],
'pca__n_components': [6, 7, 8, 10, 14, 18],
'pca__estimator': [decomposition.PCA(),
decomposition.FastICA(),],
'pca__estimator__n_components': [6, 7, 8, 10, 14, 18],
'pca__estimator': [decomposition.PCA(),],
#decomposition.FastICA(),],
#decomposition.KernelPCA()],
'pca__run': [True, True, False],
'time__hours_back': [1],#list(np.linspace(1, DEFAULT_MAX_STEPS, 12).astype(np.int32)),
@@ -141,7 +138,7 @@ def transform(self, X, y=None, **kw):
'ngen': 2,
'mu': 16,
'k': 8, # TODO ensure that k is not ignored - make elm issue if it is
'early_stop': None
'early_stop': None,
}

def get_file_name(tag, date):
@@ -157,10 +154,11 @@ def dump(obj, tag, date):
class Sampler(Step):
date = None
def transform(self, dates, y=None, **kw):
print('Sampler Called')
dsets = [slice_nldas_forcing_a(date, X_time_steps=max_time_steps)
for date in dates[:1]]
feats = [dset.to_features().features for dset in dsets]
return MLDataset(OrderedDict([('features', xr.concat(feats))]))
return MLDataset(OrderedDict([('features', xr.concat(feats, dim=feats[0].dims[1]))]))
fit_transform = transform


@@ -169,31 +167,50 @@ def transform(self, dates, y=None, **kw):
dates = np.array([START_DATE - datetime.timedelta(hours=hr)
for hr in range(max_time_steps)])

if __name__ == "__main__":

pipe = Pipeline([
('time', Differencing(layers=FEATURE_LAYERS)),
('flatten', Flatten()),
('soil_phys', AddSoilPhysicalChemical(soils_dset=read_nldas_soils())),
('get_y', GetY(SOIL_MOISTURE)),
('log', preprocessing.FunctionTransformer(func=log_trans_only_positive)),
('scaler', preprocessing.MinMaxScaler(feature_range=(1e-2, 1e-2 + 1))),
('pca', ChooseWithPreproc()),
('estimator', linear_model.LinearRegression(n_jobs=-1)),
])

ea = EaSearchCV(pipe,
n_iter=10,
param_distributions=param_distributions,
sampler=Sampler(),
ngen=NGEN,
model_selection=model_selection,
scheduler=None,
refit=True,
refit_Xy=Sampler().fit_transform([START_DATE]),
cv=KFold(3))
diff = Differencing(layers=FEATURE_LAYERS)
flat = Flatten()
soil_phys = AddSoilPhysicalChemical()
get_y = GetY(SOIL_MOISTURE)
pipe = Pipeline([
('time', diff),
('flatten', flat),
('soil_phys', soil_phys),
('scaler', preprocessing.MinMaxScaler(feature_range=(1e-2, 1e-2 + 1))),
('get_y', get_y),
('log', LogOnlyPositive(use_transform=True)),
('pca', decomposition.PCA()),
('estimator', linear_model.LinearRegression(n_jobs=-1)),
])

sampler = Sampler()
ea = EaSearchCV(pipe,
n_iter=4,
param_distributions=param_distributions,
sampler=sampler,
ngen=2,
model_selection=model_selection,
scheduler=None,
refit=True,
refit_Xy=sampler.fit_transform([START_DATE]),
cv=KFold(3))



def main():
print('Download')
download_data()
print('Downloaded')
print('Fit')
ea.fit(dates)
print('Done')
return ea


if __name__ == "__main__":
import warnings
with warnings.catch_warnings():
warnings.simplefilter('ignore')
ea = main()
'''
date += ONE_HR
current_file = get_file_name('fit_model', date)
10 changes: 6 additions & 4 deletions examples/read_nldas_forcing.py
Original file line number Diff line number Diff line change
@@ -116,17 +116,19 @@ def slice_nldas_forcing_a(date, X_time_steps=144, feature_layers=None, **kw):
def get_y(y_field, X, y=None, sample_weight=None, **kw):
'''Get the VIC Y column out of a flattened Dataset
of FORA and VIC DataArrays'''
y = X.features.sel(layer=y_field)
features = X.features.sel(layer=[x for x in X.features.layer.values
if x != y_field])
feat = X.features.dropna(dim='space', how='any')
y = feat.sel(layer=y_field)
features =feat.sel(layer=[x for x in feat.layer.values
if x != y_field])
X2 = MLDataset(OrderedDict([('features', features)]),
attrs=X.attrs)
print('X2', X2, type(y), getattr(y, 'size', y))
return X2, y


class GetY(Step):
column = SOIL_MOISTURE
def transform(self, X, y=None, **kw):
X, y = X
#X, y = X
return get_y(self.column, X, **self.get_params())

9 changes: 4 additions & 5 deletions examples/read_nldas_soils.py
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
import xarray as xr
from xarray_filters import MLDataset
import yaml

SOIL_URL = 'https://ldas.gsfc.nasa.gov/nldas/NLDASsoils.php'
@@ -122,7 +123,7 @@ def read_binary_files(y, x, attrs=None, bin_files=None):
att.update(attrs.copy())
arrs[name_token] = xr.DataArray(arr, coords=coords,
dims=dims, attrs=att)
return xr.Dataset(arrs)
return MLDataset(arrs)


def read_ascii_groups(ascii_groups=None):
@@ -162,7 +163,7 @@ def read_ascii_groups(ascii_groups=None):
dsets.pop(k)
for v in dsets.values():
v.values[v.values == NO_DATA] = np.NaN
return xr.Dataset(dsets)
return MLDataset(dsets)


def read_nldas_soils(ascii_groups=None, bin_files=None):
@@ -173,13 +174,11 @@ def read_nldas_soils(ascii_groups=None, bin_files=None):
if not a in COS_HYD_FILES:
raise ValueErrror('ascii_groups contains {} not in {}'.format(a, set(COS_HYD_FILES)))
dset_ascii = read_ascii_groups(ascii_groups)
print('dset_ascii', dset_ascii)
example = tuple(dset_ascii.data_vars.keys())[0]
example = dset_ascii[example]
y, x, dims = example.y, example.x, example.dims
dset_bin = read_binary_files(y, x, bin_files=bin_files)
print('dset_bin', dset_bin)
return xr.merge((dset_bin, dset_ascii))
return MLDataset(xr.merge((dset_bin, dset_ascii)))


def download_data(session=None):