-
Notifications
You must be signed in to change notification settings - Fork 23
Using Xarray_filters Elm Scikit learn
This script uses LANDSAT data for a Pipeline of transformations and discusses current status of Elm with respect to xarray + scikit-learn + dask interactions:
from collections import OrderedDict
from functools import partial
import os
import re
from urllib.request import urlopen
from distributed import Client
from earthio import load_array, LayerSpec
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import requests
import xarray as xr
import numpy as np
import pandas as pd
import xarray as xr
from dask_glm.datasets import make_regression
from earthio.landsat_util import landsat_metadata
from earthio.s3_landsat_util import SceneDownloader
from elm.mldataset.cv_cache import CVCacheSampleId, cv_split
from elm.model_selection import EaSearchCV
from elm.model_selection.ea_searchcv import EaSearchCV
from elm.pipeline import steps
from elm.pipeline.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from xarray_filters import MLDataset
from xarray_filters.datasets import _make_base
from xarray_filters.pipeline import Generic, Step
from xarray_filters.pipe_utils import data_vars_func
from xarray_filters.pipe_utils import for_each_array
LayerSpec
controls which files to read and their resolution. Note we have addressed a naming issue ("bands" throughout the code became "layers").
LAYER_SPECS = [LayerSpec(search_key='name',
search_value='B{}.TIF'.format(layer),
name='layer_{}'.format(layer),
buf_xsize=800,
buf_ysize=800) for layer in range(1, 8)]
SAMPLE = dict(row=33, path=15, months=tuple(range(1,13)))
NORMALIZED_DIFFS = ('nbr', 'ndsi', 'ndwi', 'ndvi')
DEFAULT_LAYERS = [layer_spec.name for layer_spec in LAYER_SPECS]
We have Step
as a new class for making a scikit-learn parameterizable step of a Pipeline
.
Here's a Step
that is actually a sampler from files. This is one way in which cross validation in Elm will differ from how people are used to it in scikit-learn. Generally people assemble a big matrix and then subset to cross validate from there, but we will (are starting to) allow cross validation over file name groupings or any other argument that may determine how inputs are formed.
class Sampler(Step):
sample_args = None
layer_specs = None
def transform(self, sample_args, y=None, **kw):
print('sample_args', sample_args, kw, self.get_params())
p = self.get_params()
s3_landsat = SceneDownloader()
clear_image = s3_landsat.lowest_cloud_cover_image(**sample_args)
download_url = clear_image.download_url.values[0]
layer_specs = p['layer_specs']
if layer_specs is None:
layer_specs = layer_specs
local_files = s3_landsat.download_all_layers(download_url)
this_sample_dir = os.path.dirname(local_files[0])
X = load_array(this_sample_dir, layer_specs=LAYER_SPECS)
meta_files = [f for f in local_files if f.endswith('.txt')][0]
X.attrs.update(vars(landsat_metadata(meta_files)))
return X
The following allows radiance or reflectance may be set as a hyperparameter:
def to_radiance_or_reflectance(arr, attrs=None, to='REFLECTANCE'):
num = arr.name.split('_')[-1]
add = attrs.get('{}_ADD_BAND_{}'.format(to, num))
mult = attrs.get('{}_MULT_BAND_{}'.format(to, num))
arr.values[:] = arr.values * mult + add
return arr
class Radiance(Generic):
def transform(self, X, y=None, **kw):
return MLDataset(OrderedDict((k, to_radiance_or_reflectance(arr, attrs=X.attrs))
for k, arr in X.data_vars.items()))
See the xarray_filters/notebooks directory for more info on this decorator - short hand for a function on every array in a dataset.
@for_each_array
def set_nans(arr):
arr.values = arr.values.astype(np.float32)
arr.values[arr.values <= 1] = np.NaN
arr.values[arr.values == 2**16] = np.NaN
return arr
class SetNan(Step):
def transform(self, X, y=None, **kw):
X2 = set_nans(X)
X2.attrs.update(X.attrs)
return X2
See the xarray_filters/notebooks directory for more info on this decorator - short hand for a function on every array in a dataset.
def normed_diff(a, b):
return (a - b) / (a + b)
@data_vars_func
def normalized_diffs(**dset):
print('Called with ', dset.keys())
dset['ndwi'] = normed_diff(dset['layer_4'], dset['layer_5'])
dset['ndvi'] = normed_diff(dset['layer_5'], dset['layer_4'])
dset['ndsi'] = normed_diff(dset['layer_2'], dset['layer_6'])
dset['nbr'] = normed_diff(dset['layer_4'], dset['layer_7'])
return dset
class NormedDiffs(Step):
def transform(self, X, y=None, **kw):
return normalized_diffs(**X.data_vars)
Note that scikit-learn and related tools wrapped in elm.pipeline.steps allow feature selection, like removing a constant or near constant column. It is also possible - using the logic below - to do feature selection earlier in the Pipeline, thereby avoiding unnecessary file loading operations:
class ChooseBands(Step):
include_normed_diffs = None
layers = None
def transform(self, X, y=None, **kw):
p = self.get_params()
new = OrderedDict()
layers = p.get('layers')
if layers is None:
layers = DEFAULT_LAYERS
include_normed_diffs = p.get('include_normed_diffs')
for layer in layers:
data_arr = getattr(X, layer)
new[layer] = data_arr
if include_normed_diffs:
for diff in NORMALIZED_DIFFS:
new[diff] = getattr(X, diff)
return MLDataset(new)
class DropRows(Step):
def transform(self, X, y=None, **kw):
if not 'features' in X.data_vars:
X = X.to_features()
features = X.features.dropna('space', how='any')
return MLDataset(OrderedDict([('features', features)]))
fit = transform
The following is an acceptable hyperparameter dict for the Pipeline:
param_distributions = {'est__n_clusters': list(range(8, 12)),
'choose__include_normed_diffs': [True, False],
'pca__n_components': list(range(5, 12))}
This dictionary controls the NSGA-2:
model_selection = {
'select_method': 'selNSGA2',
'crossover_method': 'cxTwoPoint',
'mutate_method': 'mutUniformInt',
'init_pop': 'random',
'indpb': 0.5,
'mutpb': 0.9,
'cxpb': 0.3,
'eta': 20,
'ngen': 2,
'mu': 16,
'k': 8, # TODO ensure that k is not ignored - make elm issue if it is
'early_stop': None,
}
sampler = Sampler()
pipe = Pipeline([('sampler', sampler),
('set_nans', SetNan()),
('radiance', Radiance()),
('normed_diffs', NormedDiffs()),
('choose', ChooseBands(include_normed_diffs=True)),
('drop_na', DropRows()),
('standard', steps.preprocessing.StandardScaler()),
('pca', steps.decomposition.PCA(n_components=5)),
('est', steps.cluster.MiniBatchKMeans())])
X = pipe.fit(SAMPLE)
elm.model_selection.EaSearchCV
working to alpha level with numpy arrays but has cross validation problems when given xarray data structures.
ea = EaSearchCV(pipe,
param_distributions=param_distributions,
ngen=2,
model_selection=model_selection,
cv=5)
Xt, y = ea.fit(SAMPLE)