Skip to content

Using Xarray_filters Elm Scikit learn

Greg Brener edited this page Oct 16, 2017 · 2 revisions

Working with Elm + Dask + Scikit-learn

This script uses LANDSAT data for a Pipeline of transformations and discusses current status of Elm with respect to xarray + scikit-learn + dask interactions:

Imports

from collections import OrderedDict
from functools import partial
import os
import re
from urllib.request import urlopen

from distributed import Client
from earthio import load_array, LayerSpec
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import requests
import xarray as xr
import numpy as np
import pandas as pd
import xarray as xr


from dask_glm.datasets import make_regression
from earthio.landsat_util import landsat_metadata
from earthio.s3_landsat_util import SceneDownloader
from elm.mldataset.cv_cache import CVCacheSampleId, cv_split
from elm.model_selection import EaSearchCV
from elm.model_selection.ea_searchcv import EaSearchCV
from elm.pipeline import steps
from elm.pipeline.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from xarray_filters import MLDataset
from xarray_filters.datasets import _make_base
from xarray_filters.pipeline import Generic, Step
from xarray_filters.pipe_utils import data_vars_func
from xarray_filters.pipe_utils import for_each_array

LayerSpec - Controlling which variables to read

LayerSpec controls which files to read and their resolution. Note we have addressed a naming issue ("bands" throughout the code became "layers").

LAYER_SPECS = [LayerSpec(search_key='name',
                         search_value='B{}.TIF'.format(layer),
                         name='layer_{}'.format(layer),
                         buf_xsize=800,
                         buf_ysize=800) for layer in range(1, 8)]
SAMPLE = dict(row=33, path=15, months=tuple(range(1,13)))
NORMALIZED_DIFFS = ('nbr', 'ndsi', 'ndwi', 'ndvi')
DEFAULT_LAYERS = [layer_spec.name for layer_spec in LAYER_SPECS]

Step from xarray_filters

We have Step as a new class for making a scikit-learn parameterizable step of a Pipeline.

Here's a Step that is actually a sampler from files. This is one way in which cross validation in Elm will differ from how people are used to it in scikit-learn. Generally people assemble a big matrix and then subset to cross validate from there, but we will (are starting to) allow cross validation over file name groupings or any other argument that may determine how inputs are formed.

class Sampler(Step):
    sample_args = None
    layer_specs = None
    def transform(self, sample_args, y=None, **kw):
        print('sample_args', sample_args, kw, self.get_params())
        p = self.get_params()
        s3_landsat = SceneDownloader()
        clear_image = s3_landsat.lowest_cloud_cover_image(**sample_args)
        download_url = clear_image.download_url.values[0]
        layer_specs = p['layer_specs']
        if layer_specs is None:
            layer_specs = layer_specs
        local_files = s3_landsat.download_all_layers(download_url)
        this_sample_dir = os.path.dirname(local_files[0])
        X = load_array(this_sample_dir, layer_specs=LAYER_SPECS)
        meta_files = [f for f in local_files if f.endswith('.txt')][0]
        X.attrs.update(vars(landsat_metadata(meta_files)))
        return X

Step allows any function to be parameterized for xarray data structures

The following allows radiance or reflectance may be set as a hyperparameter:

def to_radiance_or_reflectance(arr, attrs=None, to='REFLECTANCE'):
    num = arr.name.split('_')[-1]
    add = attrs.get('{}_ADD_BAND_{}'.format(to, num))
    mult = attrs.get('{}_MULT_BAND_{}'.format(to, num))
    arr.values[:] = arr.values * mult + add
    return arr
class Radiance(Generic):
    def transform(self, X, y=None, **kw):
        return MLDataset(OrderedDict((k, to_radiance_or_reflectance(arr, attrs=X.attrs))
                          for k, arr in X.data_vars.items()))

for_each_array

See the xarray_filters/notebooks directory for more info on this decorator - short hand for a function on every array in a dataset.

@for_each_array
def set_nans(arr):
    arr.values = arr.values.astype(np.float32)
    arr.values[arr.values <= 1] = np.NaN
    arr.values[arr.values == 2**16] = np.NaN
    return arr

class SetNan(Step):
    def transform(self, X, y=None, **kw):
        X2 = set_nans(X)
        X2.attrs.update(X.attrs)
        return X2

Operations on >=1 layer - data_vars_func

See the xarray_filters/notebooks directory for more info on this decorator - short hand for a function on every array in a dataset.

def normed_diff(a, b):
    return (a - b) / (a + b)

@data_vars_func
def normalized_diffs(**dset):
    print('Called with ', dset.keys())
    dset['ndwi'] = normed_diff(dset['layer_4'], dset['layer_5'])
    dset['ndvi'] = normed_diff(dset['layer_5'], dset['layer_4'])
    dset['ndsi'] = normed_diff(dset['layer_2'], dset['layer_6'])
    dset['nbr']  = normed_diff(dset['layer_4'], dset['layer_7'])
    return dset

class NormedDiffs(Step):
    def transform(self, X, y=None, **kw):
        return normalized_diffs(**X.data_vars)

Parameterize which features are included

Note that scikit-learn and related tools wrapped in elm.pipeline.steps allow feature selection, like removing a constant or near constant column. It is also possible - using the logic below - to do feature selection earlier in the Pipeline, thereby avoiding unnecessary file loading operations:

class ChooseBands(Step):
    include_normed_diffs = None
    layers = None
    def transform(self, X, y=None, **kw):
        p = self.get_params()
        new = OrderedDict()
        layers = p.get('layers')
        if layers is None:
            layers = DEFAULT_LAYERS
        include_normed_diffs = p.get('include_normed_diffs')
        for layer in layers:
            data_arr = getattr(X, layer)
            new[layer] = data_arr
        if include_normed_diffs:
            for diff in NORMALIZED_DIFFS:
                new[diff] = getattr(X, diff)
        return MLDataset(new)

Dropping NaN's from image perimeters

class DropRows(Step):
    def transform(self, X, y=None, **kw):
        if not 'features' in X.data_vars:
            X = X.to_features()
        features = X.features.dropna('space', how='any')
        return MLDataset(OrderedDict([('features', features)]))
    fit = transform

Parameterization

The following is an acceptable hyperparameter dict for the Pipeline:

param_distributions = {'est__n_clusters': list(range(8, 12)),
                       'choose__include_normed_diffs': [True, False],
                       'pca__n_components': list(range(5, 12))}

NSGA-2 Control

This dictionary controls the NSGA-2:


model_selection = {
    'select_method': 'selNSGA2',
    'crossover_method': 'cxTwoPoint',
    'mutate_method': 'mutUniformInt',
    'init_pop': 'random',
    'indpb': 0.5,
    'mutpb': 0.9,
    'cxpb':  0.3,
    'eta':   20,
    'ngen':  2,
    'mu':    16,
    'k':     8, # TODO ensure that k is not ignored - make elm issue if it is
    'early_stop': None,
}

Pipeline for xarray + scikit-learn

sampler = Sampler()
pipe = Pipeline([('sampler', sampler),
                  ('set_nans', SetNan()),
                  ('radiance', Radiance()),
                  ('normed_diffs', NormedDiffs()),
                  ('choose', ChooseBands(include_normed_diffs=True)),
                  ('drop_na', DropRows()),
                  ('standard', steps.preprocessing.StandardScaler()),
                  ('pca', steps.decomposition.PCA(n_components=5)),
                  ('est', steps.cluster.MiniBatchKMeans())])

X = pipe.fit(SAMPLE)

Next steps (2 to 4 weeks)

elm.model_selection.EaSearchCV working to alpha level with numpy arrays but has cross validation problems when given xarray data structures.

ea = EaSearchCV(pipe,
                param_distributions=param_distributions,
                ngen=2,
                model_selection=model_selection,
                cv=5)
Xt, y = ea.fit(SAMPLE)