Skip to content

Yaml Specification for Elm Pipeline

Greg Brener edited this page Oct 16, 2017 · 4 revisions

This page has a few notes on the goals of the YAML spec for Elm Pipeline, a YAML spec for serializing a Pipeline's steps.

The restructuring of scikit-learn interactions with dask and xarray will help us use YAML (text) specifications:

from elm.pipeline.steps import decomposition
from elm.mldataset.spec_mixins import SpecMixinBaseEstimator

class PCA(decomposition.PCA, SpecMixinBaseEstimator):
    pass

p = PCA()
p.spec
{'module': 'elm.pipeline.steps.decomposition',
 'name': 'PCA',
 'params': {'copy': True,
  'iterated_power': 'auto',
  'n_components': None,
  'random_state': None,
  'svd_solver': 'auto',
  'tol': 0.0,
  'whiten': False}}
p.from_spec(p.spec)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

This is an example from a Pipeline of 9 steps (more needs to be done here to serialize each step, as shown above for PCA):

{'module': 'elm.pipeline.steps.pipeline',
 'name': 'Pipeline',
 'params': {'_': None,
  'choose': ChooseBands(include_normed_diffs=True, layers=None),
  'choose__include_normed_diffs': True,
  'choose__layers': None,
  'drop_na': DropRows(),
  'est': MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
          init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
          n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
          verbose=0),
  'est__batch_size': 100,
  'est__compute_labels': True,
  'est__init': 'k-means++',
  'est__init_size': None,
  'est__max_iter': 100,
  'est__max_no_improvement': 10,
  'est__n_clusters': 8,
  'est__n_init': 3,
  'est__random_state': None,
  'est__reassignment_ratio': 0.01,
  'est__tol': 0.0,
  'est__verbose': 0,
  'memory': None,
  'normed_diffs': NormedDiffs(),
  'pca': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False),
  'pca__copy': True,
  'pca__iterated_power': 'auto',
  'pca__n_components': 5,
  'pca__random_state': None,
  'pca__svd_solver': 'auto',
  'pca__tol': 0.0,
  'pca__whiten': False,
  'radiance': Radiance(),
  'sampler': Sampler(layer_specs=None, sample_args=None),
  'sampler__layer_specs': None,
  'sampler__sample_args': None,
  'set_nans': SetNan(),
  'standard': StandardScaler(copy=True, with_mean=True, with_std=True),
  'standard__copy': True,
  'standard__with_mean': True,
  'standard__with_std': True,
  'steps': [('sampler', Sampler(layer_specs=None, sample_args=None)),
   ('set_nans', SetNan()),
   ('radiance', Radiance()),
   ('normed_diffs', NormedDiffs()),
   ('choose', ChooseBands(include_normed_diffs=True, layers=None)),
   ('drop_na', DropRows()),
   ('_', None),
   ('standard', StandardScaler(copy=True, with_mean=True, with_std=True)),
   ('pca',
    PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
      svd_solver='auto', tol=0.0, whiten=False)),
   ('est',
    MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
            init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
            n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
            verbose=0))]}}

Cross validation (below) is a work-in-progress when mixing xarray + numpy transformation steps:

ea = EaSearchCV(pipe,
                param_distributions=param_distributions,
                ngen=2,
                model_selection=model_selection,
                cv=5)
Xt, y = ea.fit(SAMPLE)
Clone this wiki locally