-
Notifications
You must be signed in to change notification settings - Fork 23
Yaml Specification for Elm Pipeline
Greg Brener edited this page Oct 16, 2017
·
4 revisions
This page has a few notes on the goals of the YAML spec for Elm Pipeline, a YAML spec for serializing a Pipeline's steps.
The restructuring of scikit-learn interactions with dask and xarray will help us use YAML (text) specifications:
from elm.pipeline.steps import decomposition
from elm.mldataset.spec_mixins import SpecMixinBaseEstimator
class PCA(decomposition.PCA, SpecMixinBaseEstimator):
pass
p = PCA()
p.spec
{'module': 'elm.pipeline.steps.decomposition',
'name': 'PCA',
'params': {'copy': True,
'iterated_power': 'auto',
'n_components': None,
'random_state': None,
'svd_solver': 'auto',
'tol': 0.0,
'whiten': False}}
p.from_spec(p.spec)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
This is an example from a Pipeline of 9 steps (more needs to be done here to serialize each step, as shown above for PCA):
{'module': 'elm.pipeline.steps.pipeline',
'name': 'Pipeline',
'params': {'_': None,
'choose': ChooseBands(include_normed_diffs=True, layers=None),
'choose__include_normed_diffs': True,
'choose__layers': None,
'drop_na': DropRows(),
'est': MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
verbose=0),
'est__batch_size': 100,
'est__compute_labels': True,
'est__init': 'k-means++',
'est__init_size': None,
'est__max_iter': 100,
'est__max_no_improvement': 10,
'est__n_clusters': 8,
'est__n_init': 3,
'est__random_state': None,
'est__reassignment_ratio': 0.01,
'est__tol': 0.0,
'est__verbose': 0,
'memory': None,
'normed_diffs': NormedDiffs(),
'pca': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
svd_solver='auto', tol=0.0, whiten=False),
'pca__copy': True,
'pca__iterated_power': 'auto',
'pca__n_components': 5,
'pca__random_state': None,
'pca__svd_solver': 'auto',
'pca__tol': 0.0,
'pca__whiten': False,
'radiance': Radiance(),
'sampler': Sampler(layer_specs=None, sample_args=None),
'sampler__layer_specs': None,
'sampler__sample_args': None,
'set_nans': SetNan(),
'standard': StandardScaler(copy=True, with_mean=True, with_std=True),
'standard__copy': True,
'standard__with_mean': True,
'standard__with_std': True,
'steps': [('sampler', Sampler(layer_specs=None, sample_args=None)),
('set_nans', SetNan()),
('radiance', Radiance()),
('normed_diffs', NormedDiffs()),
('choose', ChooseBands(include_normed_diffs=True, layers=None)),
('drop_na', DropRows()),
('_', None),
('standard', StandardScaler(copy=True, with_mean=True, with_std=True)),
('pca',
PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)),
('est',
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
verbose=0))]}}
Cross validation (below) is a work-in-progress when mixing xarray + numpy transformation steps:
ea = EaSearchCV(pipe,
param_distributions=param_distributions,
ngen=2,
model_selection=model_selection,
cv=5)
Xt, y = ea.fit(SAMPLE)