Skip to content

Commit

Permalink
Merge pull request #101 from astroswego/regressor-generalization
Browse files Browse the repository at this point in the history
Regressor generalization
  • Loading branch information
dwysocki authored Jun 14, 2016
2 parents f9065bf + 2c08caa commit b63d61f
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 56 deletions.
62 changes: 26 additions & 36 deletions src/plotypus/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import dill
from argparse import ArgumentError, ArgumentParser, SUPPRESS
from pandas import read_table
from sklearn.linear_model import (LassoCV, LassoLarsCV, LassoLarsIC,
LinearRegression, RidgeCV, ElasticNetCV)
import sklearn.linear_model
from sklearn.grid_search import GridSearchCV
from matplotlib import rc_params_from_file
from collections import ChainMap
Expand All @@ -20,8 +19,9 @@
plot_periodogram)
import plotypus
from plotypus.preprocessing import Fourier
from plotypus.utils import (colvec, mad, make_sure_path_exists, pmap,
valid_basename, verbose_print)
from plotypus.utils import (colvec, import_name, mad, make_sure_path_exists,
pmap, strlist_to_dict, valid_basename,
verbose_print)
from plotypus.resources import matplotlibrc

import pkg_resources # part of setuptools
Expand Down Expand Up @@ -60,6 +60,7 @@ def get_args():
parallel_group = parser.add_argument_group('Parallel')
period_group = parser.add_argument_group('Periodogram')
fourier_group = parser.add_argument_group('Fourier')
regressor_group = parser.add_argument_group('Regressor')
outlier_group = parser.add_argument_group('Outlier Detection')
# regression_group = parser.add_argument_group('Regression')

Expand Down Expand Up @@ -276,12 +277,6 @@ def get_args():
default=(2, 20), metavar=('MIN', 'MAX'),
help='range of degrees of fourier fits to use '
'(default = 2 20)')
fourier_group.add_argument('-r', '--regressor',
choices=['LassoCV', 'LassoLarsCV', 'LassoLarsIC', 'OLS', 'RidgeCV',
'ElasticNetCV'],
default='LassoLarsIC',
help='type of regressor to use '
'(default = "Lasso")')
fourier_group.add_argument('--selector',
choices=['Baart', 'GridSearch'],
default='GridSearch',
Expand All @@ -292,15 +287,23 @@ def get_args():
help='form of Fourier series to use in coefficient output, '
'does not affect the fit '
'(default = "cos")')
fourier_group.add_argument('--max-iter', type=int,
default=1000, metavar='N',
help='maximum number of iterations in the regularization path '
'(default = 1000)')
fourier_group.add_argument('--regularization-cv', type=int,
default=None, metavar='N',
help='number of folds used in regularization regularization_cv '
'validation '
'(default = 3)')

## Regressor Group #######################################################

fourier_group.add_argument('-r', '--regressor', type=import_name,
default=sklearn.linear_model.LassoLarsIC,
help='type of regressor to use, loads any Python object named like '
'*module.submodule.etc.object_name*, though it must behave like a '
'scikit-learn regressor '
'(default = "sklearn.linear_model.LassoLarsIC")')
regressor_group.add_argument('--regressor-options', type=str, nargs='+',
default=[], metavar="KEY VALUE",
help='list of key value pairs to pass to regressor object. '
'accepted keys depend on regressor. '
'values which form valid Python literals (e.g. 2, True, [1,2]) '
'are all parsed to their obvious type, or left as strings '
'otherwise '
'(default = None)')

## Outlier Group #########################################################

Expand All @@ -327,22 +330,9 @@ def get_args():
fail_on_error=True)
plotypus.lightcurve.matplotlib.rcParams = rcParams

regressor_choices = {
"LassoCV" : LassoCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False),
"LassoLarsCV" : LassoLarsCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False),
"LassoLarsIC" : LassoLarsIC(max_iter=args.max_iter,
fit_intercept=False),
"OLS" : LinearRegression(fit_intercept=False),
"RidgeCV" : RidgeCV(cv=args.regularization_cv,
fit_intercept=False),
"ElasticNetCV" : ElasticNetCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False)
}
# parse regressor (TODO: and selector) options into a dict
regressor_options = strlist_to_dict(args.regressor_options)

selector_choices = {
"Baart" : None,
"GridSearch" : GridSearchCV
Expand All @@ -363,7 +353,7 @@ def get_args():
}
args.scoring = scoring_choices[args.scoring]

args.regressor = regressor_choices[args.regressor]
args.regressor = args.regressor(**regressor_options)
Selector = selector_choices[args.selector] or GridSearchCV
args.periodogram = periodogram_choices[args.periodogram]
args.sigma_clipping = sigma_clipping_choices[args.sigma_clipping]
Expand Down
16 changes: 10 additions & 6 deletions src/plotypus/lightcurve.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,20 @@
]


def make_predictor(regressor=LassoLarsIC(fit_intercept=False),
def make_predictor(regressor=LassoLarsIC(),
Selector=GridSearchCV, fourier_degree=(2, 25),
selector_processes=1,
use_baart=False, scoring='r2', scoring_cv=3,
**kwargs):
"""make_predictor(regressor=LassoLarsIC(fit_intercept=False), Selector=GridSearchCV, fourier_degree=(2, 25), selector_processes=1, use_baart=False, scoring='r2', scoring_cv=3, **kwargs)
"""make_predictor(regressor=LassoLarsIC(), Selector=GridSearchCV, fourier_degree=(2, 25), selector_processes=1, use_baart=False, scoring='r2', scoring_cv=3, **kwargs)
Makes a predictor object for use in :func:`get_lightcurve`.
**Parameters**
regressor : object with "fit" and "transform" methods, optional
Regression object used for solving Fourier matrix
(default ``sklearn.linear_model.LassoLarsIC(fit_intercept=False)``).
(default ``sklearn.linear_model.LassoLarsIC()``).
Selector : class with "fit" and "predict" methods, optional
Model selection class used for finding the best fit
(default :class:`sklearn.grid_search.GridSearchCV`).
Expand Down Expand Up @@ -328,10 +328,14 @@ def get_lightcurve(data, copy=False, name=None,
# use rephased phase points from *data* in residuals
residuals = np.ma.column_stack((time, phase, mag, residuals, err))
data[:,0] = phase
# Grab the coefficients from the model
coefficients = predictor.named_steps['Regressor'].coef_ \
# grab the regressor used in the model
regressor = predictor.named_steps['Regressor'] \
if isinstance(predictor, Pipeline) \
else predictor.best_estimator_.named_steps['Regressor'].coef_
else predictor.best_estimator_.named_steps['Regressor']
# Grab the coefficients from the model
coefficients = regressor.coef_
intercept = regressor.intercept_
coefficients = np.insert(coefficients, 0, intercept)

# compute R^2 and MSE if they haven't already been
# (one or zero have been computed, depending on the predictor)
Expand Down
23 changes: 9 additions & 14 deletions src/plotypus/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class Fourier(BaseEstimator):
def __init__(self,
degree=3, degree_range=None,
period=None,
regressor=LinearRegression(fit_intercept=False)):
regressor=LinearRegression()):
self.degree = degree
self.degree_range = degree_range
self.period = period
Expand Down Expand Up @@ -203,22 +203,19 @@ def design_matrix(times, period, degree):
.. math::
\begin{bmatrix}
1
& \sin(1 \omega t_0)
\sin(1 \omega t_0)
& \cos(1 \omega t_0)
& \ldots
& \sin(n \omega t_0)
& \cos(n \omega t_0)
\\
\vdots
& \vdots
& \vdots
& \ddots
& \vdots
& \vdots
\\
1
& \sin(1 \omega t_N)
\sin(1 \omega t_N)
& \cos(1 \omega t_N)
& \ldots
& \sin(n \omega t_N)
Expand All @@ -241,15 +238,15 @@ def design_matrix(times, period, degree):
**Returns**
design_matrix : array-like, shape = [n_samples, 2*degree + 1]
design_matrix : array-like, shape = [n_samples, 2*degree]
"""
# pre-compute number of samples
n_samples = np.size(times)
# convert the period into angular frequency
omega = 2*np.pi / period
# initialize coefficient matrix
M = np.empty((n_samples, 2*degree+1))
M = np.empty((n_samples, 2*degree))
# indices
i = np.arange(1, degree+1)
# initialize the Nxn matrix that is repeated within the
Expand All @@ -260,12 +257,10 @@ def design_matrix(times, period, degree):
x[:,:] = i * omega
# multiply each row of x by the times
x.T[:,:] *= times
# place 1's in the first column of the coefficient matrix
M[:,0] = 1
# the odd indices of the coefficient matrix have sine terms
M[:,1::2] = np.sin(x)
# the even indices of the coefficient matrix have cosine terms
M[:,2::2] = np.cos(x)
# the even indices of the coefficient matrix have sine terms
M[:,0::2] = np.sin(x)
# the odd indices of the coefficient matrix have cosine terms
M[:,1::2] = np.cos(x)

return M

Expand Down
103 changes: 103 additions & 0 deletions src/plotypus/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

__all__ = [
'verbose_print',
'import_name',
'literal_eval_str',
'strlist_to_dict',
'pmap',
'make_sure_path_exists',
'valid_basename',
Expand Down Expand Up @@ -45,6 +48,106 @@ def verbose_print(message, *, operation, verbosity):
print(message, file=stderr)


def import_name(dotted_name):
"""import_name(dotted_name)
Import an object from a module, and return the object. Does not affect the
encapsulating namespace.
**Parameters**
dotted_name : str
The fully-qualified name of the object to import, in the form
*module.submodule.etc.object_name*. Will error if *dotted_name* is
a top-level module (e.g. *module* and not *module.something*) or if
the object does not exist.
**Returns**
obj : object
The object specified by *dotted_name*.
"""
# Separate `module.submodule.object` into `[module, submodule]` and `object`
*module_path_components, object_name = dotted_name.split(".")

# Ensure `object_name` is contained within a module, and is not a
# standalone module.
# In other words, make sure `dotted_name` is a module containing an object
# like `foo.bar.baz` and not just a module like `baz`
if len(module_path_components) == 0:
raise ValueError("must name object within a module, not just a module")

# Reinsert the dots into the module name, e.g. turn
# `[module, submodule]` back into `module.submodule`
module_name = ".".join(module_path_components)

# import the module
from importlib import import_module
module = import_module(module_name)

# return the desired object
try:
return getattr(module, object_name)
except AttributeError:
raise ImportError("module '{module}' has no object '{object_name}'"
.format(module=module_name, object_name=object_name))


def literal_eval_str(string):
"""literal_eval_str(string)
Equivalent to *ast.literal_eval*, but if *string* cannot be parsed,
returns the original string. Only accepts arguments of type *str*.
**Parameters**
string : str
Any string. If it can be parsed as a basic Python object literal, it
will be. See *ast.literal_eval* for supported objects.
**Returns**
obj : object
Original string, or the object literal it represents.
"""
if not isinstance(string, str):
raise ValueError("input must be of type str")

try:
from ast import literal_eval
return literal_eval(string)
except ValueError:
return string


def strlist_to_dict(lst):
"""list_to_dict(lst)
Turn a list of strings with an even number of elements into a dictionary.
Any values which may be interpreted as basic Python object literals are
parsed as such. See *plotypus.literal_eval_str* for supported types.
**Parameters**
lst : list
A *list* or equivalent object with an even number of arguments, to be
interpreted as *[key1, val1, ..., keyN, valN]*.
**Returns**
dct : dict
A *dict* whose keys are the even elements of *lst* and values are the
odd elements.
"""
if len(lst)%2:
raise ValueError("length of list must be even")

keys = lst[0::2]
vals = map(literal_eval_str, lst[1::2])

return dict(zip(keys, vals))


def pmap(func, args, processes=None, callback=lambda *_, **__: None, **kwargs):
"""pmap(func, args, processes=None, callback=do_nothing, **kwargs)
Expand Down

0 comments on commit b63d61f

Please sign in to comment.