Skip to content

Commit

Permalink
[Feature] add docstring (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
RektPunk authored Sep 18, 2024
1 parent 53334a2 commit 37b4884
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 124 deletions.
103 changes: 103 additions & 0 deletions imlightgbm/docstring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from typing import Callable

_space = "\n "
ALL_PARAMS = {
"params": f"dict{_space}Parameters for training. Values passed through ``params`` take precedence over those supplied via arguments.",
"train_set": f"Dataset{_space}Data to be trained on.",
"num_boost_round": f"int, optional (default=100){_space}Number of boosting iterations.",
"valid_sets": f"list of Dataset, or None, optional (default=None){_space}List of data to be evaluated on during training.",
"valid_names": f"list of str, or None, optional (default=None){_space}Names of ``valid_sets``.",
"folds": f"generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None){_space}If generator or iterator, it should yield the train and test indices for each fold.{_space}If object, it should be one of the scikit-learn splitter classes{_space}(https://scikit-learn.org/stable/modules/classes.html#splitter-classes){_space}and have ``split`` method.{_space}This argument has highest priority over other data split arguments.",
"nfold": f"int, optional (default=5){_space}Number of folds in CV.",
"stratified": f"bool, optional (default=True){_space}Whether to perform stratified sampling.",
"shuffle": f"bool, optional (default=True){_space}Whether to shuffle before splitting data.",
"metrics": f"str, list of str, or None, optional (default=None){_space}Evaluation metrics to be monitored while CV.",
"init_model": f"str, pathlib.Path, Booster or None, optional (default=None){_space}Filename of LightGBM model or Booster instance used for continue training.",
"fpreproc": f"callable or None, optional (default=None){_space}Preprocessing function that takes (dtrain, dtest, params) and returns transformed versions of those.",
"seed": f"int, optional (default=0){_space}Seed used to generate the folds (passed to numpy.random.seed).",
"callbacks": f"list of callable, or None, optional (default=None){_space}List of callback functions that are applied at each iteration.{_space}See Callbacks in Python API for more information.",
"eval_train_metric": f"bool, optional (default=False){_space}Whether to display the train metric in progress.",
"return_cvbooster": f"bool, optional (default=False){_space}Whether to return Booster models trained on each fold through ``CVBooster``.",
"keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.",
"num_trials": f"int, optional (default=10){_space}Number of hyperparameter tuning trials.",
"get_params": 'callable, optional (default=get_params)\n Number of hyperparameter tuning trials.\n def get_params(trial: optuna.Trial):\n return {\n "alpha": trial.suggest_float("alpha", 0.25, 0.75),\n "gamma": trial.suggest_float("gamma", 0.0, 3.0),\n "num_leaves": trial.suggest_int("num_leaves", 20, 150),\n "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),\n "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),\n "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),\n "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),\n }',
}


PARAMS_MAPPER = {
"train": {
"description": "Perform the training with given parameters.",
"selected_params": [
"params",
"train_set",
"num_boost_round",
"valid_sets",
"valid_names",
"init_model",
"keep_training_booster",
"callbacks",
],
"return_description": f"booster: Booster{_space}The trained Booster model.",
},
"cv": {
"description": "Perform the cross-validation with given parameters.",
"selected_params": [
"params",
"train_set",
"num_boost_round",
"folds",
"nfold",
"stratified",
"shuffle",
"init_model",
"fpreproc",
"seed",
"callbacks",
"eval_train_metric",
"return_cvbooster",
],
"return_description": "eval_results: dict\n History of evaluation results of each metric.\n The dictionary has the following format:\n {'valid metric1-mean': [values], 'valid metric1-stdv': [values],\n 'valid metric2-mean': [values], 'valid metric2-stdv': [values],\n ...}.\n If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.\n If ``eval_train_metric=True``, also returns the train metric history.\n In this case, the dictionary has the following format:\n {'train metric1-mean': [values], 'valid metric1-mean': [values],\n 'train metric2-mean': [values], 'valid metric2-mean': [values],\n ...}.",
},
"optimize": {
"description": "Perform the hyperparameter tuning with optuna.",
"selected_params": [
"train_set",
"num_trials",
"num_boost_round",
"folds",
"nfold",
"stratified",
"shuffle",
"get_params",
"init_model",
"fpreproc",
"seed",
"callbacks",
],
"return_description": f"study: optuna.Study{_space}study.best_params{_space}study.best_value",
},
}


def generate_docstring(
description: str,
selected_params: list[str],
return_description: str = "",
) -> str:
"""Generate a docstring with a provided description, selected parameters, and optional return description."""
docstring = f"{description}\n\n Parameters\n ----------\n"
for param in selected_params:
docstring += f" {param}: {ALL_PARAMS[param]}\n"
if return_description:
docstring += f"\n Returns\n -------\n {return_description}\n"
return docstring


def add_docstring(func_name: str) -> Callable:
"""Decorator to add a docstring to a function based on provided parameters and descriptions."""

def decorator(func: Callable) -> Callable:
func.__doc__ = generate_docstring(**PARAMS_MAPPER[func_name])
return func

return decorator
46 changes: 8 additions & 38 deletions imlightgbm/engine.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,40 @@
from collections.abc import Iterable
from typing import Any, Callable, Literal
from typing import Any, Callable

import lightgbm as lgb
import numpy as np
import optuna
from sklearn.model_selection import BaseCrossValidator

from imlightgbm.objective import set_params
from imlightgbm.utils import docstring, optimize_doc
from imlightgbm.docstring import add_docstring
from imlightgbm.objective import get_params, set_params


@docstring(lgb.train.__doc__)
@add_docstring("train")
def train(
params: dict[str, Any],
train_set: lgb.Dataset,
num_boost_round: int = 100,
valid_sets: list[lgb.Dataset] = None,
valid_names: list[str] = None,
num_boost_round: int = 100,
init_model: str | lgb.Path | lgb.Booster | None = None,
feature_name: list[str] | Literal["auto"] = "auto",
categorical_feature: list[str] | list[int] | Literal["auto"] = "auto",
keep_training_booster: bool = False,
callbacks: list[Callable] | None = None,
) -> lgb.Booster:
_params = set_params(params=params, train_set=train_set)
return lgb.train(
params=_params,
train_set=train_set,
num_boost_round=num_boost_round,
valid_sets=valid_sets,
valid_names=valid_names,
num_boost_round=num_boost_round,
init_model=init_model,
feature_name=feature_name,
categorical_feature=categorical_feature,
keep_training_booster=keep_training_booster,
callbacks=callbacks,
)


@docstring(lgb.cv.__doc__)
@add_docstring("cv")
def cv(
params: dict[str, Any],
train_set: lgb.Dataset,
Expand All @@ -47,10 +43,7 @@ def cv(
nfold: int = 5,
stratified: bool = True,
shuffle: bool = True,
metrics: str | list[str] | None = None,
init_model: str | lgb.Path | lgb.Booster | None = None,
feature_name: list[str] | Literal["auto"] = "auto",
categorical_feature: list[str] | list[int] | Literal["auto"] = "auto",
fpreproc: Callable[
[lgb.Dataset, lgb.Dataset, dict[str, Any]],
tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]],
Expand All @@ -70,10 +63,7 @@ def cv(
nfold=nfold,
stratified=stratified,
shuffle=shuffle,
metrics=metrics,
init_model=init_model,
feature_name=feature_name,
categorical_feature=categorical_feature,
fpreproc=fpreproc,
seed=seed,
callbacks=callbacks,
Expand All @@ -82,18 +72,7 @@ def cv(
)


def get_params(trial: optuna.Trial):
return {
"alpha": trial.suggest_float("alpha", 0.25, 0.75),
"gamma": trial.suggest_float("gamma", 0.0, 3.0),
"num_leaves": trial.suggest_int("num_leaves", 20, 150),
"learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
}


@add_docstring("optimize")
def optimize(
train_set: lgb.Dataset,
num_trials: int = 10,
Expand All @@ -104,8 +83,6 @@ def optimize(
shuffle: bool = True,
get_params: Callable[[optuna.Trial], dict[str, Any]] = get_params,
init_model: str | lgb.Path | lgb.Booster | None = None,
feature_name: list[str] | Literal["auto"] = "auto",
categorical_feature: list[str] | list[int] | Literal["auto"] = "auto",
fpreproc: Callable[
[lgb.Dataset, lgb.Dataset, dict[str, Any]],
tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]],
Expand All @@ -126,13 +103,9 @@ def _objective(trial: optuna.Trial):
stratified=stratified,
shuffle=shuffle,
init_model=init_model,
feature_name=feature_name,
categorical_feature=categorical_feature,
fpreproc=fpreproc,
seed=seed,
callbacks=callbacks,
eval_train_metric=False,
return_cvbooster=False,
)
_keys = [_ for _ in cv_results.keys() if _.endswith("mean")]
assert len(_keys) == 1
Expand All @@ -141,6 +114,3 @@ def _objective(trial: optuna.Trial):
study = optuna.create_study(direction="minimize")
study.optimize(_objective, n_trials=num_trials)
return study


optimize.__doc__ = optimize_doc
14 changes: 14 additions & 0 deletions imlightgbm/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Callable

import numpy as np
import optuna
from lightgbm import Dataset
from sklearn.utils.multiclass import type_of_target

Expand Down Expand Up @@ -120,3 +121,16 @@ def set_params(params: dict[str, Any], train_set: Dataset) -> dict[str, Any]:
fobj, feval = _set_fobj_feval(train_set=train_set, alpha=_alpha, gamma=_gamma)
_params.update({OBJECTIVE_STR: fobj, METRIC_STR: feval})
return _params


def get_params(trial: optuna.Trial) -> dict[str, Any]:
"""Get default params."""
return {
"alpha": trial.suggest_float("alpha", 0.25, 0.75),
"gamma": trial.suggest_float("gamma", 0.0, 3.0),
"num_leaves": trial.suggest_int("num_leaves", 20, 150),
"learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
}
86 changes: 0 additions & 86 deletions imlightgbm/utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,4 @@
import logging
from typing import Callable


def _modify_docstring(docstring: str) -> str:
lines = docstring.splitlines()

feval_start = next(i for i, line in enumerate(lines) if "feval" in line)
init_model_start = next(i for i, line in enumerate(lines) if "init_model" in line)
del lines[feval_start:init_model_start]

note_start = next(i for i, line in enumerate(lines) if "Note" in line)
returns_start = next(i for i, line in enumerate(lines) if "Returns" in line)
del lines[note_start:returns_start]
return "\n".join(lines)


def docstring(doc: str) -> Callable[[Callable], Callable]:
def decorator(func: Callable) -> Callable:
func.__doc__ = _modify_docstring(doc)
return func

return decorator


def init_logger() -> logging.Logger:
Expand All @@ -30,67 +8,3 @@ def init_logger() -> logging.Logger:


logger = init_logger()


optimize_doc = """Perform the cross-validation with given parameters.
Parameters
----------
train_set : Dataset
Data to be trained on.
num_trials : int, optional (default=10)
Number of hyperparameter search trials.
num_boost_round : int, optional (default=100)
Number of boosting iterations.
folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
If generator or iterator, it should yield the train and test indices for each fold.
If object, it should be one of the scikit-learn splitter classes
(https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
and have ``split`` method.
This argument has highest priority over other data split arguments.
nfold : int, optional (default=5)
Number of folds in CV.
stratified : bool, optional (default=True)
Whether to perform stratified sampling.
shuffle : bool, optional (default=True)
Whether to shuffle before splitting data.
get_params : callable, optional (default=get_params)
def get_params(trial: optuna.Trial):
return {
'alpha': trial.suggest_float('alpha', .25, .75),
'gamma': trial.suggest_float('gamma', .0, 3.),
'num_leaves': trial.suggest_int('num_leaves', 20, 150),
'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
}
init_model : str, pathlib.Path, Booster or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of str, or 'auto', optional (default="auto")
**Deprecated.** Set ``feature_name`` on ``train_set`` instead.
Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of str or int, or 'auto', optional (default="auto")
**Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
Categorical features.
If list of int, interpreted as indices.
If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
The output cannot be monotonically constrained with respect to a categorical feature.
Floating point numbers in categorical features will be rounded towards 0.
fpreproc : callable or None, optional (default=None)
Preprocessing function that takes (dtrain, dtest, params)
and returns transformed versions of those.
seed : int, optional (default=0)
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callable, or None, optional (default=None)
List of callback functions that are applied at each iteration.
See Callbacks in Python API for more information.
Returns
-------
study: optuna.Study
"""

0 comments on commit 37b4884

Please sign in to comment.