Merge pull request #20 from elephaint/torch_missing_values

Adds missing values support to Torch version
elephaint · Feb 20, 2023 · 3a69497 · 3a69497
2 parents 6628e2a + 8696215
commit 3a69497
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 10 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -3,7 +3,7 @@
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 
 # Required
-version: 2
+version: 1
 
 # Set the version of Python and other tools you might need
 build:

diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,7 @@
+## 2.1.0 ##
+* Added support for missing values in Torch backend
+* Fixed documentation not rendering properly
+
 ## 2.0.0 ##
 * Added `HistGradientBoostingRegressor`, a fork of scikit-learn's version that allows to use PGBM whilst being fully compatible with scikit-learn
 * Deprecated `pgbm_nb` in favor of `HistGradientBoostingRegressor`. 

diff --git a/docs/conf.py b/docs/conf.py
@@ -26,7 +26,7 @@
 author = 'Olivier Sprangers'
 
 # The full version, including alpha/beta/rc tags
-release = '2.0'
+release = '2.1'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/features.md b/docs/features.md
@@ -45,7 +45,7 @@ The table below lists the features per API, which may help you decide which API
 | Feature bagging |      Yes        |                   No                       |
 | Monotone cst    |      Yes        |                   Yes                      |
 | Categorical val |      No         |                   Yes                      |
-| Missing values  |      No         |                   Yes                      |
+| Missing values  |      Yes        |                   Yes                      |
 | Checkpointing   |      Yes        |                   Yes                      |
 | Autodiff        |      Yes        |                   No                       |
 
@@ -56,6 +56,6 @@ Description of features:
 * Feature bagging: if we can train on a subsample of the features of the dataset. This may improve model accuracy and speeds up training.
 * Monotone cst: if we can set monotone constraints per feature, using positive, negative or neutral constraints.
 * Categorical val: if the model can natively handle categorical data.
-* Missing values: if the model can natively handle missing values. 
+* Missing values: if the model can natively handle missing values (defined as NaNs). 
 * Checkpointing: if we can train the model, save it, and continue training later on (a.k.a., 'warm-start').
 * Autodiff: if we can supply a differentiable loss function for which we use autodifferentiation to determine the gradient and hessian.
diff --git a/pgbm/torch/pgbm.py b/pgbm/torch/pgbm.py
@@ -1020,14 +1020,16 @@ def _predict_forest_mu(X: torch.Tensor, nodes_idx: torch.Tensor, nodes_split_bin
 def _create_feature_bins(X: torch.Tensor, max_bin: int = 256):
     # Create array that contains the bins
     bins = torch.zeros((X.shape[1], max_bin), device=X.device)
-    quantiles = torch.linspace(0, 1, max_bin, device=X.device)
+    quantiles = torch.linspace(0, 1, max_bin - 1, device=X.device)
     # For each feature, create max_bins based on frequency bins. 
     for i in range(X.shape[1]):
         xs = X[:, i]   
-        current_bin = torch.unique(torch.quantile(xs, quantiles))
-        # A bit inefficiency created here... some features usually have less than max_bin values (e.g. 1/0 indicator features). 
-        bins[i, :current_bin.shape[0]] = current_bin
-        bins[i, current_bin.shape[0]:] = current_bin.max()
+        current_bin = torch.unique(torch.nanquantile(xs, quantiles))
+        # First bin is the NaN bin. Then, the rest of the bins follow
+        # A bit inefficiency created here... some features usually have less than max_bin values 
+        bins[i, 0] = current_bin[0] - 1.0e-6
+        bins[i, 1:current_bin.shape[0] + 1] = current_bin
+        bins[i, current_bin.shape[0] + 1:] = current_bin.max()
 
     return bins
 

diff --git a/setup.py b/setup.py
@@ -57,7 +57,7 @@ def get_openmp_flag():
 if __name__ == "__main__":
     setup(
         name="pgbm",
-        version="2.0.0",
+        version="2.1.0",
         description="Probabilistic Gradient Boosting Machines",
         author="Olivier Sprangers",
         author_email="[email protected]",