From 2d6e17f9a5acb976e19e20ca86e890c2279506aa Mon Sep 17 00:00:00 2001
From: Marshall Crumiller <mcrumiller@gmail.com>
Date: Mon, 4 Mar 2024 16:05:18 -0500
Subject: [PATCH] Add keep column(s) param

---
 crates/polars-ops/src/frame/mod.rs            | 18 ++++--
 .../polars-ops/src/series/ops/to_dummies.rs   | 60 ++++++++++++-------
 crates/polars-python/src/dataframe/general.rs |  6 +-
 crates/polars-python/src/series/general.rs    |  5 +-
 py-polars/polars/dataframe/frame.py           | 18 +++++-
 py-polars/polars/series/series.py             | 23 ++++++-
 py-polars/tests/unit/dataframe/test_df.py     | 19 ++++++
 py-polars/tests/unit/series/test_series.py    | 10 ++++
 8 files changed, 125 insertions(+), 34 deletions(-)
diff --git a/crates/polars-ops/src/frame/mod.rs b/crates/polars-ops/src/frame/mod.rs
index d72ff4488251..9b0d71c017dc 100644
--- a/crates/polars-ops/src/frame/mod.rs
+++ b/crates/polars-ops/src/frame/mod.rs
@@ -70,8 +70,13 @@ pub trait DataFrameOps: IntoDf {
     ///  +------+------+------+--------+--------+--------+---------+---------+---------+
     /// ```
     #[cfg(feature = "to_dummies")]
-    fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PolarsResult<DataFrame> {
-        self._to_dummies(None, separator, drop_first)
+    fn to_dummies(
+        &self,
+        separator: Option<&str>,
+        drop_first: bool,
+        keep_columns: bool,
+    ) -> PolarsResult<DataFrame> {
+        self._to_dummies(None, separator, drop_first, keep_columns)
     }
 
     #[cfg(feature = "to_dummies")]
@@ -80,8 +85,9 @@ pub trait DataFrameOps: IntoDf {
         columns: Vec<&str>,
         separator: Option<&str>,
         drop_first: bool,
+        keep_columns: bool,
     ) -> PolarsResult<DataFrame> {
-        self._to_dummies(Some(columns), separator, drop_first)
+        self._to_dummies(Some(columns), separator, drop_first, keep_columns)
     }
 
     #[cfg(feature = "to_dummies")]
@@ -90,6 +96,7 @@ pub trait DataFrameOps: IntoDf {
         columns: Option<Vec<&str>>,
         separator: Option<&str>,
         drop_first: bool,
+        keep_columns: bool,
     ) -> PolarsResult<DataFrame> {
         use crate::series::ToDummies;
 
@@ -105,7 +112,10 @@ pub trait DataFrameOps: IntoDf {
             df.get_columns()
                 .par_iter()
                 .map(|s| match set.contains(s.name().as_str()) {
-                    true => s.as_materialized_series().to_dummies(separator, drop_first),
+                    true => {
+                        s.as_materialized_series()
+                            .to_dummies(separator, drop_first, keep_columns)
+                    },
                     false => Ok(s.clone().into_frame()),
                 })
                 .collect::<PolarsResult<Vec<_>>>()
diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs
index eb2cf3a228c1..3a0b1940f62d 100644
--- a/crates/polars-ops/src/series/ops/to_dummies.rs
+++ b/crates/polars-ops/src/series/ops/to_dummies.rs
@@ -13,11 +13,21 @@ type DummyType = i32;
 type DummyCa = Int32Chunked;
 
 pub trait ToDummies {
-    fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PolarsResult<DataFrame>;
+    fn to_dummies(
+        &self,
+        separator: Option<&str>,
+        drop_first: bool,
+        keep_column: bool,
+    ) -> PolarsResult<DataFrame>;
 }
 
 impl ToDummies for Series {
-    fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PolarsResult<DataFrame> {
+    fn to_dummies(
+        &self,
+        separator: Option<&str>,
+        drop_first: bool,
+        keep_column: bool,
+    ) -> PolarsResult<DataFrame> {
         let sep = separator.unwrap_or("_");
         let col_name = self.name();
         let groups = self.group_tuples(true, drop_first)?;
@@ -25,26 +35,32 @@ impl ToDummies for Series {
         // SAFETY: groups are in bounds
         let columns = unsafe { self.agg_first(&groups) };
         let columns = columns.iter().zip(groups.iter()).skip(drop_first as usize);
-        let columns = columns
-            .map(|(av, group)| {
-                // strings are formatted with extra \" \" in polars, so we
-                // extract the string
-                let name = if let Some(s) = av.get_str() {
-                    format_pl_smallstr!("{col_name}{sep}{s}")
-                } else {
-                    // other types don't have this formatting issue
-                    format_pl_smallstr!("{col_name}{sep}{av}")
-                };
-
-                let ca = match group {
-                    GroupsIndicator::Idx((_, group)) => dummies_helper_idx(group, self.len(), name),
-                    GroupsIndicator::Slice([offset, len]) => {
-                        dummies_helper_slice(offset, len, self.len(), name)
-                    },
-                };
-                ca.into_column()
-            })
-            .collect::<Vec<_>>();
+        let columns = columns.map(|(av, group)| {
+            // strings are formatted with extra \" \" in polars, so we
+            // extract the string
+            let name = if let Some(s) = av.get_str() {
+                format_pl_smallstr!("{col_name}{sep}{s}")
+            } else {
+                // other types don't have this formatting issue
+                format_pl_smallstr!("{col_name}{sep}{av}")
+            };
+
+            let ca = match group {
+                GroupsIndicator::Idx((_, group)) => dummies_helper_idx(group, self.len(), name),
+                GroupsIndicator::Slice([offset, len]) => {
+                    dummies_helper_slice(offset, len, self.len(), name)
+                },
+            };
+            ca.into_column()
+        });
+
+        let columns = if keep_column {
+            std::iter::once(self.clone().into_column())
+                .chain(columns)
+                .collect()
+        } else {
+            columns.collect()
+        };
 
         // SAFETY: `dummies_helper` functions preserve `self.len()` length
         unsafe { DataFrame::new_no_length_checks(sort_columns(columns)) }
diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs
index e866e7db1004..cda8ddfb557a 100644
--- a/crates/polars-python/src/dataframe/general.rs
+++ b/crates/polars-python/src/dataframe/general.rs
@@ -550,13 +550,14 @@ impl PyDataFrame {
         Ok(s.map(|s| s.into()))
     }
 
-    #[pyo3(signature = (columns, separator, drop_first=false))]
+    #[pyo3(signature = (columns, separator, drop_first=false, keep_columns=false))]
     pub fn to_dummies(
         &self,
         py: Python,
         columns: Option<Vec<String>>,
         separator: Option<&str>,
         drop_first: bool,
+        keep_columns: bool,
     ) -> PyResult<Self> {
         let df = py
             .allow_threads(|| match columns {
@@ -564,8 +565,9 @@ impl PyDataFrame {
                     cols.iter().map(|x| x as &str).collect(),
                     separator,
                     drop_first,
+                    keep_columns,
                 ),
-                None => self.df.to_dummies(separator, drop_first),
+                None => self.df.to_dummies(separator, drop_first, keep_columns),
             })
             .map_err(PyPolarsErr::from)?;
         Ok(df.into())
diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs
index 3134f5354f09..6426e9786d1c 100644
--- a/crates/polars-python/src/series/general.rs
+++ b/crates/polars-python/src/series/general.rs
@@ -336,15 +336,16 @@ impl PySeries {
         Ok(s.into())
     }
 
-    #[pyo3(signature = (separator, drop_first=false))]
+    #[pyo3(signature = (separator, drop_first=false, keep_column=false))]
     fn to_dummies(
         &self,
         py: Python,
         separator: Option<&str>,
         drop_first: bool,
+        keep_column: bool,
     ) -> PyResult<PyDataFrame> {
         let df = py
-            .allow_threads(|| self.series.to_dummies(separator, drop_first))
+            .allow_threads(|| self.series.to_dummies(separator, drop_first, keep_column))
             .map_err(PyPolarsErr::from)?;
         Ok(df.into())
     }
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 662d09e83d5b..fe5bd409aa89 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -9682,6 +9682,7 @@ def to_dummies(
         *,
         separator: str = "_",
         drop_first: bool = False,
+        keep_columns: bool = False,
     ) -> DataFrame:
         """
         Convert categorical variables into dummy/indicator variables.
@@ -9695,6 +9696,8 @@ def to_dummies(
             Separator/delimiter used when generating column names.
         drop_first
             Remove the first category from the variables being encoded.
+        keep_columns
+            Retain columns used to generated dummy columns.
 
         Examples
         --------
@@ -9727,6 +9730,17 @@ def to_dummies(
         │ 1     ┆ 1     ┆ 1     │
         └───────┴───────┴───────┘
 
+        >>> df.to_dummies(keep_columns=True)
+        shape: (2, 9)
+        ┌─────┬───────┬───────┬─────┬───┬───────┬─────┬───────┬───────┐
+        │ foo ┆ foo_1 ┆ foo_2 ┆ bar ┆ … ┆ bar_4 ┆ ham ┆ ham_a ┆ ham_b │
+        │ --- ┆ ---   ┆ ---   ┆ --- ┆   ┆ ---   ┆ --- ┆ ---   ┆ ---   │
+        │ i64 ┆ u8    ┆ u8    ┆ i64 ┆   ┆ u8    ┆ str ┆ u8    ┆ u8    │
+        ╞═════╪═══════╪═══════╪═════╪═══╪═══════╪═════╪═══════╪═══════╡
+        │ 1   ┆ 1     ┆ 0     ┆ 3   ┆ … ┆ 0     ┆ a   ┆ 1     ┆ 0     │
+        │ 2   ┆ 0     ┆ 1     ┆ 4   ┆ … ┆ 1     ┆ b   ┆ 0     ┆ 1     │
+        └─────┴───────┴───────┴─────┴───┴───────┴─────┴───────┴───────┘
+
         >>> import polars.selectors as cs
         >>> df.to_dummies(cs.integer(), separator=":")
         shape: (2, 5)
@@ -9752,7 +9766,9 @@ def to_dummies(
         """
         if columns is not None:
             columns = _expand_selectors(self, columns)
-        return self._from_pydf(self._df.to_dummies(columns, separator, drop_first))
+        return self._from_pydf(
+            self._df.to_dummies(columns, separator, drop_first, keep_columns)
+        )
 
     def unique(
         self,
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index d86c9d29cd0f..37fd5748aaa1 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -2148,7 +2148,11 @@ def quantile(
         return self._s.quantile(quantile, interpolation)
 
     def to_dummies(
-        self, *, separator: str = "_", drop_first: bool = False
+        self,
+        *,
+        separator: str = "_",
+        drop_first: bool = False,
+        keep_column: bool = False,
     ) -> DataFrame:
         """
         Get dummy/indicator variables.
@@ -2159,6 +2163,8 @@ def to_dummies(
             Separator/delimiter used when generating column names.
         drop_first
             Remove the first category from the variable being encoded.
+        keep_column
+            Retain column used to generated dummy columns.
 
         Examples
         --------
@@ -2186,8 +2192,19 @@ def to_dummies(
         │ 1   ┆ 0   │
         │ 0   ┆ 1   │
         └─────┴─────┘
-        """
-        return wrap_df(self._s.to_dummies(separator, drop_first))
+        >>> s.to_dummies(keep_column=True)
+        shape: (3, 4)
+        ┌─────┬─────┬─────┬─────┐
+        │ a   ┆ a_1 ┆ a_2 ┆ a_3 │
+        │ --- ┆ --- ┆ --- ┆ --- │
+        │ i64 ┆ u8  ┆ u8  ┆ u8  │
+        ╞═════╪═════╪═════╪═════╡
+        │ 1   ┆ 1   ┆ 0   ┆ 0   │
+        │ 2   ┆ 0   ┆ 1   ┆ 0   │
+        │ 3   ┆ 0   ┆ 0   ┆ 1   │
+        └─────┴─────┴─────┴─────┘
+        """
+        return wrap_df(self._s.to_dummies(separator, drop_first, keep_column))
 
     @unstable()
     def cut(
diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
index c375e1952347..31d753a3c4a7 100644
--- a/py-polars/tests/unit/dataframe/test_df.py
+++ b/py-polars/tests/unit/dataframe/test_df.py
@@ -806,6 +806,25 @@ def test_to_dummies_drop_first() -> None:
     ]
 
 
+def test_to_dummies_keep_columns() -> None:
+    df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
+    dummies = df.to_dummies(keep_columns=True)
+
+    expected = pl.DataFrame(
+        {
+            "A": ["a", "b", "c"],
+            "A_a": pl.Series([1, 0, 0], dtype=pl.UInt8),
+            "A_b": pl.Series([0, 1, 0], dtype=pl.UInt8),
+            "A_c": pl.Series([0, 0, 1], dtype=pl.UInt8),
+            "B": [1, 3, 5],
+            "B_1": pl.Series([1, 0, 0], dtype=pl.UInt8),
+            "B_3": pl.Series([0, 1, 0], dtype=pl.UInt8),
+            "B_5": pl.Series([0, 0, 1], dtype=pl.UInt8),
+        }
+    )
+    assert_frame_equal(dummies, expected)
+
+
 def test_to_pandas(df: pl.DataFrame) -> None:
     # pyarrow cannot deal with unsigned dictionary integer yet.
     # pyarrow cannot convert a time64 w/ non-zero nanoseconds
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
index 0ed478b9aa83..aa34c9d09b78 100644
--- a/py-polars/tests/unit/series/test_series.py
+++ b/py-polars/tests/unit/series/test_series.py
@@ -1357,6 +1357,16 @@ def test_to_dummies_drop_first() -> None:
     assert_frame_equal(result, expected)
 
 
+def test_to_dummies_keep_column() -> None:
+    s = pl.Series("a", [1, 2, 3])
+    result = s.to_dummies(keep_column=True)
+    expected = pl.DataFrame(
+        {"a": [1, 2, 3], "a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]},
+        schema={"a": pl.Int64, "a_1": pl.UInt8, "a_2": pl.UInt8, "a_3": pl.UInt8},
+    )
+    assert_frame_equal(result, expected)
+
+
 def test_to_dummies_null_clash_19096() -> None:
     with pytest.raises(
         DuplicateError, match="column with name '_null' has more than one occurrence"