From 680e2612e69baa2c149985e0fa80d4795e7d26fd Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Thu, 7 Mar 2024 18:48:43 +0800 Subject: [PATCH 01/31] Move K8s cloud name into common lib for Jenkins CI (#582) Signed-off-by: Tim Liu --- ci/Jenkinsfile.premerge | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge index bf235ea9..8aae41de 100644 --- a/ci/Jenkinsfile.premerge +++ b/ci/Jenkinsfile.premerge @@ -40,7 +40,7 @@ pipeline { agent { kubernetes { label "premerge-init-${BUILD_TAG}" - cloud 'sc-ipp-blossom-prod' + cloud "${common.CLOUD_NAME}" yaml cpuImage } } @@ -107,7 +107,7 @@ pipeline { agent { kubernetes { label "premerge-docker-${BUILD_TAG}" - cloud 'sc-ipp-blossom-prod' + cloud "${common.CLOUD_NAME}" yaml pod.getDockerBuildYAML() workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false) customWorkspace "${CUSTOM_WORKSPACE}" @@ -169,7 +169,7 @@ pipeline { agent { kubernetes { label "premerge-ci-${BUILD_TAG}" - cloud 'sc-ipp-blossom-prod' + cloud "${common.CLOUD_NAME}" yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false) customWorkspace "${CUSTOM_WORKSPACE}" From b6556a2647de597037ca64449c4f69a0ade5d87b Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Fri, 15 Mar 2024 15:13:56 +0800 Subject: [PATCH 02/31] add cloud guardword (#594) Signed-off-by: YanxuanLiu --- ci/Jenkinsfile.premerge | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge index 8aae41de..43e5d2ac 100644 --- a/ci/Jenkinsfile.premerge +++ b/ci/Jenkinsfile.premerge @@ -203,6 +203,7 @@ pipeline { // upload log only in case of build failure def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"] guardWords.add("nvidia-smi(?s)(.*?)(?=git)") // hide GPU info + guardWords.add("sc-ipp*") // hide cloud info githubHelper.uploadLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords) githubHelper.updateCommitStatus("$BUILD_URL", "Fail", GitHubCommitState.FAILURE) From 6989591cb00917055bb61c68c54b64dbbffd0cb2 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Tue, 26 Mar 2024 13:21:39 +0800 Subject: [PATCH 03/31] Auto merge PRs to branch-24.06 from branch-24.04 [skip ci] (#595) Signed-off-by: Tim Liu --- .github/workflows/auto-merge.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index 1302ecc0..27b0c2fd 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-24.02 + - branch-24.04 types: [closed] jobs: @@ -29,14 +29,14 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: branch-24.02 # force to fetch from latest upstream instead of PR ref + ref: branch-24.04 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids-ml - HEAD: branch-24.02 - BASE: branch-24.04 + HEAD: branch-24.04 + BASE: branch-24.06 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR From ca2975e151dd478b9d997be53f928dfa43e1028e Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Thu, 4 Apr 2024 19:45:17 -0700 Subject: [PATCH 04/31] DBSCAN Algorithm for Spark RAPIDS ML (#598) * DBSCAN basis * Precomputed distance testcase, usecase comments Signed-off-by: nvssh nssswitch user account * Code structure cleaning and update for ColID extraction * testfile fix * Remove Precomputed mode support * idCol fix * Syntax fix for CI python version * ColID fix, code cleaning for sparse vector input * Avoid core indices calc kernel by default --------- Signed-off-by: nvssh nssswitch user account Co-authored-by: nvssh nssswitch user account --- python/src/spark_rapids_ml/dbscan.py | 644 +++++++++++++++++++++++++++ python/src/spark_rapids_ml/params.py | 43 ++ python/tests/test_dbscan.py | 254 +++++++++++ 3 files changed, 941 insertions(+) create mode 100644 python/src/spark_rapids_ml/dbscan.py create mode 100644 python/tests/test_dbscan.py diff --git a/python/src/spark_rapids_ml/dbscan.py b/python/src/spark_rapids_ml/dbscan.py new file mode 100644 index 00000000..27142072 --- /dev/null +++ b/python/src/spark_rapids_ml/dbscan.py @@ -0,0 +1,644 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from abc import ABCMeta +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Type, + Union, + cast, +) + +import numpy as np +import pandas as pd +from pyspark import keyword_only +from pyspark.ml.param.shared import HasFeaturesCol, Param, Params, TypeConverters +from pyspark.sql import Column +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.functions import col, monotonically_increasing_id +from pyspark.sql.types import ( + DoubleType, + FloatType, + IntegerType, + LongType, + Row, + StructField, + StructType, +) + +from .core import ( + FitInputType, + _ConstructFunc, + _CumlCaller, + _CumlEstimator, + _CumlModel, + _CumlModelWithPredictionCol, + _EvaluateFunc, + _read_csr_matrix_from_unwrapped_spark_vec, + _TransformFunc, + _use_sparse_in_cuml, + alias, + param_alias, +) +from .metrics import EvalMetricInfo +from .params import HasFeaturesCols, HasIDCol, P, _CumlClass, _CumlParams +from .utils import _ArrayOrder, _concat_and_free, _get_spark_session, get_logger + +if TYPE_CHECKING: + import cudf + from pyspark.ml._typing import ParamMap + + +class DBSCANClass(_CumlClass): + @classmethod + def _param_mapping(cls) -> Dict[str, Optional[str]]: + return {} + + def _get_cuml_params_default(self) -> Dict[str, Any]: + return { + "eps": 0.5, + "min_samples": 5, + "metric": "euclidean", + "verbose": False, + "max_mbytes_per_batch": None, + "calc_core_sample_indices": False, + } + + def _pyspark_class(self) -> Optional[ABCMeta]: + return None + + +class _DBSCANCumlParams(_CumlParams, HasFeaturesCol, HasFeaturesCols, HasIDCol): + def __init__(self) -> None: + super().__init__() + self._setDefault( + eps=0.5, + min_samples=5, + metric="euclidean", + max_mbytes_per_batch=None, + calc_core_sample_indices=True, + idCol=alias.row_number, + ) + + eps = Param( + Params._dummy(), + "eps", + ( + f"The maximum distance between 2 points such they reside in the same neighborhood." + ), + typeConverter=TypeConverters.toFloat, + ) + + min_samples = Param( + Params._dummy(), + "min_samples", + ( + f"The number of samples in a neighborhood such that this group can be considered as an important core point (including the point itself)." + ), + typeConverter=TypeConverters.toInt, + ) + + metric = Param( + Params._dummy(), + "metric", + ( + f"The metric to use when calculating distances between points." + f"Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead." + ), + typeConverter=TypeConverters.toString, + ) + + max_mbytes_per_batch = Param( + Params._dummy(), + "max_mbytes_per_batch", + ( + f"Calculate batch size using no more than this number of megabytes for the pairwise distance computation." + f"This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples." + f"If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device." + ), + typeConverter=TypeConverters.toInt, + ) + + calc_core_sample_indices = Param( + Params._dummy(), + "calc_core_sample_indices", + ( + f"Indicates whether the indices of the core samples should be calculated." + f"Setting this to False will avoid unnecessary kernel launches" + ), + typeConverter=TypeConverters.toBoolean, + ) + + idCol = Param( + Params._dummy(), + "idCol", + "id column name.", + typeConverter=TypeConverters.toString, + ) + + def getFeaturesCol(self) -> Union[str, List[str]]: # type: ignore + """ + Gets the value of :py:attr:`featuresCol` or :py:attr:`featuresCols` + """ + if self.isDefined(self.featuresCols): + return self.getFeaturesCols() + elif self.isDefined(self.featuresCol): + return self.getOrDefault("featuresCol") + else: + raise RuntimeError("featuresCol is not set") + + def setFeaturesCol(self: P, value: Union[str, List[str]]) -> P: + """ + Sets the value of :py:attr:`featuresCol` or :py:attr:`featuresCols`. + """ + if isinstance(value, str): + self._set_params(featuresCol=value) + else: + self._set_params(featuresCols=value) + return self + + def setFeaturesCols(self: P, value: List[str]) -> P: + """ + Sets the value of :py:attr:`featuresCols`. Used when input vectors are stored as multiple feature columns. + """ + return self._set_params(featuresCols=value) + + def setPredictionCol(self: P, value: str) -> P: + """ + Sets the value of :py:attr:`predictionCol`. + """ + self._set_params(predictionCol=value) + return self + + def setIdCol(self: P, value: str) -> P: + """ + Sets the value of `idCol`. If not set, an id column will be added with column name `unique_id`. The id column is used to specify nearest neighbor vectors by associated id value. + """ + self._set_params(idCol=value) + return self + + +class DBSCAN(DBSCANClass, _CumlEstimator, _DBSCANCumlParams): + """ + The Density-Based Spatial Clustering of Applications with Noise (DBSCAN) is a non-parametric + data clustering algorithm based on data density. It groups points close to each other that form a dense cluster + and mark the far-away points as noise and exclude them from all clusters. + + Parameters + ---------- + featuresCol: str or List[str] + The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n + * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. + * When the value is a list of strings, the feature columns must be numeric types. + + predictionCol: str + the name of the column that stores cluster indices of input vectors. predictionCol should be set when users expect to apply the transform function of a learned model. + + num_workers: + Number of cuML workers, where each cuML worker corresponds to one Spark task + running on one GPU. If not set, spark-rapids-ml tries to infer the number of + cuML workers (i.e. GPUs in cluster) from the Spark environment. + + eps: float (default = 0.5) + The maximum distance between 2 points such they reside in the same neighborhood. + + min_samples: int (default = 5) + The number of samples in a neighborhood such that this group can be considered as + an important core point (including the point itself). + + metric: {'euclidean', 'cosine'}, default = 'euclidean' + The metric to use when calculating distances between points. + Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead + + verbose: int or boolean (default=False) + Logging level. + * ``0`` - Disables all log messages. + * ``1`` - Enables only critical messages. + * ``2`` - Enables all messages up to and including errors. + * ``3`` - Enables all messages up to and including warnings. + * ``4 or False`` - Enables all messages up to and including information messages. + * ``5 or True`` - Enables all messages up to and including debug messages. + * ``6`` - Enables all messages up to and including trace messages. + + max_mbytes_per_batch(optional): int + Calculate batch size using no more than this number of megabytes for the pairwise distance computation. + This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples. + If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device. + + calc_core_sample_indices(optional): boolean (default = True) + Indicates whether the indices of the core samples should be calculated. + Setting this to False will avoid unnecessary kernel launches + + idCol: str (default = 'unique_id') + The internal unique id column name for label matching, will not reveal in the output. + Need to be set to a name that does not conflict with an existing column name in the original input data. + + Examples + ---------- + >>> from spark_rapids_ml.dbscan import DBSCAN + >>> data = [([0.0, 0.0],), + ... ([1.0, 1.0],), + ... ([9.0, 8.0],), + ... ([8.0, 9.0],),] + >>> df = spark.createDataFrame(data, ["features"]) + >>> df.show() + +----------+ + | features| + +----------+ + |[0.0, 0.0]| + |[1.0, 1.0]| + |[9.0, 8.0]| + |[8.0, 9.0]| + +----------+ + >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCol("features") + >>> gpu_model = gpu_dbscan.fit(df) + >>> gpu_model.setPredictionCol("prediction") + >>> transformed = gpu_model.transform(df) + >>> transformed.show() + +----------+----------+ + | features|prediction| + +----------+----------+ + |[0.0, 0.0]| 0| + |[1.0, 1.0]| 0| + |[9.0, 8.0]| 1| + |[8.0, 9.0]| 1| + +----------+----------+ + >>> gpu_dbscan.save("/tmp/dbscan") + >>> gpu_model.save("/tmp/dbscan_model") + + >>> # vector column input + >>> from spark_rapids_ml.dbscan import DBSCAN + >>> from pyspark.ml.linalg import Vectors + >>> data = [(Vectors.dense([0.0, 0.0]),), + ... (Vectors.dense([1.0, 1.0]),), + ... (Vectors.dense([9.0, 8.0]),), + ... (Vectors.dense([8.0, 9.0]),),] + >>> df = spark.createDataFrame(data, ["features"]) + >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCol("features") + >>> gpu_dbscan.getFeaturesCol() + 'features' + >>> gpu_model = gpu_dbscan.fit(df) + + + >>> # multi-column input + >>> data = [(0.0, 0.0), + ... (1.0, 1.0), + ... (9.0, 8.0), + ... (8.0, 9.0),] + >>> df = spark.createDataFrame(data, ["f1", "f2"]) + >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCols(["f1", "f2"]) + >>> gpu_dbscan.getFeaturesCols() + ['f1', 'f2'] + >>> gpu_model = gpu_dbscan.fit(df) + """ + + @keyword_only + def __init__( + self, + *, + featuresCol: str = "features", + predictionCol: str = "prediction", + eps: float = 0.5, + min_samples: int = 5, + metric: str = "euclidean", + max_mbytes_per_batch: Optional[int] = None, + calc_core_sample_indices: bool = True, + verbose: Union[int, bool] = False, + **kwargs: Any, + ) -> None: + super().__init__() + self._set_params(**self._input_kwargs) + + max_records_per_batch_str = _get_spark_session().conf.get( + "spark.sql.execution.arrow.maxRecordsPerBatch", "10000" + ) + assert max_records_per_batch_str is not None + self.max_records_per_batch = int(max_records_per_batch_str) + self.BROADCAST_LIMIT = 8 << 30 + + self.verbose = verbose + + def setEps(self: P, value: float) -> P: + return self._set_params(eps=value) + + def getEps(self) -> float: + return self.getOrDefault("eps") + + def setMinSamples(self: P, value: int) -> P: + return self._set_params(min_samples=value) + + def getMinSamples(self) -> int: + return self.getOrDefault("min_samples") + + def setMetric(self: P, value: str) -> P: + return self._set_params(metric=value) + + def getMetric(self) -> str: + return self.getOrDefault("metric") + + def setMaxMbytesPerBatch(self: P, value: Optional[int]) -> P: + return self._set_params(max_mbytes_per_batch=value) + + def getMaxMbytesPerBatch(self) -> Optional[int]: + return self.getOrDefault("max_mbytes_per_batch") + + def setCalcCoreSampleIndices(self: P, value: bool) -> P: + return self._set_params(calc_core_sample_indices=value) + + def getCalcCoreSampleIndices(self) -> bool: + return self.getOrDefault("calc_core_sample_indices") + + def _fit(self, dataset: DataFrame) -> _CumlModel: + if self.getMetric() == "precomputed": + raise ValueError( + "Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead" + ) + + # Create parameter-copied model without accessing the input dataframe + # All information will be retrieved from Model and transform + model = DBSCANModel(verbose=self.verbose, n_cols=0, dtype="") + + model._num_workers = self.num_workers + self._copyValues(model) + + return model + + def _create_pyspark_model(self, result: Row) -> _CumlModel: + raise NotImplementedError("DBSCAN does not support model creation from Row") + + def _get_cuml_fit_func( + self, + dataset: DataFrame, + extra_params: Optional[List[Dict[str, Any]]] = None, + ) -> Callable[ + [FitInputType, Dict[str, Any]], + Dict[str, Any], + ]: + raise NotImplementedError("DBSCAN does not fit and generate model") + + def _out_schema(self) -> Union[StructType, str]: + raise NotImplementedError("DBSCAN does not output for fit and generate model") + + +class DBSCANModel( + DBSCANClass, _CumlCaller, _CumlModelWithPredictionCol, _DBSCANCumlParams +): + def __init__( + self, + n_cols: int, + dtype: str, + verbose: Union[int, bool], + ): + super(DBSCANClass, self).__init__() + super(_CumlModelWithPredictionCol, self).__init__(n_cols=n_cols, dtype=dtype) + super(_DBSCANCumlParams, self).__init__() + + self._setDefault( + idCol=alias.row_number, + ) + + self.verbose = verbose + self.BROADCAST_LIMIT = 8 << 30 + self._dbscan_spark_model = None + + def _pre_process_data(self, dataset: DataFrame) -> Tuple[ # type: ignore + List[Column], + Optional[List[str]], + int, + Union[Type[FloatType], Type[DoubleType]], + ]: + ( + select_cols, + multi_col_names, + dimension, + feature_type, + ) = _CumlCaller._pre_process_data(self, dataset) + + # Must retain idCol for label matching + if self.hasParam("idCol") and self.isDefined("idCol"): + id_col_name = self.getOrDefault("idCol") + select_cols.append(col(id_col_name)) + else: + select_cols.append(col(alias.row_number)) + + return select_cols, multi_col_names, dimension, feature_type + + def _out_schema( + self, input_schema: StructType = StructType() + ) -> Union[StructType, str]: + return StructType( + [ + StructField(self._get_prediction_name(), IntegerType(), False), + StructField(self.getIdCol(), LongType(), False), + ] + ) + + def _transform_array_order(self) -> _ArrayOrder: + return "C" + + def _fit_array_order(self) -> _ArrayOrder: + return "C" + + def _require_nccl_ucx(self) -> Tuple[bool, bool]: + return (True, True) + + def _get_cuml_fit_func( + self, + dataset: DataFrame, + extra_params: Optional[List[Dict[str, Any]]] = None, + ) -> Callable[ + [FitInputType, Dict[str, Any]], + Dict[str, Any], + ]: + import cupy as cp + import cupyx + + dtype = self.dtype + n_cols = self.n_cols + array_order = self._fit_array_order() + pred_name = self._get_prediction_name() + idCol_name = self.getIdCol() + + cuda_managed_mem_enabled = ( + _get_spark_session().conf.get("spark.rapids.ml.uvm.enabled", "false") + == "true" + ) + + inputs = [] # type: ignore + + idCol = list( + self.idCols_[0].value + if len(self.idCols_) == 1 + else np.concatenate([chunk.value for chunk in self.idCols_]) + ) + + for pdf_bc in self.raw_data_: + features = pdf_bc.value + + # experiments indicate it is faster to convert to numpy array and then to cupy array than directly + # invoking cupy array on the list + if cuda_managed_mem_enabled: + features = cp.array(features) + + inputs.append(features) + + concated = _concat_and_free(inputs, order=array_order) + + def _cuml_fit( + dfs: FitInputType, + params: Dict[str, Any], + ) -> Dict[str, Any]: + from cuml.cluster.dbscan_mg import DBSCANMG as CumlDBSCANMG + from pyspark import BarrierTaskContext + + context = BarrierTaskContext.get() + partition_id = context.partitionId() + + logger = get_logger(self.__class__) + + dbscan = CumlDBSCANMG( + handle=params[param_alias.handle], + output_type="cudf", + eps=self.getOrDefault("eps"), + min_samples=self.getOrDefault("min_samples"), + metric=self.getOrDefault("metric"), + max_mbytes_per_batch=self.getOrDefault("max_mbytes_per_batch"), + calc_core_sample_indices=self.getOrDefault("calc_core_sample_indices"), + verbose=self.verbose, + ) + dbscan.n_cols = params[param_alias.num_cols] + dbscan.dtype = np.dtype(dtype) + + res = list(dbscan.fit_predict(concated).to_numpy()) + + # Only node 0 from cuML will contain the correct label output + if partition_id == 0: + return { + idCol_name: idCol, + pred_name: res, + } + else: + return { + idCol_name: [], + pred_name: [], + } + + return _cuml_fit + + def _get_cuml_transform_func( + self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None + ) -> Tuple[ + _ConstructFunc, + _TransformFunc, + Optional[_EvaluateFunc], + ]: + raise NotImplementedError( + "DBSCAN does not can not have a separate transform UDF" + ) + + def _transform(self, dataset: DataFrame) -> DataFrame: + logger = get_logger(self.__class__) + + spark = _get_spark_session() + + def _chunk_arr( + arr: np.ndarray, BROADCAST_LIMIT: int = self.BROADCAST_LIMIT + ) -> List[np.ndarray]: + """Chunk an array, if oversized, into smaller arrays that can be broadcasted.""" + if arr.nbytes <= BROADCAST_LIMIT: + return [arr] + + rows_per_chunk = BROADCAST_LIMIT // (arr.nbytes // arr.shape[0]) + num_chunks = (arr.shape[0] + rows_per_chunk - 1) // rows_per_chunk + chunks = [ + arr[i * rows_per_chunk : (i + 1) * rows_per_chunk] + for i in range(num_chunks) + ] + + return chunks + + dataset = self._ensureIdCol(dataset) + select_cols, multi_col_names, dimension, _ = self._pre_process_data(dataset) + input_dataset = dataset.select(*select_cols) + pd_dataset: pd.DataFrame = input_dataset.toPandas() + + if multi_col_names: + raw_data = np.array( + pd_dataset.drop(columns=[self.getIdCol()]), + order=self._fit_array_order(), + ) + else: + raw_data = np.array( + list(pd_dataset.drop(columns=[self.getIdCol()])[alias.data]), + order=self._fit_array_order(), + ) + + idCols: np.ndarray = np.array(pd_dataset[self.getIdCol()]) + + # Set input metadata + self.n_cols = len(raw_data[0]) + self.dtype = ( + type(raw_data[0][0][0]).__name__ + if isinstance(raw_data[0][0], List) + or isinstance(raw_data[0][0], np.ndarray) + else type(raw_data[0][0]).__name__ + ) + + # Broadcast preprocessed input dataset and the idCol + broadcast_raw_data = [ + spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(raw_data) + ] + + broadcast_idCol = [ + spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(idCols) + ] + + self.processed_input_cols = input_dataset.drop(self.getIdCol()).columns + self.raw_data_ = broadcast_raw_data + self.idCols_ = broadcast_idCol + self.multi_col_names = multi_col_names + + idCol_name = self.getIdCol() + + default_num_partitions = dataset.rdd.getNumPartitions() + + rdd = self._call_cuml_fit_func( + dataset=dataset, + partially_collect=False, + paramMaps=None, + ) + rdd = rdd.repartition(default_num_partitions) + + pred_df = rdd.toDF() + + # JOIN the transformed label column into the original input dataset + # and discard the internal idCol for row matching + return dataset.join(pred_df, idCol_name).drop(idCol_name) + + def _get_model_attributes(self) -> Optional[Dict[str, Any]]: + """ + Override parent method to bring broadcast variables to driver before JSON serialization. + """ + + self._model_attributes["verbose"] = self.verbose + + return self._model_attributes diff --git a/python/src/spark_rapids_ml/params.py b/python/src/spark_rapids_ml/params.py index 88daf64b..e4b6f8b8 100644 --- a/python/src/spark_rapids_ml/params.py +++ b/python/src/spark_rapids_ml/params.py @@ -30,6 +30,8 @@ from pyspark import SparkContext from pyspark.ml.param import Param, Params, TypeConverters from pyspark.sql import SparkSession +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.functions import col, monotonically_increasing_id from .utils import _get_spark_session, _is_local, get_logger @@ -85,6 +87,47 @@ def getFeaturesCols(self) -> List[str]: return self.getOrDefault(self.featuresCols) +class HasIDCol(Params): + """ + Mixin for param idCol: ID for each row of input dataset for row matching. + """ + + idCol = Param( + Params._dummy(), # type: ignore + "idCol", + "id column name.", + typeConverter=TypeConverters.toString, + ) + + def __init__(self) -> None: + super(HasIDCol, self).__init__() + + def getIdCol(self) -> str: + """ + Gets the value of `idCol`. + """ + return self.getOrDefault("idCol") + + def _ensureIdCol(self, df: DataFrame) -> DataFrame: + """ + Ensure an id column exists in the input dataframe. Add the column if not exists. + """ + dedup = False + if not self.isSet("idCol"): + while self.getIdCol() in df.columns: + self._set(**{"idCol": self.getIdCol() + "_dedup"}) + dedup = True + + id_col_name = self.getIdCol() + df_withid = df.select(monotonically_increasing_id().alias(id_col_name), "*") + df_withid = ( + df + if self.isSet("idCol") and not dedup + else df.select(monotonically_increasing_id().alias(id_col_name), "*") + ) + return df_withid + + class _CumlClass(object): """ Base class for all _CumlEstimator and _CumlModel implemenations. diff --git a/python/tests/test_dbscan.py b/python/tests/test_dbscan.py new file mode 100644 index 00000000..ebbebb91 --- /dev/null +++ b/python/tests/test_dbscan.py @@ -0,0 +1,254 @@ +# +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Dict, List, Tuple, Type, TypeVar + +import numpy as np +import pyspark +import pytest +from _pytest.logging import LogCaptureFixture +from packaging import version + +if version.parse(pyspark.__version__) < version.parse("3.4.0"): + from pyspark.sql.utils import IllegalArgumentException # type: ignore +else: + from pyspark.errors import IllegalArgumentException # type: ignore + +from pyspark.ml.functions import array_to_vector +from pyspark.ml.linalg import DenseVector, Vectors +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.functions import col + +from spark_rapids_ml.dbscan import DBSCAN, DBSCANModel + +from .sparksession import CleanSparkSession +from .utils import ( + assert_params, + create_pyspark_dataframe, + cuml_supported_data_types, + feature_types, + get_default_cuml_parameters, + idfn, + pyspark_supported_feature_types, +) + + +def test_default_cuml_params() -> None: + from cuml import DBSCAN as CumlDBSCAN + + cuml_params = get_default_cuml_parameters([CumlDBSCAN], ["handle", "output_type"]) + cuml_params["calc_core_sample_indices"] = False + spark_params = DBSCAN()._get_cuml_params_default() + assert cuml_params == spark_params + + +def test_dbscan_params( + gpu_number: int, tmp_path: str, caplog: LogCaptureFixture +) -> None: + # Default constructor + default_spark_params: Dict[str, Any] = {} + default_cuml_params = { + "eps": 0.5, + "min_samples": 5, + "metric": "euclidean", + "verbose": False, + "max_mbytes_per_batch": None, + "calc_core_sample_indices": False, + } + default_dbscan = DBSCAN() + assert_params(default_dbscan, default_spark_params, default_cuml_params) + + # Estimator persistence + path = tmp_path + "/dbscan_tests" + estimator_path = f"{path}/dbscan" + default_dbscan.write().overwrite().save(estimator_path) + loaded_dbscan = DBSCAN.load(estimator_path) + assert_params(loaded_dbscan, default_spark_params, default_cuml_params) + + +def test_dbscan_basic( + gpu_number: int, tmp_path: str, caplog: LogCaptureFixture +) -> None: + # reduce the number of GPUs for toy dataset to avoid empty partition + gpu_number = min(gpu_number, 2) + data = [ + ([0.0, 0.0]), + ([1.0, 1.0]), + ([9.0, 8.0]), + ([8.0, 9.0]), + ] + + with CleanSparkSession() as spark: + df = ( + spark.sparkContext.parallelize(data, gpu_number) + .map(lambda row: (row,)) + .toDF(["features"]) + ) + dbscan = DBSCAN(num_workers=gpu_number, min_samples=2, eps=2).setFeaturesCol( + "features" + ) + + dbscan_model = dbscan.fit(df) + + # Model persistence + path = tmp_path + "/dbscan_tests" + model_path = f"{path}/dbscan_model" + dbscan_model.write().overwrite().save(model_path) + dbscan_model_loaded = DBSCANModel.load(model_path) + + # test transform function + dbscan_model.setPredictionCol("prediction") + label_df = dbscan_model.transform(df) + assert ["features", "prediction"] == sorted(label_df.columns) + + o_col = dbscan_model.getPredictionCol() + labels = [row[o_col] for row in label_df.collect()] + + assert len(labels) == 4 + assert labels[0] == labels[1] + assert labels[1] != labels[2] + assert labels[2] == labels[3] + + # Test the loaded model + dbscan_model_loaded.setPredictionCol("prediction") + label_df = dbscan_model_loaded.transform(df) + assert ["features", "prediction"] == sorted(label_df.columns) + + o_col = dbscan_model_loaded.getPredictionCol() + labels = [row[o_col] for row in label_df.collect()] + + assert len(labels) == 4 + assert labels[0] == labels[1] + assert labels[1] != labels[2] + assert labels[2] == labels[3] + + +@pytest.mark.parametrize("data_type", ["byte", "short", "int", "long"]) +def test_dbscan_numeric_type(gpu_number: int, data_type: str) -> None: + # reduce the number of GPUs for toy dataset to avoid empty partition + gpu_number = min(gpu_number, 2) + data = [ + [1, 4, 4, 4, 0], + [2, 2, 2, 2, 1], + [3, 3, 3, 2, 2], + [3, 3, 3, 2, 3], + [5, 2, 1, 3, 4], + ] + + with CleanSparkSession() as spark: + feature_cols = ["c1", "c2", "c3", "c4", "c5"] + schema = ", ".join([f"{c} {data_type}" for c in feature_cols]) + df = spark.createDataFrame(data, schema=schema) + dbscan = DBSCAN(num_workers=gpu_number, featuresCols=feature_cols) + dbscan_model = dbscan.fit(df) + label_df = dbscan_model.transform(df) + + +@pytest.mark.parametrize("feature_type", pyspark_supported_feature_types) +@pytest.mark.parametrize( + "data_shape", + [(1000, 20), pytest.param((10000, 200), marks=pytest.mark.slow)], + ids=idfn, +) +@pytest.mark.parametrize("data_type", cuml_supported_data_types) +@pytest.mark.parametrize("max_record_batch", [100, 10000]) +def test_dbscan( + gpu_number: int, + feature_type: str, + data_shape: Tuple[int, int], + data_type: np.dtype, + max_record_batch: int, +) -> None: + from cuml.datasets import make_blobs + + n_rows = data_shape[0] + n_cols = data_shape[1] + n_clusters = 8 + cluster_std = 1.0 + + eps = 5 + min_samples = 5 + metric = "euclidean" + + X, _ = make_blobs( + n_rows, n_cols, n_clusters, cluster_std=cluster_std, random_state=0 + ) # make_blobs creates a random dataset of isotropic gaussian blobs. + + from cuml import DBSCAN as cuDBSCAN + + cuml_dbscan = cuDBSCAN( + eps=eps, min_samples=min_samples, metric=metric, output_type="numpy", verbose=7 + ) + + import cudf + + gdf = cudf.DataFrame(X) + cuml_transformed = cuml_dbscan.fit_predict(gdf) + + sample_to_cluster = dict() + cluster_dict: Dict[int, int] = dict() + + np_df = X.get() + for rid, row in enumerate(np_df): + label = cuml_transformed[rid] + + sample_to_cluster[tuple(row)] = label + + conf = {"spark.sql.execution.arrow.maxRecordsPerBatch": str(max_record_batch)} + with CleanSparkSession(conf) as spark: + df, features_col, _ = create_pyspark_dataframe( + spark, feature_type, data_type, X, None + ) + + dbscan = DBSCAN( + num_workers=gpu_number, + eps=eps, + min_samples=min_samples, + metric=metric, + verbose=7, + ).setFeaturesCol(features_col) + + dbscan_model = dbscan.fit(df) + dbscan_model.setPredictionCol("prediction") + transformed = dbscan_model.transform(df) + + # Check cluster match + label_df = transformed.select("prediction") + feature_df = transformed.drop("prediction") + + label_pdf = label_df.toPandas() + feature_pdf = feature_df.toPandas() + + label_arr = label_pdf.to_numpy().squeeze() + feature_matrix = feature_pdf.to_numpy() + + for rid, row in enumerate(feature_matrix): + if isinstance(row[0], DenseVector): + data = tuple(row[0].toArray()) + elif isinstance(row[0], np.float32) or isinstance(row[0], np.float64): + data = tuple(row) + else: + data = tuple(row[0]) + + # Get the label computed by rapids and cuml + label_rapids = label_arr[rid] + label_cuml = sample_to_cluster[data] + + # Check if the mapping from rapids cluster to cuml cluster holds + if label_rapids in cluster_dict: + assert cluster_dict[label_rapids] == label_cuml + else: + cluster_dict[label_rapids] = label_cuml From ca730ec3fb937dd8e7f4dd1489e9d754a016777c Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:35:27 -0700 Subject: [PATCH 05/31] kNN update for colID extraction (#601) * kNN update for colID extraction * comment fix Signed-off-by: nvssh nssswitch user account --------- Signed-off-by: nvssh nssswitch user account Co-authored-by: nvssh nssswitch user account --- python/src/spark_rapids_ml/dbscan.py | 2 +- python/src/spark_rapids_ml/knn.py | 30 ++++------------------------ 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/python/src/spark_rapids_ml/dbscan.py b/python/src/spark_rapids_ml/dbscan.py index 27142072..d6cf190c 100644 --- a/python/src/spark_rapids_ml/dbscan.py +++ b/python/src/spark_rapids_ml/dbscan.py @@ -191,7 +191,7 @@ def setPredictionCol(self: P, value: str) -> P: def setIdCol(self: P, value: str) -> P: """ - Sets the value of `idCol`. If not set, an id column will be added with column name `unique_id`. The id column is used to specify nearest neighbor vectors by associated id value. + Sets the value of `idCol`. If not set, an id column will be added with column name `unique_id`. The id column is used to specify dbscan vectors by associated id value. """ self._set_params(idCol=value) return self diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 51104158..cef0c6cf 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -57,7 +57,7 @@ param_alias, ) from .metrics import EvalMetricInfo -from .params import P, _CumlClass, _CumlParams +from .params import HasIDCol, P, _CumlClass, _CumlParams from .utils import _concat_and_free, get_logger @@ -73,7 +73,9 @@ def _pyspark_class(self) -> Optional[ABCMeta]: return None -class _NearestNeighborsCumlParams(_CumlParams, HasInputCol, HasLabelCol, HasInputCols): +class _NearestNeighborsCumlParams( + _CumlParams, HasInputCol, HasLabelCol, HasInputCols, HasIDCol +): """ Shared Spark Params for NearestNeighbor and NearestNeighborModel. """ @@ -126,30 +128,6 @@ def setIdCol(self: P, value: str) -> P: self._set_params(idCol=value) return self - def getIdCol(self) -> str: - """ - Gets the value of `idCol`. - """ - return self.getOrDefault(self.idCol) - - def _ensureIdCol(self, df: DataFrame) -> DataFrame: - """ - Ensure an id column exists in the input dataframe. Add the column if not exists. - """ - if not self.isSet("idCol") and self.getIdCol() in df.columns: - raise ValueError( - f"Cannot create a default id column since a column with the default name '{self.getIdCol()}' already exists." - + "Please specify an id column" - ) - - id_col_name = self.getIdCol() - df_withid = ( - df - if self.isSet("idCol") - else df.select(monotonically_increasing_id().alias(id_col_name), "*") - ) - return df_withid - class NearestNeighbors( NearestNeighborsClass, _CumlEstimatorSupervised, _NearestNeighborsCumlParams From 0d7743ce3085f5a8613f6a0197fb9635bed25d0a Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Sun, 7 Apr 2024 13:21:51 +0800 Subject: [PATCH 06/31] update actions version (#597) Signed-off-by: YanxuanLiu --- .github/workflows/auto-merge.yml | 2 +- .github/workflows/blossom-ci.yml | 4 ++-- .github/workflows/gcs-benchmark.yml | 2 +- .github/workflows/signoff-check.yml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index 27b0c2fd..1b8ecd62 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -27,7 +27,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: branch-24.04 # force to fetch from latest upstream instead of PR ref diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 96fde0da..2f66dcba 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -61,7 +61,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: ${{ fromJson(needs.Authorization.outputs.args).repo }} ref: ${{ fromJson(needs.Authorization.outputs.args).ref }} @@ -69,7 +69,7 @@ jobs: # repo specific steps - name: Setup java - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: adopt java-version: 8 diff --git a/.github/workflows/gcs-benchmark.yml b/.github/workflows/gcs-benchmark.yml index 9ef57ac6..c252f425 100644 --- a/.github/workflows/gcs-benchmark.yml +++ b/.github/workflows/gcs-benchmark.yml @@ -39,7 +39,7 @@ jobs: SERVICE_ACCOUNT: ${{ secrets.GCLOUD_SERVICE_ACCOUNT }} CLUSTER_NAME: github-spark-rapids-ml-${{github.run_number}} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: run benchmark shell: bash diff --git a/.github/workflows/signoff-check.yml b/.github/workflows/signoff-check.yml index 0743afe2..59c5c8fe 100755 --- a/.github/workflows/signoff-check.yml +++ b/.github/workflows/signoff-check.yml @@ -23,7 +23,7 @@ jobs: signoff-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: sigoff-check job uses: ./.github/workflows/signoff-check From dafe207ad423c626cb6da44078f9ecb58804604f Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Tue, 9 Apr 2024 10:53:46 +0800 Subject: [PATCH 07/31] revert action version (#609) Signed-off-by: YanxuanLiu --- .github/workflows/blossom-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 2f66dcba..96fde0da 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -61,7 +61,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v3 with: repository: ${{ fromJson(needs.Authorization.outputs.args).repo }} ref: ${{ fromJson(needs.Authorization.outputs.args).ref }} @@ -69,7 +69,7 @@ jobs: # repo specific steps - name: Setup java - uses: actions/setup-java@v4 + uses: actions/setup-java@v3 with: distribution: adopt java-version: 8 From d17ad0cfd71b4834554d3a71d6b9c4cbc7572ed8 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:18:13 -0700 Subject: [PATCH 08/31] knn overwrite for colID extraction (#612) Signed-off-by: Hongzhe Cheng --- python/src/spark_rapids_ml/knn.py | 19 +++++++++++++++++++ python/tests/test_nearest_neighbors.py | 7 ++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index cef0c6cf..7855962d 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -128,6 +128,25 @@ def setIdCol(self: P, value: str) -> P: self._set_params(idCol=value) return self + def _ensureIdCol(self, df: DataFrame) -> DataFrame: + """ + Ensure an id column exists in the input dataframe. Add the column if not exists. + Overwritten for knn assumption on error for not setting idCol and duplicate exists. + """ + if not self.isSet("idCol") and self.getIdCol() in df.columns: + raise ValueError( + f"Cannot create a default id column since a column with the default name '{self.getIdCol()}' already exists." + + "Please specify an id column" + ) + + id_col_name = self.getIdCol() + df_withid = ( + df + if self.isSet("idCol") + else df.select(monotonically_increasing_id().alias(id_col_name), "*") + ) + return df_withid + class NearestNeighbors( NearestNeighborsClass, _CumlEstimatorSupervised, _NearestNeighborsCumlParams diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py index 53b92027..8cc3a278 100644 --- a/python/tests/test_nearest_neighbors.py +++ b/python/tests/test_nearest_neighbors.py @@ -302,11 +302,12 @@ def assert_indices_equal(indices: List[List[int]]) -> None: ) # vector feature type will be converted to float32 to be compatible with cuml multi-gpu NearestNeighbors Class @pytest.mark.parametrize("data_shape", [(1000, 50)], ids=idfn) @pytest.mark.parametrize("data_type", [np.float32]) -@pytest.mark.parametrize("max_record_batch", [100, 10000]) @pytest.mark.parametrize( - "batch_size", [100, 10000] + "max_record_batch", [pytest.param(100, marks=pytest.mark.slow), 10000] +) +@pytest.mark.parametrize( + "batch_size", [pytest.param(100, marks=pytest.mark.slow), 10000] ) # larger batch_size higher query throughput, yet more memory -@pytest.mark.slow def test_nearest_neighbors( gpu_number: int, feature_type: str, From 41f6c2b6a3416b099dd1e9a12efd04b804c395e6 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:20:47 -0700 Subject: [PATCH 09/31] Sparse Datagen Fix (#607) * Datagen Fix * Auth fix Signed-off-by: nvssh nssswitch user account * Auth fix Signed-off-by: Hongzhe Cheng --------- Signed-off-by: nvssh nssswitch user account Signed-off-by: Hongzhe Cheng Co-authored-by: nvssh nssswitch user account --- python/benchmark/gen_data_distributed.py | 7 ++++++- python/benchmark/test_gen_data.py | 18 +++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/python/benchmark/gen_data_distributed.py b/python/benchmark/gen_data_distributed.py index 89331c62..2a318792 100644 --- a/python/benchmark/gen_data_distributed.py +++ b/python/benchmark/gen_data_distributed.py @@ -717,6 +717,11 @@ def gen_dataframe_and_meta( redundant_cols = 0 orig_cols = cols + # Crop too large value + for i in range(len(density_values)): + if density_values[i] > 1: + density_values[i] = 1 + # Generate ground truth upfront. if multinomial_log: ground_truth = np.zeros((cols, n_classes)) @@ -775,7 +780,7 @@ def make_sparse_regression_udf( d = density_values[i] chunk_cols = col_per_chunk[i] - # Generate a column + # Generate a chunk sparse_col = sp.sparse.random( num_rows_per_partition, chunk_cols, diff --git a/python/benchmark/test_gen_data.py b/python/benchmark/test_gen_data.py index 2c5c4c73..b16aadde 100644 --- a/python/benchmark/test_gen_data.py +++ b/python/benchmark/test_gen_data.py @@ -234,7 +234,7 @@ def test_make_regression( ) @pytest.mark.parametrize( "density", - ["0.25", ["0.05", "0.1", "0.2"], pytest.param("0.2", marks=pytest.mark.slow)], + ["0.25", ["0.05", "0.1", "0.2"]], ) @pytest.mark.parametrize( "rows, cols", @@ -244,15 +244,12 @@ def test_make_regression( "density_curve, shuffle", [ ("None", "True"), - ("Linear", "True"), ("Linear", "False"), ("Exponential", "False"), pytest.param("Exponential", "True", marks=pytest.mark.slow), ], ) -@pytest.mark.parametrize( - "n_chunks", ["10", pytest.param("100", marks=pytest.mark.slow)] -) +@pytest.mark.parametrize("n_chunks", ["100"]) def test_make_sparse_regression( dtype: str, use_gpu: str, @@ -296,8 +293,11 @@ def test_make_sparse_regression( density_curve, "--shuffle", shuffle, + "--n_chunk", + n_chunks, ] + # Add parameters with multiple value input_args.append("--bias") if isinstance(bias, List): input_args.extend(bias) @@ -327,7 +327,6 @@ def test_make_sparse_regression( assert len(X) == row_num, "X row number mismatch" for sparseVec in X: - # assert sparseVec.toArray().dtype == np.dtype(dtype), "Unexpected dtype" assert sparseVec.size == col_num, "X col number mismatch" assert y.shape == (row_num,), "y shape mismatch" @@ -400,6 +399,10 @@ def test_make_sparse_regression( ) density_values *= n_chunks_num * density_num / sum(density_values) + for i in range(len(density_values)): + if density_values[i] > 1: + density_values[i] = 1 + col_per_chunk = np.full(n_chunks_num, orig_cols // n_chunks_num) col_per_chunk[: (orig_cols % n_chunks_num)] += 1 chunk_boundary = np.cumsum(col_per_chunk) @@ -416,9 +419,10 @@ def test_make_sparse_regression( assert dense_count >= chunk_size * num_partitions * int( (row_num // num_partitions) * col_density - 1 ) and dense_count <= chunk_size * num_partitions * int( - (row_num // num_partitions) * col_density + 1 + (row_num // num_partitions + 1) * col_density + 1 ) + # Check all clusters exists if logistic_regression == "True": assert np.unique(y).shape[0] == n_classes_num From d3be33c4f180feae3ca5b0f3d62478f714fb65a1 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:38:40 -0700 Subject: [PATCH 10/31] DBSCAN code move, notebook, benchmark (#608) * DBSCAN notebook, benchmark script * Move DBSCAN to clustering Signed-off-by: Hongzhe Cheng * File check in * benchmark script support * Benchmark parameter fix * Transform time fix * DBSCAN data broadcast fix, comment fix * separate change for DBSCAN source code fix * cmdline switch for benchmark score compute * style fix --------- Signed-off-by: Hongzhe Cheng --- README.md | 3 +- notebooks/dbscan.ipynb | 522 +++++++++++++++++ python/benchmark/benchmark/bench_dbscan.py | 313 ++++++++++ python/benchmark/benchmark_runner.py | 2 + python/run_benchmark.sh | 41 ++ python/src/spark_rapids_ml/clustering.py | 588 ++++++++++++++++++- python/src/spark_rapids_ml/dbscan.py | 644 --------------------- python/tests/test_dbscan.py | 2 +- 8 files changed, 1467 insertions(+), 648 deletions(-) create mode 100644 notebooks/dbscan.ipynb create mode 100644 python/benchmark/benchmark/bench_dbscan.py delete mode 100644 python/src/spark_rapids_ml/dbscan.py diff --git a/README.md b/README.md index e8916ff3..b80d2a31 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ The following table shows the currently supported algorithms. The goal is to ex | Supported Algorithms | Python | Scala | | :--------------------- | :----: | :---: | | CrossValidator | √ | | +| DBSCAN (*) | √ | | | KMeans | √ | | | k-NN (*) | √ | | | LinearRegression | √ | | @@ -44,7 +45,7 @@ The following table shows the currently supported algorithms. The goal is to ex | RandomForestRegressor | √ | | | UMAP (*) | √ | | -Note: Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. +Note: Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. As an alternative to KMeans, we also provide a Spark API for GPU accelerated Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density based clustering algorithm in the RAPIDS cuML library. ## Getting started diff --git a/notebooks/dbscan.ipynb b/notebooks/dbscan.ipynb new file mode 100644 index 00000000..b8b24b37 --- /dev/null +++ b/notebooks/dbscan.ipynb @@ -0,0 +1,522 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DBSCAN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a7c21330-7326-4d98-9351-d1b2e4c6143c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create synthetic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "291da378-0e9b-4b53-bf9e-b78c35631f1d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "n_rows = 10000\n", + "n_cols = 500\n", + "n_clusters_data = 10\n", + "cluster_std = 1.0\n", + "dtype='float32'\n", + "from sklearn.datasets import make_blobs\n", + "data, _ = make_blobs(\n", + " n_samples=n_rows, n_features=n_cols, centers=n_clusters_data, cluster_std=cluster_std, random_state=0\n", + " ) # make_blobs creates a random dataset of isotropic gaussian blobs.\n", + "\n", + "data = data.astype(dtype)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a9ae1eaf-a6d7-467d-88c0-644ae814c488", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Convert dataset to Spark DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a9ae1eaf-a6d7-467d-88c0-644ae814c488", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "pd_data = pd.DataFrame({\"features\": list(data)})\n", + "df = spark.createDataFrame(pd_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "90b36ddc-2f90-409b-a213-3f319c736134", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Spark RAPIDS ML DBSCAN (GPU)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f1f994f0-4ca6-4b63-88f7-b0ac94ee3130", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from spark_rapids_ml.clustering import DBSCAN\n", + "gpu_dbscan = DBSCAN(eps=50.0, min_samples=3).setFeaturesCol(\"features\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a5a2cac5-18cd-450a-8571-195be588a361", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Estimator can be persisted and reloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "estimator_path = \"/tmp/dbscan-estimator\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "798f5b5b-cfa6-45e4-aa18-40c0142e894a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "gpu_dbscan.write().overwrite().save(estimator_path)\n", + "gpu_dbscan_loaded = DBSCAN.load(estimator_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "209d8f9c-1d1c-4bdd-880c-10e46f9d0c49", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "start_time = time.time()\n", + "gpu_model = gpu_dbscan_loaded.fit(df)\n", + "print(f\"Fit took: {time.time() - start_time} sec\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "77499eff-5cf2-4ce6-95a1-e45b69abe3cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "gpu_dbscan_loaded.getEps()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = \"/tmp/dbscan-model\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6874b600-707d-48a7-b780-5d4e939a746b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "gpu_model.write().overwrite().save(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "441c8635-9070-426f-8626-7083f34b7e71", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "gpu_model_loaded = gpu_model.read().load(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c3bfc179-e702-4198-8122-3a8ba98113a4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "transformed_df = gpu_model_loaded.setPredictionCol(\"transformed\").transform(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "de0ce32d-3d09-4e82-b6a7-26978925181a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "transformed_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0c63b707-0e00-4961-8a76-82f347886b83", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "transformed_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a7c5d3b5-ecb0-435a-8976-bc18a28e3e04", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "transformed_df.show(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare DBSCAN vs KMeans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Ring Shape Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_random_points_in_ring(center, inner_radius, outer_radius, num_points):\n", + " # Generate random angles\n", + " angles = np.random.uniform(0, 2 * np.pi, num_points)\n", + "\n", + " # Generate random radii within the ring\n", + " radii = np.sqrt(np.random.uniform(inner_radius**2, outer_radius**2, num_points))\n", + "\n", + " # Convert polar coordinates to Cartesian coordinates\n", + " x = center[0] + radii * np.cos(angles)\n", + " y = center[1] + radii * np.sin(angles)\n", + "\n", + " # Create array of points\n", + " points = np.column_stack((x, y))\n", + "\n", + " return points\n", + "\n", + "data_inner = generate_random_points_in_ring((0,0), 1, 2, 500)\n", + "data_outer = generate_random_points_in_ring((0,0), 4, 5, 500)\n", + "data = np.concatenate((data_inner, data_outer), axis=0)\n", + "np.random.shuffle(data)\n", + "\n", + "pd_data = pd.DataFrame({\"features\": list(data)})\n", + "df = spark.createDataFrame(pd_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run DBSCAN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dbscan = DBSCAN(eps=1.0, min_samples=5).setFeaturesCol(\"features\")\n", + "dbscan_model = dbscan.fit(df)\n", + "dbscan_transformed = dbscan_model.transform(df)\n", + "\n", + "dbscan_pd = dbscan_transformed.toPandas()\n", + "dbscan_np = dbscan_pd.to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from spark_rapids_ml.clustering import KMeans\n", + "kmeans = KMeans(k=2).setFeaturesCol(\"features\")\n", + "kmeans_model = kmeans.fit(df)\n", + "kmeans_transformed = kmeans_model.transform(df)\n", + "\n", + "kmeans_pd = kmeans_transformed.toPandas()\n", + "kmeans_np = kmeans_pd.to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare Clustering Result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "cluster0 = []\n", + "cluster1 = []\n", + "for p in kmeans_np:\n", + " if (p[1] == 0):\n", + " cluster0.append(p[0])\n", + " else:\n", + " cluster1.append(p[0])\n", + "\n", + "cluster0 = np.array(cluster0)\n", + "cluster1 = np.array(cluster1)\n", + " \n", + "plt.scatter(cluster0[:, 0], cluster0[:, 1], s=5, label=\"cluster 0\")\n", + "plt.scatter(cluster1[:, 0], cluster1[:, 1], s=5, label=\"cluster 1\")\n", + " \n", + "plt.xlabel('X')\n", + "plt.ylabel('Y')\n", + "plt.title('KMeans Clustering Result')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster0 = []\n", + "cluster1 = []\n", + "for p in dbscan_np:\n", + " if (p[1] == 0):\n", + " cluster0.append(p[0])\n", + " else:\n", + " cluster1.append(p[0])\n", + "\n", + "cluster0 = np.array(cluster0)\n", + "cluster1 = np.array(cluster1)\n", + " \n", + "plt.scatter(cluster0[:, 0], cluster0[:, 1], s=5, label=\"cluster 0\")\n", + "plt.scatter(cluster1[:, 0], cluster1[:, 1], s=5, label=\"cluster 1\")\n", + " \n", + "plt.xlabel('X')\n", + "plt.ylabel('Y')\n", + "plt.title('DBSCAN Clustering Result')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "spark-rapids-ml-kmeans-demo", + "notebookOrigID": 1026070411409745, + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "vscode": { + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/benchmark/benchmark/bench_dbscan.py b/python/benchmark/benchmark/bench_dbscan.py new file mode 100644 index 00000000..70831596 --- /dev/null +++ b/python/benchmark/benchmark/bench_dbscan.py @@ -0,0 +1,313 @@ +# +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pprint +import time +from typing import Any, Dict, Iterator, List, Optional, Union + +import numpy as np +import pandas as pd +from pyspark.ml.feature import VectorAssembler +from pyspark.ml.functions import array_to_vector, vector_to_array +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.functions import array, col, sum +from pyspark.sql.types import DoubleType, StructField, StructType + +from .base import BenchmarkBase +from .utils import inspect_default_params_from_func, with_benchmark + + +class BenchmarkDBSCAN(BenchmarkBase): + def _supported_class_params(self) -> Dict[str, Any]: + from pyspark.ml.clustering import KMeans + + params = inspect_default_params_from_func( + KMeans.__init__, + [ + "distanceMeasure", + "featuresCol", + "labelCol", + "predictionCol", + "probabilityCol", + "rawPredictionCol", + "weightCol", + "leafCol", + ], + ) + params["seed"] = 1 + params["eps"] = float + params["min_samples"] = int + return params + + def _parse_arguments(self, argv: List[Any]) -> None: + """Override to set class params based on cpu or gpu run (dbscan or kmeans)""" + pp = pprint.PrettyPrinter() + + self._args = self._parser.parse_args(argv) + print("command line arguments:") + pp.pprint(vars(self._args)) + + if self._args.num_cpus > 0: + supported_class_params = self._supported_class_params() + supported_class_params.pop("eps", None) + supported_class_params.pop("min_samples", None) + else: + supported_class_params = { + "eps": float, + "min_samples": int, + } + self._class_params = { + k: v + for k, v in vars(self._args).items() + if k in supported_class_params and v is not None + } + print("\nclass params:") + pp.pprint(self._class_params) + print() + + def _add_extra_arguments(self) -> None: + self._parser.add_argument( + "--no_cache", + action="store_true", + default=False, + help="whether to enable dataframe repartition, cache and cout outside fit function", + ) + self._parser.add_argument( + "--compute_score", + action="store_true", + default=False, + help="whether to compute algorithm evaluation score for benchmarking", + ) + + def score( + self, + transformed_df: DataFrame, + features_col: str, + prediction_col: str, + ) -> float: + """Computes the silhoutte score for the clustering result. This is a common metric to measure + how well the clustering algorithm performs. + + Parameters + ---------- + transformed_df + Model transformed data. + features_col + Name of features column. + Note: this column is assumed to be of pyspark sql 'array' type. + prediction_col + Name of prediction column + + Returns + ------- + float + The computed silhoutte score. + + """ + from sklearn.metrics import silhouette_score + + sc = transformed_df.rdd.context + + pdf: pd.DataFrame = transformed_df.toPandas() + features_pdf = pdf.drop(columns=[prediction_col]) + prediction_pdf = pdf[prediction_col] + + features_np = np.stack(features_pdf.to_numpy().squeeze()) + prediction_np = prediction_pdf.to_numpy() + + return silhouette_score(features_np, prediction_np) + + def run_once( + self, + spark: SparkSession, + train_df: DataFrame, + features_col: Union[str, List[str]], + transform_df: Optional[DataFrame], + label_name: Optional[str], + ) -> Dict[str, Any]: + num_gpus = self.args.num_gpus + num_cpus = self.args.num_cpus + no_cache = self.args.no_cache + train_path = self.args.train_path + compute_score = self.args.compute_score + + func_start_time = time.time() + + first_col = train_df.dtypes[0][0] + first_col_type = train_df.dtypes[0][1] + is_array_col = True if "array" in first_col_type else False + is_vector_col = True if "vector" in first_col_type else False + is_single_col = is_array_col or is_vector_col + if not is_single_col: + input_cols = [c for c in train_df.schema.names] + output_col = "cluster_idx" + + if num_gpus > 0: + from spark_rapids_ml.clustering import DBSCAN + + assert num_cpus <= 0 + if not no_cache: + + def gpu_cache_df(df: DataFrame) -> DataFrame: + df = df.repartition(num_gpus).cache() + df.count() + return df + + train_df, prepare_time = with_benchmark( + "prepare dataset", lambda: gpu_cache_df(train_df) + ) + + params = self.class_params + print(f"Passing {params} to DBSCAN") + + gpu_estimator = DBSCAN( + num_workers=num_gpus, verbose=self.args.verbose, **params + ) + + if is_single_col: + gpu_estimator = gpu_estimator.setFeaturesCol(first_col) + else: + gpu_estimator = gpu_estimator.setFeaturesCols(input_cols) + + gpu_model, fit_time = with_benchmark( + "gpu fit", lambda: gpu_estimator.fit(train_df) + ) + + transformed_df, transform_time = with_benchmark( + "gpu transform", + lambda: gpu_model.setPredictionCol(output_col).transform(train_df), + ) + + # count doesn't trigger compute so do something not too compute intensive + _, extra_transform_time = with_benchmark( + "gpu transform result gathering", + lambda: transformed_df.agg(sum(output_col)).collect(), + ) + transform_time += extra_transform_time + + total_time = round(time.time() - func_start_time, 2) + print(f"gpu total time: {total_time} sec") + + df_for_scoring = transformed_df + feature_col = first_col + if not is_single_col: + feature_col = "features_array" + df_for_scoring = transformed_df.select( + array(*input_cols).alias("features_array"), output_col + ) + elif is_vector_col: + df_for_scoring = transformed_df.select( + vector_to_array(col(feature_col)), output_col + ) + + if num_cpus > 0: + from pyspark.ml.clustering import KMeans as SparkKMeans + + assert num_gpus <= 0 + if is_array_col: + vector_df = train_df.select( + array_to_vector(train_df[first_col]).alias(first_col) + ) + elif not is_vector_col: + vector_assembler = VectorAssembler(outputCol="features").setInputCols( + input_cols + ) + vector_df = vector_assembler.transform(train_df).drop(*input_cols) + first_col = "features" + else: + vector_df = train_df + + if not no_cache: + + def cpu_cache_df(df: DataFrame) -> DataFrame: + df = df.cache() + df.count() + return df + + vector_df, prepare_time = with_benchmark( + "prepare dataset", lambda: cpu_cache_df(vector_df) + ) + + params = self.class_params + print(f"Passing {params} to KMeans") + + cpu_estimator = ( + SparkKMeans(**params) + .setFeaturesCol(first_col) + .setPredictionCol(output_col) + ) + + cpu_model, fit_time = with_benchmark( + "cpu fit", lambda: cpu_estimator.fit(vector_df) + ) + + print( + f"spark ML: iterations: {cpu_model.summary.numIter}, inertia: {cpu_model.summary.trainingCost}" + ) + + def cpu_transform(df: DataFrame) -> None: + transformed_df = cpu_model.transform(df) + transformed_df.agg(sum(output_col)).collect() + return transformed_df + + transformed_df, transform_time = with_benchmark( + "cpu transform", lambda: cpu_transform(vector_df) + ) + + total_time = time.time() - func_start_time + print(f"cpu total took: {total_time} sec") + + feature_col = first_col + df_for_scoring = transformed_df.select( + vector_to_array(col(feature_col)).alias(feature_col), output_col + ) + + # either cpu or gpu mode is run, not both in same run + score = ( + self.score(df_for_scoring, feature_col, output_col) + if compute_score + else "Not Computed" + ) + print(f"score: {score}") + + if num_gpus > 0: + result = { + "fit_time": fit_time, + "transform_time": transform_time, + "total_time": total_time, + "score": score, + "eps": self.args.eps, + "min_samples": self.args.min_samples, + "num_gpus": num_gpus, + "num_cpus": num_cpus, + "no_cache": no_cache, + "train_path": train_path, + } + else: + result = { + "fit_time": fit_time, + "transform_time": transform_time, + "total_time": total_time, + "score": score, + "k": self.args.k, + "maxIter": self.args.maxIter, + "tol": self.args.tol, + "num_gpus": num_gpus, + "num_cpus": num_cpus, + "no_cache": no_cache, + "train_path": train_path, + } + + return result diff --git a/python/benchmark/benchmark_runner.py b/python/benchmark/benchmark_runner.py index dcca346d..65c1705d 100644 --- a/python/benchmark/benchmark_runner.py +++ b/python/benchmark/benchmark_runner.py @@ -16,6 +16,7 @@ import argparse import sys +from benchmark.bench_dbscan import BenchmarkDBSCAN from benchmark.bench_kmeans import BenchmarkKMeans from benchmark.bench_linear_regression import BenchmarkLinearRegression from benchmark.bench_logistic_regression import BenchmarkLogisticRegression @@ -31,6 +32,7 @@ class BenchmarkRunner: def __init__(self) -> None: registered_algorithms = { + "dbscan": BenchmarkDBSCAN, "kmeans": BenchmarkKMeans, "knn": BenchmarkNearestNeighbors, "linear_regression": BenchmarkLinearRegression, diff --git a/python/run_benchmark.sh b/python/run_benchmark.sh index 0c3ec32b..1419b6df 100755 --- a/python/run_benchmark.sh +++ b/python/run_benchmark.sh @@ -3,6 +3,7 @@ # Usage: ./run_benchmark.sh cpu|gpu|gpu_etl [] # where can be: # all +# dbscan # kmeans # knn # linear_regression @@ -517,3 +518,43 @@ if [[ "${MODE}" =~ "umap" ]] || [[ "${MODE}" == "all" ]]; then $common_confs $spark_rapids_confs_umap \ ${EXTRA_ARGS} fi + +# DBSCAN +if [[ "${MODE}" =~ "dbscan" ]] || [[ "${MODE}" == "all" ]]; then + if [[ ! -d "${gen_data_root}/blobs/r${num_rows}_c${num_cols}_float32.parquet" ]]; then + python $gen_data_script blobs \ + --num_rows $num_rows \ + --num_cols $num_cols \ + --output_num_files $output_num_files \ + --dtype "float32" \ + --feature_type "array" \ + --output_dir "${gen_data_root}/blobs/r${num_rows}_c${num_cols}_float32.parquet" \ + $common_confs + + fi + + # DBSCAN involves a large amount of data transfer to the driver for broadcast + spark_rapids_confs_dbscan="$spark_rapids_confs --spark_confs spark.driver.maxResultSize=0" + + # Compute score when datasize is suitable + if (($num_rows * $num_cols < 50000000)); then + spark_rapids_confs_dbscan="$spark_rapids_confs_dbscan --compute_score" + fi + + echo "$sep algo: dbscan $sep" + python ./benchmark/benchmark_runner.py dbscan \ + --eps 100 \ + --min_samples 5 \ + --k 3 \ + --tol 1.0e-20 \ + --maxIter 30 \ + --initMode random \ + --num_gpus $num_gpus \ + --num_cpus $num_cpus \ + --no_cache \ + --num_runs $num_runs \ + --train_path "${gen_data_root}/blobs/r${num_rows}_c${num_cols}_float32.parquet" \ + --report_path "report_dbscan_${cluster_type}.csv" \ + $common_confs $spark_rapids_confs_dbscan \ + ${EXTRA_ARGS} +fi diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index ff27b2b7..803f9156 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -15,7 +15,7 @@ # from abc import ABCMeta -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast import numpy as np import pandas as pd @@ -24,11 +24,16 @@ from pyspark.ml.clustering import KMeansModel as SparkKMeansModel from pyspark.ml.clustering import _KMeansParams from pyspark.ml.linalg import Vector +from pyspark.ml.param.shared import HasFeaturesCol, Param, Params, TypeConverters +from pyspark.sql import Column from pyspark.sql.dataframe import DataFrame +from pyspark.sql.functions import col from pyspark.sql.types import ( ArrayType, DoubleType, + FloatType, IntegerType, + LongType, Row, StringType, StructField, @@ -39,14 +44,17 @@ CumlT, FitInputType, _ConstructFunc, + _CumlCaller, _CumlEstimator, + _CumlModel, _CumlModelWithPredictionCol, _EvaluateFunc, _TransformFunc, + alias, param_alias, ) from .metrics import EvalMetricInfo -from .params import HasFeaturesCols, P, _CumlClass, _CumlParams +from .params import HasFeaturesCols, HasIDCol, P, _CumlClass, _CumlParams from .utils import ( _ArrayOrder, _concat_and_free, @@ -489,3 +497,579 @@ def _transform_internal( return pd.Series(res) return _construct_kmeans, _transform_internal, None + + +class DBSCANClass(_CumlClass): + @classmethod + def _param_mapping(cls) -> Dict[str, Optional[str]]: + return {} + + def _get_cuml_params_default(self) -> Dict[str, Any]: + return { + "eps": 0.5, + "min_samples": 5, + "metric": "euclidean", + "verbose": False, + "max_mbytes_per_batch": None, + "calc_core_sample_indices": False, + } + + def _pyspark_class(self) -> Optional[ABCMeta]: + return None + + +class _DBSCANCumlParams(_CumlParams, HasFeaturesCol, HasFeaturesCols, HasIDCol): + def __init__(self) -> None: + super().__init__() + self._setDefault( + eps=0.5, + min_samples=5, + metric="euclidean", + max_mbytes_per_batch=None, + calc_core_sample_indices=True, + idCol=alias.row_number, + ) + + eps = Param( + Params._dummy(), + "eps", + ( + f"The maximum distance between 2 points such they reside in the same neighborhood." + ), + typeConverter=TypeConverters.toFloat, + ) + + min_samples = Param( + Params._dummy(), + "min_samples", + ( + f"The number of samples in a neighborhood such that this group can be considered as an important core point (including the point itself)." + ), + typeConverter=TypeConverters.toInt, + ) + + metric = Param( + Params._dummy(), + "metric", + ( + f"The metric to use when calculating distances between points." + f"Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead." + ), + typeConverter=TypeConverters.toString, + ) + + max_mbytes_per_batch = Param( + Params._dummy(), + "max_mbytes_per_batch", + ( + f"Calculate batch size using no more than this number of megabytes for the pairwise distance computation." + f"This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples." + f"If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device." + ), + typeConverter=TypeConverters.toInt, + ) + + calc_core_sample_indices = Param( + Params._dummy(), + "calc_core_sample_indices", + ( + f"Indicates whether the indices of the core samples should be calculated." + f"Setting this to False will avoid unnecessary kernel launches" + ), + typeConverter=TypeConverters.toBoolean, + ) + + idCol = Param( + Params._dummy(), + "idCol", + "id column name.", + typeConverter=TypeConverters.toString, + ) + + def getFeaturesCol(self) -> Union[str, List[str]]: # type: ignore + """ + Gets the value of :py:attr:`featuresCol` or :py:attr:`featuresCols` + """ + if self.isDefined(self.featuresCols): + return self.getFeaturesCols() + elif self.isDefined(self.featuresCol): + return self.getOrDefault("featuresCol") + else: + raise RuntimeError("featuresCol is not set") + + def setFeaturesCol(self: P, value: Union[str, List[str]]) -> P: + """ + Sets the value of :py:attr:`featuresCol` or :py:attr:`featuresCols`. + """ + if isinstance(value, str): + self._set_params(featuresCol=value) + else: + self._set_params(featuresCols=value) + return self + + def setFeaturesCols(self: P, value: List[str]) -> P: + """ + Sets the value of :py:attr:`featuresCols`. Used when input vectors are stored as multiple feature columns. + """ + return self._set_params(featuresCols=value) + + def setPredictionCol(self: P, value: str) -> P: + """ + Sets the value of :py:attr:`predictionCol`. + """ + self._set_params(predictionCol=value) + return self + + def setIdCol(self: P, value: str) -> P: + """ + Sets the value of `idCol`. If not set, an id column will be added with column name `unique_id`. The id column is used to specify dbscan vectors by associated id value. + """ + self._set_params(idCol=value) + return self + + +class DBSCAN(DBSCANClass, _CumlEstimator, _DBSCANCumlParams): + """ + The Density-Based Spatial Clustering of Applications with Noise (DBSCAN) is a non-parametric + data clustering algorithm based on data density. It groups points close to each other that form a dense cluster + and mark the far-away points as noise and exclude them from all clusters. + + Parameters + ---------- + featuresCol: str or List[str] + The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n + * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. + * When the value is a list of strings, the feature columns must be numeric types. + + predictionCol: str + the name of the column that stores cluster indices of input vectors. predictionCol should be set when users expect to apply the transform function of a learned model. + + num_workers: + Number of cuML workers, where each cuML worker corresponds to one Spark task + running on one GPU. If not set, spark-rapids-ml tries to infer the number of + cuML workers (i.e. GPUs in cluster) from the Spark environment. + + eps: float (default = 0.5) + The maximum distance between 2 points such they reside in the same neighborhood. + + min_samples: int (default = 5) + The number of samples in a neighborhood such that this group can be considered as + an important core point (including the point itself). + + metric: {'euclidean', 'cosine'}, default = 'euclidean' + The metric to use when calculating distances between points. + Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead + + verbose: int or boolean (default=False) + Logging level. + * ``0`` - Disables all log messages. + * ``1`` - Enables only critical messages. + * ``2`` - Enables all messages up to and including errors. + * ``3`` - Enables all messages up to and including warnings. + * ``4 or False`` - Enables all messages up to and including information messages. + * ``5 or True`` - Enables all messages up to and including debug messages. + * ``6`` - Enables all messages up to and including trace messages. + + max_mbytes_per_batch(optional): int + Calculate batch size using no more than this number of megabytes for the pairwise distance computation. + This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples. + If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device. + + calc_core_sample_indices(optional): boolean (default = True) + Indicates whether the indices of the core samples should be calculated. + Setting this to False will avoid unnecessary kernel launches + + idCol: str (default = 'unique_id') + The internal unique id column name for label matching, will not reveal in the output. + Need to be set to a name that does not conflict with an existing column name in the original input data. + + Examples + ---------- + >>> from spark_rapids_ml.clustering import DBSCAN + >>> data = [([0.0, 0.0],), + ... ([1.0, 1.0],), + ... ([9.0, 8.0],), + ... ([8.0, 9.0],),] + >>> df = spark.createDataFrame(data, ["features"]) + >>> df.show() + +----------+ + | features| + +----------+ + |[0.0, 0.0]| + |[1.0, 1.0]| + |[9.0, 8.0]| + |[8.0, 9.0]| + +----------+ + >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCol("features") + >>> gpu_model = gpu_dbscan.fit(df) + >>> gpu_model.setPredictionCol("prediction") + >>> transformed = gpu_model.transform(df) + >>> transformed.show() + +----------+----------+ + | features|prediction| + +----------+----------+ + |[0.0, 0.0]| 0| + |[1.0, 1.0]| 0| + |[9.0, 8.0]| 1| + |[8.0, 9.0]| 1| + +----------+----------+ + >>> gpu_dbscan.save("/tmp/dbscan") + >>> gpu_model.save("/tmp/dbscan_model") + + >>> # vector column input + >>> from spark_rapids_ml.clustering import DBSCAN + >>> from pyspark.ml.linalg import Vectors + >>> data = [(Vectors.dense([0.0, 0.0]),), + ... (Vectors.dense([1.0, 1.0]),), + ... (Vectors.dense([9.0, 8.0]),), + ... (Vectors.dense([8.0, 9.0]),),] + >>> df = spark.createDataFrame(data, ["features"]) + >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCol("features") + >>> gpu_dbscan.getFeaturesCol() + 'features' + >>> gpu_model = gpu_dbscan.fit(df) + + + >>> # multi-column input + >>> data = [(0.0, 0.0), + ... (1.0, 1.0), + ... (9.0, 8.0), + ... (8.0, 9.0),] + >>> df = spark.createDataFrame(data, ["f1", "f2"]) + >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCols(["f1", "f2"]) + >>> gpu_dbscan.getFeaturesCols() + ['f1', 'f2'] + >>> gpu_model = gpu_dbscan.fit(df) + """ + + @keyword_only + def __init__( + self, + *, + featuresCol: str = "features", + predictionCol: str = "prediction", + eps: float = 0.5, + min_samples: int = 5, + metric: str = "euclidean", + max_mbytes_per_batch: Optional[int] = None, + calc_core_sample_indices: bool = True, + verbose: Union[int, bool] = False, + **kwargs: Any, + ) -> None: + super().__init__() + self._set_params(**self._input_kwargs) + + max_records_per_batch_str = _get_spark_session().conf.get( + "spark.sql.execution.arrow.maxRecordsPerBatch", "10000" + ) + assert max_records_per_batch_str is not None + self.max_records_per_batch = int(max_records_per_batch_str) + self.BROADCAST_LIMIT = 8 << 30 + + self.verbose = verbose + + def setEps(self: P, value: float) -> P: + return self._set_params(eps=value) + + def getEps(self) -> float: + return self.getOrDefault("eps") + + def setMinSamples(self: P, value: int) -> P: + return self._set_params(min_samples=value) + + def getMinSamples(self) -> int: + return self.getOrDefault("min_samples") + + def setMetric(self: P, value: str) -> P: + return self._set_params(metric=value) + + def getMetric(self) -> str: + return self.getOrDefault("metric") + + def setMaxMbytesPerBatch(self: P, value: Optional[int]) -> P: + return self._set_params(max_mbytes_per_batch=value) + + def getMaxMbytesPerBatch(self) -> Optional[int]: + return self.getOrDefault("max_mbytes_per_batch") + + def setCalcCoreSampleIndices(self: P, value: bool) -> P: + return self._set_params(calc_core_sample_indices=value) + + def getCalcCoreSampleIndices(self) -> bool: + return self.getOrDefault("calc_core_sample_indices") + + def _fit(self, dataset: DataFrame) -> _CumlModel: + if self.getMetric() == "precomputed": + raise ValueError( + "Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead" + ) + + # Create parameter-copied model without accessing the input dataframe + # All information will be retrieved from Model and transform + model = DBSCANModel(verbose=self.verbose, n_cols=0, dtype="") + + model._num_workers = self.num_workers + self._copyValues(model) + + return model + + def _create_pyspark_model(self, result: Row) -> _CumlModel: + raise NotImplementedError("DBSCAN does not support model creation from Row") + + def _get_cuml_fit_func( + self, + dataset: DataFrame, + extra_params: Optional[List[Dict[str, Any]]] = None, + ) -> Callable[ + [FitInputType, Dict[str, Any]], + Dict[str, Any], + ]: + raise NotImplementedError("DBSCAN does not fit and generate model") + + def _out_schema(self) -> Union[StructType, str]: + raise NotImplementedError("DBSCAN does not output for fit and generate model") + + +class DBSCANModel( + DBSCANClass, _CumlCaller, _CumlModelWithPredictionCol, _DBSCANCumlParams +): + def __init__( + self, + n_cols: int, + dtype: str, + verbose: Union[int, bool], + ): + super(DBSCANClass, self).__init__() + super(_CumlModelWithPredictionCol, self).__init__(n_cols=n_cols, dtype=dtype) + super(_DBSCANCumlParams, self).__init__() + + self._setDefault( + idCol=alias.row_number, + ) + + self.verbose = verbose + self.BROADCAST_LIMIT = 8 << 30 + self._dbscan_spark_model = None + + def _pre_process_data(self, dataset: DataFrame) -> Tuple[ # type: ignore + List[Column], + Optional[List[str]], + int, + Union[Type[FloatType], Type[DoubleType]], + ]: + ( + select_cols, + multi_col_names, + dimension, + feature_type, + ) = _CumlCaller._pre_process_data(self, dataset) + + # Must retain idCol for label matching + if self.hasParam("idCol") and self.isDefined("idCol"): + id_col_name = self.getOrDefault("idCol") + select_cols.append(col(id_col_name)) + else: + select_cols.append(col(alias.row_number)) + + return select_cols, multi_col_names, dimension, feature_type + + def _out_schema( + self, input_schema: StructType = StructType() + ) -> Union[StructType, str]: + return StructType( + [ + StructField(self._get_prediction_name(), IntegerType(), False), + StructField(self.getIdCol(), LongType(), False), + ] + ) + + def _transform_array_order(self) -> _ArrayOrder: + return "C" + + def _fit_array_order(self) -> _ArrayOrder: + return "C" + + def _require_nccl_ucx(self) -> Tuple[bool, bool]: + return (True, True) + + def _get_cuml_fit_func( + self, + dataset: DataFrame, + extra_params: Optional[List[Dict[str, Any]]] = None, + ) -> Callable[ + [FitInputType, Dict[str, Any]], + Dict[str, Any], + ]: + import cupy as cp + import cupyx + + dtype = self.dtype + n_cols = self.n_cols + array_order = self._fit_array_order() + pred_name = self._get_prediction_name() + idCol_name = self.getIdCol() + + cuda_managed_mem_enabled = ( + _get_spark_session().conf.get("spark.rapids.ml.uvm.enabled", "false") + == "true" + ) + + inputs = [] # type: ignore + + idCol = list( + self.idCols_[0].value + if len(self.idCols_) == 1 + else np.concatenate([chunk.value for chunk in self.idCols_]) + ) + + for pdf_bc in self.raw_data_: + features = pdf_bc.value + + # experiments indicate it is faster to convert to numpy array and then to cupy array than directly + # invoking cupy array on the list + if cuda_managed_mem_enabled: + features = cp.array(features) + + inputs.append(features) + + concated = _concat_and_free(inputs, order=array_order) + + def _cuml_fit( + dfs: FitInputType, + params: Dict[str, Any], + ) -> Dict[str, Any]: + from cuml.cluster.dbscan_mg import DBSCANMG as CumlDBSCANMG + from pyspark import BarrierTaskContext + + context = BarrierTaskContext.get() + partition_id = context.partitionId() + + logger = get_logger(self.__class__) + + dbscan = CumlDBSCANMG( + handle=params[param_alias.handle], + output_type="cudf", + eps=self.getOrDefault("eps"), + min_samples=self.getOrDefault("min_samples"), + metric=self.getOrDefault("metric"), + max_mbytes_per_batch=self.getOrDefault("max_mbytes_per_batch"), + calc_core_sample_indices=self.getOrDefault("calc_core_sample_indices"), + verbose=self.verbose, + ) + dbscan.n_cols = params[param_alias.num_cols] + dbscan.dtype = np.dtype(dtype) + + res = list(dbscan.fit_predict(concated).to_numpy()) + + # Only node 0 from cuML will contain the correct label output + if partition_id == 0: + return { + idCol_name: idCol, + pred_name: res, + } + else: + return { + idCol_name: [], + pred_name: [], + } + + return _cuml_fit + + def _get_cuml_transform_func( + self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None + ) -> Tuple[ + _ConstructFunc, + _TransformFunc, + Optional[_EvaluateFunc], + ]: + raise NotImplementedError( + "DBSCAN does not can not have a separate transform UDF" + ) + + def _transform(self, dataset: DataFrame) -> DataFrame: + logger = get_logger(self.__class__) + + spark = _get_spark_session() + + def _chunk_arr( + arr: np.ndarray, BROADCAST_LIMIT: int = self.BROADCAST_LIMIT + ) -> List[np.ndarray]: + """Chunk an array, if oversized, into smaller arrays that can be broadcasted.""" + if arr.nbytes <= BROADCAST_LIMIT: + return [arr] + + rows_per_chunk = BROADCAST_LIMIT // (arr.nbytes // arr.shape[0]) + num_chunks = (arr.shape[0] + rows_per_chunk - 1) // rows_per_chunk + chunks = [ + arr[i * rows_per_chunk : (i + 1) * rows_per_chunk] + for i in range(num_chunks) + ] + + return chunks + + dataset = self._ensureIdCol(dataset) + select_cols, multi_col_names, dimension, _ = self._pre_process_data(dataset) + input_dataset = dataset.select(*select_cols) + pd_dataset: pd.DataFrame = input_dataset.toPandas() + + if multi_col_names: + raw_data = np.array( + pd_dataset.drop(columns=[self.getIdCol()]), + order=self._fit_array_order(), + ) + else: + raw_data = np.array( + list(pd_dataset.drop(columns=[self.getIdCol()])[alias.data]), + order=self._fit_array_order(), + ) + + idCols: np.ndarray = np.array(pd_dataset[self.getIdCol()]) + + # Set input metadata + self.n_cols = len(raw_data[0]) + self.dtype = ( + type(raw_data[0][0][0]).__name__ + if isinstance(raw_data[0][0], List) + or isinstance(raw_data[0][0], np.ndarray) + else type(raw_data[0][0]).__name__ + ) + + # Broadcast preprocessed input dataset and the idCol + broadcast_raw_data = [ + spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(raw_data) + ] + + broadcast_idCol = [ + spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(idCols) + ] + + self.processed_input_cols = input_dataset.drop(self.getIdCol()).columns + self.raw_data_ = broadcast_raw_data + self.idCols_ = broadcast_idCol + self.multi_col_names = multi_col_names + + idCol_name = self.getIdCol() + + default_num_partitions = dataset.rdd.getNumPartitions() + + rdd = self._call_cuml_fit_func( + dataset=dataset, + partially_collect=False, + paramMaps=None, + ) + rdd = rdd.repartition(default_num_partitions) + + pred_df = rdd.toDF() + + # JOIN the transformed label column into the original input dataset + # and discard the internal idCol for row matching + return dataset.join(pred_df, idCol_name).drop(idCol_name) + + def _get_model_attributes(self) -> Optional[Dict[str, Any]]: + """ + Override parent method to bring broadcast variables to driver before JSON serialization. + """ + + self._model_attributes["verbose"] = self.verbose + + return self._model_attributes diff --git a/python/src/spark_rapids_ml/dbscan.py b/python/src/spark_rapids_ml/dbscan.py deleted file mode 100644 index d6cf190c..00000000 --- a/python/src/spark_rapids_ml/dbscan.py +++ /dev/null @@ -1,644 +0,0 @@ -# -# Copyright (c) 2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from abc import ABCMeta -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Optional, - Tuple, - Type, - Union, - cast, -) - -import numpy as np -import pandas as pd -from pyspark import keyword_only -from pyspark.ml.param.shared import HasFeaturesCol, Param, Params, TypeConverters -from pyspark.sql import Column -from pyspark.sql.dataframe import DataFrame -from pyspark.sql.functions import col, monotonically_increasing_id -from pyspark.sql.types import ( - DoubleType, - FloatType, - IntegerType, - LongType, - Row, - StructField, - StructType, -) - -from .core import ( - FitInputType, - _ConstructFunc, - _CumlCaller, - _CumlEstimator, - _CumlModel, - _CumlModelWithPredictionCol, - _EvaluateFunc, - _read_csr_matrix_from_unwrapped_spark_vec, - _TransformFunc, - _use_sparse_in_cuml, - alias, - param_alias, -) -from .metrics import EvalMetricInfo -from .params import HasFeaturesCols, HasIDCol, P, _CumlClass, _CumlParams -from .utils import _ArrayOrder, _concat_and_free, _get_spark_session, get_logger - -if TYPE_CHECKING: - import cudf - from pyspark.ml._typing import ParamMap - - -class DBSCANClass(_CumlClass): - @classmethod - def _param_mapping(cls) -> Dict[str, Optional[str]]: - return {} - - def _get_cuml_params_default(self) -> Dict[str, Any]: - return { - "eps": 0.5, - "min_samples": 5, - "metric": "euclidean", - "verbose": False, - "max_mbytes_per_batch": None, - "calc_core_sample_indices": False, - } - - def _pyspark_class(self) -> Optional[ABCMeta]: - return None - - -class _DBSCANCumlParams(_CumlParams, HasFeaturesCol, HasFeaturesCols, HasIDCol): - def __init__(self) -> None: - super().__init__() - self._setDefault( - eps=0.5, - min_samples=5, - metric="euclidean", - max_mbytes_per_batch=None, - calc_core_sample_indices=True, - idCol=alias.row_number, - ) - - eps = Param( - Params._dummy(), - "eps", - ( - f"The maximum distance between 2 points such they reside in the same neighborhood." - ), - typeConverter=TypeConverters.toFloat, - ) - - min_samples = Param( - Params._dummy(), - "min_samples", - ( - f"The number of samples in a neighborhood such that this group can be considered as an important core point (including the point itself)." - ), - typeConverter=TypeConverters.toInt, - ) - - metric = Param( - Params._dummy(), - "metric", - ( - f"The metric to use when calculating distances between points." - f"Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead." - ), - typeConverter=TypeConverters.toString, - ) - - max_mbytes_per_batch = Param( - Params._dummy(), - "max_mbytes_per_batch", - ( - f"Calculate batch size using no more than this number of megabytes for the pairwise distance computation." - f"This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples." - f"If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device." - ), - typeConverter=TypeConverters.toInt, - ) - - calc_core_sample_indices = Param( - Params._dummy(), - "calc_core_sample_indices", - ( - f"Indicates whether the indices of the core samples should be calculated." - f"Setting this to False will avoid unnecessary kernel launches" - ), - typeConverter=TypeConverters.toBoolean, - ) - - idCol = Param( - Params._dummy(), - "idCol", - "id column name.", - typeConverter=TypeConverters.toString, - ) - - def getFeaturesCol(self) -> Union[str, List[str]]: # type: ignore - """ - Gets the value of :py:attr:`featuresCol` or :py:attr:`featuresCols` - """ - if self.isDefined(self.featuresCols): - return self.getFeaturesCols() - elif self.isDefined(self.featuresCol): - return self.getOrDefault("featuresCol") - else: - raise RuntimeError("featuresCol is not set") - - def setFeaturesCol(self: P, value: Union[str, List[str]]) -> P: - """ - Sets the value of :py:attr:`featuresCol` or :py:attr:`featuresCols`. - """ - if isinstance(value, str): - self._set_params(featuresCol=value) - else: - self._set_params(featuresCols=value) - return self - - def setFeaturesCols(self: P, value: List[str]) -> P: - """ - Sets the value of :py:attr:`featuresCols`. Used when input vectors are stored as multiple feature columns. - """ - return self._set_params(featuresCols=value) - - def setPredictionCol(self: P, value: str) -> P: - """ - Sets the value of :py:attr:`predictionCol`. - """ - self._set_params(predictionCol=value) - return self - - def setIdCol(self: P, value: str) -> P: - """ - Sets the value of `idCol`. If not set, an id column will be added with column name `unique_id`. The id column is used to specify dbscan vectors by associated id value. - """ - self._set_params(idCol=value) - return self - - -class DBSCAN(DBSCANClass, _CumlEstimator, _DBSCANCumlParams): - """ - The Density-Based Spatial Clustering of Applications with Noise (DBSCAN) is a non-parametric - data clustering algorithm based on data density. It groups points close to each other that form a dense cluster - and mark the far-away points as noise and exclude them from all clusters. - - Parameters - ---------- - featuresCol: str or List[str] - The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n - * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. - * When the value is a list of strings, the feature columns must be numeric types. - - predictionCol: str - the name of the column that stores cluster indices of input vectors. predictionCol should be set when users expect to apply the transform function of a learned model. - - num_workers: - Number of cuML workers, where each cuML worker corresponds to one Spark task - running on one GPU. If not set, spark-rapids-ml tries to infer the number of - cuML workers (i.e. GPUs in cluster) from the Spark environment. - - eps: float (default = 0.5) - The maximum distance between 2 points such they reside in the same neighborhood. - - min_samples: int (default = 5) - The number of samples in a neighborhood such that this group can be considered as - an important core point (including the point itself). - - metric: {'euclidean', 'cosine'}, default = 'euclidean' - The metric to use when calculating distances between points. - Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead - - verbose: int or boolean (default=False) - Logging level. - * ``0`` - Disables all log messages. - * ``1`` - Enables only critical messages. - * ``2`` - Enables all messages up to and including errors. - * ``3`` - Enables all messages up to and including warnings. - * ``4 or False`` - Enables all messages up to and including information messages. - * ``5 or True`` - Enables all messages up to and including debug messages. - * ``6`` - Enables all messages up to and including trace messages. - - max_mbytes_per_batch(optional): int - Calculate batch size using no more than this number of megabytes for the pairwise distance computation. - This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples. - If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device. - - calc_core_sample_indices(optional): boolean (default = True) - Indicates whether the indices of the core samples should be calculated. - Setting this to False will avoid unnecessary kernel launches - - idCol: str (default = 'unique_id') - The internal unique id column name for label matching, will not reveal in the output. - Need to be set to a name that does not conflict with an existing column name in the original input data. - - Examples - ---------- - >>> from spark_rapids_ml.dbscan import DBSCAN - >>> data = [([0.0, 0.0],), - ... ([1.0, 1.0],), - ... ([9.0, 8.0],), - ... ([8.0, 9.0],),] - >>> df = spark.createDataFrame(data, ["features"]) - >>> df.show() - +----------+ - | features| - +----------+ - |[0.0, 0.0]| - |[1.0, 1.0]| - |[9.0, 8.0]| - |[8.0, 9.0]| - +----------+ - >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCol("features") - >>> gpu_model = gpu_dbscan.fit(df) - >>> gpu_model.setPredictionCol("prediction") - >>> transformed = gpu_model.transform(df) - >>> transformed.show() - +----------+----------+ - | features|prediction| - +----------+----------+ - |[0.0, 0.0]| 0| - |[1.0, 1.0]| 0| - |[9.0, 8.0]| 1| - |[8.0, 9.0]| 1| - +----------+----------+ - >>> gpu_dbscan.save("/tmp/dbscan") - >>> gpu_model.save("/tmp/dbscan_model") - - >>> # vector column input - >>> from spark_rapids_ml.dbscan import DBSCAN - >>> from pyspark.ml.linalg import Vectors - >>> data = [(Vectors.dense([0.0, 0.0]),), - ... (Vectors.dense([1.0, 1.0]),), - ... (Vectors.dense([9.0, 8.0]),), - ... (Vectors.dense([8.0, 9.0]),),] - >>> df = spark.createDataFrame(data, ["features"]) - >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCol("features") - >>> gpu_dbscan.getFeaturesCol() - 'features' - >>> gpu_model = gpu_dbscan.fit(df) - - - >>> # multi-column input - >>> data = [(0.0, 0.0), - ... (1.0, 1.0), - ... (9.0, 8.0), - ... (8.0, 9.0),] - >>> df = spark.createDataFrame(data, ["f1", "f2"]) - >>> gpu_dbscan = DBSCAN(eps=3, metric="euclidean").setFeaturesCols(["f1", "f2"]) - >>> gpu_dbscan.getFeaturesCols() - ['f1', 'f2'] - >>> gpu_model = gpu_dbscan.fit(df) - """ - - @keyword_only - def __init__( - self, - *, - featuresCol: str = "features", - predictionCol: str = "prediction", - eps: float = 0.5, - min_samples: int = 5, - metric: str = "euclidean", - max_mbytes_per_batch: Optional[int] = None, - calc_core_sample_indices: bool = True, - verbose: Union[int, bool] = False, - **kwargs: Any, - ) -> None: - super().__init__() - self._set_params(**self._input_kwargs) - - max_records_per_batch_str = _get_spark_session().conf.get( - "spark.sql.execution.arrow.maxRecordsPerBatch", "10000" - ) - assert max_records_per_batch_str is not None - self.max_records_per_batch = int(max_records_per_batch_str) - self.BROADCAST_LIMIT = 8 << 30 - - self.verbose = verbose - - def setEps(self: P, value: float) -> P: - return self._set_params(eps=value) - - def getEps(self) -> float: - return self.getOrDefault("eps") - - def setMinSamples(self: P, value: int) -> P: - return self._set_params(min_samples=value) - - def getMinSamples(self) -> int: - return self.getOrDefault("min_samples") - - def setMetric(self: P, value: str) -> P: - return self._set_params(metric=value) - - def getMetric(self) -> str: - return self.getOrDefault("metric") - - def setMaxMbytesPerBatch(self: P, value: Optional[int]) -> P: - return self._set_params(max_mbytes_per_batch=value) - - def getMaxMbytesPerBatch(self) -> Optional[int]: - return self.getOrDefault("max_mbytes_per_batch") - - def setCalcCoreSampleIndices(self: P, value: bool) -> P: - return self._set_params(calc_core_sample_indices=value) - - def getCalcCoreSampleIndices(self) -> bool: - return self.getOrDefault("calc_core_sample_indices") - - def _fit(self, dataset: DataFrame) -> _CumlModel: - if self.getMetric() == "precomputed": - raise ValueError( - "Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead" - ) - - # Create parameter-copied model without accessing the input dataframe - # All information will be retrieved from Model and transform - model = DBSCANModel(verbose=self.verbose, n_cols=0, dtype="") - - model._num_workers = self.num_workers - self._copyValues(model) - - return model - - def _create_pyspark_model(self, result: Row) -> _CumlModel: - raise NotImplementedError("DBSCAN does not support model creation from Row") - - def _get_cuml_fit_func( - self, - dataset: DataFrame, - extra_params: Optional[List[Dict[str, Any]]] = None, - ) -> Callable[ - [FitInputType, Dict[str, Any]], - Dict[str, Any], - ]: - raise NotImplementedError("DBSCAN does not fit and generate model") - - def _out_schema(self) -> Union[StructType, str]: - raise NotImplementedError("DBSCAN does not output for fit and generate model") - - -class DBSCANModel( - DBSCANClass, _CumlCaller, _CumlModelWithPredictionCol, _DBSCANCumlParams -): - def __init__( - self, - n_cols: int, - dtype: str, - verbose: Union[int, bool], - ): - super(DBSCANClass, self).__init__() - super(_CumlModelWithPredictionCol, self).__init__(n_cols=n_cols, dtype=dtype) - super(_DBSCANCumlParams, self).__init__() - - self._setDefault( - idCol=alias.row_number, - ) - - self.verbose = verbose - self.BROADCAST_LIMIT = 8 << 30 - self._dbscan_spark_model = None - - def _pre_process_data(self, dataset: DataFrame) -> Tuple[ # type: ignore - List[Column], - Optional[List[str]], - int, - Union[Type[FloatType], Type[DoubleType]], - ]: - ( - select_cols, - multi_col_names, - dimension, - feature_type, - ) = _CumlCaller._pre_process_data(self, dataset) - - # Must retain idCol for label matching - if self.hasParam("idCol") and self.isDefined("idCol"): - id_col_name = self.getOrDefault("idCol") - select_cols.append(col(id_col_name)) - else: - select_cols.append(col(alias.row_number)) - - return select_cols, multi_col_names, dimension, feature_type - - def _out_schema( - self, input_schema: StructType = StructType() - ) -> Union[StructType, str]: - return StructType( - [ - StructField(self._get_prediction_name(), IntegerType(), False), - StructField(self.getIdCol(), LongType(), False), - ] - ) - - def _transform_array_order(self) -> _ArrayOrder: - return "C" - - def _fit_array_order(self) -> _ArrayOrder: - return "C" - - def _require_nccl_ucx(self) -> Tuple[bool, bool]: - return (True, True) - - def _get_cuml_fit_func( - self, - dataset: DataFrame, - extra_params: Optional[List[Dict[str, Any]]] = None, - ) -> Callable[ - [FitInputType, Dict[str, Any]], - Dict[str, Any], - ]: - import cupy as cp - import cupyx - - dtype = self.dtype - n_cols = self.n_cols - array_order = self._fit_array_order() - pred_name = self._get_prediction_name() - idCol_name = self.getIdCol() - - cuda_managed_mem_enabled = ( - _get_spark_session().conf.get("spark.rapids.ml.uvm.enabled", "false") - == "true" - ) - - inputs = [] # type: ignore - - idCol = list( - self.idCols_[0].value - if len(self.idCols_) == 1 - else np.concatenate([chunk.value for chunk in self.idCols_]) - ) - - for pdf_bc in self.raw_data_: - features = pdf_bc.value - - # experiments indicate it is faster to convert to numpy array and then to cupy array than directly - # invoking cupy array on the list - if cuda_managed_mem_enabled: - features = cp.array(features) - - inputs.append(features) - - concated = _concat_and_free(inputs, order=array_order) - - def _cuml_fit( - dfs: FitInputType, - params: Dict[str, Any], - ) -> Dict[str, Any]: - from cuml.cluster.dbscan_mg import DBSCANMG as CumlDBSCANMG - from pyspark import BarrierTaskContext - - context = BarrierTaskContext.get() - partition_id = context.partitionId() - - logger = get_logger(self.__class__) - - dbscan = CumlDBSCANMG( - handle=params[param_alias.handle], - output_type="cudf", - eps=self.getOrDefault("eps"), - min_samples=self.getOrDefault("min_samples"), - metric=self.getOrDefault("metric"), - max_mbytes_per_batch=self.getOrDefault("max_mbytes_per_batch"), - calc_core_sample_indices=self.getOrDefault("calc_core_sample_indices"), - verbose=self.verbose, - ) - dbscan.n_cols = params[param_alias.num_cols] - dbscan.dtype = np.dtype(dtype) - - res = list(dbscan.fit_predict(concated).to_numpy()) - - # Only node 0 from cuML will contain the correct label output - if partition_id == 0: - return { - idCol_name: idCol, - pred_name: res, - } - else: - return { - idCol_name: [], - pred_name: [], - } - - return _cuml_fit - - def _get_cuml_transform_func( - self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None - ) -> Tuple[ - _ConstructFunc, - _TransformFunc, - Optional[_EvaluateFunc], - ]: - raise NotImplementedError( - "DBSCAN does not can not have a separate transform UDF" - ) - - def _transform(self, dataset: DataFrame) -> DataFrame: - logger = get_logger(self.__class__) - - spark = _get_spark_session() - - def _chunk_arr( - arr: np.ndarray, BROADCAST_LIMIT: int = self.BROADCAST_LIMIT - ) -> List[np.ndarray]: - """Chunk an array, if oversized, into smaller arrays that can be broadcasted.""" - if arr.nbytes <= BROADCAST_LIMIT: - return [arr] - - rows_per_chunk = BROADCAST_LIMIT // (arr.nbytes // arr.shape[0]) - num_chunks = (arr.shape[0] + rows_per_chunk - 1) // rows_per_chunk - chunks = [ - arr[i * rows_per_chunk : (i + 1) * rows_per_chunk] - for i in range(num_chunks) - ] - - return chunks - - dataset = self._ensureIdCol(dataset) - select_cols, multi_col_names, dimension, _ = self._pre_process_data(dataset) - input_dataset = dataset.select(*select_cols) - pd_dataset: pd.DataFrame = input_dataset.toPandas() - - if multi_col_names: - raw_data = np.array( - pd_dataset.drop(columns=[self.getIdCol()]), - order=self._fit_array_order(), - ) - else: - raw_data = np.array( - list(pd_dataset.drop(columns=[self.getIdCol()])[alias.data]), - order=self._fit_array_order(), - ) - - idCols: np.ndarray = np.array(pd_dataset[self.getIdCol()]) - - # Set input metadata - self.n_cols = len(raw_data[0]) - self.dtype = ( - type(raw_data[0][0][0]).__name__ - if isinstance(raw_data[0][0], List) - or isinstance(raw_data[0][0], np.ndarray) - else type(raw_data[0][0]).__name__ - ) - - # Broadcast preprocessed input dataset and the idCol - broadcast_raw_data = [ - spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(raw_data) - ] - - broadcast_idCol = [ - spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(idCols) - ] - - self.processed_input_cols = input_dataset.drop(self.getIdCol()).columns - self.raw_data_ = broadcast_raw_data - self.idCols_ = broadcast_idCol - self.multi_col_names = multi_col_names - - idCol_name = self.getIdCol() - - default_num_partitions = dataset.rdd.getNumPartitions() - - rdd = self._call_cuml_fit_func( - dataset=dataset, - partially_collect=False, - paramMaps=None, - ) - rdd = rdd.repartition(default_num_partitions) - - pred_df = rdd.toDF() - - # JOIN the transformed label column into the original input dataset - # and discard the internal idCol for row matching - return dataset.join(pred_df, idCol_name).drop(idCol_name) - - def _get_model_attributes(self) -> Optional[Dict[str, Any]]: - """ - Override parent method to bring broadcast variables to driver before JSON serialization. - """ - - self._model_attributes["verbose"] = self.verbose - - return self._model_attributes diff --git a/python/tests/test_dbscan.py b/python/tests/test_dbscan.py index ebbebb91..607dd88f 100644 --- a/python/tests/test_dbscan.py +++ b/python/tests/test_dbscan.py @@ -32,7 +32,7 @@ from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import col -from spark_rapids_ml.dbscan import DBSCAN, DBSCANModel +from spark_rapids_ml.clustering import DBSCAN, DBSCANModel from .sparksession import CleanSparkSession from .utils import ( From 6408c5298b4dd9fb02d43c24979c8387e7745116 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Fri, 12 Apr 2024 23:31:22 +0800 Subject: [PATCH 11/31] Remove build link from the pre-merge-CI workflow (#605) We need to remove all the CI job's link from the pre-merge-CI workflow for security concern. Signed-off-by: Tim Liu --- ci/Jenkinsfile.premerge | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge index 43e5d2ac..a73620d6 100644 --- a/ci/Jenkinsfile.premerge +++ b/ci/Jenkinsfile.premerge @@ -87,7 +87,7 @@ pipeline { def title = githubHelper.getIssue().title if (title ==~ /.*\[skip ci\].*/) { - githubHelper.updateCommitStatus("$BUILD_URL", "Skipped", GitHubCommitState.SUCCESS) + githubHelper.updateCommitStatus("", "Skipped", GitHubCommitState.SUCCESS) currentBuild.result == "SUCCESS" skipped = true return @@ -116,7 +116,7 @@ pipeline { steps { script { - githubHelper.updateCommitStatus("$BUILD_URL", "Running - preparing", GitHubCommitState.PENDING) + githubHelper.updateCommitStatus("", "Running - preparing", GitHubCommitState.PENDING) checkout( changelog: false, poll: true, @@ -178,7 +178,7 @@ pipeline { steps { script { - githubHelper.updateCommitStatus("$BUILD_URL", "Running - tests", GitHubCommitState.PENDING) + githubHelper.updateCommitStatus("", "Running - tests", GitHubCommitState.PENDING) container('gpu') { timeout(time: 2, unit: 'HOURS') { // step only timeout for test run common.resolveIncompatibleDriverIssue(this) @@ -198,7 +198,7 @@ pipeline { } if (currentBuild.currentResult == "SUCCESS") { - githubHelper.updateCommitStatus("$BUILD_URL", "Success", GitHubCommitState.SUCCESS) + githubHelper.updateCommitStatus("", "Success", GitHubCommitState.SUCCESS) } else { // upload log only in case of build failure def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"] @@ -206,7 +206,7 @@ pipeline { guardWords.add("sc-ipp*") // hide cloud info githubHelper.uploadLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords) - githubHelper.updateCommitStatus("$BUILD_URL", "Fail", GitHubCommitState.FAILURE) + githubHelper.updateCommitStatus("", "Fail", GitHubCommitState.FAILURE) } if (TEMP_IMAGE_BUILD) { From 6e1c0d1f6427b7cd3ebcbd06727c0b32b182e8ae Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:49:26 -0700 Subject: [PATCH 12/31] DBSCAN broadcast fix (#617) * DBSCAN broadcast fix Signed-off-by: Hongzhe Cheng * Comment delete --------- Signed-off-by: Hongzhe Cheng --- python/src/spark_rapids_ml/clustering.py | 43 ++++++++++++------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 803f9156..bee1e368 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -914,25 +914,8 @@ def _get_cuml_fit_func( == "true" ) - inputs = [] # type: ignore - - idCol = list( - self.idCols_[0].value - if len(self.idCols_) == 1 - else np.concatenate([chunk.value for chunk in self.idCols_]) - ) - - for pdf_bc in self.raw_data_: - features = pdf_bc.value - - # experiments indicate it is faster to convert to numpy array and then to cupy array than directly - # invoking cupy array on the list - if cuda_managed_mem_enabled: - features = cp.array(features) - - inputs.append(features) - - concated = _concat_and_free(inputs, order=array_order) + idCol_bc = self.idCols_ + raw_data_dc = self.raw_data_ def _cuml_fit( dfs: FitInputType, @@ -941,11 +924,29 @@ def _cuml_fit( from cuml.cluster.dbscan_mg import DBSCANMG as CumlDBSCANMG from pyspark import BarrierTaskContext + inputs = [] # type: ignore + + idCol = list( + idCol_bc[0].value + if len(idCol_bc) == 1 + else np.concatenate([chunk.value for chunk in idCol_bc]) + ) + + for pdf_bc in raw_data_dc: + features = pdf_bc.value + + # experiments indicate it is faster to convert to numpy array and then to cupy array than directly + # invoking cupy array on the list + if cuda_managed_mem_enabled: + features = cp.array(features) + + inputs.append(features) + + concated = _concat_and_free(inputs, order=array_order) + context = BarrierTaskContext.get() partition_id = context.partitionId() - logger = get_logger(self.__class__) - dbscan = CumlDBSCANMG( handle=params[param_alias.handle], output_type="cudf", From 3d50d0ef204f9e4b750de49821ce008c34b313f3 Mon Sep 17 00:00:00 2001 From: eordentlich Date: Mon, 15 Apr 2024 10:01:27 -0700 Subject: [PATCH 13/31] update to rapids 24.04; temp. patch dbscan test till new algorithm parameter is supported (#619) Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- docker/Dockerfile.pip | 2 +- docker/Dockerfile.python | 2 +- docs/source/conf.py | 2 +- notebooks/aws-emr/init-bootstrap-action.sh | 2 +- notebooks/databricks/init-pip-cuda-11.8.sh | 2 +- notebooks/dataproc/README.md | 2 +- notebooks/dataproc/spark_rapids_ml.sh | 2 +- python/README.md | 6 +++--- python/benchmark/databricks/init-pip-cuda-11.8.sh | 2 +- python/benchmark/dataproc/init_benchmark.sh | 2 +- python/pyproject.toml | 2 +- python/src/spark_rapids_ml/__init__.py | 2 +- python/tests/test_dbscan.py | 2 ++ 14 files changed, 17 insertions(+), 15 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index d9ba0a33..2a489efa 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG CUML_VER=24.02 +ARG CUML_VER=24.04 RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER python=3.9 cuda-version=11.8 \ && conda clean --all -f -y diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 85eccf7b..244f84ad 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG PYSPARK_VERSION=3.3.1 -ARG RAPIDS_VERSION=24.2.0 +ARG RAPIDS_VERSION=24.4.0 ARG ARCH=amd64 #ARG ARCH=arm64 # Install packages to build spark-rapids-ml diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index dcd5aed1..16aeaac8 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -17,7 +17,7 @@ ARG CUDA_VERSION=11.8.0 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -ARG CUML_VERSION=24.02 +ARG CUML_VERSION=24.04 # Install packages to build spark-rapids-ml RUN apt update -y \ diff --git a/docs/source/conf.py b/docs/source/conf.py index c3918c07..1865e609 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ project = 'spark-rapids-ml' copyright = '2024, NVIDIA' author = 'NVIDIA' -release = '24.02.0' +release = '24.04.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh index dc865d58..1f6d5f44 100755 --- a/notebooks/aws-emr/init-bootstrap-action.sh +++ b/notebooks/aws-emr/init-bootstrap-action.sh @@ -8,7 +8,7 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall" -RAPIDS_VERSION=24.2.0 +RAPIDS_VERSION=24.4.0 # install scikit-learn sudo /usr/local/bin/pip3.9 install scikit-learn diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh index 23cade50..70811100 100644 --- a/notebooks/databricks/init-pip-cuda-11.8.sh +++ b/notebooks/databricks/init-pip-cuda-11.8.sh @@ -4,7 +4,7 @@ SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=24.2.0 +RAPIDS_VERSION=24.4.0 SPARK_RAPIDS_VERSION=24.02.0 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md index efb0a44d..7a19cba2 100644 --- a/notebooks/dataproc/README.md +++ b/notebooks/dataproc/README.md @@ -29,7 +29,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D - Create a cluster with at least two single-gpu workers. **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS: ``` export CUDA_VERSION=11.8 - export RAPIDS_VERSION=24.2.0 + export RAPIDS_VERSION=24.4.0 gcloud dataproc clusters create $USER-spark-rapids-ml \ --image-version=2.1-ubuntu \ diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh index 7c0f29ec..1b914038 100644 --- a/notebooks/dataproc/spark_rapids_ml.sh +++ b/notebooks/dataproc/spark_rapids_ml.sh @@ -1,6 +1,6 @@ #!/bin/bash -RAPIDS_VERSION=24.2.0 +RAPIDS_VERSION=24.4.0 # patch existing packages mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2" diff --git a/python/README.md b/python/README.md index c5e89c80..a2f75afa 100644 --- a/python/README.md +++ b/python/README.md @@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 11.8: ```bash -conda create -n rapids-24.02 \ +conda create -n rapids-24.04 \ -c rapidsai -c conda-forge -c nvidia \ - cuml=24.02 python=3.9 cuda-version=11.8 + cuml=24.04 python=3.9 cuda-version=11.8 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -19,7 +19,7 @@ conda create -n rapids-24.02 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-24.02 +conda activate rapids-24.04 ## for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/python/benchmark/databricks/init-pip-cuda-11.8.sh b/python/benchmark/databricks/init-pip-cuda-11.8.sh index fb6d2313..c7664ba4 100644 --- a/python/benchmark/databricks/init-pip-cuda-11.8.sh +++ b/python/benchmark/databricks/init-pip-cuda-11.8.sh @@ -5,7 +5,7 @@ BENCHMARK_ZIP=/dbfs/path/to/benchmark.zip # IMPORTANT: specify rapids fully 23.10.0 and not 23.10 # also, in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=24.2.0 +RAPIDS_VERSION=24.4.0 SPARK_RAPIDS_VERSION=24.02.0 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar diff --git a/python/benchmark/dataproc/init_benchmark.sh b/python/benchmark/dataproc/init_benchmark.sh index a8ad85fb..b59a4f1f 100755 --- a/python/benchmark/dataproc/init_benchmark.sh +++ b/python/benchmark/dataproc/init_benchmark.sh @@ -8,7 +8,7 @@ function get_metadata_attribute() { /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } -RAPIDS_VERSION=$(get_metadata_attribute rapids-version 24.2.0) +RAPIDS_VERSION=$(get_metadata_attribute rapids-version 24.4.0) # patch existing packages mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2" diff --git a/python/pyproject.toml b/python/pyproject.toml index ffd2fefa..2264fc18 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "spark-rapids-ml" -version = "24.2.0" +version = "24.4.0" authors = [ { name="Jinfeng Li", email="jinfeng@nvidia.com" }, { name="Bobby Wang", email="bobwang@nvidia.com" }, diff --git a/python/src/spark_rapids_ml/__init__.py b/python/src/spark_rapids_ml/__init__.py index 2760fe80..f368b6ca 100644 --- a/python/src/spark_rapids_ml/__init__.py +++ b/python/src/spark_rapids_ml/__init__.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "24.02.0" +__version__ = "24.04.0" diff --git a/python/tests/test_dbscan.py b/python/tests/test_dbscan.py index 607dd88f..22b7710d 100644 --- a/python/tests/test_dbscan.py +++ b/python/tests/test_dbscan.py @@ -52,6 +52,8 @@ def test_default_cuml_params() -> None: cuml_params = get_default_cuml_parameters([CumlDBSCAN], ["handle", "output_type"]) cuml_params["calc_core_sample_indices"] = False spark_params = DBSCAN()._get_cuml_params_default() + # TODO: support algorithm parameter added in cuML 24.04 + cuml_params.pop("algorithm") assert cuml_params == spark_params From 7202ff427dfd1fbce4d290724b16b437ae6e4fa8 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Tue, 16 Apr 2024 15:14:27 -0700 Subject: [PATCH 14/31] Algorithm parameter support for DBSCAN (#622) * Add in algorithm parameter support for DBSCAN Signed-off-by: Hongzhe Cheng * comment fix --------- Signed-off-by: Hongzhe Cheng --- python/src/spark_rapids_ml/clustering.py | 20 ++++++++++++++++++++ python/tests/test_dbscan.py | 15 +++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index bee1e368..94f36158 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -509,6 +509,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "eps": 0.5, "min_samples": 5, "metric": "euclidean", + "algorithm": "brute", "verbose": False, "max_mbytes_per_batch": None, "calc_core_sample_indices": False, @@ -525,6 +526,7 @@ def __init__(self) -> None: eps=0.5, min_samples=5, metric="euclidean", + algorithm="brute", max_mbytes_per_batch=None, calc_core_sample_indices=True, idCol=alias.row_number, @@ -558,6 +560,13 @@ def __init__(self) -> None: typeConverter=TypeConverters.toString, ) + algorithm = Param( + Params._dummy(), + "algorithm", + (f"The algorithm to be used by for nearest neighbor computations."), + typeConverter=TypeConverters.toString, + ) + max_mbytes_per_batch = Param( Params._dummy(), "max_mbytes_per_batch", @@ -660,6 +669,9 @@ class DBSCAN(DBSCANClass, _CumlEstimator, _DBSCANCumlParams): The metric to use when calculating distances between points. Spark Rapids ML does not support the 'precomputed' mode from sklearn and cuML, please use those libraries instead + algorithm: {'brute', 'rbc'}, default = 'brute' + The algorithm to be used by for nearest neighbor computations. + verbose: int or boolean (default=False) Logging level. * ``0`` - Disables all log messages. @@ -751,6 +763,7 @@ def __init__( eps: float = 0.5, min_samples: int = 5, metric: str = "euclidean", + algorithm: str = "brute", max_mbytes_per_batch: Optional[int] = None, calc_core_sample_indices: bool = True, verbose: Union[int, bool] = False, @@ -786,6 +799,12 @@ def setMetric(self: P, value: str) -> P: def getMetric(self) -> str: return self.getOrDefault("metric") + def setAlgorithm(self: P, value: str) -> P: + return self._set_params(algorithm=value) + + def getAlgorithm(self) -> str: + return self.getOrDefault("algorithm") + def setMaxMbytesPerBatch(self: P, value: Optional[int]) -> P: return self._set_params(max_mbytes_per_batch=value) @@ -953,6 +972,7 @@ def _cuml_fit( eps=self.getOrDefault("eps"), min_samples=self.getOrDefault("min_samples"), metric=self.getOrDefault("metric"), + algorithm=self.getOrDefault("algorithm"), max_mbytes_per_batch=self.getOrDefault("max_mbytes_per_batch"), calc_core_sample_indices=self.getOrDefault("calc_core_sample_indices"), verbose=self.verbose, diff --git a/python/tests/test_dbscan.py b/python/tests/test_dbscan.py index 22b7710d..db94932a 100644 --- a/python/tests/test_dbscan.py +++ b/python/tests/test_dbscan.py @@ -52,8 +52,6 @@ def test_default_cuml_params() -> None: cuml_params = get_default_cuml_parameters([CumlDBSCAN], ["handle", "output_type"]) cuml_params["calc_core_sample_indices"] = False spark_params = DBSCAN()._get_cuml_params_default() - # TODO: support algorithm parameter added in cuML 24.04 - cuml_params.pop("algorithm") assert cuml_params == spark_params @@ -166,13 +164,17 @@ def test_dbscan_numeric_type(gpu_number: int, data_type: str) -> None: ids=idfn, ) @pytest.mark.parametrize("data_type", cuml_supported_data_types) -@pytest.mark.parametrize("max_record_batch", [100, 10000]) +@pytest.mark.parametrize( + "max_record_batch", [pytest.param(100, marks=pytest.mark.slow), 10000] +) +@pytest.mark.parametrize("algorithm", ["brute", "rbc"]) def test_dbscan( gpu_number: int, feature_type: str, data_shape: Tuple[int, int], data_type: np.dtype, max_record_batch: int, + algorithm: str, ) -> None: from cuml.datasets import make_blobs @@ -192,7 +194,12 @@ def test_dbscan( from cuml import DBSCAN as cuDBSCAN cuml_dbscan = cuDBSCAN( - eps=eps, min_samples=min_samples, metric=metric, output_type="numpy", verbose=7 + eps=eps, + min_samples=min_samples, + metric=metric, + algorithm=algorithm, + output_type="numpy", + verbose=7, ) import cudf From 793851a13902d1015fb9f2ac59dc2a534ead252a Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 17 Apr 2024 07:54:27 +0800 Subject: [PATCH 15/31] [data-gen] add example in the help (#621) Signed-off-by: Bobby Wang --- python/benchmark/gen_data.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/python/benchmark/gen_data.py b/python/benchmark/gen_data.py index fb859997..26b170aa 100644 --- a/python/benchmark/gen_data.py +++ b/python/benchmark/gen_data.py @@ -465,15 +465,27 @@ def main(registered_data_gens: Dict[str, Any], repartition: bool) -> None: parser = argparse.ArgumentParser( description="Generate random dataset.", - usage="""gen_data.py [] - - Supported types are: - blobs Generate random blobs datasets using sklearn's make_blobs - regression Generate random regression datasets using sklearn's make_regression - classification Generate random classification datasets using sklearn's make_classification - low_rank_matrix Generate random dataset using sklearn's make_low_rank_matrix - sparse_regression Generate random sparse regression datasets stored as sparse vectors - default Generate default dataset using pyspark RandomRDDs.uniformVectorRDD + usage="""python gen_data_distributed.py [] + +Supported types are: + blobs Generate random blobs datasets using sklearn's make_blobs + regression Generate random regression datasets using sklearn's make_regression + classification Generate random classification datasets using sklearn's make_classification + low_rank_matrix Generate random dataset using sklearn's make_low_rank_matrix + sparse_regression Generate random sparse regression datasets stored as sparse vectors + default Generate default dataset using pyspark RandomRDDs.uniformVectorRDD + +Example: +python gen_data_distributed.py [regression|blobs|low_rank_matrix|default|classification|sparse_regression] \\ + --feature_type array \\ + --num_rows 5000 \\ + --num_cols 3000 \\ + --dtype "float64" \\ + --output_num_files 100 \\ + --overwrite \\ + --output_dir "./5k_3k_float64.parquet" \\ + --spark_confs "spark.master=local[*]" \\ + --spark_confs "spark.driver.memory=128g" """, ) parser.add_argument("type", help="Generate random dataset") From d487be33358109adb324f055b5dec4661563f371 Mon Sep 17 00:00:00 2001 From: eordentlich Date: Tue, 16 Apr 2024 19:58:36 -0700 Subject: [PATCH 16/31] work around to support pyspark 3.3 with rapids 24.04 (#623) Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/src/spark_rapids_ml/__init__.py b/python/src/spark_rapids_ml/__init__.py index f368b6ca..68826205 100644 --- a/python/src/spark_rapids_ml/__init__.py +++ b/python/src/spark_rapids_ml/__init__.py @@ -14,3 +14,15 @@ # limitations under the License. # __version__ = "24.04.0" + +import pandas as pd +import pyspark + +# patch pandas 2.0+ for backward compatibility with psypark < 3.4 +from packaging import version + +if version.parse(pyspark.__version__) < version.parse("3.4.0") and version.parse( + pd.__version__ +) >= version.parse("2.0.0"): + pd.DataFrame.iteritems = pd.DataFrame.items + pd.Series.iteritems = pd.Series.items From 9e95ef8b81b0eac67bdbf2e3bebcaf9562a20de0 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Thu, 18 Apr 2024 10:54:38 -0700 Subject: [PATCH 17/31] Databricks notebook readme fix (#627) Signed-off-by: Hongzhe Cheng --- notebooks/databricks/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index ee0be0a2..bd1bb530 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -33,9 +33,16 @@ If you already have a Databricks account, you can run the example notebooks on a ```bash export WS_SAVE_DIR="/path/to/directory/in/workspace" databricks workspace mkdirs ${WS_SAVE_DIR} --profile ${PROFILE} + ``` + For Mac + ```bash databricks workspace import --format AUTO --content $(base64 -i init-pip-cuda-11.8.sh) ${WS_SAVE_DIR}/init-pip-cuda-11.8.sh --profile ${PROFILE} ``` -- Create a cluster using **Databricks 12.2 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**. + For Linux + ```bash + databricks workspace import --format AUTO --content $(base64 -w 0 init-pip-cuda-11.8.sh) ${WS_SAVE_DIR}/init-pip-cuda-11.8.sh --profile ${PROFILE} + ``` +- Create a cluster using **Databricks 13.3 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**. - **Init Scripts** - add the workspace path to the uploaded init script, e.g. `${WS_SAVE_DIR}/init-pip-cuda-11.8.sh`. - **Spark** From 49f5069fe574b111b4b281c64ec89be4cdc382ce Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Fri, 19 Apr 2024 10:39:18 -0700 Subject: [PATCH 18/31] Support standardization for sparse matrix in logistic regression (#629) * support standardization for sparse vectors per cuml 24.04 * revise test cases to test sparse standardization * revise * revise docstring regarding sparse standardization --------- Signed-off-by: Jinfeng --- python/src/spark_rapids_ml/classification.py | 19 ++---------- python/tests/test_logistic_regression.py | 31 ++++++++------------ 2 files changed, 14 insertions(+), 36 deletions(-) diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index df1d8988..aba4c241 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -849,9 +849,7 @@ class LogisticRegression( fitIntercept: Whether to fit an intercept term. standardization: - Whether to standardize the training data. If true, spark rapids ml sets enable_sparse_data_optim=False - to densify sparse vectors into dense vectors for fitting. Currently there is no support for sparse vectors - standardization in cuml yet. + Whether to standardize the training data before fit. num_workers: Number of cuML workers, where each cuML worker corresponds to one Spark task running on one GPU. If not set, spark-rapids-ml tries to infer the number of @@ -945,15 +943,6 @@ def _get_cuml_fit_func( fit_intercept = self.getFitIntercept() logger = get_logger(self.__class__) - if ( - self.getStandardization() is True - and self.getOrDefault("enable_sparse_data_optim") is not False - ): - logger.warning( - ( - "when standardization is True, spark rapids ml forces densifying sparse vectors to dense vectors for training." - ) - ) def _logistic_regression_fit( dfs: FitInputType, @@ -977,17 +966,13 @@ def _logistic_regression_fit( concated, cupyx.scipy.sparse.csr_matrix ) - # densifying sparse vectors into dense to use standardization - if standardization is True and is_sparse is True: - concated = concated.toarray() - pdesc = PartitionDescriptor.build( [concated.shape[0]], params[param_alias.num_cols], ) # Use cupy to standardize dataset as a workaround to gain better numeric stability - standarization_with_cupy = standardization + standarization_with_cupy = standardization and not is_sparse if standarization_with_cupy is True: import cupy as cp diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py index 8d5c0551..b6f935c8 100644 --- a/python/tests/test_logistic_regression.py +++ b/python/tests/test_logistic_regression.py @@ -1575,7 +1575,7 @@ def test_sparse_nlp20news( y = twenty_train.target.tolist() conf: Dict[str, Any] = { - # "spark.rapids.ml.uvm.enabled": True # Commenting this out can resolve a cudaMemSet error + "spark.rapids.ml.uvm.enabled": True } # enable memory management to run the test case on GPU with small memory (e.g. 2G) with CleanSparkSession(conf) as spark: data = [ @@ -1624,17 +1624,15 @@ def test_sparse_nlp20news( or abs(gpu_model.objective - cpu_objective) < tolerance ) - # temporarily comment out uvm and compare_model - # assert "CUDA managed memory enabled." in caplog.text - # if standardization is True: - # compare_model( - # gpu_model, - # cpu_model, - # df_train, - # unit_tol=tolerance, - # total_tol=tolerance, - # accuracy_and_probability_only=True, - # ) + if standardization is True: + compare_model( + gpu_model, + cpu_model, + df_train, + unit_tol=tolerance, + total_tol=tolerance, + accuracy_and_probability_only=True, + ) @pytest.mark.parametrize("fit_intercept", [True, False]) @@ -1764,10 +1762,6 @@ def test_compat_standardization( blor_model = blor.fit(bdf) - if isinstance(blor, LogisticRegression): - warning_log = "when standardization is True, spark rapids ml forces densifying sparse vectors to dense vectors for training." - assert warning_log in caplog.text - blor_model.setFeaturesCol("features") blor_model.setProbabilityCol("newProbability") blor_model.setRawPredictionCol("newRawPrediction") @@ -1901,7 +1895,8 @@ def train_model(EstimatorClass, ModelClass): # type: ignore @pytest.mark.parametrize("fit_intercept", [True, False]) @pytest.mark.parametrize( - "reg_factors", [(0.0, 0.0), (0.1, 0.0), (0.1, 1.0), (0.1, 0.2)] + "reg_factors", + [(0.0, 0.0), (0.1, 0.0), (0.1, 1.0), (0.1, 0.2)], ) def test_standardization_sparse_example( fit_intercept: bool, @@ -1981,8 +1976,6 @@ def sparse_to_df(X: csr_matrix, y: List[float]) -> DataFrame: cpu_lr = SparkLogisticRegression(**est_params) gpu_model = gpu_lr.fit(df) - warning_log = "when standardization is True, spark rapids ml forces densifying sparse vectors to dense vectors for training." - assert warning_log in caplog.text cpu_model = cpu_lr.fit(df) From 4d58143c4e36d818ec71ed8e9cd18ce0ec456517 Mon Sep 17 00:00:00 2001 From: eordentlich Date: Fri, 19 Apr 2024 12:06:35 -0700 Subject: [PATCH 19/31] add dbscan to api docs (#600) * add dbscan to api docs Signed-off-by: Erik Ordentlich * make consistent with source changes Signed-off-by: Erik Ordentlich --------- Signed-off-by: Erik Ordentlich --- docs/site/compatibility.md | 3 ++- docs/source/spark_rapids_ml.rst | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/site/compatibility.md b/docs/site/compatibility.md index 26443a15..ec5cd6d6 100644 --- a/docs/site/compatibility.md +++ b/docs/site/compatibility.md @@ -11,6 +11,7 @@ The following table shows the currently supported algorithms. The goal is to ex | Supported Algorithms | Python | Scala | | :--------------------- | :----: | :---: | | CrossValidator | √ | | +| DBSCAN (*) | √ | | | KMeans | √ | | | k-NN (*) | √ | | | LinearRegression | √ | | @@ -20,7 +21,7 @@ The following table shows the currently supported algorithms. The goal is to ex | RandomForestRegressor | √ | | | UMAP (*) | √ | | -Note: Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. +Note: Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. As an alternative to KMeans, we also provide a Spark API for GPU accelerated Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density based clustering algorithm in the RAPIDS cuML library. ## Supported Versions diff --git a/docs/source/spark_rapids_ml.rst b/docs/source/spark_rapids_ml.rst index 5fcbdb53..03470908 100644 --- a/docs/source/spark_rapids_ml.rst +++ b/docs/source/spark_rapids_ml.rst @@ -57,6 +57,8 @@ Clustering :template: autosummary/class_with_docs.rst :toctree: api + DBSCAN + DBSCANModel KMeans KMeansModel @@ -111,4 +113,4 @@ UMAP :toctree: api UMAP - UMAPModel \ No newline at end of file + UMAPModel From 6d7ff1020647ebc4b4957f9c405836738101101f Mon Sep 17 00:00:00 2001 From: eordentlich Date: Sat, 20 Apr 2024 11:49:13 -0700 Subject: [PATCH 20/31] keep eval related computations and data on gpu (#587) * keep eval related computations and data on gpu Signed-off-by: Erik Ordentlich * extend to regression and rf Signed-off-by: Erik Ordentlich * clean up Signed-off-by: Erik Ordentlich * fix types Signed-off-by: Erik Ordentlich --------- Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/classification.py | 113 ++++++++++++------- python/src/spark_rapids_ml/core.py | 8 +- python/src/spark_rapids_ml/regression.py | 30 +++-- python/src/spark_rapids_ml/tree.py | 32 +++++- 4 files changed, 124 insertions(+), 59 deletions(-) diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index aba4c241..34b33a38 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -111,18 +111,19 @@ class _ClassificationModelEvaluationMixIn: _this_model: Union["RandomForestClassificationModel", "LogisticRegressionModel"] def _get_evaluate_fn(self, eval_metric_info: EvalMetricInfo) -> _EvaluateFunc: - def _evaluate( - input: TransformInputType, - transformed: TransformInputType, - ) -> pd.DataFrame: - # calculate the count of (label, prediction) - # TBD: keep all intermediate transform output on gpu as long as possible to avoid copies - - if eval_metric_info.eval_metric == transform_evaluate_metric.accuracy_like: - comb = pd.DataFrame( + if eval_metric_info.eval_metric == transform_evaluate_metric.accuracy_like: + + def _evaluate( + input: TransformInputType, + transformed: "cp.ndarray", + ) -> pd.DataFrame: + # calculate the count of (label, prediction) + import cudf + + comb = cudf.DataFrame( { "label": input[alias.label], - "prediction": transformed[pred.prediction], + "prediction": transformed, } ) confusion = ( @@ -131,16 +132,19 @@ def _evaluate( .reset_index(name="total") ) + confusion = confusion.to_pandas() + return confusion - else: - # once data is maintained on gpu replace with cuml.metrics.log_loss - from spark_rapids_ml.metrics.MulticlassMetrics import log_loss - _log_loss = log_loss( - np.array(input[alias.label]), - np.array(list(transformed[pred.probability])), - eval_metric_info.eps, - ) + else: + + def _evaluate( + input: TransformInputType, + transformed: "cp.ndarray", + ) -> pd.DataFrame: + from cuml.metrics import log_loss + + _log_loss = log_loss(input[alias.label], transformed, normalize=False) _log_loss_pdf = pd.DataFrame( {"total": [len(input[alias.label])], "log_loss": [_log_loss]} @@ -621,16 +625,26 @@ def _get_cuml_transform_func( ]: _construct_rf, _, _ = super()._get_cuml_transform_func(dataset) - def _predict(rf: CumlT, pdf: TransformInputType) -> pd.Series: - data = {} - rf.update_labels = False - data[pred.prediction] = rf.predict(pdf) + if eval_metric_info: + if eval_metric_info.eval_metric == transform_evaluate_metric.log_loss: + + def _predict(rf: CumlT, pdf: TransformInputType) -> "cp.ndarray": + rf.update_labels = False + return rf.predict_proba(pdf) + + else: + + def _predict(rf: CumlT, pdf: TransformInputType) -> "cp.ndarray": + rf.update_labels = False + return rf.predict(pdf) + + else: + + def _predict(rf: CumlT, pdf: TransformInputType) -> pd.Series: + data = {} + rf.update_labels = False + data[pred.prediction] = rf.predict(pdf) - # non log-loss metric doesn't need probs. - if ( - not eval_metric_info - or eval_metric_info.eval_metric == transform_evaluate_metric.log_loss - ): probs = rf.predict_proba(pdf) if isinstance(probs, pd.DataFrame): # For 2302, when input is multi-cols, the output will be DataFrame @@ -639,7 +653,7 @@ def _predict(rf: CumlT, pdf: TransformInputType) -> pd.Series: # should be np.ndarray data[pred.probability] = pd.Series(list(probs)) - return pd.DataFrame(data) + return pd.DataFrame(data) _evaluate = ( self._get_evaluate_fn(eval_metric_info) if eval_metric_info else None @@ -1456,21 +1470,34 @@ def _construct_lr() -> CumlT: self._get_evaluate_fn(eval_metric_info) if eval_metric_info else None ) - def _predict(lr: CumlT, pdf: TransformInputType) -> pd.DataFrame: - import cupy as cp + if eval_metric_info: + if eval_metric_info.eval_metric == transform_evaluate_metric.log_loss: + + def _predict(lr: CumlT, pdf: TransformInputType) -> "cp.ndarray": + + return lr.predict_proba(pdf) + + else: + + def _predict(lr: CumlT, pdf: TransformInputType) -> "cp.ndarray": + + return lr.predict(pdf) + + else: + + def _predict(lr: CumlT, pdf: TransformInputType) -> pd.DataFrame: + import cupy as cp + + data = {} + + scores = lr.decision_function(pdf).T + assert isinstance(scores, cp.ndarray) + _num_classes = max(scores.shape[1] if len(scores.shape) == 2 else 2, 2) + + data[pred.prediction] = pd.Series( + list(_predict_labels(scores, _num_classes).get()) + ) - data = {} - scores = lr.decision_function(pdf).T - assert isinstance(scores, cp.ndarray) - _num_classes = max(scores.shape[1] if len(scores.shape) == 2 else 2, 2) - data[pred.prediction] = pd.Series( - list(_predict_labels(scores, _num_classes).get()) - ) - # non log-loss metric doesn't need probs. - if ( - not eval_metric_info - or eval_metric_info.eval_metric == transform_evaluate_metric.log_loss - ): data[pred.probability] = pd.Series( list(_predict_proba(scores, _num_classes).get()) ) @@ -1482,7 +1509,7 @@ def _predict(lr: CumlT, pdf: TransformInputType) -> pd.DataFrame: raw_prediction = scores data[pred.raw_prediction] = pd.Series(list(cp.asnumpy(raw_prediction))) - return pd.DataFrame(data) + return pd.DataFrame(data) return _construct_lr, _predict, _evaluate diff --git a/python/src/spark_rapids_ml/core.py b/python/src/spark_rapids_ml/core.py index ca9c53f1..6069258d 100644 --- a/python/src/spark_rapids_ml/core.py +++ b/python/src/spark_rapids_ml/core.py @@ -85,6 +85,7 @@ if TYPE_CHECKING: import cudf + import cupy as cp from pyspark.ml._typing import ParamMap CumlT = Any @@ -104,13 +105,16 @@ _ConstructFunc = Callable[..., Union[CumlT, List[CumlT]]] # Function to do the inference using cuml instance constructed by _ConstructFunc -_TransformFunc = Callable[[CumlT, TransformInputType], pd.DataFrame] +_TransformFunc = Union[ + Callable[[CumlT, TransformInputType], pd.DataFrame], + Callable[[CumlT, TransformInputType], "cp.ndarray"], +] # Function to do evaluation based on the prediction result got from _TransformFunc _EvaluateFunc = Callable[ [ TransformInputType, # input dataset with label column - TransformInputType, # inferred dataset with prediction column + "cp.ndarray", # inferred dataset with prediction column ], pd.DataFrame, ] diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index fed56792..997c3915 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -77,6 +77,7 @@ from .utils import PartitionDescriptor, _get_spark_session, cudf_to_cuml_array, java_uid if TYPE_CHECKING: + import cupy as cp from pyspark.ml._typing import ParamMap T = TypeVar("T") @@ -143,13 +144,15 @@ def _transform_evaluate( @staticmethod def _calculate_regression_metrics( input: TransformInputType, - transformed: TransformInputType, + transformed: "cp.array", ) -> pd.DataFrame: """calculate the metrics: mean/m2n/m2/l1 ... input must have `alias.label` column""" - comb = pd.DataFrame( + import cudf + + comb = cudf.DataFrame( { "label": input[alias.label], "prediction": transformed, @@ -159,10 +162,12 @@ def _calculate_regression_metrics( total_cnt = comb.shape[0] return pd.DataFrame( data={ - reg_metrics.mean: [comb.mean().to_list()], - reg_metrics.m2n: [(comb.var(ddof=0) * total_cnt).to_list()], - reg_metrics.m2: [comb.pow(2).sum().to_list()], - reg_metrics.l1: [comb.abs().sum().to_list()], + reg_metrics.mean: [comb.mean().to_arrow().to_pylist()], + reg_metrics.m2n: [ + (comb.var(ddof=0) * total_cnt).to_arrow().to_pylist() + ], + reg_metrics.m2: [comb.pow(2).sum().to_arrow().to_pylist()], + reg_metrics.l1: [comb.abs().sum().to_arrow().to_pylist()], reg_metrics.total_count: total_cnt, } ) @@ -741,9 +746,16 @@ def _construct_lr() -> CumlT: return lrs - def _predict(lr: CumlT, pdf: TransformInputType) -> pd.Series: - ret = lr.predict(pdf) - return pd.Series(ret) + if eval_metric_info: + + def _predict(lr: CumlT, pdf: TransformInputType) -> "cp.ndarray": + return lr.predict(pdf) + + else: + + def _predict(lr: CumlT, pdf: TransformInputType) -> pd.Series: + ret = lr.predict(pdf) + return pd.Series(ret) return _construct_lr, _predict, self._calculate_regression_metrics diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index fdb5afe4..e87ebd04 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -18,7 +18,18 @@ import math import pickle from abc import abstractmethod -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Type, + Union, + cast, +) import numpy as np import pandas as pd @@ -62,6 +73,9 @@ translate_trees, ) +if TYPE_CHECKING: + import cupy as cp + class _RandomForestClass(_CumlClass): @classmethod @@ -582,10 +596,18 @@ def _construct_rf() -> CumlT: return rfs - def _predict(rf: CumlT, pdf: TransformInputType) -> pd.Series: - rf.update_labels = False - ret = rf.predict(pdf) - return pd.Series(ret) + if eval_metric_info: + + def _predict(rf: CumlT, pdf: TransformInputType) -> "cp.ndarray": + rf.update_labels = False + return rf.predict(pdf) + + else: + + def _predict(rf: CumlT, pdf: TransformInputType) -> pd.Series: + rf.update_labels = False + ret = rf.predict(pdf) + return pd.Series(ret) # TBD: figure out why RF algo's warns regardless of what np array order is set return _construct_rf, _predict, None From 8432305cb1cd7864724fd37cb39d3c6c0c9b2432 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Wed, 24 Apr 2024 12:51:06 -0700 Subject: [PATCH 21/31] DBSCAN support for 64bit indextype (#634) Signed-off-by: Hongzhe Cheng --- python/src/spark_rapids_ml/clustering.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 94f36158..73c36b96 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -934,7 +934,8 @@ def _get_cuml_fit_func( ) idCol_bc = self.idCols_ - raw_data_dc = self.raw_data_ + raw_data_bc = self.raw_data_ + data_size = self.data_size def _cuml_fit( dfs: FitInputType, @@ -951,7 +952,7 @@ def _cuml_fit( else np.concatenate([chunk.value for chunk in idCol_bc]) ) - for pdf_bc in raw_data_dc: + for pdf_bc in raw_data_bc: features = pdf_bc.value # experiments indicate it is faster to convert to numpy array and then to cupy array than directly @@ -980,7 +981,9 @@ def _cuml_fit( dbscan.n_cols = params[param_alias.num_cols] dbscan.dtype = np.dtype(dtype) - res = list(dbscan.fit_predict(concated).to_numpy()) + # Set out_dtype tp 64bit to get larger indexType in cuML for avoiding overflow + out_dtype = np.int32 if data_size < 2147000000 else np.int64 + res = list(dbscan.fit_predict(concated, out_dtype=out_dtype).to_numpy()) # Only node 0 from cuML will contain the correct label output if partition_id == 0: @@ -1044,6 +1047,7 @@ def _chunk_arr( order=self._fit_array_order(), ) + self.data_size = len(raw_data) * len(raw_data[0]) idCols: np.ndarray = np.array(pd_dataset[self.getIdCol()]) # Set input metadata From 96de8d29d21092bd8279dae88ff8012a68a0d5f4 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Wed, 1 May 2024 11:48:07 -0700 Subject: [PATCH 22/31] Log print fix for logistic regression (#636) Signed-off-by: Hongzhe Cheng --- python/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/run_benchmark.sh b/python/run_benchmark.sh index 1419b6df..f6b4a3f4 100755 --- a/python/run_benchmark.sh +++ b/python/run_benchmark.sh @@ -423,7 +423,7 @@ if [[ "${MODE}" =~ "logistic_regression" ]] || [[ "${MODE}" == "all" ]]; then family="Multinomial" fi - echo "$sep algo: sparse ${family} logistic regression - elasticnet regularization $sep" + echo "$sep algo: ${family} logistic regression - elasticnet regularization $sep" python ./benchmark/benchmark_runner.py logistic_regression \ --standardization False \ --maxIter 200 \ From 9f9b4e6600bd75c147f52ce24f0a3b0014ed7168 Mon Sep 17 00:00:00 2001 From: Er1cCheng <107245098+Er1cCheng@users.noreply.github.com> Date: Fri, 3 May 2024 09:13:37 -0700 Subject: [PATCH 23/31] DBSCAN notebook Twitter exmaple (#638) * Twitter DBSCAN exmaple Signed-off-by: Hongzhe Cheng * Parquet Save --------- Signed-off-by: Hongzhe Cheng --- notebooks/dbscan.ipynb | 106 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/notebooks/dbscan.ipynb b/notebooks/dbscan.ipynb index b8b24b37..e9b198cc 100644 --- a/notebooks/dbscan.ipynb +++ b/notebooks/dbscan.ipynb @@ -481,6 +481,110 @@ "plt.grid(True)\n", "plt.show()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Twitter Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download Data and Store to Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Full dataset\n", + "# !curl --output twitter.h5.h5 https://b2share.eudat.eu/api/files/189c8eaf-d596-462b-8a07-93b5922c4a9f/twitter.h5.h5\n", + "\n", + "# Partial small dataset\n", + "!curl --output twitterSmall.h5.h5 https://b2share.eudat.eu/api/files/189c8eaf-d596-462b-8a07-93b5922c4a9f/twitterSmall.h5.h5\n", + "\n", + "import h5py\n", + "import pyarrow\n", + "import pyarrow.parquet as pq\n", + "\n", + "with h5py.File('twitterSmall.h5.h5', 'r') as f: \n", + " data = f[\"DBSCAN\"][:]\n", + "\n", + "df=pd.DataFrame(data, columns=['f1', 'f2'])\n", + "arrow_table = pyarrow.Table.from_pandas(df)\n", + "\n", + "# REMEMBER to change the dbfs path to your designated space\n", + "# Or to local like \"./twitter.parquet\"\n", + "dbfs_path = \"/dbfs/temp/twitter.parquet\"\n", + "pq.write_table(arrow_table, dbfs_path)\n", + "\n", + "df = spark.read.parquet(dbfs_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run DBSCAN over Twitter Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "start_time = time.time()\n", + "\n", + "eps = 0.1\n", + "gpu_dbscan = DBSCAN(eps=eps, min_samples=40, metric=\"euclidean\")\n", + "gpu_dbscan.setFeaturesCols([\"f1\", \"f2\"])\n", + "gpu_model = gpu_dbscan.fit(df)\n", + "gpu_model.setPredictionCol(\"prediction\")\n", + "transformed = gpu_model.transform(df)\n", + "transformed.show()\n", + "\n", + "end_time = time.time()\n", + "elapsed_time = (end_time - start_time)\n", + "\n", + "print(\"Time\", elapsed_time)\n", + "\n", + "dbscan_np = transformed.toPandas().to_numpy()\n", + "\n", + "n_cluster = max(dbscan_np[:,2])\n", + "clusters = [[[],[]] for i in range(int(n_cluster) + 1)]\n", + "\n", + "for p in dbscan_np:\n", + " if int(p[2]) == -1:\n", + " continue\n", + "\n", + " clusters[int(p[2])][0].append(p[0])\n", + " clusters[int(p[2])][1].append(p[1])\n", + "\n", + "clusters = sorted(clusters, key=lambda x: len(x[0]), reverse=True)\n", + "print(\"Number of clusters: \", len(clusters))\n", + "\n", + "for i, c in enumerate(clusters):\n", + " plt.scatter(c[0], c[1], s=0.5, label=f\"cluster {i}\")\n", + " \n", + "plt.xlabel('X')\n", + "plt.ylabel('Y')\n", + "plt.title(f'Twitter API Geo Clusters with DBSCAN eps={eps}')\n", + "plt.show()\n", + "# plt.savefig('plot.png', dpi=1200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -509,7 +613,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.9.19" }, "vscode": { "interpreter": { From 97826c33e37c215ca3aa47b9aade3850fdb07186 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 8 May 2024 02:12:46 +0800 Subject: [PATCH 24/31] Support stage-level scheduling for Yarn and K8s (#640) Signed-off-by: Bobby Wang --- python/src/spark_rapids_ml/core.py | 11 +-- python/tests/test_common_estimator.py | 97 ++++++++++++++++++++------- 2 files changed, 81 insertions(+), 27 deletions(-) diff --git a/python/src/spark_rapids_ml/core.py b/python/src/spark_rapids_ml/core.py index 6069258d..5d60f8d4 100644 --- a/python/src/spark_rapids_ml/core.py +++ b/python/src/spark_rapids_ml/core.py @@ -905,10 +905,13 @@ def _skip_stage_level_scheduling(self, spark_version: str, conf: SparkConf) -> b ) return True - if not _is_standalone_or_localcluster(conf): + if "3.4.0" <= spark_version < "3.5.1" and not _is_standalone_or_localcluster( + conf + ): self.logger.info( - "Stage-level scheduling in spark-rapids-ml requires spark standalone or " - "local-cluster mode" + "For Spark %s, Stage-level scheduling in spark-rapids-ml requires spark " + "standalone or local-cluster mode", + spark_version, ) return True @@ -956,7 +959,7 @@ def _try_stage_level_scheduling(self, rdd: RDD) -> RDD: ss = _get_spark_session() sc = ss.sparkContext - if self._skip_stage_level_scheduling(ss.version, sc.getConf()): + if _is_local(sc) or self._skip_stage_level_scheduling(ss.version, sc.getConf()): return rdd # executor_cores will not be None diff --git a/python/tests/test_common_estimator.py b/python/tests/test_common_estimator.py index 4a2d1bc4..a5adb03f 100644 --- a/python/tests/test_common_estimator.py +++ b/python/tests/test_common_estimator.py @@ -532,56 +532,107 @@ def test_num_workers_validation() -> None: def test_stage_level_scheduling() -> None: dummy = SparkRapidsMLDummy() - assert dummy._skip_stage_level_scheduling("3.3.1", SparkConf()) - conf = SparkConf().setMaster("yarn") - assert dummy._skip_stage_level_scheduling("3.4.0", conf) + standalone_conf = ( + SparkConf() + .setMaster("spark://foo") + .set("spark.executor.cores", "12") + .set("spark.task.cpus", "1") + .set("spark.executor.resource.gpu.amount", "1") + .set("spark.task.resource.gpu.amount", "0.08") + ) - # lack of executor cores/gpu configuration => skip - conf = SparkConf().setMaster("spark://foo") - assert dummy._skip_stage_level_scheduling("3.4.0", conf) + # the correct configurations should not skip stage-level scheduling + assert not dummy._skip_stage_level_scheduling("3.4.0", standalone_conf) + assert not dummy._skip_stage_level_scheduling("3.4.1", standalone_conf) + assert not dummy._skip_stage_level_scheduling("3.5.0", standalone_conf) + assert not dummy._skip_stage_level_scheduling("3.5.1", standalone_conf) + + # spark version < 3.4.0 + assert dummy._skip_stage_level_scheduling("3.3.0", standalone_conf) + + # spark.executor.cores is not set + bad_conf = ( + SparkConf() + .setMaster("spark://foo") + .set("spark.task.cpus", "1") + .set("spark.executor.resource.gpu.amount", "1") + .set("spark.task.resource.gpu.amount", "0.08") + ) + assert dummy._skip_stage_level_scheduling("3.4.0", bad_conf) - # spark.executor.cores=1 => skip - conf = ( + # spark.executor.cores=1 + bad_conf = ( SparkConf() .setMaster("spark://foo") .set("spark.executor.cores", "1") + .set("spark.task.cpus", "1") .set("spark.executor.resource.gpu.amount", "1") + .set("spark.task.resource.gpu.amount", "0.08") ) - assert dummy._skip_stage_level_scheduling("3.4.0", conf) + assert dummy._skip_stage_level_scheduling("3.4.0", bad_conf) - # spark.executor.resource.gpu.amount > 1 => skip - conf = ( + # spark.executor.resource.gpu.amount is not set + bad_conf = ( SparkConf() .setMaster("spark://foo") .set("spark.executor.cores", "12") - .set("spark.executor.resource.gpu.amount", "2") + .set("spark.task.cpus", "1") + .set("spark.task.resource.gpu.amount", "0.08") ) - assert dummy._skip_stage_level_scheduling("3.4.0", conf) + assert dummy._skip_stage_level_scheduling("3.4.0", bad_conf) - # executor.gpu.amount = task.gpu.amount => skip - conf = ( + # spark.executor.resource.gpu.amount>1 + bad_conf = ( SparkConf() .setMaster("spark://foo") .set("spark.executor.cores", "12") - .set("spark.executor.resource.gpu.amount", "1") - .set("spark.task.resource.gpu.amount", "1") + .set("spark.task.cpus", "1") + .set("spark.executor.resource.gpu.amount", "2") + .set("spark.task.resource.gpu.amount", "0.08") ) - assert dummy._skip_stage_level_scheduling("3.4.0", conf) + assert dummy._skip_stage_level_scheduling("3.4.0", bad_conf) - conf = ( + # spark.task.resource.gpu.amount is not set + bad_conf = ( SparkConf() .setMaster("spark://foo") .set("spark.executor.cores", "12") + .set("spark.task.cpus", "1") .set("spark.executor.resource.gpu.amount", "1") ) - assert not dummy._skip_stage_level_scheduling("3.4.0", conf) + assert not dummy._skip_stage_level_scheduling("3.4.0", bad_conf) - conf = ( + # spark.task.resource.gpu.amount=1 + bad_conf = ( SparkConf() .setMaster("spark://foo") .set("spark.executor.cores", "12") + .set("spark.task.cpus", "1") .set("spark.executor.resource.gpu.amount", "1") - .set("spark.task.resource.gpu.amount", "0.08") + .set("spark.task.resource.gpu.amount", "1") ) - assert not dummy._skip_stage_level_scheduling("3.4.0", conf) + assert dummy._skip_stage_level_scheduling("3.4.0", bad_conf) + + # For Yarn and K8S + for mode in ["yarn", "k8s://"]: + for gpu_amount in ["0.08", "0.2", "1.0"]: + conf = ( + SparkConf() + .setMaster(mode) + .set("spark.executor.cores", "12") + .set("spark.task.cpus", "1") + .set("spark.executor.resource.gpu.amount", "1") + .set("spark.task.resource.gpu.amount", gpu_amount) + ) + assert dummy._skip_stage_level_scheduling("3.3.0", conf) + assert dummy._skip_stage_level_scheduling("3.4.0", conf) + assert dummy._skip_stage_level_scheduling("3.4.1", conf) + assert dummy._skip_stage_level_scheduling("3.5.0", conf) + + # This will be fixed when spark 4.0.0 is released. + if gpu_amount == "1.0": + assert dummy._skip_stage_level_scheduling("3.5.1", conf) + else: + # Starting from 3.5.1+, stage-level scheduling is working for Yarn and K8s + assert not dummy._skip_stage_level_scheduling("3.5.1", conf) From c12d1b4469c1b0059b571edd387673d2e5e7ec3c Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Wed, 8 May 2024 14:31:47 -0700 Subject: [PATCH 25/31] Add Approximate Nearest Neighbors algorithm (i.e. IVF_FLAT) from Cuml (#630) * Get toy example working * square ivfflat dists, add join API, add test with/without setting idCol * test key APIs, and add parametrize * fix a bug relates to returned id * move dictionary typeconverter to a class * revised per comments that can be easily addressed * remove brute option from approximatenearestneighbors * add example and docstring to Estimator class and tested the examples in pyspark shell * add docstring to kneighbors and approxSimilarityJoin * reuse code: exact knn working * get ann working, runslow tested * test getter setter * support metric argument and all values except cosine and correlation * fix mypy error --------- Signed-off-by: Jinfeng --- python/src/spark_rapids_ml/core.py | 13 +- python/src/spark_rapids_ml/knn.py | 751 ++++++++++++++++-- .../test_approximate_nearest_neighbors.py | 297 +++++++ python/tests/test_nearest_neighbors.py | 83 +- 4 files changed, 1060 insertions(+), 84 deletions(-) create mode 100644 python/tests/test_approximate_nearest_neighbors.py diff --git a/python/src/spark_rapids_ml/core.py b/python/src/spark_rapids_ml/core.py index 5d60f8d4..6303240c 100644 --- a/python/src/spark_rapids_ml/core.py +++ b/python/src/spark_rapids_ml/core.py @@ -1349,10 +1349,17 @@ def _transform_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame: ) # TODO try to concatenate all the data and do the transform. + has_row_number = None for pdf in pdf_iter: + if has_row_number is None: + has_row_number = True if alias.row_number in pdf.columns else False + else: + assert has_row_number == (alias.row_number in pdf.columns) + for index, cuml_object in enumerate(cuml_objects): - # Transform the dataset - if use_sparse_array: + if has_row_number: + data = cuml_transform_func(cuml_object, pdf) + elif use_sparse_array: features = _read_csr_matrix_from_unwrapped_spark_vec( pdf[select_cols] ) @@ -1362,10 +1369,12 @@ def _transform_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame: else: nparray = np.array(list(pdf[select_cols[0]]), order=array_order) data = cuml_transform_func(cuml_object, nparray) + # Evaluate the dataset if necessary. if evaluate_func is not None: data = evaluate_func(pdf, data) data[pred.model_index] = index + yield data return dataset.mapInPandas(_transform_udf, schema=schema) # type: ignore diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 7855962d..fc55f59f 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -15,12 +15,14 @@ # import asyncio -from abc import ABCMeta +from abc import ABCMeta, abstractmethod from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import numpy as np import pandas as pd from pyspark import keyword_only +from pyspark.broadcast import Broadcast +from pyspark.ml import Estimator from pyspark.ml.functions import vector_to_array from pyspark.ml.linalg import VectorUDT from pyspark.ml.param.shared import ( @@ -49,6 +51,7 @@ FitInputType, _ConstructFunc, _CumlCaller, + _CumlEstimator, _CumlEstimatorSupervised, _CumlModel, _EvaluateFunc, @@ -58,7 +61,7 @@ ) from .metrics import EvalMetricInfo from .params import HasIDCol, P, _CumlClass, _CumlParams -from .utils import _concat_and_free, get_logger +from .utils import _concat_and_free, _get_spark_session, get_logger class NearestNeighborsClass(_CumlClass): @@ -105,6 +108,12 @@ def setK(self: P, value: int) -> P: self._set_params(k=value) return self + def getK(self: P) -> int: + """ + Get the value of `k`. + """ + return self.getOrDefault("k") + def setInputCol(self: P, value: Union[str, List[str]]) -> P: """ Sets the value of :py:attr:`inputCol` or :py:attr:`inputCols`. @@ -340,9 +349,111 @@ def read(cls) -> MLReader: ) -class NearestNeighborsModel( - _CumlCaller, _CumlModel, NearestNeighborsClass, _NearestNeighborsCumlParams -): +class _NNModelBase(_CumlModel, _NearestNeighborsCumlParams): + + def _transform(self, dataset: DataFrame) -> DataFrame: + raise NotImplementedError( + f"{self.__class__} does not provide a transform function. Use 'kneighbors' instead." + ) + + def _get_cuml_transform_func( + self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None + ) -> Tuple[ + _ConstructFunc, + _TransformFunc, + Optional[_EvaluateFunc], + ]: + raise NotImplementedError( + "'_CumlModel._get_cuml_transform_func' method is not implemented. Use 'kneighbors' instead." + ) + + @abstractmethod + def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame]: + raise NotImplementedError() + + def exactNearestNeighborsJoin( + self, + query_df: DataFrame, + distCol: str = "distCol", + ) -> DataFrame: + """ + This function returns the k exact nearest neighbors (knn) in item_df of each query vector in query_df. + item_df is the dataframe passed to the fit function of the NearestNeighbors estimator. + Note that the knn relationship is asymmetric with respect to the input datasets (e.g., if x is a knn of y + , y is not necessarily a knn of x). + + Parameters + ---------- + query_df: pyspark.sql.DataFrame + the query_df dataframe. Each row represents a query vector. + + distCol: str + the name of the output distance column + + Returns + ------- + knnjoin_df: pyspark.sql.DataFrame + the result dataframe that has three columns (item_df, query_df, distCol). + item_df column is of struct type that includes as fields all the columns of input item dataframe. + Similarly, query_df column is of struct type that includes as fields all the columns of input query dataframe. + distCol is the distance column. A row in knnjoin_df is in the format (v1, v2, dist(v1, v2)), + where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2). + """ + + id_col_name = self.getIdCol() + + # call kneighbors then prepare return results + (item_df_withid, query_df_withid, knn_df) = self.kneighbors(query_df) + + from pyspark.sql.functions import arrays_zip, col, explode, struct + + knn_pair_df = knn_df.select( + f"query_{id_col_name}", + explode(arrays_zip("indices", "distances")).alias("zipped"), + ).select( + f"query_{id_col_name}", + col("zipped.indices").alias(f"item_{id_col_name}"), + col("zipped.distances").alias(distCol), + ) + + item_df_struct = item_df_withid.select(struct("*").alias("item_df")) + query_df_struct = query_df_withid.select(struct("*").alias("query_df")) + + knnjoin_df = item_df_struct.join( + knn_pair_df, + item_df_struct[f"item_df.{id_col_name}"] + == knn_pair_df[f"item_{id_col_name}"], + ) + knnjoin_df = knnjoin_df.join( + query_df_struct, + knnjoin_df[f"query_{id_col_name}"] + == query_df_struct[f"query_df.{id_col_name}"], + ) + + if self.isSet(self.idCol): + knnjoin_df = knnjoin_df.select("item_df", "query_df", distCol) + else: + knnjoin_df = knnjoin_df.select( + knnjoin_df["item_df"].dropFields(id_col_name).alias("item_df"), + knnjoin_df["query_df"].dropFields(id_col_name).alias("query_df"), + distCol, + ) + + return knnjoin_df + + def write(self) -> MLWriter: + raise NotImplementedError( + f"{self.__class__} does not support saving/loading, just re-fit the estimator to re-create a model." + ) + + @classmethod + def read(cls) -> MLReader: + raise NotImplementedError( + f"{cls} does not support loading/loading, just re-fit the estimator to re-create a model." + ) + + +class NearestNeighborsModel(_CumlCaller, _NNModelBase, NearestNeighborsClass): def __init__( self, item_df_withid: DataFrame, @@ -590,11 +701,499 @@ async def do_allGather() -> List[str]: return _cuml_fit - def _transform(self, dataset: DataFrame) -> DataFrame: + +class ApproximateNearestNeighborsClass(_CumlClass): + + @classmethod + def _param_mapping(cls) -> Dict[str, Optional[str]]: + return { + "k": "n_neighbors", + "algorithm": "algorithm", + "metric": "metric", + "algoParams": "algo_params", + } + + def _get_cuml_params_default(self) -> Dict[str, Any]: + return { + "n_neighbors": 5, + "verbose": False, + "algorithm": "ivfflat", + "metric": "euclidean", + "algo_params": None, + } + + def _pyspark_class(self) -> Optional[ABCMeta]: + return None + + +class DictTypeConverters(TypeConverters): + @staticmethod + def _toDict(value: Any) -> Dict[str, Any]: + """ + Convert a value to a Dict type for Param typeConverter, if possible. + """ + if isinstance(value, Dict): + return {TypeConverters.toString(k): v for k, v in value.items()} + raise TypeError("Could not convert %s to Dict[str, Any]" % value) + + +class _ApproximateNearestNeighborsParams(_NearestNeighborsCumlParams): + def __init__(self) -> None: + super().__init__() + self._setDefault(algorithm="ivfflat") + self._setDefault(algoParams=None) + self._setDefault(metric="euclidean") + + algorithm = Param( + Params._dummy(), + "algorithm", + "The algorithm to use for approximate nearest neighbors search.", + typeConverter=TypeConverters.toString, + ) + + algoParams = Param( + Params._dummy(), + "algoParams", + "The parameters to use to set up a neighbor algorithm.", + typeConverter=DictTypeConverters._toDict, + ) + + metric = Param( + Params._dummy(), + "metric", + "The distance metric to use.", + typeConverter=TypeConverters.toString, + ) + + def setAlgorithm(self: P, value: str) -> P: + """ + Sets the value of `algorithm`. + """ + assert value == "ivfflat", "Only IVFFLAT algorithm is currently supported" + self._set_params(algorithm=value) + return self + + def getAlgorithm(self: P) -> str: + """ + Gets the value of `algorithm`. + """ + return self.getOrDefault("algorithm") + + def setAlgoParams(self: P, value: Dict[str, Any]) -> P: + """ + Sets the value of `algoParams`. + """ + self._set_params(algoParams=value) + return self + + def getAlgoParams(self: P) -> Dict[str, Any]: + """ + Gets the value of `algoParams`. + """ + return self.getOrDefault("algoParams") + + def setMetric(self: P, value: str) -> P: + """ + Sets the value of `metric`. + """ + self._set_params(metric=value) + return self + + def getMetric(self: P) -> str: + """ + Gets the value of `metric`. + """ + return self.getOrDefault("metric") + + +class ApproximateNearestNeighbors( + ApproximateNearestNeighborsClass, _CumlEstimator, _ApproximateNearestNeighborsParams +): + """ + ApproximateNearestNeighbors retrieves k approximate nearest neighbors (ANNs) in item vectors for each query. + The key APIs are similar to the NearestNeighbor class which returns the exact k nearest neighbors. + The ApproximateNearestNeighbors is currently built on the IVFFLAT algorithm of cuML, and is expected to support + other algorithms such as IVFPQ. + + IVFFLAT algorithm trains a set of kmeans centers, then partition every item vector to the closest center. In the query processing + phase, a query will be partitioned into a number of closest centers, and probe all the items associated with those centers. In + the end the top k closest items will be returned as the approximate nearest neighbors. + + The current implementation build kmeans index independently on each data partition (or maxRecordsPerBatch if Arrow is enabled) of item_df. + Queries will be broadcast to all GPUs, then every query probes closest centers on individual index. Local topk results will be aggregated to obtain + global topk ANNs. + + + Parameters + ---------- + k: int (default = 5) + the default number of approximate nearest neighbors to retrieve for each query. + + algorithm: str (default = 'ivfflat') + the algorithm parameter to be passed into cuML. It currently must be 'ivfflat'. Other algorithms such as + 'ivfpq' are expected to be supported later. + + algoParams: Optional[Dict[str, Any]] (default = None) + if set, algoParam is used to configure the algorithm, on each data partition (or maxRecordsPerBatch if Arrow is enabled) of the item_df. + Note this class constructs the kmeans index independently on individual data partition (or maxRecordPerBatch if Arrow is enabled). + + When algorithm is 'ivfflat': + *nlist: (int) number of kmeans clusters to partition the dataframe into. + *nprobe: (int) number of closest clusters to probe for topk ANNs. + + metric: str (default = "euclidean") + the distance metric to use. 'ivfflat' algorithm supports ['euclidean', 'sqeuclidean', 'l2', 'inner_product']. + + inputCol: str or List[str] + The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n + * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. + * When the value is a list of strings, the feature columns must be numeric types. + + idCol: str + the name of the column in a dataframe that uniquely identifies each vector. idCol should be set + if such a column exists in the dataframe. If idCol is not set, a column with the name `unique_id` + will be automatically added to the dataframe and used as unique identifier for each vector. + + verbose: + Logging level. + * ``0`` - Disables all log messages. + * ``1`` - Enables only critical messages. + * ``2`` - Enables all messages up to and including errors. + * ``3`` - Enables all messages up to and including warnings. + * ``4 or False`` - Enables all messages up to and including information messages. + * ``5 or True`` - Enables all messages up to and including debug messages. + * ``6`` - Enables all messages up to and including trace messages. + + Examples + -------- + >>> from spark_rapids_ml.knn import ApproximateNearestNeighbors + >>> data = [(0, [0.0, 0.0]), + ... (1, [1.0, 1.0]), + ... (2, [2.0, 2.0]), + ... (3, [30.0, 30.0]), + ... (4, [40.0, 40.0]), + ... (5, [50.0, 50.0]),] + >>> data_df = spark.createDataFrame(data, schema="id int, features array") + >>> data_df = data_df.repartition(2) # ensure each partition having more data vectors than the 'nlist' of 'ivfflat' + >>> query = [(10, [0.0, 0.0]), + ... (11, [50.0, 50.0]),] + >>> query_df = spark.createDataFrame(query, schema="id int, features array") + >>> topk = 2 + >>> gpu_knn = ApproximateNearestNeighbors().setAlgorithm('ivfflat').setAlgoParams({"nlist" : 2, "nprobe": 1}) + >>> gpu_knn = gpu_knn.setInputCol("features").setIdCol("id").setK(topk) + >>> gpu_model = gpu_knn.fit(data_df) + >>> (data_df, query_df, knn_df) = gpu_model.kneighbors(query_df) + >>> knn_df.show() + +--------+-------+----------------+ + |query_id|indices| distances| + +--------+-------+----------------+ + | 10| [0, 1]|[0.0, 1.4142134]| + | 11| [5, 4]|[0.0, 14.142137]| + +--------+-------+----------------+ + >>> data_df.show() + +---+------------+ + | id| features| + +---+------------+ + | 0| [0.0, 0.0]| + | 1| [1.0, 1.0]| + | 4|[40.0, 40.0]| + | 2| [2.0, 2.0]| + | 3|[30.0, 30.0]| + | 5|[50.0, 50.0]| + +---+------------+ + + >>> query_df.show() + +---+------------+ + | id| features| + +---+------------+ + | 10| [0.0, 0.0]| + | 11|[50.0, 50.0]| + +---+------------+ + + >>> knnjoin_df = gpu_model.approxSimilarityJoin(query_df, distCol="EuclideanDistance") + +-----------------+------------------+-----------------+ + | item_df| query_df|EuclideanDistance| + +-----------------+------------------+-----------------+ + | {0, [0.0, 0.0]}| {10, [0.0, 0.0]}| 0.0| + | {1, [1.0, 1.0]}| {10, [0.0, 0.0]}| 1.4142134| + |{5, [50.0, 50.0]}|{11, [50.0, 50.0]}| 0.0| + |{4, [40.0, 40.0]}|{11, [50.0, 50.0]}| 14.142137| + +-----------------+------------------+-----------------+ + + + >>> # vector column input + >>> from spark_rapids_ml.knn import ApproximateNearestNeighbors + >>> from pyspark.ml.linalg import Vectors + >>> data = [(0, Vectors.dense([0.0, 0.0])), + ... (1, Vectors.dense([1.0, 1.0])), + ... (2, Vectors.dense([2.0, 2.0])), + ... (3, Vectors.dense([30.0, 30.0])), + ... (4, Vectors.dense([40.0, 40.0])), + ... (5, Vectors.dense([50.0, 50.0])),] + >>> data_df = spark.createDataFrame(data, ["id", "features"]).repartition(2) + >>> query = [(10, Vectors.dense([0.0, 0.0])), + ... (11, Vectors.dense([50.0, 50.0])),] + >>> query_df = spark.createDataFrame(query, ["id", "features"]) + >>> topk = 2 + >>> gpu_knn = ApproximateNearestNeighbors().setAlgorithm('ivfflat').setAlgoParams({"nlist" : 2, "nprobe": 1}) + >>> gpu_knn = gpu_knn.setInputCol("features").setIdCol("id").setK(topk) + >>> gpu_model = gpu_knn.fit(data_df) + >>> (data_df, query_df, knn_df) = gpu_model.kneighbors(query_df) + >>> knn_df.show() + + + >>> # multi-column input + >>> from spark_rapids_ml.knn import ApproximateNearestNeighbors + >>> data = [(0, 0.0, 0.0), + ... (1, 1.0, 1.0), + ... (2, 2.0, 2.0), + ... (3, 30.0, 30.0), + ... (4, 40.0, 40.0), + ... (5, 50.0, 50.0),] + >>> data_df = spark.createDataFrame(data, schema="id int, f1 float, f2 float").repartition(2) + >>> query = [(10, 0.0, 0.0), + ... (11, 50.0, 50.0),] + >>> query_df = spark.createDataFrame(query, schema="id int, f1 float, f2 float") + >>> topk = 2 + >>> gpu_knn = ApproximateNearestNeighbors().setAlgorithm('ivfflat').setAlgoParams({"nlist" : 2, "nprobe": 1}) + >>> gpu_knn = gpu_knn.setInputCols(["f1", "f2"]).setIdCol("id").setK(topk) + >>> gpu_model = gpu_knn.fit(data_df) + >>> (data_df, query_df, knn_df) = gpu_model.kneighbors(query_df) + >>> knn_df.show() + """ + + @keyword_only + def __init__( + self, + *, + k: Optional[int] = None, + algorithm: str = "ivfflat", + metric: str = "euclidean", + algoParams: Optional[Dict[str, Any]] = None, + inputCol: Optional[Union[str, List[str]]] = None, + idCol: Optional[str] = None, + verbose: Union[int, bool] = False, + **kwargs: Any, + ) -> None: + super().__init__() + assert algorithm in {"ivfflat"}, "currently only ivfflat algorithm is supported" + self._set_params(**self._input_kwargs) + + def _fit(self, item_df: DataFrame) -> "ApproximateNearestNeighborsModel": # type: ignore + self._item_df_withid = self._ensureIdCol(item_df) + + model = self._create_pyspark_model( + Row( + item_df_withid=self._item_df_withid, + ) + ) + model._float32_inputs = self._float32_inputs + self._copyValues(model) + self._copy_cuml_params(model) # type: ignore + return model + + def _create_pyspark_model(self, result: Row) -> "ApproximateNearestNeighborsModel": # type: ignore + return ApproximateNearestNeighborsModel._from_row(result) + + def _out_schema(self) -> Union[StructType, str]: # type: ignore + """ + This class overrides _fit and will not call _out_schema. + """ + pass + + def _get_cuml_fit_func(self, dataset: DataFrame) -> Callable[ # type: ignore + [FitInputType, Dict[str, Any]], + Dict[str, Any], + ]: + """ + This class overrides _fit and will not call _get_cuml_fit_func. + """ + pass + + def write(self) -> MLWriter: raise NotImplementedError( - "NearestNeighborsModel does not provide a transform function. Use 'kneighbors' instead." + "ApproximateNearestNeighbors does not support saving/loading, just re-create the estimator." ) + @classmethod + def read(cls) -> MLReader: + raise NotImplementedError( + "ApproximateNearestNeighbors does not support saving/loading, just re-create the estimator." + ) + + +class ApproximateNearestNeighborsModel( + ApproximateNearestNeighborsClass, _NNModelBase, _ApproximateNearestNeighborsParams +): + def __init__( + self, + item_df_withid: DataFrame, + ): + super().__init__() + + self._item_df_withid = item_df_withid + + self.bcast_qids: Optional[Broadcast] = None + self.bcast_qfeatures: Optional[Broadcast] = None + + def _out_schema(self) -> Union[StructType, str]: # type: ignore + return ( + f"query_{self.getIdCol()} long, indices array, distances array" + ) + + def _pre_process_data( + self, dataset: DataFrame + ) -> Tuple[DataFrame, List[str], bool, List[str]]: + + dataset, select_cols, input_is_multi_cols, tmp_cols = super()._pre_process_data( + dataset + ) + + if self.hasParam("idCol") and self.isDefined("idCol"): + id_col_name = self.getOrDefault("idCol") + dataset = dataset.withColumnRenamed(id_col_name, alias.row_number) + + select_cols.append(alias.row_number) + + return dataset, select_cols, input_is_multi_cols, tmp_cols + + # TODO: should we support dtype? + def _broadcast_as_nparray( + self, + query_df_withid: DataFrame, + dtype: Union[str, np.dtype] = "float32", + BROADCAST_LIMIT: int = 8 << 30, + ) -> Tuple[Broadcast, Broadcast]: + """ + broadcast idCol and inputCol/inputCols of a query_df + the broadcast splits an array by the BROADCAST_LIMIT bytes + """ + + query_df_withid, select_cols, input_is_multi_cols, tmp_cols = ( + self._pre_process_data(query_df_withid) + ) + query_id_pd = query_df_withid.select(*select_cols).toPandas() + + id_col = alias.row_number + query_ids = query_id_pd[id_col].to_numpy() # type: ignore + query_pd = query_id_pd.drop(id_col, axis=1) # type: ignore + + if input_is_multi_cols: + assert len(query_pd.columns) == len(self.getInputCols()) + query_features = query_pd.to_numpy() + else: + assert len(query_pd.columns) == 1 + query_features = np.array(list(query_pd[query_pd.columns[0]]), dtype=dtype) + + bcast_qids = _get_spark_session().sparkContext.broadcast(query_ids) + bcast_qfeatures = _get_spark_session().sparkContext.broadcast(query_features) + + return (bcast_qids, bcast_qfeatures) + + @classmethod + def _agg_topk( + cls: Type["ApproximateNearestNeighborsModel"], + knn_df: DataFrame, + id_col_name: str, + indices_col_name: str, + distances_col_name: str, + k: int, + ascending: bool = True, + ) -> DataFrame: + from pyspark.sql.functions import pandas_udf + + @pandas_udf("array") # type: ignore + def func_agg_indices(indices: pd.Series, distances: pd.Series) -> list[int]: + flat_indices = indices.explode().reset_index(drop=True) + flat_distances = ( + distances.explode().reset_index(drop=True).astype("float32") + ) + assert len(flat_indices) == len(flat_distances) + if ascending: + topk_index = flat_distances.nsmallest(k).index + else: + topk_index = flat_distances.nlargest(k).index + + res = flat_indices[topk_index].to_numpy() + return res + + @pandas_udf("array") # type: ignore + def func_agg_distances(distances: pd.Series) -> list[float]: + flat_distances = ( + distances.explode().reset_index(drop=True).astype("float32") + ) + if ascending: + res = flat_distances.nsmallest(k).to_numpy() + else: + res = flat_distances.nlargest(k).to_numpy() + + return res + + res_df = knn_df.groupBy(id_col_name).agg( + func_agg_indices( + knn_df[indices_col_name], knn_df[distances_col_name] + ).alias(indices_col_name), + func_agg_distances(knn_df[distances_col_name]).alias(distances_col_name), + ) + + return res_df + + def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame]: + """Return the approximate nearest neighbors for each query in query_df. The data + vectors (or equivalently item vectors) should be provided through the fit + function (see Examples in the spark_rapids_ml.knn.ApproximateNearestNeighbors). The + distance measure here is euclidean distance and the number of target approximate + nearest neighbors can be set through setK(). The function currently only + supports float32 type and will convert other data types into float32. + + Parameters + ---------- + query_df: pyspark.sql.DataFrame + query vectors where each row corresponds to one query. The query_df can be in the + format of a single array column, a single vector column, or multiple float columns. + + Returns + ------- + query_df: pyspark.sql.DataFrame + the query_df itself if it has an id column set through setIdCol(). If not, + a monotonically increasing id column will be added. + + item_df: pyspark.sql.DataFrame + the item_df (or equivalently data_df) itself if it has an id column set + through setIdCol(). If not, a monotonically increasing id column will be added. + + knn_df: pyspark.sql.DataFrame + the result k approximate nearest neighbors (ANNs) dataframe that has three + columns (id, indices, distances). Each row of knn_df corresponds to the k-ANNs + result of a query vector, identified by the id column. The indices/distances + column stores the ids/distances of knn item_df vectors. + """ + + query_df_withid = self._ensureIdCol(query_df) + self.bcast_qids, self.bcast_qfeatures = self._broadcast_as_nparray( + query_df_withid + ) + + knn_df = self._transform_evaluate_internal( + self._item_df_withid, schema=self._out_schema() + ) + k = self.getK() + + query_id_col_name = f"query_{self.getIdCol()}" + + ascending = False if self.getMetric() == "inner_product" else True + + knn_df_agg = self.__class__._agg_topk( + knn_df, + query_id_col_name, + "indices", + "distances", + k, + ascending, + ) + + return (self._item_df_withid, query_df_withid, knn_df_agg) + def _get_cuml_transform_func( self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None ) -> Tuple[ @@ -602,20 +1201,90 @@ def _get_cuml_transform_func( _TransformFunc, Optional[_EvaluateFunc], ]: - raise NotImplementedError( - "'_CumlModel._get_cuml_transform_func' method is not implemented. Use 'kneighbors' instead." - ) - def exactNearestNeighborsJoin( + cuml_alg_params = self.cuml_params.copy() + assert cuml_alg_params["metric"] in { + "euclidean", + "sqeuclidean", + "inner_product", + "l2", + } + + def _construct_sgnn() -> CumlT: + + from cuml.neighbors import NearestNeighbors as SGNN + + nn_object = SGNN(output_type="cupy", **cuml_alg_params) + + return nn_object + + row_number_col = alias.row_number + input_col, input_cols = self._get_input_columns() + assert input_col is not None or input_cols is not None + id_col_name = self.getIdCol() + + bcast_qids = self.bcast_qids + bcast_qfeatures = self.bcast_qfeatures + + assert bcast_qids is not None and bcast_qfeatures is not None + + def _transform_internal( + nn_object: CumlT, df: Union[pd.DataFrame, np.ndarray] + ) -> pd.DataFrame: + + item_row_number = df[row_number_col].to_numpy() + item = df.drop(row_number_col, axis=1) # type: ignore + if input_col is not None: + assert len(item.columns) == 1 + item = np.array(list(item[item.columns[0]]), order="C") + + if len(item) == 0: + return pd.DataFrame( + { + f"query_{id_col_name}": [], + "indices": [], + "distances": [], + } + ) + + nn_object.fit(item) + import cupy as cp + + distances, indices = nn_object.kneighbors(bcast_qfeatures.value) + + # Note cuML kneighbors applys an extra square root on the l2 distances. + # Here applies square to obtain the actual l2 distances. + if cuml_alg_params["algorithm"] == "ivfflat": + if ( + cuml_alg_params["metric"] == "euclidean" + or cuml_alg_params["metric"] == "l2" + ): + distances = distances * distances + + indices = indices.get() + indices_global = item_row_number[indices] + + res = pd.DataFrame( + { + f"query_{id_col_name}": bcast_qids.value, + "indices": list(indices_global), + "distances": list(distances.get()), + } + ) + return res + + return _construct_sgnn, _transform_internal, None + + def approxSimilarityJoin( self, query_df: DataFrame, distCol: str = "distCol", ) -> DataFrame: """ - This function returns the k exact nearest neighbors (knn) in item_df of each query vector in query_df. - item_df is the dataframe passed to the fit function of the NearestNeighbors estimator. - Note that the knn relationship is asymmetric with respect to the input datasets (e.g., if x is a knn of y - , y is not necessarily a knn of x). + This function returns the k approximate nearest neighbors (k-ANNs) in item_df of each query vector in query_df. + item_df is the dataframe passed to the fit function of the ApproximateNearestNeighbors estimator. + Note that the knn relationship is asymmetric with respect to the input datasets (e.g., if x is a ann of y + , y is not necessarily a ann of x). Parameters ---------- @@ -635,54 +1304,4 @@ def exactNearestNeighborsJoin( where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2). """ - id_col_name = self.getIdCol() - - # call kneighbors then prepare return results - (item_df_withid, query_df_withid, knn_df) = self.kneighbors(query_df) - - from pyspark.sql.functions import arrays_zip, col, explode, struct - - knn_pair_df = knn_df.select( - f"query_{id_col_name}", - explode(arrays_zip("indices", "distances")).alias("zipped"), - ).select( - f"query_{id_col_name}", - col("zipped.indices").alias(f"item_{id_col_name}"), - col("zipped.distances").alias(distCol), - ) - - item_df_struct = item_df_withid.select(struct("*").alias("item_df")) - query_df_struct = query_df_withid.select(struct("*").alias("query_df")) - - knnjoin_df = item_df_struct.join( - knn_pair_df, - item_df_struct[f"item_df.{id_col_name}"] - == knn_pair_df[f"item_{id_col_name}"], - ) - knnjoin_df = knnjoin_df.join( - query_df_struct, - knnjoin_df[f"query_{id_col_name}"] - == query_df_struct[f"query_df.{id_col_name}"], - ) - - if self.isSet(self.idCol): - knnjoin_df = knnjoin_df.select("item_df", "query_df", distCol) - else: - knnjoin_df = knnjoin_df.select( - knnjoin_df["item_df"].dropFields(id_col_name).alias("item_df"), - knnjoin_df["query_df"].dropFields(id_col_name).alias("query_df"), - distCol, - ) - - return knnjoin_df - - def write(self) -> MLWriter: - raise NotImplementedError( - "NearestNeighborsModel does not support saving/loading, just re-fit the estimator to re-create a model." - ) - - @classmethod - def read(cls) -> MLReader: - raise NotImplementedError( - "NearestNeighborsModel does not support loading/loading, just re-fit the estimator to re-create a model." - ) + return self.exactNearestNeighborsJoin(query_df, distCol) diff --git a/python/tests/test_approximate_nearest_neighbors.py b/python/tests/test_approximate_nearest_neighbors.py new file mode 100644 index 00000000..b1aa16c8 --- /dev/null +++ b/python/tests/test_approximate_nearest_neighbors.py @@ -0,0 +1,297 @@ +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np +import pandas as pd +import pytest +from _pytest.logging import LogCaptureFixture +from pyspark.sql import DataFrame +from pyspark.sql.functions import col +from pyspark.sql.types import Row +from sklearn.datasets import make_blobs + +from spark_rapids_ml.core import alias +from spark_rapids_ml.knn import ( + ApproximateNearestNeighbors, + ApproximateNearestNeighborsModel, +) + +from .sparksession import CleanSparkSession +from .test_nearest_neighbors import ( + NNEstimator, + NNModel, + func_test_example_no_id, + func_test_example_with_id, + reconstruct_knn_df, +) +from .utils import ( + array_equal, + create_pyspark_dataframe, + get_default_cuml_parameters, + idfn, + pyspark_supported_feature_types, +) + + +def test_default_cuml_params() -> None: + from cuml import NearestNeighbors as CumlNearestNeighbors + + # obtain n_neighbors, verbose, algorithm, algo_params, metric + cuml_params = get_default_cuml_parameters( + [CumlNearestNeighbors], + [ + "handle", + "p", + "metric_expanded", + "metric_params", + "output_type", + ], + ) + + spark_params = ApproximateNearestNeighbors()._get_cuml_params_default() + cuml_params["algorithm"] = "ivfflat" # change cuml default 'auto' to 'ivfflat' + assert cuml_params == spark_params + + +@pytest.mark.parametrize( + "algo_and_params", + [("ivfflat", {"nlist": 1, "nprobe": 2})], +) +@pytest.mark.parametrize( + "func_test", + [func_test_example_no_id, func_test_example_with_id], +) +def test_example( + algo_and_params: Tuple[str, Optional[dict[str, Any]]], + func_test: Callable[[NNEstimator, str], Tuple[NNEstimator, NNModel]], + gpu_number: int, + tmp_path: str, +) -> None: + algorithm = algo_and_params[0] + algoParams = algo_and_params[1] + + gpu_knn = ApproximateNearestNeighbors(algorithm=algorithm, algoParams=algoParams) + gpu_knn, gpu_model = func_test(tmp_path, gpu_knn) # type: ignore + + for obj in [gpu_knn, gpu_model]: + assert obj._cuml_params["algorithm"] == algorithm + assert obj._cuml_params["algo_params"] == algoParams + + +@pytest.mark.parametrize( + "combo", + [ + ("array", 10000, None), + ("vector", 2000, {"nlist": 10, "nprobe": 2}), + ("multi_cols", 5000, {"nlist": 20, "nprobe": 4}), + ], +) # vector feature type will be converted to float32 to be compatible with cuml single-GPU NearestNeighbors Class +@pytest.mark.parametrize("data_shape", [(10000, 50)], ids=idfn) +@pytest.mark.parametrize("data_type", [np.float32]) +def test_ivfflat( + combo: Tuple[str, int, Optional[Dict[str, Any]]], + data_shape: Tuple[int, int], + data_type: np.dtype, + metric: str = "euclidean", +) -> Tuple[ApproximateNearestNeighbors, ApproximateNearestNeighborsModel]: + + feature_type = combo[0] + max_record_batch = combo[1] + algoParams = combo[2] + n_neighbors = 50 + n_clusters = 10 + tolerance = 1e-4 + + expected_avg_recall = 0.95 + + X, _ = make_blobs( + n_samples=data_shape[0], + n_features=data_shape[1], + centers=n_clusters, + random_state=0, + ) # make_blobs creates a random dataset of isotropic gaussian blobs. + + # set average norm sq to be 1 to allow comparisons with default error thresholds + # below + root_ave_norm_sq = np.sqrt(np.average(np.linalg.norm(X, ord=2, axis=1) ** 2)) + X = X / root_ave_norm_sq + + # obtain exact knn distances and indices + if metric == "inner_product": + from cuml import NearestNeighbors as cuNN + + cuml_knn = cuNN( + algorithm="brute", + n_neighbors=n_neighbors, + output_type="numpy", + metric=metric, + ) + cuml_knn.fit(X) + distances_exact, indices_exact = cuml_knn.kneighbors(X) + else: + from sklearn.neighbors import NearestNeighbors as skNN + + sk_knn = skNN(algorithm="brute", n_neighbors=n_neighbors, metric=metric) + sk_knn.fit(X) + distances_exact, indices_exact = sk_knn.kneighbors(X) + + def cal_avg_recall(indices_ann: np.ndarray) -> float: + assert indices_ann.shape == indices_exact.shape + assert indices_ann.shape == (len(X), n_neighbors) + retrievals = [np.intersect1d(a, b) for a, b in zip(indices_ann, indices_exact)] + recalls = np.array([len(nns) / n_neighbors for nns in retrievals]) + return recalls.mean() + + def cal_avg_dist_gap(distances_ann: np.ndarray) -> float: + assert distances_ann.shape == distances_exact.shape + assert distances_ann.shape == (len(X), n_neighbors) + gaps = np.abs(distances_ann - distances_exact) + return gaps.mean() + + y = np.arange(len(X)) # use label column as id column + + conf = {"spark.sql.execution.arrow.maxRecordsPerBatch": str(max_record_batch)} + with CleanSparkSession(conf) as spark: + data_df, features_col, label_col = create_pyspark_dataframe( + spark, feature_type, data_type, X, y + ) + assert label_col is not None + data_df = data_df.withColumn(label_col, col(label_col).cast("long")) + id_col = label_col + + knn_est = ( + ApproximateNearestNeighbors( + algorithm="ivfflat", algoParams=algoParams, k=n_neighbors, metric=metric + ) + .setInputCol(features_col) + .setIdCol(id_col) + ) + + # test kneighbors: obtain spark results + knn_model = knn_est.fit(data_df) + + for obj in [knn_est, knn_model]: + assert obj.getK() == n_neighbors + assert obj.getAlgorithm() == "ivfflat" + assert obj.getAlgoParams() == algoParams + if feature_type == "multi_cols": + assert obj.getInputCols() == features_col + else: + assert obj.getInputCol() == features_col + assert obj.getIdCol() == id_col + + query_df = data_df + (item_df_withid, query_df_withid, knn_df) = knn_model.kneighbors(query_df) + + knn_df = knn_df.sort(f"query_{id_col}") + knn_df_collect = knn_df.collect() + + # test kneighbors: collect spark results for comparison with cuml results + distances = np.array([r["distances"] for r in knn_df_collect]) + indices = np.array([r["indices"] for r in knn_df_collect]) + + # test kneighbors: compare top-1 nn indices(self) and distances(self) + + if metric != "inner_product": + self_index = [knn[0] for knn in indices] + assert np.all(self_index == y) + + self_distance = [dist[0] for dist in distances] + assert self_distance == [0.0] * len(X) + + # test kneighbors: compare with cuml ANN on avg_recall and dist + from cuml import NearestNeighbors as cuNN + + cuml_ivfflat = cuNN( + algorithm="ivfflat", + algo_params=algoParams, + n_neighbors=n_neighbors, + metric=metric, + ) + cuml_ivfflat.fit(X) + distances_cumlann, indices_cumlann = cuml_ivfflat.kneighbors(X) + if metric == "euclidean" or metric == "l2": + distances_cumlann **= 2 # square up cuml distances to get l2 distances + + avg_recall_cumlann = cal_avg_recall(indices_cumlann) + avg_recall = cal_avg_recall(indices) + assert abs(avg_recall - avg_recall_cumlann) < tolerance + + avg_dist_gap_cumlann = cal_avg_dist_gap(distances_cumlann) + avg_dist_gap = cal_avg_dist_gap(distances) + assert abs(avg_dist_gap - avg_dist_gap_cumlann) < tolerance + + # test kneighbors: compare with sklearn brute NN on avg_recall and dist + assert avg_recall >= expected_avg_recall + assert np.all(np.abs(avg_dist_gap) < tolerance) + + # test exactNearestNeighborsJoin + knnjoin_df = knn_model.approxSimilarityJoin(query_df_withid) + + ascending = False if metric == "inner_product" else True + reconstructed_knn_df = reconstruct_knn_df( + knnjoin_df, row_identifier_col=knn_model.getIdCol(), ascending=ascending + ) + reconstructed_collect = reconstructed_knn_df.collect() + + def get_sorted_indices(row: Row) -> list[int]: + row_dists = row["distances"] + row_indices = row["indices"] + idx_sorted = sorted( + range(len(row_indices)), key=lambda i: (row_dists[i], row_indices[i]) + ) + indices_sorted = [row["indices"][idx] for idx in idx_sorted] + return indices_sorted + + def assert_row_equal(r1: Row, r2: Row) -> None: + assert r1[f"query_{id_col}"] == r2[f"query_{id_col}"] + r1_distances = r1["distances"] + r2_distances = r2["distances"] + assert r1_distances == r2_distances + + # sort indices in case two neighbors having same distance to the query + r1_indices_sorted = get_sorted_indices(r1) + r2_indices_sorted = get_sorted_indices(r2) + assert r1_indices_sorted == r2_indices_sorted + + assert len(reconstructed_collect) == len(knn_df_collect) + for i in range(len(reconstructed_collect)): + r1 = reconstructed_collect[i] + r2 = knn_df_collect[i] + assert_row_equal(r1, r2) + + return (knn_est, knn_model) + + +@pytest.mark.parametrize( + "combo", # feature_type, max_batch_size, algo_param, metric + [ + ("array", 2000, {"nlist": 10, "nprobe": 2}, "sqeuclidean"), + ("vector", 5000, {"nlist": 20, "nprobe": 4}, "l2"), + ("multi_cols", 2000, {"nlist": 10, "nprobe": 2}, "inner_product"), + ], +) +def test_metric( + combo: Tuple[str, int, Optional[Dict[str, Any]], str], +) -> None: + data_shape = (10000, 50) + data_type = np.float32 + + from cuml.neighbors import VALID_METRICS + + assert VALID_METRICS["ivfflat"] == { + "euclidean", + "sqeuclidean", + "cosine", + "inner_product", + "l2", + "correlation", + } + + gpu_est, gpu_model = test_ivfflat( + combo=(combo[0], combo[1], combo[2]), + data_shape=data_shape, + data_type=data_type, + metric=combo[3], + ) + assert gpu_est._cuml_params["metric"] == combo[3] + assert gpu_model._cuml_params["metric"] == combo[3] diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py index 8cc3a278..c50d605b 100644 --- a/python/tests/test_nearest_neighbors.py +++ b/python/tests/test_nearest_neighbors.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Tuple, Union import numpy as np import pandas as pd @@ -8,7 +8,12 @@ from sklearn.datasets import make_blobs from spark_rapids_ml.core import alias -from spark_rapids_ml.knn import NearestNeighbors +from spark_rapids_ml.knn import ( + ApproximateNearestNeighbors, + ApproximateNearestNeighborsModel, + NearestNeighbors, + NearestNeighborsModel, +) from .sparksession import CleanSparkSession from .utils import ( @@ -19,6 +24,9 @@ pyspark_supported_feature_types, ) +NNEstimator = Union[NearestNeighbors, ApproximateNearestNeighbors] +NNModel = Union[NearestNeighborsModel, ApproximateNearestNeighborsModel] + def test_default_cuml_params(caplog: LogCaptureFixture) -> None: from cuml import NearestNeighbors as CumlNearestNeighbors @@ -48,9 +56,9 @@ def test_default_cuml_params(caplog: LogCaptureFixture) -> None: assert nn_float32._float32_inputs -def test_example(gpu_number: int, tmp_path: str) -> None: - # reduce the number of GPUs for toy dataset to avoid empty partition - gpu_number = min(gpu_number, 2) +def func_test_example_no_id( + tmp_path: str, gpu_knn: NNEstimator +) -> Tuple[NNEstimator, NNModel]: data = [ ([1.0, 1.0], "a"), @@ -79,15 +87,18 @@ def test_example(gpu_number: int, tmp_path: str) -> None: data_df = spark.createDataFrame(data, schema) query_df = spark.createDataFrame(query, schema) - gpu_knn = NearestNeighbors(num_workers=gpu_number) gpu_knn = gpu_knn.setInputCol("features") gpu_knn = gpu_knn.setK(topk) + assert topk == gpu_knn.getK() + with pytest.raises(NotImplementedError): gpu_knn.save(tmp_path + "/knn_esimator") gpu_model = gpu_knn.fit(data_df) + assert topk == gpu_knn.getK() + with pytest.raises(NotImplementedError): gpu_model.save(tmp_path + "/knn_model") @@ -137,7 +148,14 @@ def assert_indices_equal(indices: List[List[int]]) -> None: gpu_model.transform(query_df) # test exactNearestNeighborsJoin - knnjoin_df = gpu_model.exactNearestNeighborsJoin(query_df, distCol="distCol") + + if isinstance(gpu_knn, NearestNeighbors): + knnjoin_df = gpu_model.exactNearestNeighborsJoin( + query_df, distCol="distCol" + ) + else: + knnjoin_df = gpu_model.approxSimilarityJoin(query_df, distCol="distCol") + knnjoin_df.show() assert len(knnjoin_df.dtypes) == 3 @@ -171,7 +189,7 @@ def assert_knn_metadata_equal(knn_metadata: List[List[str]]) -> None: assert_knn_metadata_equal(reconstructed_knn_metadata) reconstructed_distances = [r.distances for r in reconstructed_rows] assert_distances_equal(reconstructed_distances) - reconstructed_query_ids = [r.query_id for r in reconstructed_rows] + reconstructed_query_ids = [r.query_metadata for r in reconstructed_rows] assert reconstructed_query_ids == ["qa", "qb", "qc", "qd", "qe"] knnjoin_items = ( @@ -216,10 +234,20 @@ def assert_knn_metadata_equal(knn_metadata: List[List[str]]) -> None: assert knnjoin_queries[i]["features"] == query[i][0] assert knnjoin_queries[i]["metadata"] == query[i][1] + return gpu_knn, gpu_model + -def test_example_with_id(gpu_number: int) -> None: +def test_example(gpu_number: int, tmp_path: str) -> None: # reduce the number of GPUs for toy dataset to avoid empty partition gpu_number = min(gpu_number, 2) + gpu_knn = NearestNeighbors(num_workers=gpu_number) + func_test_example_no_id(tmp_path, gpu_knn) + + +def func_test_example_with_id( + tmp_path: str, gpu_knn: NNEstimator +) -> Tuple[NNEstimator, NNModel]: + # reduce the number of GPUs for toy dataset to avoid empty partition data = [ (101, [1.0, 1.0], "a"), @@ -247,7 +275,6 @@ def test_example_with_id(gpu_number: int) -> None: data_df = spark.createDataFrame(data, schema) query_df = spark.createDataFrame(query, schema) - gpu_knn = NearestNeighbors(num_workers=gpu_number) gpu_knn = gpu_knn.setInputCol("features") gpu_knn = gpu_knn.setIdCol("id") gpu_knn = gpu_knn.setK(topk) @@ -272,7 +299,13 @@ def assert_indices_equal(indices: List[List[int]]) -> None: assert indices[4] == [108, 107] # test exactNearestNeighborsJoin - knnjoin_df = gpu_model.exactNearestNeighborsJoin(query_df, distCol="distCol") + if isinstance(gpu_model, NearestNeighborsModel): + knnjoin_df = gpu_model.exactNearestNeighborsJoin( + query_df, distCol="distCol" + ) + else: + knnjoin_df = gpu_model.approxSimilarityJoin(query_df, distCol="distCol") + knnjoin_df.show() assert len(knnjoin_df.dtypes) == 3 @@ -296,6 +329,15 @@ def assert_indices_equal(indices: List[List[int]]) -> None: reconstructed_query_ids = [r.query_id for r in reconstructed_rows] assert reconstructed_query_ids == [201, 202, 203, 204, 205] + return (gpu_knn, gpu_model) + + +def test_example_with_id(gpu_number: int, tmp_path: str) -> None: + # reduce the number of GPUs for toy dataset to avoid empty partition + gpu_number = min(gpu_number, 2) + gpu_knn = NearestNeighbors(num_workers=gpu_number) + func_test_example_no_id(tmp_path, gpu_knn) + @pytest.mark.parametrize( "feature_type", pyspark_supported_feature_types @@ -521,11 +563,16 @@ def check_dtypes(res_df: DataFrame, from_spark: bool) -> None: def reconstruct_knn_df( - knnjoin_df: DataFrame, row_identifier_col: str, distCol: str = "distCol" + knnjoin_df: DataFrame, + row_identifier_col: str, + distCol: str = "distCol", + ascending: bool = True, ) -> DataFrame: """ This function accepts the returned dataframe (denoted as knnjoin_df) of exactNearestNeighborsjoin, then reconstructs the returned dataframe (i.e. knn_df) of kneighbors. + + Note the reconstructed knn_df does not guarantee the same indices as the original knn_df, because the distances to two neighbors can be the same. """ knn_df: DataFrame = knnjoin_df.select( knnjoin_df[f"query_df.{row_identifier_col}"].alias(f"query_id"), @@ -534,21 +581,25 @@ def reconstruct_knn_df( ) def functor(pdf: pd.DataFrame) -> pd.DataFrame: - pdf = pdf.sort_values(by=["distance"]) + pdf = pdf.sort_values(by=["distance"], ascending=ascending) indices = pdf["index"].tolist() distances = pdf["distance"].tolist() query_id = pdf[f"query_id"].tolist()[0] return pd.DataFrame( - {"query_id": [query_id], "indices": [indices], "distances": [distances]} + { + f"query_{row_identifier_col}": [query_id], + "indices": [indices], + "distances": [distances], + } ) knn_df = knn_df.groupBy("query_id").applyInPandas( functor, - schema=f"query_id {knn_df.dtypes[0][1]}, " + schema=f"query_{row_identifier_col} {knn_df.dtypes[0][1]}, " + f"indices array<{knn_df.dtypes[1][1]}>, " + f"distances array<{knn_df.dtypes[2][1]}>", ) - knn_df = knn_df.sort("query_id") + knn_df = knn_df.sort(f"query_{row_identifier_col}") return knn_df From b24434187a09a48a5703645b1eb5e67244cd8b65 Mon Sep 17 00:00:00 2001 From: eordentlich Date: Wed, 8 May 2024 17:10:48 -0700 Subject: [PATCH 26/31] add approx knn to api docs [skip ci] (#644) * minor doc updates Signed-off-by: Erik Ordentlich * fix bullets in doc string Signed-off-by: Erik Ordentlich --------- Signed-off-by: Erik Ordentlich --- README.md | 2 +- docs/site/compatibility.md | 2 +- docs/source/spark_rapids_ml.rst | 2 ++ python/src/spark_rapids_ml/knn.py | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b80d2a31..7247b276 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The following table shows the currently supported algorithms. The goal is to ex | CrossValidator | √ | | | DBSCAN (*) | √ | | | KMeans | √ | | -| k-NN (*) | √ | | +| approx/exact k-NN (*) | √ | | | LinearRegression | √ | | | LogisticRegression | √ | | | PCA | √ | √ | diff --git a/docs/site/compatibility.md b/docs/site/compatibility.md index ec5cd6d6..e83acc9c 100644 --- a/docs/site/compatibility.md +++ b/docs/site/compatibility.md @@ -13,7 +13,7 @@ The following table shows the currently supported algorithms. The goal is to ex | CrossValidator | √ | | | DBSCAN (*) | √ | | | KMeans | √ | | -| k-NN (*) | √ | | +| approx/exact k-NN (*) | √ | | | LinearRegression | √ | | | LogisticRegression | √ | | | PCA | √ | √ | diff --git a/docs/source/spark_rapids_ml.rst b/docs/source/spark_rapids_ml.rst index 03470908..d3f2868d 100644 --- a/docs/source/spark_rapids_ml.rst +++ b/docs/source/spark_rapids_ml.rst @@ -87,6 +87,8 @@ Nearest Neighbors :template: autosummary/class_with_docs.rst :toctree: api + ApproximateNearestNeighbors + ApproximateNearestNeighborsModel NearestNeighbors NearestNeighborsModel diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index fc55f59f..70907ec4 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -838,8 +838,8 @@ class ApproximateNearestNeighbors( Note this class constructs the kmeans index independently on individual data partition (or maxRecordPerBatch if Arrow is enabled). When algorithm is 'ivfflat': - *nlist: (int) number of kmeans clusters to partition the dataframe into. - *nprobe: (int) number of closest clusters to probe for topk ANNs. + * nlist: (int) number of kmeans clusters to partition the dataframe into. + * nprobe: (int) number of closest clusters to probe for topk ANNs. metric: str (default = "euclidean") the distance metric to use. 'ivfflat' algorithm supports ['euclidean', 'sqeuclidean', 'l2', 'inner_product']. From 1788e9d3df8e783ff2bcc6ced167df4854770be3 Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Wed, 8 May 2024 22:59:59 -0700 Subject: [PATCH 27/31] Fix NearestNeighbors _ensureIdCol to check whether id_col in df.columns instead of relying on isSet(idCol) (#642) * fix ensureIdCol to avoid using isSet(idCol) * simply the logic of ensureIdCol * try set idCol to None --------- Signed-off-by: Jinfeng --- python/src/spark_rapids_ml/knn.py | 83 ++++++++++++------- .../test_approximate_nearest_neighbors.py | 4 +- python/tests/test_nearest_neighbors.py | 13 ++- 3 files changed, 66 insertions(+), 34 deletions(-) diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 70907ec4..eb720689 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -85,7 +85,7 @@ class _NearestNeighborsCumlParams( def __init__(self) -> None: super().__init__() - self._setDefault(idCol=alias.row_number) + self._setDefault(idCol=None) k = Param( Params._dummy(), @@ -114,6 +114,16 @@ def getK(self: P) -> int: """ return self.getOrDefault("k") + def _getIdColOrDefault(self) -> str: + """ + Gets the value of `idCol`. + """ + + res = self.getIdCol() + if res is None: + res = alias.row_number + return res + def setInputCol(self: P, value: Union[str, List[str]]) -> P: """ Sets the value of :py:attr:`inputCol` or :py:attr:`inputCols`. @@ -142,19 +152,32 @@ def _ensureIdCol(self, df: DataFrame) -> DataFrame: Ensure an id column exists in the input dataframe. Add the column if not exists. Overwritten for knn assumption on error for not setting idCol and duplicate exists. """ - if not self.isSet("idCol") and self.getIdCol() in df.columns: - raise ValueError( - f"Cannot create a default id column since a column with the default name '{self.getIdCol()}' already exists." - + "Please specify an id column" - ) id_col_name = self.getIdCol() - df_withid = ( - df - if self.isSet("idCol") - else df.select(monotonically_increasing_id().alias(id_col_name), "*") - ) - return df_withid + if id_col_name is None: + if alias.row_number in df.columns: + raise ValueError( + f"Trying to create an id column with default name {alias.row_number}. But a column with the same name already exists." + ) + else: + get_logger(self.__class__).info( + f"idCol not set. Spark Rapids ML will create one with default name {alias.row_number}." + ) + df_withid = df.select( + monotonically_increasing_id().alias(alias.row_number), "*" + ) + return df_withid + else: + if id_col_name in df.columns: + return df + else: + get_logger(self.__class__).info( + f"column {id_col_name} does not exists in the input dataframe. Spark Rapids ML will create the {id_col_name} column." + ) + df_withid = df.select( + monotonically_increasing_id().alias(alias.row_number), "*" + ) + return df_withid class NearestNeighbors( @@ -179,7 +202,7 @@ class NearestNeighbors( * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - idCol: str + idCol: str (default = None) the name of the column in a dataframe that uniquely identifies each vector. idCol should be set if such a column exists in the dataframe. If idCol is not set, a column with the name `unique_id` will be automatically added to the dataframe and used as unique identifier for each vector. @@ -400,7 +423,7 @@ def exactNearestNeighborsJoin( where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2). """ - id_col_name = self.getIdCol() + id_col_name = self._getIdColOrDefault() # call kneighbors then prepare return results (item_df_withid, query_df_withid, knn_df) = self.kneighbors(query_df) @@ -471,7 +494,9 @@ def _out_schema(self) -> Union[StructType, str]: # type: ignore return StructType( [ StructField( - f"query_{self.getIdCol()}", ArrayType(LongType(), False), False + f"query_{self._getIdColOrDefault()}", + ArrayType(LongType(), False), + False, ), StructField( "indices", ArrayType(ArrayType(LongType(), False), False), False @@ -509,11 +534,8 @@ def _pre_process_data( # type: ignore select_cols.append(col(alias.label)) - if self.hasParam("idCol") and self.isDefined("idCol"): - id_col_name = self.getOrDefault("idCol") - select_cols.append(col(id_col_name).alias(alias.row_number)) - else: - select_cols.append(col(alias.row_number)) + id_col_name = self._getIdColOrDefault() + select_cols.append(col(id_col_name).alias(alias.row_number)) return select_cols, multi_col_names, dimension, feature_type @@ -561,8 +583,8 @@ def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFra pipelinedrdd = self._call_cuml_fit_func(union_df, partially_collect=False) pipelinedrdd = pipelinedrdd.repartition(query_default_num_partitions) # type: ignore - query_id_col_name = f"query_{self.getIdCol()}" - id_col_type = dict(union_df.dtypes)[self.getIdCol()] + query_id_col_name = f"query_{self._getIdColOrDefault()}" + id_col_type = dict(union_df.dtypes)[self._getIdColOrDefault()] knn_rdd = pipelinedrdd.flatMap( lambda row: list( zip(row[query_id_col_name], row["indices"], row["distances"]) @@ -584,7 +606,7 @@ def _get_cuml_fit_func( ]: label_isdata = self._label_isdata label_isquery = self._label_isquery - id_col_name = self.getIdCol() + id_col_name = self._getIdColOrDefault() def _cuml_fit( dfs: FitInputType, @@ -849,7 +871,7 @@ class ApproximateNearestNeighbors( * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - idCol: str + idCol: str (default = None) the name of the column in a dataframe that uniquely identifies each vector. idCol should be set if such a column exists in the dataframe. If idCol is not set, a column with the name `unique_id` will be automatically added to the dataframe and used as unique identifier for each vector. @@ -1037,9 +1059,7 @@ def __init__( self.bcast_qfeatures: Optional[Broadcast] = None def _out_schema(self) -> Union[StructType, str]: # type: ignore - return ( - f"query_{self.getIdCol()} long, indices array, distances array" - ) + return f"query_{self._getIdColOrDefault()} long, indices array, distances array" def _pre_process_data( self, dataset: DataFrame @@ -1049,9 +1069,8 @@ def _pre_process_data( dataset ) - if self.hasParam("idCol") and self.isDefined("idCol"): - id_col_name = self.getOrDefault("idCol") - dataset = dataset.withColumnRenamed(id_col_name, alias.row_number) + id_col_name = self._getIdColOrDefault() + dataset = dataset.withColumnRenamed(id_col_name, alias.row_number) select_cols.append(alias.row_number) @@ -1179,7 +1198,7 @@ def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFra ) k = self.getK() - query_id_col_name = f"query_{self.getIdCol()}" + query_id_col_name = f"query_{self._getIdColOrDefault()}" ascending = False if self.getMetric() == "inner_product" else True @@ -1221,7 +1240,7 @@ def _construct_sgnn() -> CumlT: row_number_col = alias.row_number input_col, input_cols = self._get_input_columns() assert input_col is not None or input_cols is not None - id_col_name = self.getIdCol() + id_col_name = self._getIdColOrDefault() bcast_qids = self.bcast_qids bcast_qfeatures = self.bcast_qfeatures diff --git a/python/tests/test_approximate_nearest_neighbors.py b/python/tests/test_approximate_nearest_neighbors.py index b1aa16c8..2d5f546e 100644 --- a/python/tests/test_approximate_nearest_neighbors.py +++ b/python/tests/test_approximate_nearest_neighbors.py @@ -229,7 +229,9 @@ def cal_avg_dist_gap(distances_ann: np.ndarray) -> float: ascending = False if metric == "inner_product" else True reconstructed_knn_df = reconstruct_knn_df( - knnjoin_df, row_identifier_col=knn_model.getIdCol(), ascending=ascending + knnjoin_df, + row_identifier_col=knn_model._getIdColOrDefault(), + ascending=ascending, ) reconstructed_collect = reconstructed_knn_df.collect() diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py index c50d605b..10baedae 100644 --- a/python/tests/test_nearest_neighbors.py +++ b/python/tests/test_nearest_neighbors.py @@ -234,6 +234,17 @@ def assert_knn_metadata_equal(knn_metadata: List[List[str]]) -> None: assert knnjoin_queries[i]["features"] == query[i][0] assert knnjoin_queries[i]["metadata"] == query[i][1] + # Test fit(dataset, ParamMap) that copies existing estimator + # After copy, self.isSet("idCol") becomes true. But the added id column does not exist in the dataframe + paramMap = gpu_knn.extractParamMap() + gpu_model_v2 = gpu_knn.fit(data_df, paramMap) + + assert gpu_knn.isSet("idCol") is False + assert gpu_model_v2.isSet("idCol") is True + + (_, _, knn_df_v2) = gpu_model_v2.kneighbors(query_df) + assert knn_df_v2.collect() == knn_df.collect() + return gpu_knn, gpu_model @@ -432,7 +443,7 @@ def test_nearest_neighbors( knn_model.setIdCol(item_df_withid.dtypes[0][0]) knnjoin_df = knn_model.exactNearestNeighborsJoin(query_df_withid) reconstructed_knn_df = reconstruct_knn_df( - knnjoin_df, row_identifier_col=knn_model.getIdCol() + knnjoin_df, row_identifier_col=knn_model._getIdColOrDefault() ) assert reconstructed_knn_df.collect() == knn_df.collect() From 9fff486c725d68c9f9cdffc1a89cc0420c99134b Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Thu, 9 May 2024 14:05:30 +0800 Subject: [PATCH 28/31] update blossom-ci auth list (#647) Signed-off-by: YanxuanLiu --- .github/workflows/blossom-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 96fde0da..28379d7f 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -44,8 +44,6 @@ jobs: GaryShen2008,\ NvTimLiu,\ YanxuanLiu,\ - zhanga5,\ - Er1cCheng,\ ', format('{0},', github.actor)) && github.event.comment.body == 'build' steps: - name: Check if comment is issued by authorized person From c59795af8541682b8f06d2398c6a856b2aa0bc3a Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Thu, 9 May 2024 09:39:45 -0700 Subject: [PATCH 29/31] add notebook for ann and remove exactNearestNeighborsJoin API from ANN class (#650) Signed-off-by: Jinfeng --- notebooks/approx-nearest-neighbors.ipynb | 466 +++++++++++++++++++++++ python/src/spark_rapids_ml/knn.py | 58 +-- 2 files changed, 499 insertions(+), 25 deletions(-) create mode 100644 notebooks/approx-nearest-neighbors.ipynb diff --git a/notebooks/approx-nearest-neighbors.ipynb b/notebooks/approx-nearest-neighbors.ipynb new file mode 100644 index 00000000..2f6838d5 --- /dev/null +++ b/notebooks/approx-nearest-neighbors.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "db673eda-86ee-47a7-975d-aa3d36c2f407", + "metadata": {}, + "source": [ + "# Approximate Nearest Neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeb78fae-ae08-4b64-8daa-b579af1d9ba6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from spark_rapids_ml.knn import ApproximateNearestNeighbors\n", + "from pyspark.sql.functions import col" + ] + }, + { + "cell_type": "markdown", + "id": "17955bd7-b911-4da6-ad1b-02ae4a5f0a2b", + "metadata": {}, + "source": [ + "### Create synthetic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38c671ba-b6de-4414-992f-957e28f3a8be", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dim = 2000\n", + "dtype = 'float32'\n", + "np.random.seed(1)\n", + "\n", + "# items\n", + "num_vecs = 100000\n", + "vec = np.random.randn(dim).reshape([1,dim])\n", + "arr = np.random.randn(num_vecs).reshape([num_vecs,1])\n", + "items = arr * vec\n", + "items = items.astype(dtype)\n", + "\n", + "# items extra data\n", + "items_extra = np.random.randn(num_vecs)\n", + "\n", + "# queries\n", + "num_vecs = 50\n", + "vec = np.random.randn(dim).reshape([1,dim])\n", + "arr = np.random.randn(num_vecs).reshape([num_vecs,1])\n", + "queries = arr * vec\n", + "queries = queries.astype(dtype)\n", + "\n", + "# queries extra data\n", + "queries_extra = np.random.randn(num_vecs)" + ] + }, + { + "cell_type": "markdown", + "id": "4fb6683a", + "metadata": {}, + "source": [ + "### Configure Spark\n", + "It is highly recommend to increase the spark.sql.execution.arrow.maxRecordsPerBatch from the default 10000 to a larger value. Spark Rapids ML applies cuML approximate nearest neighbor search on every data batch independently, and some algorithms have requirements on the batch size. For example, the ivfflat algorithm requires that the number of vectors in a batch must be more than the number of kmeans centroids (specified by 'nlist')." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "901aab99", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.set(\"spark.sql.execution.arrow.maxRecordsPerBatch\", 0) # set to unlimited" + ] + }, + { + "cell_type": "markdown", + "id": "01d6b576-460a-41d1-a22b-0eb303afcccc", + "metadata": {}, + "source": [ + "### Convert dataset to Spark DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be00961-aac8-4bb1-bcc8-8971971ff78f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pd_items = pd.DataFrame({\"features\": list(items), \"extra\": items_extra})\n", + "item_df = spark.createDataFrame(pd_items, \"features array, extra float\")\n", + "\n", + "pd_queries = pd.DataFrame({\"features\": list(queries), \"extra\": queries_extra})\n", + "query_df = spark.createDataFrame(pd_queries, \"features array, extra float\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c0155a-b72f-4db3-a9aa-51522b18ee61", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "item_df.show(5, truncate=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aafd06b2-8150-49d4-840f-3271e9914e76", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query_df.show(5, truncate=80)" + ] + }, + { + "cell_type": "markdown", + "id": "204b7e72-737e-4a8d-81ce-5a275cb7446a", + "metadata": {}, + "source": [ + "## Spark RAPIDS ML (GPU)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c105d965-cec5-430b-be88-3a9d1476147c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "knn = ApproximateNearestNeighbors(k=2)\n", + "knn.setInputCol(\"features\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "731d2f06-7ca4-4395-a856-e5a8ca403d49", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "knn_model = knn.fit(item_df)" + ] + }, + { + "cell_type": "markdown", + "id": "2d1680c4-fde6-4016-a06c-1db89b53db43", + "metadata": {}, + "source": [ + "Note: `fit` just stores a reference to the `item_df` in the returned model. As such, saving the estimator or model is not supported, since their only state is the referenced dataset. Instead, just re-create and re-fit the estimator on the dataset, as needed." + ] + }, + { + "cell_type": "markdown", + "id": "7210e792-f7aa-4581-8b84-aae2b52d6baf", + "metadata": {}, + "source": [ + "#### kneighbors\n", + "\n", + "This API takes a DataFrame of query vectors, and returns the `k` approximate nearest item vectors for each query vector, represented by their unique ids and distances. The unique ids are automatically generated if not provided, so the input datasets are also returned with their unique ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61d09a34-15a1-456a-a6dd-fc9abad2716a", + "metadata": {}, + "outputs": [], + "source": [ + "item_id_df, query_id_df, neighbor_df = knn_model.kneighbors(query_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fde68c05-5cde-4b1e-8ed3-ae5fe2d7015b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# original item_df is returned with unique identifiers\n", + "item_id_df.show(5, truncate=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "281c204b-5804-4ae8-9064-e6fe08355618", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# original query_df is returned with unique identifiers\n", + "query_id_df.show(5, truncate=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "457b702d-1af0-4c38-b1b5-dd1a1f5f1dec", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# neighbor_df shows the nearest item vectors for each query vector, represented by their unique ids and distances.\n", + "neighbor_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62f81b49-6ff3-4656-8b0c-27b32a3f60de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# change the value of 'k'\n", + "knn_model.setK(3)\n", + "_, _, neighbor_df = knn_model.kneighbors(query_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a52e7e03-0bac-4378-b219-eda5d00523da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "neighbor_df.show()" + ] + }, + { + "cell_type": "markdown", + "id": "21d0f88c-8ba0-4f25-a42f-45c864972f54", + "metadata": {}, + "source": [ + "#### approxNearestNeighborsJoin\n", + "\n", + "This API returns a join of the query vectors and their `k` approximate nearest item vectors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dc0fd29-f25e-4d89-95fe-ebd2efbfe9ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result_df = knn_model.approxSimilarityJoin(query_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5196d375-db08-4292-a865-61fecd07fe41", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result_df.orderBy(\"query_df\", \"item_df\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "7d1d41c5-c6c5-4f37-942a-cecb882a7862", + "metadata": {}, + "source": [ + "For each returned query or item vector, all columns from the original input DataFrame will be returned as a single struct column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27993cac-bc6d-4f57-b6da-549857c9218f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result_df.select(\"query_df.*\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "8cd56670-7633-4fe6-ab75-0fd680c63baa", + "metadata": {}, + "source": [ + "# PySpark\n", + "\n", + "PySpark does not have an exact kNN implementation, but it does have an LSH-based Approximate Nearest Neighbors implementation, shown here to illustrate the similarity between the APIs. However, the algorithms are very different, so their results are only roughly comparable, and it would require elaborate tuning of parameters to produce similar results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a21c783f-c4ed-43b3-a869-7395b94152f3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pyspark.ml.feature import BucketedRandomProjectionLSH\n", + "from pyspark.ml.functions import array_to_vector\n", + "from pyspark.ml.linalg import Vectors\n", + "from pyspark.sql.functions import col\n", + "\n", + "item_vector_df = item_df.select(array_to_vector(item_df.features).alias(\"features\"))\n", + "query_vector_df = query_df.select(array_to_vector(query_df.features).alias(\"features\"))\n", + "key = Vectors.dense([1.0] * dim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4288d38-6426-41c0-87a2-a75bc9d4bbda", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "item_vector_df.show(5, truncate=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "655f8071-c90e-464b-aa25-58d96bbeebea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query_vector_df.show(5, truncate=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8712f3f-4662-409e-8172-29308ec84e0c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "brp = BucketedRandomProjectionLSH(inputCol=\"features\", outputCol=\"hashes\", bucketLength=2.0, numHashTables=3)\n", + "model = brp.fit(item_vector_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4aa5813b-41b9-4b51-9977-721e6f90118e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Feature Transformation\n", + "print(\"The hashed dataset where hashed values are stored in the column 'hashes':\")\n", + "model.transform(item_vector_df).show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90f7049a-5ed1-44a8-b0d0-be63f6458e0a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Compute the locality sensitive hashes for the input rows, then perform approximate similarity join.\n", + "# We could avoid computing hashes by passing in the already-transformed dataset, e.g.\n", + "# `model.approxSimilarityJoin(transformed_item_vector_df, transformed_query_vector_df, 3.0)`\n", + "print(\"Approximately joining items and queries on Euclidean distance smaller than 3.0:\")\n", + "model.approxSimilarityJoin(item_vector_df, query_vector_df, 3.0, distCol=\"EuclideanDistance\")\\\n", + " .select(col(\"datasetA.features\").alias(\"item\"),\n", + " col(\"datasetB.features\").alias(\"query\"),\n", + " col(\"EuclideanDistance\")).orderBy(\"query\", \"item\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e70d24b7-db70-4041-9794-7cd5451bad76", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Compute the locality sensitive hashes for the input rows, then perform approximate nearest neighbor search.\n", + "# We could avoid computing hashes by passing in the already-transformed dataset, e.g.\n", + "# `model.approxNearestNeighbors(transformed_item_vector_df, key, 2)`\n", + "print(\"Approximately searching item vectors for 2 nearest neighbors of the key:\")\n", + "model.approxNearestNeighbors(item_vector_df, key, 2).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a8dc90-379a-409b-8c8a-a26f33910c0d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# saves the LSH hashes for the input rows\n", + "model.write().overwrite().save(\"/tmp/ann_model\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index eb720689..97709f9c 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -394,34 +394,11 @@ def _get_cuml_transform_func( def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame]: raise NotImplementedError() - def exactNearestNeighborsJoin( + def _nearest_neighbors_join( self, query_df: DataFrame, distCol: str = "distCol", ) -> DataFrame: - """ - This function returns the k exact nearest neighbors (knn) in item_df of each query vector in query_df. - item_df is the dataframe passed to the fit function of the NearestNeighbors estimator. - Note that the knn relationship is asymmetric with respect to the input datasets (e.g., if x is a knn of y - , y is not necessarily a knn of x). - - Parameters - ---------- - query_df: pyspark.sql.DataFrame - the query_df dataframe. Each row represents a query vector. - - distCol: str - the name of the output distance column - - Returns - ------- - knnjoin_df: pyspark.sql.DataFrame - the result dataframe that has three columns (item_df, query_df, distCol). - item_df column is of struct type that includes as fields all the columns of input item dataframe. - Similarly, query_df column is of struct type that includes as fields all the columns of input query dataframe. - distCol is the distance column. A row in knnjoin_df is in the format (v1, v2, dist(v1, v2)), - where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2). - """ id_col_name = self._getIdColOrDefault() @@ -723,6 +700,37 @@ async def do_allGather() -> List[str]: return _cuml_fit + def exactNearestNeighborsJoin( + self, + query_df: DataFrame, + distCol: str = "distCol", + ) -> DataFrame: + """ + This function returns the k exact nearest neighbors (knn) in item_df of each query vector in query_df. + item_df is the dataframe passed to the fit function of the NearestNeighbors estimator. + Note that the knn relationship is asymmetric with respect to the input datasets (e.g., if x is a knn of y + , y is not necessarily a knn of x). + + Parameters + ---------- + query_df: pyspark.sql.DataFrame + the query_df dataframe. Each row represents a query vector. + + distCol: str + the name of the output distance column + + Returns + ------- + knnjoin_df: pyspark.sql.DataFrame + the result dataframe that has three columns (item_df, query_df, distCol). + item_df column is of struct type that includes as fields all the columns of input item dataframe. + Similarly, query_df column is of struct type that includes as fields all the columns of input query dataframe. + distCol is the distance column. A row in knnjoin_df is in the format (v1, v2, dist(v1, v2)), + where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2). + """ + + return self._nearest_neighbors_join(query_df=query_df, distCol=distCol) + class ApproximateNearestNeighborsClass(_CumlClass): @@ -1323,4 +1331,4 @@ def approxSimilarityJoin( where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2). """ - return self.exactNearestNeighborsJoin(query_df, distCol) + return self._nearest_neighbors_join(query_df, distCol) From 0198d3fffe03c61f11f9d51beb1be2265871d509 Mon Sep 17 00:00:00 2001 From: eordentlich Date: Thu, 9 May 2024 10:54:05 -0700 Subject: [PATCH 30/31] =?UTF-8?q?remove=20unsupported=20save,load,read,wri?= =?UTF-8?q?te=20from=20api=20docs=20for=20knn=20estimat=E2=80=A6=20(#646)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove unsupported save,load,read,write from api docs for knn estimator, model classes Signed-off-by: Erik Ordentlich * fix class names in error messages Signed-off-by: Erik Ordentlich * typo Signed-off-by: Erik Ordentlich --------- Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/knn.py | 48 ++++++++++++++++++++++++++++- python/src/spark_rapids_ml/utils.py | 12 +++++++- python/tests/test_utils.py | 29 ++++++++++++++++- 3 files changed, 86 insertions(+), 3 deletions(-) diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 97709f9c..389151c7 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -361,12 +361,27 @@ def _get_cuml_fit_func(self, dataset: DataFrame) -> Callable[ # type: ignore pass def write(self) -> MLWriter: + """Unsupported.""" raise NotImplementedError( "NearestNeighbors does not support saving/loading, just re-create the estimator." ) @classmethod def read(cls) -> MLReader: + """Unsupported.""" + raise NotImplementedError( + "NearestNeighbors does not support saving/loading, just re-create the estimator." + ) + + def save(self, path: str) -> None: + """Unsupported.""" + raise NotImplementedError( + "NearestNeighbors does not support saving/loading, just re-create the estimator." + ) + + @classmethod + def load(cls, path: str) -> MLReader: + """Unsupported.""" raise NotImplementedError( "NearestNeighbors does not support saving/loading, just re-create the estimator." ) @@ -442,14 +457,29 @@ def _nearest_neighbors_join( return knnjoin_df def write(self) -> MLWriter: + """Unsupported.""" raise NotImplementedError( f"{self.__class__} does not support saving/loading, just re-fit the estimator to re-create a model." ) @classmethod def read(cls) -> MLReader: + """Unsupported.""" raise NotImplementedError( - f"{cls} does not support loading/loading, just re-fit the estimator to re-create a model." + f"{cls} does not support saving/loading, just re-fit the estimator to re-create a model." + ) + + def save(self, path: str) -> None: + """Unsupported.""" + raise NotImplementedError( + f"{self.__class__} does not support saving/loading, just re-create the estimator." + ) + + @classmethod + def load(cls, path: str) -> MLReader: + """Unsupported.""" + raise NotImplementedError( + f"{cls} does not support saving/loading, just re-create the estimator." ) @@ -1040,13 +1070,29 @@ def _get_cuml_fit_func(self, dataset: DataFrame) -> Callable[ # type: ignore """ pass + # for the following 4 methods leave doc string as below so that they are filtered out from api docs def write(self) -> MLWriter: + """Unsupported.""" raise NotImplementedError( "ApproximateNearestNeighbors does not support saving/loading, just re-create the estimator." ) @classmethod def read(cls) -> MLReader: + """Unsupported.""" + raise NotImplementedError( + "ApproximateNearestNeighbors does not support saving/loading, just re-create the estimator." + ) + + @classmethod + def load(cls, path: str) -> MLReader: + """Unsupported.""" + raise NotImplementedError( + "ApproximateNearestNeighbors does not support saving/loading, just re-create the estimator." + ) + + def save(self, path: str) -> None: + """Unsupported.""" raise NotImplementedError( "ApproximateNearestNeighbors does not support saving/loading, just re-create the estimator." ) diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 193d7508..da6b4eb0 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -56,7 +56,17 @@ def _unsupported_methods_attributes(clazz: Any) -> Set[str]: _unsupported_methods: List[str] = sum( [_method_names_from_param(k) for k in _unsupported_params], [] ) - return set(_unsupported_params + _unsupported_methods) + methods_and_functions = inspect.getmembers( + clazz, + predicate=lambda member: inspect.isfunction(member) + or inspect.ismethod(member), + ) + _other_unsupported = [ + entry[0] + for entry in methods_and_functions + if entry and (entry[1].__doc__) == "Unsupported." + ] + return set(_unsupported_params + _unsupported_methods + _other_unsupported) else: return set() diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py index b634388a..0f7ca860 100644 --- a/python/tests/test_utils.py +++ b/python/tests/test_utils.py @@ -61,8 +61,35 @@ class A: def _param_mapping(cls) -> Dict[str, Optional[str]]: return {"param1": "param2", "param3": None, "param4": ""} + @classmethod + def unsupported_method(cls) -> None: + """Unsupported.""" + pass + + def unsupported_function(self) -> None: + """Unsupported.""" + pass + + @classmethod + def supported_method(cls) -> None: + """supported""" + pass + + def supported_function(self) -> None: + """supported""" + pass + assert _unsupported_methods_attributes(A) == set( - ["param3", "getParam3", "setParam3", "param4", "getParam4", "setParam4"] + [ + "param3", + "getParam3", + "setParam3", + "param4", + "getParam4", + "setParam4", + "unsupported_method", + "unsupported_function", + ] ) From ac4785c8300fd991b71ac7b0ddb7654b5ae126e6 Mon Sep 17 00:00:00 2001 From: Gary Shen Date: Fri, 10 May 2024 09:40:58 +0800 Subject: [PATCH 31/31] Add one more guardword for uploaded log (#586) Signed-off-by: Gary Shen --- ci/Jenkinsfile.premerge | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge index a73620d6..44f321e2 100644 --- a/ci/Jenkinsfile.premerge +++ b/ci/Jenkinsfile.premerge @@ -1,6 +1,6 @@ #!/usr/local/env groovy /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -201,7 +201,7 @@ pipeline { githubHelper.updateCommitStatus("", "Success", GitHubCommitState.SUCCESS) } else { // upload log only in case of build failure - def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"] + def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com", "sc-ipp-*"] guardWords.add("nvidia-smi(?s)(.*?)(?=git)") // hide GPU info guardWords.add("sc-ipp*") // hide cloud info githubHelper.uploadLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords)