Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise error when combining precomputed_knn and sample_fraction<1.0 in UMAP #825

Merged
merged 6 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 44 additions & 17 deletions python/src/spark_rapids_ml/umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,8 @@ class UMAP(UMAPClass, _CumlEstimatorSupervised, _UMAPCumlParams):
Either one of a tuple (indices, distances) of arrays of shape (n_samples, n_neighbors), a pairwise distances
dense array of shape (n_samples, n_samples) or a KNN graph sparse array (preferably CSR/COO). This feature
allows the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function.
This function should match the metric used to train the UMAP embeedings.
This function should match the metric used to train the UMAP embeedings. Note: supplying a precomputed KNN graph
with sample_fraction < 1.0 is not supported, as the KNN graph must be built on the same subset used to fit the model.

random_state : int, RandomState instance (optional, default=None)
The seed used by the random number generator during embedding initialization and during sampling used by the
Expand Down Expand Up @@ -914,6 +915,11 @@ def _create_pyspark_model(self, result: Row) -> _CumlModel:

def _fit(self, dataset: DataFrame) -> "UMAPModel":
if self.getSampleFraction() < 1.0:
if self.cuml_params["precomputed_knn"] is not None:
raise ValueError(
f"Both precomputed_knn and sample_fraction < 1.0 cannot be used simultaneously, as the KNN graph must be built on the same subset used to fit the model."
)

data_subset = dataset.sample(
withReplacement=False,
fraction=self.getSampleFraction(),
Expand Down Expand Up @@ -1160,24 +1166,45 @@ def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]:
indices = csr_chunk.indices
indptr = csr_chunk.indptr
data = csr_chunk.data
yield pd.DataFrame(
data=[
if cuda_managed_mem_enabled:
yield pd.DataFrame(
data=[
{
"embedding_": list(embedding[start:end].get()),
"indices": list(indices.get()),
"indptr": list(indptr.get()),
"data": list(data.get()),
"shape": [end - start, dimension],
}
]
)
else:
yield pd.DataFrame(
data=[
{
"embedding_": list(embedding[start:end]),
"indices": list(indices),
"indptr": list(indptr),
"data": list(data),
"shape": [end - start, dimension],
}
]
)
else:
if cuda_managed_mem_enabled:
yield pd.DataFrame(
{
"embedding_": embedding[start:end].tolist(),
"indices": indices.tolist(),
"indptr": indptr.tolist(),
"data": data.tolist(),
"shape": [end - start, dimension],
"embedding_": list(embedding[start:end].get()),
"raw_data_": list(raw_data[start:end].get()),
}
]
)
else:
yield pd.DataFrame(
{
"embedding_": embedding[start:end].tolist(),
"raw_data_": raw_data[start:end].tolist(),
}
)
)
else:
yield pd.DataFrame(
{
"embedding_": list(embedding[start:end]),
"raw_data_": list(raw_data[start:end]),
}
)

output_df = dataset.mapInPandas(_train_udf, schema=self._out_schema())

Expand Down
29 changes: 29 additions & 0 deletions python/tests/test_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,35 @@ def test_umap_precomputed_knn(
df = spark.createDataFrame(X.tolist(), ",".join(schema))
df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols)

try:
umap = UMAP(
num_workers=gpu_number,
metric="sqeuclidean",
sample_fraction=0.5,
precomputed_knn=precomputed_knn,
)
umap.fit(df)
assert False, "We should have raised an error"
except ValueError as e:
assert (
"precomputed_knn and sample_fraction < 1.0 cannot be used simultaneously"
in str(e)
)

try:
umap = UMAP(
num_workers=gpu_number,
metric="sqeuclidean",
precomputed_knn=precomputed_knn,
)
umap.setSampleFraction(0.5).fit(df)
assert False, "We should have raised an error"
except ValueError as e:
assert (
"precomputed_knn and sample_fraction < 1.0 cannot be used simultaneously"
in str(e)
)

umap = UMAP(
num_workers=gpu_number,
metric="sqeuclidean",
Expand Down
Loading