Skip to content

Commit

Permalink
Remove support for deprecated ray.util.data.MLDataset (#218)
Browse files Browse the repository at this point in the history
Ray MLDataset has been deprecated. We should no longer maintain support for it in newer xgboost-ray versions.
  • Loading branch information
amogkam authored May 23, 2022
1 parent 929b3e3 commit 8d07e8d
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 141 deletions.
8 changes: 3 additions & 5 deletions xgboost_ray/data_sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from xgboost_ray.data_sources.pandas import Pandas
from xgboost_ray.data_sources.modin import Modin
from xgboost_ray.data_sources.dask import Dask
from xgboost_ray.data_sources.ml_dataset import MLDataset
from xgboost_ray.data_sources.petastorm import Petastorm
from xgboost_ray.data_sources.csv import CSV
from xgboost_ray.data_sources.parquet import Parquet
Expand All @@ -12,12 +11,11 @@
from xgboost_ray.data_sources.partitioned import Partitioned

data_sources = [
Numpy, Pandas, Partitioned, Modin, Dask, MLDataset, Petastorm, CSV,
Parquet, ObjectStore, RayDataset
Numpy, Pandas, Partitioned, Modin, Dask, Petastorm, CSV, Parquet,
ObjectStore, RayDataset
]

__all__ = [
"DataSource", "RayFileType", "Numpy", "Pandas", "Modin", "Dask",
"MLDataset", "Petastorm", "CSV", "Parquet", "ObjectStore", "RayDataset",
"Partitioned"
"Petastorm", "CSV", "Parquet", "ObjectStore", "RayDataset", "Partitioned"
]
2 changes: 1 addition & 1 deletion xgboost_ray/data_sources/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class DataSource:
"""Abstract class for data sources.
xgboost_ray supports reading from various sources, such as files
(e.g. CSV, Parquet) or distributed datasets (Ray MLDataset, Modin).
(e.g. CSV, Parquet) or distributed datasets (Modin).
This abstract class defines an interface to read from these sources.
New data sources can be added by implementing this interface.
Expand Down
83 changes: 0 additions & 83 deletions xgboost_ray/data_sources/ml_dataset.py

This file was deleted.

20 changes: 4 additions & 16 deletions xgboost_ray/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,8 @@
from ray import logger
from ray.util.annotations import PublicAPI, DeveloperAPI

from xgboost_ray.util import Unavailable
from xgboost_ray.data_sources import DataSource, data_sources, RayFileType

try:
from ray.util.data import MLDataset
except ImportError:
MLDataset = Unavailable

try:
from ray.data.dataset import Dataset as RayDataset
except (ImportError, ModuleNotFoundError):
Expand All @@ -46,7 +40,7 @@ class RayDataset:
if TYPE_CHECKING:
from xgboost_ray.xgb import xgboost as xgb

Data = Union[str, List[str], np.ndarray, pd.DataFrame, pd.Series, MLDataset]
Data = Union[str, List[str], np.ndarray, pd.DataFrame, pd.Series]


def concat_dataframes(dfs: List[Optional[pd.DataFrame]]):
Expand Down Expand Up @@ -404,7 +398,7 @@ def get_data_source(self) -> Type[DataSource]:

# Todo (krfricke): It would be good to have a more general way to
# check for compatibility here. Combine with test below?
if not (isinstance(self.data, (Iterable, MLDataset, RayDataset))
if not (isinstance(self.data, (Iterable, RayDataset))
or hasattr(self.data, "__partitioned__")) or invalid_data:
raise ValueError(
f"Distributed data loading only works with already "
Expand Down Expand Up @@ -444,7 +438,7 @@ def get_data_source(self) -> Type[DataSource]:
f"with FileType: {self.filetype} for a distributed dataset."
"\nFIX THIS by passing a supported data type. Supported "
"data types for distributed datasets are a list of "
"CSV or Parquet sources as well as Ray MLDatasets. If using "
"CSV or Parquet sources. If using "
"Modin, Dask, or Petastorm, make sure the library is "
"installed.")

Expand Down Expand Up @@ -586,7 +580,7 @@ class RayDMatrix:
Args:
data: Data object. Can be a pandas dataframe, pandas series,
numpy array, Ray MLDataset, modin dataframe, string pointing to
numpy array, modin dataframe, string pointing to
a csv or parquet file, or list of strings pointing to csv or
parquet files.
label: Optional label object. Can be a pandas series, numpy array,
Expand Down Expand Up @@ -874,13 +868,10 @@ def get_data(

def _can_load_distributed(source: Data) -> bool:
"""Returns True if it might be possible to use distributed data loading"""
from xgboost_ray.data_sources.ml_dataset import MLDataset
from xgboost_ray.data_sources.modin import Modin

if isinstance(source, (int, float, bool)):
return False
elif MLDataset.is_data_type(source):
return True
elif Modin.is_data_type(source):
return True
elif isinstance(source, str):
Expand All @@ -902,12 +893,9 @@ def _can_load_distributed(source: Data) -> bool:

def _detect_distributed(source: Data) -> bool:
"""Returns True if we should try to use distributed data loading"""
from xgboost_ray.data_sources.ml_dataset import MLDataset
from xgboost_ray.data_sources.modin import Modin
if not _can_load_distributed(source):
return False
if MLDataset.is_data_type(source):
return True
if Modin.is_data_type(source):
return True
if isinstance(source, Iterable) and not isinstance(source, str) and \
Expand Down
36 changes: 0 additions & 36 deletions xgboost_ray/tests/test_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,32 +289,6 @@ def testFromMultiParquetString(self):
self._testMatrixCreation(
[data_file_1, data_file_2], "label", distributed=True)

def testFromMLDataset(self):
try:
from ray.util import data as ml_data
except ImportError:
self.skipTest("MLDataset not available in current Ray version.")
return

with tempfile.TemporaryDirectory() as dir:
data_file_1 = os.path.join(dir, "data_1.parquet")
data_file_2 = os.path.join(dir, "data_2.parquet")

data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
data_df["label"] = pd.Series(self.y)

df_1 = data_df[0:len(data_df) // 2]
df_2 = data_df[len(data_df) // 2:]

df_1.to_parquet(data_file_1)
df_2.to_parquet(data_file_2)

dataset = ml_data.read_parquet(
[data_file_1, data_file_2], num_shards=2)

self._testMatrixCreation(dataset, "label", distributed=False)
self._testMatrixCreation(dataset, "label", distributed=True)

def testDetectDistributed(self):
with tempfile.TemporaryDirectory() as dir:
parquet_file = os.path.join(dir, "file.parquet")
Expand All @@ -339,16 +313,6 @@ def testDetectDistributed(self):
mat = RayDMatrix([csv_file] * 3, lazy=True)
self.assertTrue(mat.distributed)

try:
from ray.util import data as ml_data
mat = RayDMatrix(
ml_data.read_parquet(parquet_file, num_shards=1),
lazy=True)
self.assertTrue(mat.distributed)
except ImportError:
print("MLDataset not available in current Ray version. "
"Skipping part of test.")

def testTooManyActorsDistributed(self):
"""Test error when too many actors are passed"""
with self.assertRaises(RuntimeError):
Expand Down

0 comments on commit 8d07e8d

Please sign in to comment.