From 494052f8263d3a9dd34ad4d39d9a21d34784eb6a Mon Sep 17 00:00:00 2001 From: Tilman Krokotsch Date: Wed, 31 Jan 2024 14:53:40 +0100 Subject: [PATCH] docs: update and fix format (#56) --- docs/use_cases/feature_extraction.md | 3 ++- docs/use_cases/libraries.md | 14 +++++++++----- docs/use_cases/tasks.md | 4 ++-- rul_datasets/adaption.py | 6 +++++- rul_datasets/baseline.py | 1 + rul_datasets/core.py | 6 ++++++ rul_datasets/reader/abstract.py | 6 ++++++ rul_datasets/reader/saving.py | 3 +++ rul_datasets/reader/scaling.py | 7 ++++++- rul_datasets/ssl.py | 3 +++ rul_datasets/utils.py | 16 +++++++++------- 11 files changed, 52 insertions(+), 17 deletions(-) diff --git a/docs/use_cases/feature_extraction.md b/docs/use_cases/feature_extraction.md index 21a6ee6..57629e2 100644 --- a/docs/use_cases/feature_extraction.md +++ b/docs/use_cases/feature_extraction.md @@ -3,7 +3,8 @@ It may be useful to extract hand-crafted features, i.e. RMS or P2P, from this vi The [RulDataModule][rul_datasets.core.RulDataModule] provides the option to use a custom feature extractor on each window of data. The feature extractor can be anything that can be called as a function. -It should take a numpy array with the shape `[num_windows, window_size, num_features]` and return another array. +It should take a feature array with the shape `[num_windows, window_size, num_features]` and a target array with the shape `[num_windows]`. +The return value should be the transformed feature and target arrays. Depending on whether a `window_size` is supplied to the data module, the expected output shape of the feature extractor is: * `window_size is None`: `[num_new_windows, new_window_size, features]` diff --git a/docs/use_cases/libraries.md b/docs/use_cases/libraries.md index af4b109..ef28be1 100644 --- a/docs/use_cases/libraries.md +++ b/docs/use_cases/libraries.md @@ -9,24 +9,28 @@ The RUL Datasets library implements several data modules that are 100% compatibl import pytorch_lightning as pl import rul_datasets -import rul_estimator +import rul_estimator # (1)! cmapss_fd1 = rul_datasets.CmapssReader(fd=1) dm = rul_datasets.RulDataModule(cmapss_fd1, batch_size=32) -my_rul_estimator = rul_estimator.MyRulEstimator() # (1)! +my_rul_estimator = rul_estimator.MyRulEstimator() # (2)! trainer = pl.Trainer(max_epochs=100) -trainer.fit(my_rul_estimator, dm) # (2)! +trainer.fit(my_rul_estimator, dm) # (3)! trainer.test(my_rul_estimator, dm) ``` -1. This should be a subclass of [LightningModule][pytorch_lightning.core.LightningModule]. -2. The trainer calls the data module's `prepare_data` and `setup` functions automatically. +1. This is a hypothetical module containing your model. +2. This should be a subclass of [LightningModule][lightning.pytorch.core.LightningModule]. +3. The trainer calls the data module's `prepare_data` and `setup` functions automatically. The RUL datasets library loads all data into memory at once and uses the main process for creating batches, i.e. `num_workers=0` for all dataloaders. +Unnecessary copies are avoided by using shared memory for both Numpy and PyTorch. +This means that modifying a batch directly, e.g., `features += 1` should be avoided. + When data is held in memory, multiple data loading processes are unnecessary and may even slow down training. The warning produced by PyTorch Lightning that `num_workers` is too low is, therefore, suppressed. diff --git a/docs/use_cases/tasks.md b/docs/use_cases/tasks.md index 679333c..6477b60 100644 --- a/docs/use_cases/tasks.md +++ b/docs/use_cases/tasks.md @@ -96,8 +96,8 @@ For validation and testing, the data module returns data loaders for the full la ## Unsupervised Domain Adaption Unsupervised domain adaption uses a labeled dataset form a source domain to train a model for a target domain for which only unlabeled data is available. -All included dataset consist of multiple sub-datasets that can be viewed as different domains. -As the sub-dataset still bear a sufficient similarity to each other, domain adaption between them should be possible. +All included datasets consist of multiple sub-datasets that can be viewed as different domains. +As the sub-datasets still bear a sufficient similarity to each other, domain adaption between them should be possible. The `get_compatible` function is useful to construct a reader for a different sub-dataset from an existing one: ```pycon diff --git a/rul_datasets/adaption.py b/rul_datasets/adaption.py index dc6e21b..7979bad 100644 --- a/rul_datasets/adaption.py +++ b/rul_datasets/adaption.py @@ -134,6 +134,7 @@ def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The training data loader """ @@ -156,6 +157,7 @@ def val_dataloader(self, *args: Any, **kwargs: Any) -> List[DataLoader]: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The source, target and an optional paired validation data loader. """ @@ -184,6 +186,7 @@ def test_dataloader(self, *args: Any, **kwargs: Any) -> List[DataLoader]: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The source and target test data loader. """ @@ -323,9 +326,10 @@ def split_healthy( targets: List of target time series. by_max_rul: Whether to split healthy and degrading data by max RUL value. by_steps: Split healthy and degrading data after this number of time steps. + Returns: healthy: Dataset of healthy data. - degrading: Dataset of degrading data. + degraded: Dataset of degrading data. """ if not by_max_rul and (by_steps is None): raise ValueError("Either 'by_max_rul' or 'by_steps' need to be set.") diff --git a/rul_datasets/baseline.py b/rul_datasets/baseline.py index 3b8faee..b437c96 100644 --- a/rul_datasets/baseline.py +++ b/rul_datasets/baseline.py @@ -111,6 +111,7 @@ def test_dataloader(self, *args: Any, **kwargs: Any) -> List[DataLoader]: Args: *args: Passed down to each data module. **kwargs: Passed down to each data module. + Returns: The test dataloaders of all sub-datasets. """ diff --git a/rul_datasets/core.py b/rul_datasets/core.py index 3618123..adeb385 100644 --- a/rul_datasets/core.py +++ b/rul_datasets/core.py @@ -190,6 +190,7 @@ def is_mutually_exclusive(self, other: "RulDataModule") -> bool: Args: other: Data module to check exclusivity against. + Returns: Whether both data modules are mutually exclusive. """ @@ -254,6 +255,7 @@ def load_split( split: The desired split to load. alias: The split as which the loaded data should be treated. degraded_only: Whether to only load degraded samples. + Returns: The feature and target tensors of the split's runs. """ @@ -319,6 +321,7 @@ def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The training data loader """ @@ -347,6 +350,7 @@ def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The validation data loader """ @@ -370,6 +374,7 @@ def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The test data loader """ @@ -395,6 +400,7 @@ def to_dataset(self, split: str, alias: Optional[str] = None) -> "RulDataset": Args: split: The split to place inside the dataset. alias: The split the loaded data should be treated as. + Returns: A dataset containing the requested split. """ diff --git a/rul_datasets/reader/abstract.py b/rul_datasets/reader/abstract.py index 313415e..639df45 100644 --- a/rul_datasets/reader/abstract.py +++ b/rul_datasets/reader/abstract.py @@ -139,6 +139,7 @@ def default_window_size(self, fd: int) -> int: Args: fd: The index of a sub-dataset. + Returns: The default window size for the sub-dataset. """ @@ -170,6 +171,7 @@ def load_complete_split( Args: split: The name of the split to load. alias: The split as which the loaded data should be treated. + Returns: features: The complete, scaled features of the desired split. targets: The capped target values corresponding to the features. @@ -194,6 +196,7 @@ def load_split( Args: split: The desired split to load. alias: The split as which the loaded data should be treated. + Returns: features: The scaled, truncated features of the desired split. targets: The truncated targets of the desired split. @@ -250,6 +253,7 @@ def get_compatible( percent_fail_runs: Override this value in the new reader. truncate_val: Override this value in the new reader. consolidate_window_size: How to consolidate the window size of the readers. + Returns: A compatible reader with optional overrides. """ @@ -300,6 +304,7 @@ def get_complement( Args: percent_broken: Override this value in the new reader. truncate_val: Override this value in the new reader. + Returns: A compatible reader with all development runs missing in this one. """ @@ -338,6 +343,7 @@ def is_mutually_exclusive(self, other: "AbstractReader") -> bool: Args: other: The reader to check exclusivity against. + Returns: Whether the readers are mutually exclusive. """ diff --git a/rul_datasets/reader/saving.py b/rul_datasets/reader/saving.py index 951bbbb..8b15b7b 100644 --- a/rul_datasets/reader/saving.py +++ b/rul_datasets/reader/saving.py @@ -42,6 +42,7 @@ def load(save_path: str, memmap: bool = False) -> Tuple[np.ndarray, np.ndarray]: save_path: Path that was supplied to the [save][rul_datasets.reader.saving.save] function. memmap: whether to use memmap to avoid loading the whole run into memory + Returns: features: The feature array saved in `save_path` targets: The target array saved in `save_path` @@ -64,6 +65,7 @@ def load_multiple( Args: save_paths: The list of run files to load. memmap: See [load][rul_datasets.reader.saving.load] + Returns: features: The feature arrays saved in `save_paths` targets: The target arrays saved in `save_paths` @@ -84,6 +86,7 @@ def exists(save_path: str) -> bool: Args: save_path: the `save_path` the [save][rul_datasets.reader.saving.save] function was called with + Returns: Whether the files exist """ diff --git a/rul_datasets/reader/scaling.py b/rul_datasets/reader/scaling.py index 925ba1e..4d71870 100644 --- a/rul_datasets/reader/scaling.py +++ b/rul_datasets/reader/scaling.py @@ -86,6 +86,7 @@ def partial_fit( Args: features: The feature array to be scaled. operation_conditions: The condition values compared against the boundaries. + Returns: The partially fitted scaler. """ @@ -112,6 +113,7 @@ def transform( Args: features: The features to be scaled. operation_conditions: The condition values compared against the boundaries. + Returns: The scaled features. """ @@ -160,6 +162,7 @@ def fit_scaler( features: The RUL features. scaler: The scaler to be fit. Defaults to a StandardScaler. operation_conditions: The operation conditions for condition aware scaling. + Returns: The fitted scaler. """ @@ -218,6 +221,7 @@ def load_scaler(save_path: str) -> Scaler: Args: save_path: The path the scaler was saved to. + Returns: The loaded scaler. """ @@ -233,7 +237,7 @@ def scale_features( operation_conditions: Optional[List[np.ndarray]] = None, ) -> List[np.ndarray]: """ - Scaler the RUL features with a given scaler. + Scale the RUL features with a given scaler. The features can have a shape of `[num_time_steps, channels]` or `[num_windows, window_size, channels]`. The scaler needs to work on the channel dimension. If it @@ -246,6 +250,7 @@ def scale_features( features: The RUL features to be scaled. scaler: The already fitted scaler. operation_conditions: The operation conditions for condition aware scaling. + Returns: The scaled features. """ diff --git a/rul_datasets/ssl.py b/rul_datasets/ssl.py index bb7dfc2..0ecc398 100644 --- a/rul_datasets/ssl.py +++ b/rul_datasets/ssl.py @@ -115,6 +115,7 @@ def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The training data loader """ @@ -132,6 +133,7 @@ def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The labeled validation data loader. """ @@ -144,6 +146,7 @@ def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: Args: *args: Ignored. Only for adhering to parent class interface. **kwargs: Ignored. Only for adhering to parent class interface. + Returns: The labeled test data loader. """ diff --git a/rul_datasets/utils.py b/rul_datasets/utils.py index 68469ca..283da8c 100644 --- a/rul_datasets/utils.py +++ b/rul_datasets/utils.py @@ -12,16 +12,17 @@ def get_files_in_path(path: str, condition: Optional[Callable] = None) -> List[str]: """ - Return the paths of all files in a path that satisfy a condition in alphabetical - order. + Return the paths of all files in a path that satisfy a condition in alphabetical + order. - If the condition is `None` all files are returned. + If the condition is `None` all files are returned. + + Args: + path: the path to look into + condition: the include-condition for files - Args: - path: the path to look into - condition: the include-condition for files Returns: - all files that satisfy the condition in alphabetical order + all files that satisfy the condition in alphabetical order """ if condition is None: feature_files = [f for f in os.listdir(path)] @@ -83,6 +84,7 @@ def extract_windows( window_size: length of the sliding window dilation: dilation of the sliding window mode: create windows either in memory or on disk + Returns: array of sliding windows """