Merge branch 'tensor_comm' into bf16_decomposer

nvidianz · Dec 3, 2024 · 5bd20f3 · 5bd20f3
2 parents 048da33 + f694a14
commit 5bd20f3
Show file tree

Hide file tree

Showing 49 changed files with 776 additions and 347 deletions.
diff --git a/docs/resources/log.config b/docs/resources/log.config
@@ -21,7 +21,7 @@ args=(sys.stdout,)
 class=FileHandler
 level=ERROR
 formatter=fullFormatter
-args=('error.log', 'a')
+args=('error_log.txt', 'a')
 
 [formatter_fullFormatter]
 format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
diff --git a/examples/advanced/job_api/tf/README.md b/examples/advanced/job_api/tf/README.md
@@ -7,9 +7,8 @@ All examples in this folder are based on using [TensorFlow](https://tensorflow.o
 
 ## Simulated Federated Learning with CIFAR10 Using Tensorflow
 
-This example shows `Tensorflow`-based classic Federated Learning
-algorithms, namely FedAvg and FedOpt on CIFAR10
-dataset. This example is analogous to [the example using `Pytorch`
+This example demonstrates TensorFlow-based federated learning algorithms on the CIFAR-10 dataset.
+This example is analogous to [the example using `Pytorch`
 backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
 on the same dataset, where same experiments
 were conducted and analyzed. You should expect the same
@@ -21,7 +20,7 @@ client-side training logics (details in file
 and the new
 [`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
 APIs were used to programmatically set up an
-`nvflare` job to be exported or ran by simulator (details in file
+NVFlare job to be exported or ran by simulator (details in file
 [`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
 alleviating the need of writing job config files, simplifying
 development process.
@@ -65,12 +64,8 @@ script.
 > `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
 > TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
 
-The set-up of all experiments in this example are kept the same as
-[the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
-to the `Pytorch` example for more details. Similar to the Pytorch
-example, we here also use Dirichelet sampling on CIFAR10 data labels
-to simulate data heterogeneity among data splits for different client
+We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
+CIFAR10 data labels to simulate data heterogeneity among data splits for different client
 sites, controlled by an alpha value, ranging from 0 (not including 0)
 to 1. A high alpha value indicates less data heterogeneity, i.e., an
 alpha value equal to 1.0 would result in homogeneous data distribution

diff --git a/examples/advanced/job_api/tf/run_jobs.sh b/examples/advanced/job_api/tf/run_jobs.sh
@@ -25,7 +25,7 @@ GPU_INDX=0
 WORKSPACE=/tmp
 
 # Run centralized training job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo centralized \
        --n_clients 1 \
        --num_rounds 25 \
@@ -39,7 +39,7 @@ python ./tf_fl_script_executor_cifar10.py \
 # Run FedAvg with different alpha values
 for alpha in 1.0 0.5 0.3 0.1; do
 
-    python ./tf_fl_script_executor_cifar10.py \
+    python ./tf_fl_script_runner_cifar10.py \
        --algo fedavg \
        --n_clients 8 \
        --num_rounds 50 \
@@ -53,7 +53,7 @@ done
 
 
 # Run FedOpt job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo fedopt \
        --n_clients 8 \
        --num_rounds 50 \
@@ -65,7 +65,7 @@ python ./tf_fl_script_executor_cifar10.py \
 
 
 # Run FedProx job.
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo fedprox \
        --n_clients 8 \
        --num_rounds 50 \
@@ -77,11 +77,11 @@ python ./tf_fl_script_executor_cifar10.py \
 
 
 # Run scaffold job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo scaffold \
        --n_clients 8 \
        --num_rounds 50 \
        --batch_size 64 \
        --epochs 4 \
        --alpha 0.1 \
-       --gpu $GPU_INDX   
+       --gpu $GPU_INDX
diff --git a/examples/advanced/llm_hf/sft_job.py b/examples/advanced/llm_hf/sft_job.py
@@ -19,9 +19,10 @@
 from nvflare.app_common.widgets.intime_model_selector import IntimeModelSelector
 from nvflare.app_common.workflows.fedavg import FedAvg
 from nvflare.app_opt.pt.file_model_persistor import PTFileModelPersistor
+from nvflare.app_opt.pt.tensor_params_converter import PTReceiveParamsConverter, PTSendParamsConverter
 from nvflare.app_opt.quantization.numpy_dequantizor import NumpyModelDequantizor
 from nvflare.app_opt.quantization.numpy_quantizor import NumpyModelQuantizor
-from nvflare.job_config.script_runner import ScriptRunner
+from nvflare.job_config.script_runner import BaseScriptRunner
 
 
 def main():
@@ -46,6 +47,7 @@ def main():
     job_dir = args.job_dir
     model_name_or_path = args.model_name_or_path
     train_mode = args.train_mode
+    message_mode = args.message_mode
 
     # Create the FedJob
     if train_mode.lower() == "sft":
@@ -87,11 +89,26 @@ def main():
         site_name = f"site-{client_id}"
         data_path_train = os.path.join(args.data_path, client_id, "training.jsonl")
         data_path_valid = os.path.join(args.data_path, client_id, "validation.jsonl")
-        runner = ScriptRunner(
-            script=train_script,
-            script_args=f"--model_name_or_path {model_name_or_path} --data_path_train {data_path_train} --data_path_valid {data_path_valid} --output_path {output_path} --train_mode {train_mode} --clean_up {clean_up}",
-        )
+
+        if message_mode == "tensor":
+            # Add params converters and send to client
+            job.to(PTSendParamsConverter(), site_name, id="pt_send")
+            job.to(PTReceiveParamsConverter(), site_name, id="pt_receive")
+            runner = BaseScriptRunner(
+                script=train_script,
+                script_args=f"--model_name_or_path {model_name_or_path} --data_path_train {data_path_train} --data_path_valid {data_path_valid} --output_path {output_path} --train_mode {train_mode} --message_mode {message_mode} --clean_up {clean_up}",
+                from_nvflare_converter_id="pt_receive",
+                to_nvflare_converter_id="pt_send",
+            )
+        elif message_mode == "numpy":
+            runner = BaseScriptRunner(
+                script=train_script,
+                script_args=f"--model_name_or_path {model_name_or_path} --data_path_train {data_path_train} --data_path_valid {data_path_valid} --output_path {output_path} --train_mode {train_mode} --message_mode {message_mode} --clean_up {clean_up}",
+            )
+        else:
+            raise ValueError(f"Invalid message_mode: {message_mode}, only numpy and tensor are supported.")
         job.to(runner, site_name, tasks=["train"])
+
         if args.quantize_mode:
             job.to(quantizor, site_name, tasks=["train"], filter_type=FilterType.TASK_RESULT)
             job.to(dequantizor, site_name, tasks=["train"], filter_type=FilterType.TASK_DATA)
@@ -157,6 +174,12 @@ def define_parser():
         default=None,
         help="quantization mode, float16 or blockwise8, default to None (no quantization)",
     )
+    parser.add_argument(
+        "--message_mode",
+        type=str,
+        default="numpy",
+        help="message mode, numpy or tensor, default to numpy",
+    )
     parser.add_argument(
         "--threads",
         type=int,

diff --git a/examples/advanced/llm_hf/src/hf_sft_peft_fl.py b/examples/advanced/llm_hf/src/hf_sft_peft_fl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -69,6 +69,12 @@ def main():
         default="SFT",
         help="training mode, SFT or PEFT, default to SFT",
     )
+    parser.add_argument(
+        "--message_mode",
+        type=str,
+        default="numpy",
+        help="message mode, numpy or tensor, default to numpy",
+    )
     parser.add_argument("--local_epoch", type=int, default=1)
     parser.add_argument("--clean_up", type=int, default=0)
     args = parser.parse_args()
@@ -232,8 +238,10 @@ def evaluate(input_weights, mode):
             for key in list(out_param.keys()):
                 out_param["model." + key] = out_param.pop(key).cpu()
 
-        # cast out_param to float32 preparing for communication
-        out_param = {k: v.to(torch.float32) for k, v in out_param.items()}
+        if args.message_mode.lower() == "numpy":
+            # cast out_param to float32 preparing for communication with numpy
+            # otherwise do nothing
+            out_param = {k: v.to(torch.float32) for k, v in out_param.items()}
 
         # construct trained FL model
         output_model = flare.FLModel(

diff --git a/...vanced/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_learner_splitnn.py b/...vanced/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_learner_splitnn.py
@@ -240,11 +240,11 @@ def _train_step_data_side(self, batch_indices):
     def _val_step_data_side(self, batch_indices):
         t_start = timer()
         self.model.eval()
+        with torch.no_grad():
+            inputs = self.valid_dataset.get_batch(batch_indices)
+            inputs = inputs.to(self.device)
 
-        inputs = self.valid_dataset.get_batch(batch_indices)
-        inputs = inputs.to(self.device)
-
-        _val_activations = self.model.forward(inputs)  # keep on site-1
+            _val_activations = self.model.forward(inputs)  # keep on site-1
 
         self.compute_stats_pool.record_value(category="_val_step_data_side", value=timer() - t_start)
 
@@ -295,23 +295,24 @@ def _train_step_label_side(self, batch_indices, activations, fl_ctx: FLContext):
     def _val_step_label_side(self, batch_indices, activations, fl_ctx: FLContext):
         t_start = timer()
         self.model.eval()
+        with torch.no_grad():
+            labels = self.valid_dataset.get_batch(batch_indices)
+            labels = labels.to(self.device)
 
-        labels = self.valid_dataset.get_batch(batch_indices)
-        labels = labels.to(self.device)
+            if self.fp16:
+                activations = activations.type(torch.float32)  # return to default pytorch precision
 
-        if self.fp16:
-            activations = activations.type(torch.float32)  # return to default pytorch precision
+            activations = activations.to(self.device)
 
-        activations = activations.to(self.device)
+            pred = self.model.forward(activations)
 
-        pred = self.model.forward(activations)
-        loss = self.criterion(pred, labels)
-        self.val_loss.append(loss.unsqueeze(0))  # unsqueeze needed for later concatenation
+            loss = self.criterion(pred, labels)
+            self.val_loss.append(loss.unsqueeze(0))  # unsqueeze needed for later concatenation
 
-        _, pred_labels = torch.max(pred, 1)
+            _, pred_labels = torch.max(pred, 1)
 
-        self.val_pred_labels.extend(pred_labels.unsqueeze(0))
-        self.val_labels.extend(labels.unsqueeze(0))
+            self.val_pred_labels.extend(pred_labels.unsqueeze(0))
+            self.val_labels.extend(labels.unsqueeze(0))
 
         self.compute_stats_pool.record_value(category="_val_step_label_side", value=timer() - t_start)
 

diff --git a/examples/advanced/xgboost/histogram-based/requirements.txt b/examples/advanced/xgboost/histogram-based/requirements.txt
@@ -3,5 +3,7 @@ pandas
 scikit-learn
 torch
 tensorboard
+matplotlib
+shap
 # require xgboost 2.2 version, for now need to install a nightly build
 https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl
diff --git a/examples/advanced/xgboost/tree-based/requirements.txt b/examples/advanced/xgboost/tree-based/requirements.txt
@@ -3,5 +3,7 @@ pandas
 scikit-learn
 torch
 tensorboard
+matplotlib
+shap
 # require xgboost 2.2 version, for now need to install a nightly build
 https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl
diff --git a/examples/getting_started/tf/README.md b/examples/getting_started/tf/README.md
@@ -1,26 +1,21 @@
 # Getting Started with NVFlare (TensorFlow)
 [![TensorFlow Logo](https://upload.wikimedia.org/wikipedia/commons/a/ab/TensorFlow_logo.svg)](https://tensorflow.org/)
 
-We provide several examples to quickly get you started using NVFlare's Job API. 
+We provide several examples to help you quickly get started with NVFlare.
 All examples in this folder are based on using [TensorFlow](https://tensorflow.org/) as the model training framework.
 
 ## Simulated Federated Learning with CIFAR10 Using Tensorflow
 
-This example shows `Tensorflow`-based classic Federated Learning
-algorithms, namely FedAvg and FedOpt on CIFAR10
-dataset. This example is analogous to [the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
-on the same dataset, where same experiments
-were conducted and analyzed. You should expect the same
-experimental results when comparing this example with the `Pytorch` one.
+This example demonstrates TensorFlow-based federated learning algorithms,
+FedAvg and FedOpt, on the CIFAR-10 dataset.
 
 In this example, the latest Client APIs were used to implement
 client-side training logics (details in file
 [`cifar10_tf_fl_alpha_split.py`](src/cifar10_tf_fl_alpha_split.py)),
 and the new
 [`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
 APIs were used to programmatically set up an
-`nvflare` job to be exported or ran by simulator (details in file
+NVFlare job to be exported or ran by simulator (details in file
 [`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
 alleviating the need of writing job config files, simplifying
 development process.
@@ -64,12 +59,8 @@ script.
 > `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
 > TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
 
-The set-up of all experiments in this example are kept the same as
-[the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
-to the `Pytorch` example for more details. Similar to the Pytorch
-example, we here also use Dirichelet sampling on CIFAR10 data labels
-to simulate data heterogeneity among data splits for different client
+We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
+CIFAR10 data labels to simulate data heterogeneity among data splits for different client
 sites, controlled by an alpha value, ranging from 0 (not including 0)
 to 1. A high alpha value indicates less data heterogeneity, i.e., an
 alpha value equal to 1.0 would result in homogeneous data distribution
@@ -111,11 +102,11 @@ for alpha in 1.0 0.5 0.3 0.1; do
 done
 ```
 
-## 2. Results
+## 3. Results
 
 Now let's compare experimental results.
 
-### 2.1 Centralized training vs. FedAvg for homogeneous split
+### 3.1 Centralized training vs. FedAvg for homogeneous split
 Let's first compare FedAvg with homogeneous data split
 (i.e. `alpha=1.0`) and centralized training. As can be seen from the
 figure and table below, FedAvg can achieve similar performance to
@@ -129,7 +120,7 @@ no difference in data distributions among different clients.
 
 ![Central vs. FedAvg](./figs/fedavg-vs-centralized.png)
 
-### 2.2 Impact of client data heterogeneity
+### 3.2 Impact of client data heterogeneity
 
 Here we compare the impact of data heterogeneity by varying the
 `alpha` value, where lower values cause higher heterogeneity. As can
@@ -145,7 +136,7 @@ as data heterogeneity becomes higher.
 
 ![Impact of client data
 heterogeneity](./figs/fedavg-diff-alphas.png)
- 
+
 > [!NOTE]
 > More examples can be found at https://nvidia.github.io/NVFlare.
 
diff --git a/examples/getting_started/tf/run_jobs.sh b/examples/getting_started/tf/run_jobs.sh
@@ -50,38 +50,3 @@ for alpha in 1.0 0.5 0.3 0.1; do
        --workspace $WORKSPACE
 
 done
-
-
-# Run FedOpt job
-python ./tf_fl_script_runner_cifar10.py \
-       --algo fedopt \
-       --n_clients 8 \
-       --num_rounds 50 \
-       --batch_size 64 \
-       --epochs 4 \
-       --alpha 0.1 \
-       --gpu $GPU_INDX \
-       --workspace $WORKSPACE
-
-
-# Run FedProx job.
-python ./tf_fl_script_runner_cifar10.py \
-       --algo fedprox \
-       --n_clients 8 \
-       --num_rounds 50 \
-       --batch_size 64 \
-       --epochs 4 \
-       --fedprox_mu 1e-5 \
-       --alpha 0.1 \
-       --gpu $GPU_INDX
-
-
-# Run scaffold job
-python ./tf_fl_script_runner_cifar10.py \
-       --algo scaffold \
-       --n_clients 8 \
-       --num_rounds 50 \
-       --batch_size 64 \
-       --epochs 4 \
-       --alpha 0.1 \
-       --gpu $GPU_INDX   
diff --git a/examples/hello-world/hello-tf/README.md b/examples/hello-world/hello-tf/README.md
@@ -4,7 +4,7 @@ Example of using [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.htm
 using federated averaging ([FedAvg](https://arxiv.org/abs/1602.05629))
 and [TensorFlow](https://tensorflow.org/) as the deep learning training framework.
 
-> **_NOTE:_** This example uses the [MNIST](http://yann.lecun.com/exdb/mnist/) handwritten digits dataset and will load its data within the trainer code.
+> **_NOTE:_** This example uses the [MNIST](https://www.tensorflow.org/datasets/catalog/mnist) handwritten digits dataset and will load its data within the trainer code.
 
 See the [Hello TensorFlow](https://nvflare.readthedocs.io/en/main/examples/hello_tf_job_api.html#hello-tf-job-api) example documentation page for details on this
 example.
@@ -48,7 +48,7 @@ In scenarios where multiple clients are involved, you have to prevent TensorFlow
 by setting the following flags.
 
 ```bash
-TF_FORCE_GPU_ALLOW_GROWTH=true TF_GPU_ALLOCATOR=cuda_malloc_async
+TF_FORCE_GPU_ALLOW_GROWTH=true TF_GPU_ALLOCATOR=cuda_malloc_async python3 fedavg_script_runner_tf.py
 ```
 
 If you possess more GPUs than clients, a good strategy is to run one client on each GPU.