From a9d3bb376918baaba5a9a358236414170a907ee5 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Fri, 6 Sep 2024 11:05:22 -0700 Subject: [PATCH] address comments --- .../FSDP_adavnced_tutorial.rst | 704 ------------- intermediate_source/FSDP_tutorial.rst | 447 -------- .../TCPStore_libuv_backend.rst | 286 ----- intermediate_source/TP_tutorial.rst | 361 ------- .../_torch_export_nightly_tutorial.py | 635 ----------- .../autograd_saved_tensors_hooks_tutorial.py | 514 --------- .../ax_multiobjective_nas_tutorial.py | 516 --------- .../char_rnn_classification_tutorial.py | 531 ---------- .../char_rnn_generation_tutorial.py | 433 -------- .../compiled_autograd_tutorial.py | 80 +- .../custom_function_conv_bn_tutorial.py | 394 ------- ...stom_function_double_backward_tutorial.rst | 301 ------ intermediate_source/ddp_series_minGPT.rst | 87 -- intermediate_source/ddp_series_multinode.rst | 94 -- intermediate_source/ddp_tutorial.rst | 375 ------- intermediate_source/dist_tuto.rst | 636 ----------- intermediate_source/dqn_with_rnn_tutorial.py | 468 --------- .../dynamic_quantization_bert_tutorial.rst | 568 ---------- intermediate_source/ensembling.py | 175 --- .../flask_rest_api_tutorial.py | 335 ------ ...ced_alignment_with_torchaudio_tutorial.rst | 11 - intermediate_source/forward_ad_usage.py | 246 ----- intermediate_source/fx_conv_bn_fuser.py | 262 ----- intermediate_source/fx_profiling_tutorial.py | 236 ----- intermediate_source/inductor_debug_cpu.py | 637 ----------- intermediate_source/jacobians_hessians.py | 349 ------ intermediate_source/mario_rl_tutorial.py | 791 -------------- intermediate_source/memory_format_tutorial.py | 389 ------- intermediate_source/mnist_train_nas.py | 171 --- .../model_parallel_tutorial.py | 357 ------- intermediate_source/neural_tangent_kernels.py | 251 ----- .../nvfuser_intro_tutorial.rst | 8 - .../optimizer_step_in_backward_tutorial.py | 268 ----- intermediate_source/parametrizations.py | 393 ------- intermediate_source/per_sample_grads.py | 225 ---- intermediate_source/pinmem_nonblock.py | 728 ------------- intermediate_source/pipelining_tutorial.rst | 236 ----- .../process_group_cpp_extension_tutorial.rst | 307 ------ intermediate_source/pruning_tutorial.py | 403 ------- .../quantized_transfer_learning_tutorial.rst | 516 --------- intermediate_source/realtime_rpi.rst | 345 ------ intermediate_source/reinforcement_ppo.py | 705 ------------- .../reinforcement_q_learning.py | 464 -------- intermediate_source/rpc_async_execution.rst | 524 --------- .../rpc_param_server_tutorial.rst | 386 ------- intermediate_source/rpc_tutorial.rst | 622 ----------- .../scaled_dot_product_attention_tutorial.py | 407 ------- .../seq2seq_translation_tutorial.py | 872 --------------- .../spatial_transformer_tutorial.py | 257 ----- .../speech_recognition_pipeline_tutorial.rst | 10 - .../tensorboard_profiler_tutorial.py | 501 --------- intermediate_source/tensorboard_tutorial.rst | 404 ------- .../text_to_speech_with_torchaudio.rst | 10 - intermediate_source/tiatoolbox_tutorial.rst | 994 ------------------ intermediate_source/torch_compile_tutorial.py | 606 ----------- .../torch_export_nightly_tutorial.rst | 858 --------------- intermediate_source/torch_export_tutorial.py | 768 -------------- intermediate_source/torchrec_tutorial.rst | 244 ----- intermediate_source/torchserve_with_ipex.rst | 394 ------- .../torchserve_with_ipex_2.rst | 447 -------- intermediate_source/torchvision_tutorial.py | 534 ---------- 61 files changed, 43 insertions(+), 25033 deletions(-) delete mode 100644 intermediate_source/FSDP_adavnced_tutorial.rst delete mode 100644 intermediate_source/FSDP_tutorial.rst delete mode 100644 intermediate_source/TCPStore_libuv_backend.rst delete mode 100644 intermediate_source/TP_tutorial.rst delete mode 100644 intermediate_source/_torch_export_nightly_tutorial.py delete mode 100644 intermediate_source/autograd_saved_tensors_hooks_tutorial.py delete mode 100644 intermediate_source/ax_multiobjective_nas_tutorial.py delete mode 100644 intermediate_source/char_rnn_classification_tutorial.py delete mode 100644 intermediate_source/char_rnn_generation_tutorial.py delete mode 100644 intermediate_source/custom_function_conv_bn_tutorial.py delete mode 100644 intermediate_source/custom_function_double_backward_tutorial.rst delete mode 100644 intermediate_source/ddp_series_minGPT.rst delete mode 100644 intermediate_source/ddp_series_multinode.rst delete mode 100644 intermediate_source/ddp_tutorial.rst delete mode 100644 intermediate_source/dist_tuto.rst delete mode 100644 intermediate_source/dqn_with_rnn_tutorial.py delete mode 100644 intermediate_source/dynamic_quantization_bert_tutorial.rst delete mode 100644 intermediate_source/ensembling.py delete mode 100644 intermediate_source/flask_rest_api_tutorial.py delete mode 100644 intermediate_source/forced_alignment_with_torchaudio_tutorial.rst delete mode 100644 intermediate_source/forward_ad_usage.py delete mode 100644 intermediate_source/fx_conv_bn_fuser.py delete mode 100644 intermediate_source/fx_profiling_tutorial.py delete mode 100644 intermediate_source/inductor_debug_cpu.py delete mode 100644 intermediate_source/jacobians_hessians.py delete mode 100755 intermediate_source/mario_rl_tutorial.py delete mode 100644 intermediate_source/memory_format_tutorial.py delete mode 100644 intermediate_source/mnist_train_nas.py delete mode 100644 intermediate_source/model_parallel_tutorial.py delete mode 100644 intermediate_source/neural_tangent_kernels.py delete mode 100644 intermediate_source/nvfuser_intro_tutorial.rst delete mode 100644 intermediate_source/optimizer_step_in_backward_tutorial.py delete mode 100644 intermediate_source/parametrizations.py delete mode 100644 intermediate_source/per_sample_grads.py delete mode 100644 intermediate_source/pinmem_nonblock.py delete mode 100644 intermediate_source/pipelining_tutorial.rst delete mode 100644 intermediate_source/process_group_cpp_extension_tutorial.rst delete mode 100644 intermediate_source/pruning_tutorial.py delete mode 100644 intermediate_source/quantized_transfer_learning_tutorial.rst delete mode 100644 intermediate_source/realtime_rpi.rst delete mode 100644 intermediate_source/reinforcement_ppo.py delete mode 100644 intermediate_source/reinforcement_q_learning.py delete mode 100644 intermediate_source/rpc_async_execution.rst delete mode 100644 intermediate_source/rpc_param_server_tutorial.rst delete mode 100644 intermediate_source/rpc_tutorial.rst delete mode 100644 intermediate_source/scaled_dot_product_attention_tutorial.py delete mode 100755 intermediate_source/seq2seq_translation_tutorial.py delete mode 100644 intermediate_source/spatial_transformer_tutorial.py delete mode 100644 intermediate_source/speech_recognition_pipeline_tutorial.rst delete mode 100644 intermediate_source/tensorboard_profiler_tutorial.py delete mode 100644 intermediate_source/tensorboard_tutorial.rst delete mode 100644 intermediate_source/text_to_speech_with_torchaudio.rst delete mode 100644 intermediate_source/tiatoolbox_tutorial.rst delete mode 100644 intermediate_source/torch_compile_tutorial.py delete mode 100644 intermediate_source/torch_export_nightly_tutorial.rst delete mode 100644 intermediate_source/torch_export_tutorial.py delete mode 100644 intermediate_source/torchrec_tutorial.rst delete mode 100644 intermediate_source/torchserve_with_ipex.rst delete mode 100644 intermediate_source/torchserve_with_ipex_2.rst delete mode 100644 intermediate_source/torchvision_tutorial.py diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_adavnced_tutorial.rst deleted file mode 100644 index f7ee1e7de1..0000000000 --- a/intermediate_source/FSDP_adavnced_tutorial.rst +++ /dev/null @@ -1,704 +0,0 @@ -Advanced Model Training with Fully Sharded Data Parallel (FSDP) -=============================================================== - -**Author**: `Hamid Shojanazeri `__, `Less -Wright `__, `Rohan Varma -`__, `Yanli Zhao -`__ - - -This tutorial introduces more advanced features of Fully Sharded Data Parallel -(FSDP) as part of the PyTorch 1.12 release. To get familiar with FSDP, please -refer to the `FSDP getting started tutorial -`__. - -In this tutorial, we fine-tune a HuggingFace (HF) T5 model with FSDP for text -summarization as a working example. - -The example uses Wikihow and for simplicity, we will showcase the training on a -single node, P4dn instance with 8 A100 GPUs. We will soon have a blog post on -large scale FSDP training on a multi-node cluster, please stay tuned for that on -the PyTorch medium channel. - -FSDP is a production ready package with focus on ease of use, performance, and -long-term support. One of the main benefits of FSDP is reducing the memory -footprint on each GPU. This enables training of larger models with lower total -memory vs DDP, and leverages the overlap of computation and communication to -train models efficiently. -This reduced memory pressure can be leveraged to either train larger models or -increase batch size, potentially helping overall training throughput. You can -read more about PyTorch FSDP `here -`__. - - -FSDP Features in This Tutorial ------------------------------- -* Transformer Auto Wrap Policy -* Mixed Precision -* Initializing FSDP Model on Device -* Sharding Strategy -* Backward Prefetch -* Model Checkpoint Saving via Streaming to CPU - - - -Recap on How FSDP Works ------------------------ - -At a high level FDSP works as follow: - -*In constructor* - -* Shard model parameters and each rank only keeps its own shard - -*In forward pass* - -* Run `all_gather` to collect all shards from all ranks to recover the full - parameter for this FSDP unit Run forward computation -* Discard non-owned parameter shards it has just collected to free memory - -*In backward pass* - -* Run `all_gather` to collect all shards from all ranks to recover the full - parameter in this FSDP unit Run backward computation -* Discard non-owned parameters to free memory. -* Run reduce_scatter to sync gradients - - -Fine-tuning HF T5 ------------------ -HF T5 pre-trained models are available in four different sizes, ranging from -small with 60 Million parameters to XXL with 11 Billion parameters. In this -tutorial, we demonstrate the fine-tuning of a T5 3B with FSDP for text -summarization using WikiHow dataset. The main focus of this tutorial is to -highlight different available features in FSDP that are helpful for training -large scale model above 3B parameters. Also, we cover specific features for -Transformer based models. The code for this tutorial is available in `Pytorch -examples -`__. - - -*Setup* - -1.1 Install PyTorch Nightlies - -We will install PyTorch nightlies, as some of the features such as activation -checkpointing is available in nightlies and will be added in next PyTorch -release after 1.12. - -.. code-block:: bash - - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html - -1.2 Dataset Setup - -Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv -`__ and -`wikihowSep.cs `__, -and place them in the `data` folder. We will use the wikihow dataset from -`summarization_dataset -`__. - -Next, we add the following code snippets to a Python script “T5_training.py”. - -.. note:: - The full source code for this tutorial is available in `PyTorch examples - `__. - -1.3 Import necessary packages: - -.. code-block:: python - - import os - import argparse - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - from transformers import AutoTokenizer, GPT2TokenizerFast - from transformers import T5Tokenizer, T5ForConditionalGeneration - import functools - from torch.optim.lr_scheduler import StepLR - import torch.nn.functional as F - import torch.distributed as dist - import torch.multiprocessing as mp - from torch.nn.parallel import DistributedDataParallel as DDP - from torch.utils.data.distributed import DistributedSampler - from transformers.models.t5.modeling_t5 import T5Block - - from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( - checkpoint_wrapper, - CheckpointImpl, - apply_activation_checkpointing_wrapper) - - from torch.distributed.fsdp import ( - FullyShardedDataParallel as FSDP, - MixedPrecision, - BackwardPrefetch, - ShardingStrategy, - FullStateDictConfig, - StateDictType, - ) - from torch.distributed.fsdp.wrap import ( - transformer_auto_wrap_policy, - enable_wrap, - wrap, - ) - from functools import partial - from torch.utils.data import DataLoader - from pathlib import Path - from summarization_dataset import * - from transformers.models.t5.modeling_t5 import T5Block - from typing import Type - import time - import tqdm - from datetime import datetime - -1.4 Distributed training setup. -Here we use two helper functions to initialize the processes for distributed -training, and then to clean up after training completion. In this tutorial, we -are going to use torch elastic, using `torchrun -`__ , which will set the -worker `RANK` and `WORLD_SIZE` automatically. - -.. code-block:: python - - def setup(): - # initialize the process group - dist.init_process_group("nccl") - - def cleanup(): - dist.destroy_process_group() - -2.1 Set up the HuggingFace T5 model: - -.. code-block:: python - - def setup_model(model_name): - model = T5ForConditionalGeneration.from_pretrained(model_name) - tokenizer = T5Tokenizer.from_pretrained(model_name) - return model, tokenizer - -We also, add couple of helper functions here for date and formatting memory -metrics. - -.. code-block:: python - - def get_date_of_run(): - """create date and time for file save uniqueness - example: 2022-05-07-08:31:12_PM' - """ - date_of_run = datetime.now().strftime("%Y-%m-%d-%I:%M:%S_%p") - print(f"--> current date and time of run = {date_of_run}") - return date_of_run - - def format_metrics_to_gb(item): - """quick function to format numbers to gigabyte and round to 4 digit precision""" - metric_num = item / g_gigabyte - metric_num = round(metric_num, ndigits=4) - return metric_num - - -2.2 Define a train function: - -.. code-block:: python - - def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): - model.train() - local_rank = int(os.environ['LOCAL_RANK']) - fsdp_loss = torch.zeros(2).to(local_rank) - - if sampler: - sampler.set_epoch(epoch) - if rank==0: - inner_pbar = tqdm.tqdm( - range(len(train_loader)), colour="blue", desc="r0 Training Epoch" - ) - for batch in train_loader: - for key in batch.keys(): - batch[key] = batch[key].to(local_rank) - optimizer.zero_grad() - output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] ) - loss = output["loss"] - loss.backward() - optimizer.step() - fsdp_loss[0] += loss.item() - fsdp_loss[1] += len(batch) - if rank==0: - inner_pbar.update(1) - - dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM) - train_accuracy = fsdp_loss[0] / fsdp_loss[1] - - - if rank == 0: - inner_pbar.close() - print( - f"Train Epoch: \t{epoch}, Loss: \t{train_accuracy:.4f}" - ) - return train_accuracy - -2.3 Define a validation function: - -.. code-block:: python - - def validation(model, rank, world_size, val_loader): - model.eval() - correct = 0 - local_rank = int(os.environ['LOCAL_RANK']) - fsdp_loss = torch.zeros(3).to(local_rank) - if rank == 0: - inner_pbar = tqdm.tqdm( - range(len(val_loader)), colour="green", desc="Validation Epoch" - ) - with torch.no_grad(): - for batch in val_loader: - for key in batch.keys(): - batch[key] = batch[key].to(local_rank) - output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"]) - fsdp_loss[0] += output["loss"].item() # sum up batch loss - fsdp_loss[1] += len(batch) - - if rank==0: - inner_pbar.update(1) - - dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM) - val_loss = fsdp_loss[0] / fsdp_loss[1] - if rank == 0: - inner_pbar.close() - print(f"Validation Loss: {val_loss:.4f}") - return val_loss - - -2.4 Define a distributed train function that wraps the model in FSDP: - - -.. code-block:: python - - - def fsdp_main(args): - - model, tokenizer = setup_model("t5-base") - - local_rank = int(os.environ['LOCAL_RANK']) - rank = int(os.environ['RANK']) - world_size = int(os.environ['WORLD_SIZE']) - - - dataset = load_dataset('wikihow', 'all', data_dir='data/') - print(dataset.keys()) - print("Size of train dataset: ", dataset['train'].shape) - print("Size of Validation dataset: ", dataset['validation'].shape) - - - #wikihow(tokenizer, type_path, num_samples, input_length, output_length, print_text=False) - train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False) - val_dataset = wikihow(tokenizer, 'validation', 300, 512, 150, False) - - sampler1 = DistributedSampler(train_dataset, rank=rank, num_replicas=world_size, shuffle=True) - sampler2 = DistributedSampler(val_dataset, rank=rank, num_replicas=world_size) - - setup() - - - train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} - test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} - cuda_kwargs = {'num_workers': 2, - 'pin_memory': True, - 'shuffle': False} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - - train_loader = torch.utils.data.DataLoader(train_dataset,**train_kwargs) - val_loader = torch.utils.data.DataLoader(val_dataset, **test_kwargs) - - t5_auto_wrap_policy = functools.partial( - transformer_auto_wrap_policy, - transformer_layer_cls={ - T5Block, - }, - ) - sharding_strategy: ShardingStrategy = ShardingStrategy.SHARD_GRAD_OP #for Zero2 and FULL_SHARD for Zero3 - torch.cuda.set_device(local_rank) - - - #init_start_event = torch.cuda.Event(enable_timing=True) - #init_end_event = torch.cuda.Event(enable_timing=True) - - #init_start_event.record() - - bf16_ready = ( - torch.version.cuda - and torch.cuda.is_bf16_supported() - and LooseVersion(torch.version.cuda) >= "11.0" - and dist.is_nccl_available() - and nccl.version() >= (2, 10) - ) - - if bf16_ready: - mp_policy = bfSixteen - else: - mp_policy = None # defaults to fp32 - - # model is on CPU before input to FSDP - model = FSDP(model, - auto_wrap_policy=t5_auto_wrap_policy, - mixed_precision=mp_policy, - #sharding_strategy=sharding_strategy, - device_id=torch.cuda.current_device()) - - optimizer = optim.AdamW(model.parameters(), lr=args.lr) - - scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) - best_val_loss = float("inf") - curr_val_loss = float("inf") - file_save_name = "T5-model-" - - if rank == 0: - time_of_run = get_date_of_run() - dur = [] - train_acc_tracking = [] - val_acc_tracking = [] - training_start_time = time.time() - - if rank == 0 and args.track_memory: - mem_alloc_tracker = [] - mem_reserved_tracker = [] - - for epoch in range(1, args.epochs + 1): - t0 = time.time() - train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) - if args.run_validation: - curr_val_loss = validation(model, rank, world_size, val_loader) - scheduler.step() - - if rank == 0: - - print(f"--> epoch {epoch} completed...entering save and stats zone") - - dur.append(time.time() - t0) - train_acc_tracking.append(train_accuracy.item()) - - if args.run_validation: - val_acc_tracking.append(curr_val_loss.item()) - - if args.track_memory: - mem_alloc_tracker.append( - format_metrics_to_gb(torch.cuda.memory_allocated()) - ) - mem_reserved_tracker.append( - format_metrics_to_gb(torch.cuda.memory_reserved()) - ) - print(f"completed save and stats zone...") - - if args.save_model and curr_val_loss < best_val_loss: - - # save - if rank == 0: - print(f"--> entering save model state") - - save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) - with FSDP.state_dict_type( - model, StateDictType.FULL_STATE_DICT, save_policy - ): - cpu_state = model.state_dict() - #print(f"saving process: rank {rank} done w state_dict") - - - if rank == 0: - print(f"--> saving model ...") - currEpoch = ( - "-" + str(epoch) + "-" + str(round(curr_val_loss.item(), 4)) + ".pt" - ) - print(f"--> attempting to save model prefix {currEpoch}") - save_name = file_save_name + "-" + time_of_run + "-" + currEpoch - print(f"--> saving as model name {save_name}") - - torch.save(cpu_state, save_name) - - if curr_val_loss < best_val_loss: - - best_val_loss = curr_val_loss - if rank==0: - print(f"-->>>> New Val Loss Record: {best_val_loss}") - - dist.barrier() - cleanup() - - -2.5 Parse the arguments and set the main function: - -.. code-block:: python - - - if __name__ == '__main__': - # Training settings - parser = argparse.ArgumentParser(description='PyTorch T5 FSDP Example') - parser.add_argument('--batch-size', type=int, default=4, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=4, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=2, metavar='N', - help='number of epochs to train (default: 3)') - parser.add_argument('--lr', type=float, default=.002, metavar='LR', - help='learning rate (default: .002)') - parser.add_argument('--gamma', type=float, default=0.7, metavar='M', - help='Learning rate step gamma (default: 0.7)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--track_memory', action='store_false', default=True, - help='track the gpu memory') - parser.add_argument('--run_validation', action='store_false', default=True, - help='running the validation') - parser.add_argument('--save-model', action='store_false', default=True, - help='For Saving the current Model') - args = parser.parse_args() - - torch.manual_seed(args.seed) - - fsdp_main(args) - - -To run the the training using torchrun: - -.. code-block:: bash - - torchrun --nnodes 1 --nproc_per_node 4 T5_training.py - -.. _transformer_wrapping_policy: - -Transformer Wrapping Policy ---------------------------- - -As discussed in the `previous tutorial -`__, -auto_wrap_policy is one of the FSDP features that make it easy to automatically -shard a given model and put the model, optimizer and gradient shards into -distinct FSDP units. - -For some architectures such as Transformer encoder-decoders, some parts of the -model such as embedding table is being shared with both encoder and decoder. In -this case, we need to place the embedding table in the outer FSDP unit so that -it could be accessed from both encoder and decoder. In addition, by registering -the layer class for a transformer, the sharding plan can be made much more -communication efficient. In PyTorch 1.12, FSDP added this support and now we -have a wrapping policy for transfomers. - -It can be created as follows, where the T5Block represents the T5 transformer -layer class (holding MHSA and FFN). - - -.. code-block:: python - - t5_auto_wrap_policy = functools.partial( - transformer_auto_wrap_policy, - transformer_layer_cls={ - T5Block, - }, - ) - torch.cuda.set_device(local_rank) - - - model = FSDP(model, - auto_wrap_policy=t5_auto_wrap_policy) - -To see the wrapped model, you can easily print the model and visually inspect -the sharding and FSDP units as well. - - -Mixed Precision ---------------- -FSDP supports flexible mixed precision training allowing for arbitrary reduced -precision types (such as fp16 or bfloat16). Currently BFloat16 is only available -on Ampere GPUs, so you need to confirm native support before you use it. On -V100s for example, BFloat16 can still be run but due to it running non-natively, -it can result in significant slowdowns. - -To check if BFloat16 is natively supported, you can use the following : - -.. code-block:: python - - bf16_ready = ( - torch.version.cuda - and torch.cuda.is_bf16_supported() - and LooseVersion(torch.version.cuda) >= "11.0" - and dist.is_nccl_available() - and nccl.version() >= (2, 10) - ) - -One of the advantages of mixed percision in FSDP is providing granular control -over different precision levels for parameters, gradients, and buffers as -follows: - -.. code-block:: python - - fpSixteen = MixedPrecision( - param_dtype=torch.float16, - # Gradient communication precision. - reduce_dtype=torch.float16, - # Buffer precision. - buffer_dtype=torch.float16, - ) - - bfSixteen = MixedPrecision( - param_dtype=torch.bfloat16, - # Gradient communication precision. - reduce_dtype=torch.bfloat16, - # Buffer precision. - buffer_dtype=torch.bfloat16, - ) - - fp32_policy = MixedPrecision( - param_dtype=torch.float32, - # Gradient communication precision. - reduce_dtype=torch.float32, - # Buffer precision. - buffer_dtype=torch.float32, - ) - -Note that if a certain type (parameter, reduce, buffer) is not specified, they -will not be casted at all. - -This flexibility allows users fine grained control, such as only setting -gradient communication to happen in reduced precision, and all parameters / -buffer computation to be done in full precision. This is potentially useful in -cases where intra-node communication is the main bottleneck and parameters / -buffers must be in full precision to avoid accuracy issues. This can be done -with the following policy: - -.. code-block:: bash - - grad_bf16 = MixedPrecision(reduce_dtype=torch.bfloat16) - - -In 2.4 we just add the relevant mixed precision policy to the FSDP wrapper: - - -.. code-block:: python - - model = FSDP(model, - auto_wrap_policy=t5_auto_wrap_policy, - mixed_precision=bfSixteen) - -In our experiments, we have observed up to 4x speed up by using BFloat16 for -training and memory reduction of approximately 30% in some experiments that can -be used for batch size increases. - - -Intializing FSDP Model on Device --------------------------------- -In 1.12, FSDP supports a `device_id` argument meant to initialize input CPU -module on the device given by `device_id`. This is useful when the entire model -does not fit on a single GPU, but fits in a host's CPU memory. When `device_id` -is specified, FSDP will move the model to the specified device on a per-FSDP -unit basis, avoiding GPU OOM issues while initializing several times faster than -CPU-based initialization: - -.. code-block:: python - - torch.cuda.set_device(local_rank) - - model = FSDP(model, - auto_wrap_policy=t5_auto_wrap_policy, - mixed_precision=bfSixteen, - device_id=torch.cuda.current_device()) - - - -Sharding Strategy ------------------ -FSDP sharding strategy by default is set to fully shard the model parameters, -gradients and optimizer states get sharded across all ranks. (also termed Zero3 -sharding). In case you are interested to have the Zero2 sharding strategy, where -only optimizer states and gradients are sharded, FSDP support this feature by -passing the Sharding strategy by using "ShardingStrategy.SHARD_GRAD_OP", -instead of "ShardingStrategy.FULL_SHARD" to the FSDP initialization as follows: - -.. code-block:: python - - torch.cuda.set_device(local_rank) - - model = FSDP(model, - auto_wrap_policy=t5_auto_wrap_policy, - mixed_precision=bfSixteen, - device_id=torch.cuda.current_device(), - sharding_strategy=ShardingStrategy.SHARD_GRAD_OP # ZERO2) - -This will reduce the communication overhead in FSDP, in this case, it holds full -parameters after forward and through the backwards pass. - -This saves an all_gather during backwards so there is less communication at the -cost of a higher memory footprint. Note that full model params are freed at the -end of backwards and all_gather will happen on the next forward pass. - -Backward Prefetch ------------------ -The backward prefetch setting controls the timing of when the next FSDP unit's -parameters should be requested. By setting it to `BACKWARD_PRE`, the next -FSDP's unit params can begin to be requested and arrive sooner before the -computation of the current unit starts. This overlaps the `all_gather` -communication and gradient computation which can increase the training speed in -exchange for slightly higher memory consumption. It can be utilized in the FSDP -wrapper in 2.4 as follows: - -.. code-block:: python - - torch.cuda.set_device(local_rank) - - model = FSDP(model, - auto_wrap_policy=t5_auto_wrap_policy, - mixed_precision=bfSixteen, - device_id=torch.cuda.current_device(), - backward_prefetch = BackwardPrefetch.BACKWARD_PRE) - -`backward_prefetch` has two modes, `BACKWARD_PRE` and `BACKWARD_POST`. -`BACKWARD_POST` means that the next FSDP unit's params will not be requested -until the current FSDP unit processing is complete, thus minimizing memory -overhead. In some cases, using `BACKWARD_PRE` can increase model training speed -up to 2-10%, with even higher speed improvements noted for larger models. - -Model Checkpoint Saving, by streaming to the Rank0 CPU ------------------------------------------------------- -To save model checkpoints using FULL_STATE_DICT saving which saves model in the -same fashion as a local model, PyTorch 1.12 offers a few utilities to support -the saving of larger models. - -First, a FullStateDictConfig can be specified, allowing the state_dict to be -populated on rank 0 only and offloaded to the CPU. - -When using this configuration, FSDP will allgather model parameters, offloading -them to the CPU one by one, only on rank 0. When the state_dict is finally -saved, it will only be populated on rank 0 and contain CPU tensors. This avoids -potential OOM for models that are larger than a single GPU memory and allows -users to checkpoint models whose size is roughly the available CPU RAM on the -user's machine. - -This feature can be run as follows: - -.. code-block:: python - - save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) - with FSDP.state_dict_type( - model, StateDictType.FULL_STATE_DICT, save_policy - ): - cpu_state = model.state_dict() - if rank == 0: - save_name = file_save_name + "-" + time_of_run + "-" + currEpoch - torch.save(cpu_state, save_name) - -Summary -------- - -In this tutorial, we have introduced many new features for FSDP available in -Pytorch 1.12 and used HF T5 as the running example. Using the proper wrapping -policy especially for transformer models, along with mixed precision and -backward prefetch should speed up your training runs. Also, features such as -initializing the model on device, and checkpoint saving via streaming to CPU -should help to avoid OOM error in dealing with large models. - -We are actively working to add new features to FSDP for the next release. If -you have feedback, feature requests, questions or are encountering issues -using FSDP, please feel free to contact us by opening an issue in the -`PyTorch Github repository `__. diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst deleted file mode 100644 index 9b9845667f..0000000000 --- a/intermediate_source/FSDP_tutorial.rst +++ /dev/null @@ -1,447 +0,0 @@ -Getting Started with Fully Sharded Data Parallel(FSDP) -====================================================== - -**Author**: `Hamid Shojanazeri `__, `Yanli Zhao `__, `Shen Li `__ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Training AI models at a large scale is a challenging task that requires a lot of compute power and resources. -It also comes with considerable engineering complexity to handle the training of these very large models. -`PyTorch FSDP `__, released in PyTorch 1.11 makes this easier. - -In this tutorial, we show how to use `FSDP APIs `__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models `__, -`GPT 3 models up to 1T parameters `__ . The sample DDP MNIST code has been borrowed from `here `__. - - -How FSDP works --------------- -In `DistributedDataParallel `__, (DDP) training, each process/ worker owns a replica of the model and processes a batch of data, finally it uses all-reduce to sum up gradients over different workers. In DDP the model weights and optimizer states are replicated across all workers. FSDP is a type of data parallelism that shards model parameters, optimizer states and gradients across DDP ranks. - -When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes the training of some very large models feasible by allowing larger models or batch sizes to fit on device. This comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like overlapping communication and computation. - -.. figure:: /_static/img/distributed/fsdp_workflow.png - :width: 100% - :align: center - :alt: FSDP workflow - - FSDP Workflow - -At a high level FSDP works as follow: - -*In constructor* - -* Shard model parameters and each rank only keeps its own shard - -*In forward path* - -* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit -* Run forward computation -* Discard parameter shards it has just collected - -*In backward path* - -* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit -* Run backward computation -* Run reduce_scatter to sync gradients -* Discard parameters. - -One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards. - -.. figure:: /_static/img/distributed/fsdp_sharding.png - :width: 100% - :align: center - :alt: FSDP allreduce - - FSDP Allreduce - -How to use FSDP ---------------- -Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well. - -*Setup* - -1.1 Install PyTorch along with Torchvision - -See the `Get Started guide `__ for information on installation. - -We add the following code snippets to a python script “FSDP_mnist.py”. - -1.2 Import necessary packages - -.. note:: - This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy` and `fsdp_auto_wrap_policy` with `auto_wrap_policy`. - -.. code-block:: python - - # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py - import os - import argparse - import functools - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - from torchvision import datasets, transforms - - - from torch.optim.lr_scheduler import StepLR - - import torch.distributed as dist - import torch.multiprocessing as mp - from torch.nn.parallel import DistributedDataParallel as DDP - from torch.utils.data.distributed import DistributedSampler - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.fully_sharded_data_parallel import ( - CPUOffload, - BackwardPrefetch, - ) - from torch.distributed.fsdp.wrap import ( - size_based_auto_wrap_policy, - enable_wrap, - wrap, - ) - -1.3 Distributed training setup. As we mentioned FSDP is a type of data parallelism which requires a distributed training environment, so here we use two helper functions to initialize the processes for distributed training and clean up. - -.. code-block:: python - - def setup(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '12355' - - # initialize the process group - dist.init_process_group("nccl", rank=rank, world_size=world_size) - - def cleanup(): - dist.destroy_process_group() - -2.1 Define our toy model for handwritten digit classification. - -.. code-block:: python - - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - -2.2 Define a train function - -.. code-block:: python - - def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): - model.train() - ddp_loss = torch.zeros(2).to(rank) - if sampler: - sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(rank), target.to(rank) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target, reduction='sum') - loss.backward() - optimizer.step() - ddp_loss[0] += loss.item() - ddp_loss[1] += len(data) - - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - if rank == 0: - print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) - -2.3 Define a validation function - -.. code-block:: python - - def test(model, rank, world_size, test_loader): - model.eval() - correct = 0 - ddp_loss = torch.zeros(3).to(rank) - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(rank), target.to(rank) - output = model(data) - ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() - ddp_loss[2] += len(data) - - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - - if rank == 0: - test_loss = ddp_loss[0] / ddp_loss[2] - print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( - test_loss, int(ddp_loss[1]), int(ddp_loss[2]), - 100. * ddp_loss[1] / ddp_loss[2])) - -2.4 Define a distributed train function that wraps the model in FSDP - -**Note: to save the FSDP model, we need to call the state_dict on each rank then on Rank 0 save the overall states.** - -.. code-block:: python - - def fsdp_main(rank, world_size, args): - setup(rank, world_size) - - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - - dataset1 = datasets.MNIST('../data', train=True, download=True, - transform=transform) - dataset2 = datasets.MNIST('../data', train=False, - transform=transform) - - sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) - sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) - - train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} - test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} - cuda_kwargs = {'num_workers': 2, - 'pin_memory': True, - 'shuffle': False} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - - train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) - test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=100 - ) - torch.cuda.set_device(rank) - - - init_start_event = torch.cuda.Event(enable_timing=True) - init_end_event = torch.cuda.Event(enable_timing=True) - - model = Net().to(rank) - - model = FSDP(model) - - optimizer = optim.Adadelta(model.parameters(), lr=args.lr) - - scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) - init_start_event.record() - for epoch in range(1, args.epochs + 1): - train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) - test(model, rank, world_size, test_loader) - scheduler.step() - - init_end_event.record() - - if rank == 0: - print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") - print(f"{model}") - - if args.save_model: - # use a barrier to make sure training is done on all ranks - dist.barrier() - states = model.state_dict() - if rank == 0: - torch.save(states, "mnist_cnn.pt") - - cleanup() - - - -2.5 Finally, parse the arguments and set the main function - -.. code-block:: python - - if __name__ == '__main__': - # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 14)') - parser.add_argument('--lr', type=float, default=1.0, metavar='LR', - help='learning rate (default: 1.0)') - parser.add_argument('--gamma', type=float, default=0.7, metavar='M', - help='Learning rate step gamma (default: 0.7)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--save-model', action='store_true', default=False, - help='For Saving the current Model') - args = parser.parse_args() - - torch.manual_seed(args.seed) - - WORLD_SIZE = torch.cuda.device_count() - mp.spawn(fsdp_main, - args=(WORLD_SIZE, args), - nprocs=WORLD_SIZE, - join=True) - - -We have recorded cuda events to measure the time of FSDP model specifics. The CUDA event time was 110.85 seconds. - -.. code-block:: bash - - python FSDP_mnist.py - - CUDA event elapsed time on training loop 40.67462890625sec - -Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit. -Alternatively, we will look at adding the auto_wrap_policy next and will discuss the differences. - -.. code-block:: bash - - FullyShardedDataParallel( - (_fsdp_wrapped_module): FlattenParamsWrapper( - (_fpw_module): Net( - (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) - (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) - (dropout1): Dropout(p=0.25, inplace=False) - (dropout2): Dropout(p=0.5, inplace=False) - (fc1): Linear(in_features=9216, out_features=128, bias=True) - (fc2): Linear(in_features=128, out_features=10, bias=True) - ) - ) - ) - -The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. - - -.. figure:: /_static/img/distributed/FSDP_memory.gif - :width: 100% - :align: center - :alt: FSDP peak memory - - FSDP Peak Memory Usage - -Applying *auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. -The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model. -In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding. -Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers. - -To avoid that, you can pass in an auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit). -In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers. -Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100. - - -To do so in 2.4 we define the auto_wrap_policy and pass it to FSDP wrapper, in the following example, my_auto_wrap_policy defines that a layer could be wrapped or sharded by FSDP if the number of parameters in this layer is larger than 100. -If the number of parameters in this layer is smaller than 100, it will be wrapped with other small layers together by FSDP. -Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning for this config in the future. Without an auto tuning tool, it is good to profile your workflow using different auto wrap policies experimentally and find the optimal one. - -.. code-block:: python - - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=20000 - ) - torch.cuda.set_device(rank) - model = Net().to(rank) - - model = FSDP(model, - auto_wrap_policy=my_auto_wrap_policy) - -Applying the auto_wrap_policy, the model would be as follows: - -.. code-block:: bash - - FullyShardedDataParallel( - (_fsdp_wrapped_module): FlattenParamsWrapper( - (_fpw_module): Net( - (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) - (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) - (dropout1): Dropout(p=0.25, inplace=False) - (dropout2): Dropout(p=0.5, inplace=False) - (fc1): FullyShardedDataParallel( - (_fsdp_wrapped_module): FlattenParamsWrapper( - (_fpw_module): Linear(in_features=9216, out_features=128, bias=True) - ) - ) - (fc2): Linear(in_features=128, out_features=10, bias=True) - ) - ) - - -.. code-block:: bash - - python FSDP_mnist.py - - CUDA event elapsed time on training loop 41.89130859375sec - -The following is the peak memory usage from FSDP with auto_wrap policy of MNIST training on a g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. -It can be observed that the peak memory usage on each device is smaller compared to FSDP without auto wrap policy applied, from ~75 MB to 66 MB. - -.. figure:: /_static/img/distributed/FSDP_autowrap.gif - :width: 100% - :align: center - :alt: FSDP peak memory - - FSDP Peak Memory Usage using Auto_wrap policy - -*CPU Off-loading*: In case the model is very large that even with FSDP wouldn't fit into GPUs, then CPU offload can be helpful here. - -Currently, only parameter and gradient CPU offload is supported. It can be enabled via passing in cpu_offload=CPUOffload(offload_params=True). - -Note that this currently implicitly enables gradient offloading to CPU in order for params and grads to be on the same device to work with the optimizer. This API is subject to change. The default is None in which case there will be no offloading. - -Using this feature may slow down the training considerably, due to frequent copying of tensors from host to device, but it could help improve memory efficiency and train larger scale models. - -In 2.4 we just add it to the FSDP wrapper - - -.. code-block:: python - - model = FSDP(model, - auto_wrap_policy=my_auto_wrap_policy, - cpu_offload=CPUOffload(offload_params=True)) - - -Compare it with DDP, if in 2.4 we just normally wrap the model in DPP, saving the changes in “DDP_mnist.py”. - -.. code-block:: python - - model = Net().to(rank) - model = DDP(model) - - -.. code-block:: bash - - python DDP_mnist.py - - CUDA event elapsed time on training loop 39.77766015625sec - -The following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch profiler. - -.. figure:: /_static/img/distributed/DDP_memory.gif - :width: 100% - :align: center - :alt: FSDP peak memory - - DDP Peak Memory Usage using Auto_wrap policy - - -Considering the toy example and tiny MNIST model we defined here, we can observe the difference between peak memory usage of DDP and FSDP. -In DDP each process holds a replica of the model, so the memory footprint is higher compared to FSDP which shards the model parameters, optimizer states and gradients over DDP ranks. -The peak memory usage using FSDP with auto_wrap policy is the lowest followed by FSDP and DDP. - -Also, looking at timings, considering the small model and running the training on a single machine, FSDP with and without auto_wrap policy performed almost as fast as DDP. -This example does not represent most of the real applications, for detailed analysis and comparison between DDP and FSDP please refer to this `blog post `__ . diff --git a/intermediate_source/TCPStore_libuv_backend.rst b/intermediate_source/TCPStore_libuv_backend.rst deleted file mode 100644 index 1e285eba7c..0000000000 --- a/intermediate_source/TCPStore_libuv_backend.rst +++ /dev/null @@ -1,286 +0,0 @@ -Introduction to Libuv TCPStore Backend -====================================== -**Authors**: `Xilun Wu `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -.. grid:: 2 - - .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn - :class-card: card-prerequisites - - * What is the new TCPStore backend - * Compare the new libuv backend against the legacy backend - * How to enable to use the legacy backend - - - .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites - :class-card: card-prerequisites - - * PyTorch 2.4 or later - * Read about the `TCPStore API `__. - - -Introduction ------------- - -Recently, we have rolled out a new TCPStore server backend using `libuv `__, a third-party library for asynchronous I/O. This new server backend aims to -address scalability and robustness challenges in large-scale distributed training jobs, such as those with more than 1024 ranks. We ran a series of -benchmarks to compare the libuv backend against the old one, and the experiment results demonstrated significant improvements in store initialization -time and maintained a comparable performance in store I/O operations. - -As a result of these findings, the libuv backend has been set as the default TCPStore server backend in PyTorch 2.4. This change is expected to enhance -the performance and scalability of distributed training jobs. - -This change introduces a slight incompatibility to store initialization. For users who wish to continue using the legacy backend, the tutorial will -provide guidance on how to specify to use the previous TCPStore server backend. - - -Performance Benchmark ---------------------- - -To better demonstrate the benefit of our new libuv TCPStore backend, we set up a benchmark over a wide range of job size, from 1024 (1K) to 98304 (96K) ranks. -We first measured the TCPStore initialization time using the code snippet below: - -.. code:: python - - import logging - import os - - from time import perf_counter - - import torch - import torch.distributed as dist - - logger: logging.Logger = logging.getLogger(__name__) - - # Env var are preset when launching the benchmark - env_rank = os.environ.get("RANK", 0) - env_world_size = os.environ.get("WORLD_SIZE", 1) - env_master_addr = os.environ.get("MASTER_ADDR", "localhost") - env_master_port = os.environ.get("MASTER_PORT", "23456") - - start = perf_counter() - tcp_store = dist.TCPStore( - env_master_addr, - int(env_master_port), - world_size=int(env_world_size), - is_master=(int(env_rank) == 0), - ) - end = perf_counter() - time_elapsed = end - start - logger.info( - f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." - ) - -Since the execution of the TCPStore server thread will be blocked until all clients are successfully connected, we take the time measured on rank 0 as the total -TCPStore initialization runtime. The experiment numbers are reported in the figure below: - -.. figure:: /_static/img/distributed/tcpstore_init_time.png - :width: 100% - :align: center - :alt: TCPStore Initialization Runtime Benchmark Result - -Figure 1. shows some significant evidence that the libuv backend is superior to the legacy backend: - -- TCPStore with libuv backend always has a faster initialization than the legacy backend, especially at super-large scale -- The legacy backend would timeout at server-client connecting at 96K scale (for example, over 30 minutes) while the libuv backend completed the initialization in 100 seconds. - -The second benchmark we did is to measure the runtime of TCPStore ``store_based_barrier`` operation: - -.. code:: python - - import logging - import os - import time - - from datetime import timedelta - from time import perf_counter - - import torch - import torch.distributed as dist - - DistStoreError = torch._C._DistStoreError - logger: logging.Logger = logging.getLogger(__name__) - - # since dist._store_based_barrier is a private function and cannot be directly called, we need to write a function which does the same - def store_based_barrier( - rank, - store, - group_name, - rendezvous_count, - timeout=dist.constants.default_pg_timeout, - logging_interval=timedelta(seconds=10), - ): - store_key = f"store_based_barrier_key:{group_name}" - store.add(store_key, 1) - - world_size = rendezvous_count - worker_count = store.add(store_key, 0) - - last_worker_key = f"{store_key}:last_worker" - if worker_count == world_size: - store.set(last_worker_key, "1") - - start = time.time() - while True: - try: - # This will throw an exception after the logging_interval in which we print out - # the status of the group or time out officially, throwing runtime error - store.wait([last_worker_key], logging_interval) - break - except RuntimeError as e: - worker_count = store.add(store_key, 0) - # Print status periodically to keep track. - logger.info( - "Waiting in store based barrier to initialize process group for " - "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s)" - "error: %s", - rank, - store_key, - world_size, - worker_count, - timeout, - e, - ) - - if timedelta(seconds=(time.time() - start)) > timeout: - raise DistStoreError( - "Timed out initializing process group in store based barrier on " - "rank {}, for key: {} (world_size={}, num_workers_joined={}, timeout={})".format( - rank, store_key, world_size, worker_count, timeout - ) - ) - - logger.info( - "Rank %s: Completed store-based barrier for key:%s with %s nodes.", - rank, - store_key, - world_size, - ) - - # Env var are preset when launching the benchmark - env_rank = os.environ.get("RANK", 0) - env_world_size = os.environ.get("WORLD_SIZE", 1) - env_master_addr = os.environ.get("MASTER_ADDR", "localhost") - env_master_port = os.environ.get("MASTER_PORT", "23456") - - tcp_store = dist.TCPStore( - env_master_addr, - int(env_master_port), - world_size=int(env_world_size), - is_master=(int(env_rank) == 0), - ) - - # sync workers - store_based_barrier(int(env_rank), tcp_store, "tcpstore_test", int(env_world_size)) - - number_runs = 10 - start = perf_counter() - for _ in range(number_runs): - store_based_barrier( - int(env_rank), tcp_store, "tcpstore_test", int(env_world_size) - ) - end = perf_counter() - time_elapsed = end - start - logger.info( - f"Complete {number_runs} TCPStore barrier runs with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." - ) - -We compute the average by dividing the runtime measured on rank 0 by ``number_runs`` and report it in the figure below: - -.. figure:: /_static/img/distributed/tcpstore_barrier_time.png - :width: 100% - :align: center - :alt: TCPStore Barrier Runtime Benchmark Result - -Figure 2. shows that the I/O performance of libuv backend is comparable to the legacy backend: - -- The libuv backend has a comparable performance over the whole spectrum in terms of the number of ranks -- The libuv backend runtime is more stable than the legacy backend as the number of ranks grows - - -Impact ------- - -One incompatibility that users may need to pay attention is, TCPStore currently does not support initialization with a ``listen_fd`` when using libuv backend. -If the user wants to keep using this initialization method, the user can simply pass ``use_libuv=False`` to stay with the old TCPStore backend. - -.. code:: python - - import socket - - import torch - import torch.distributed as dist - - listen_sock: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - listen_sock.bind(("localhost", 0)) - addr, port, *_ = listen_sock.getsockname() - listen_fd = listen_sock.detach() - - tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd) # expect NotImplementedError - tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd, use_libuv=False) # OK. Use legacy backend - - -Exit Route 1: Pass ``use_libuv=False`` to TCPStore Initialization ------------------------------------------------------------------ - -As the above code snippet shows, if user calls TCPStore init method to create a store, simply passing ``use_libuv=False`` allows user to remain using the old -TCPStore backend. This override has the highest priority over other approaches determining which backend the TCPStore server should choose. - - -Exit Route 2: Add ``use_libuv=0`` to ``init_method`` at ProcessGroup Initialization ------------------------------------------------------------------------------------ - -``ProcessGroup`` creates a TCPStore if user does not explicitly pass one to its initialization. User can add the query option ``use_libuv=0`` to ``init_method`` when -initializing the ``ProcessGroup``. This approach has lower priority than Exit Route 1. - -.. code:: python - - import torch - import torch.distributed as dist - - addr = "localhost" - port = 23456 - dist.init_process_group( - backend="cpu:gloo,cuda:nccl", - rank=0, - world_size=1, - init_method=f"tcp://{addr}:{port}?use_libuv=0", - ) - dist.destroy_process_group() - - -Exit Route 3: Set Environment Variable ``USE_LIBUV`` to ``0`` -------------------------------------------------------------- - -When ProcessGroup creates a TCPStore, it also checks the environment vairable ``USE_LIBUV`` to determine which TCPStore backend to use. User can set the environment -variable ``"USE_LIBUV"`` to ``"0"`` to specify the use of old TCPStore backend. This approach has lower priority than Exit Route 2, for example, if the user sets environment -variable ``USE_LIBUV`` to ``1`` and also passes ``use_libuv=0`` in ``init_method``, then the old store backend will be chosen. - -.. code:: python - - import os - - import torch - import torch.distributed as dist - - addr = "localhost" - port = 23456 - os.environ["USE_LIBUV"] = "0" - dist.init_process_group( - backend="cpu:gloo,cuda:nccl", - rank=0, - world_size=1, - init_method=f"tcp://{addr}:{port}", - ) - dist.destroy_process_group() - - -Conclusion ----------- -In PyTorch 2.4, we made the new libuv TCPStore backend the default. Although the new backend has incompatibility with initialization from a ``listen_fd``, it -shows significant performance improvement on store initialization at large-scale and compatible performance on store I/O at small/medium/large scales, which -brings a major benefit to Distributed Training's control plane. This tutorial explains our motivation, goes through the performance benchmark, notifies users -of the potential impact, and introduces three exit routes to remain using the legacy backend. In the long term, we aim to eventually deprecate the legacy backend. diff --git a/intermediate_source/TP_tutorial.rst b/intermediate_source/TP_tutorial.rst deleted file mode 100644 index 91e64a8748..0000000000 --- a/intermediate_source/TP_tutorial.rst +++ /dev/null @@ -1,361 +0,0 @@ -Large Scale Transformer model training with Tensor Parallel (TP) -====================================================== - -**Author**: `Wanchao Liang `__, `Tianyu Liu `__ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel and Fully Sharded Data Parallel. - -Prerequisites: - -- PyTorch 2.3.0 or later installed with CUDA/Linux -- `Tensor Parallel APIs `__ -- `Getting Started with DeviceMesh `__ -- `Getting Started with Fully Sharded Data Parallel `__ - - -How Tensor Parallel works? ------------ -Tensor Parallel (TP) was originally proposed in the `Megatron-LM `__ paper, -and it is an efficient model parallelism technique to train large scale Transformer models. -`Sequence Parallel `__ (SP) we mention in this tutorial is a variant of Tensor -Parallel that shards on the sequence dimension for ``nn.LayerNorm`` or ``RMSNorm`` to further save activation memory -during training. As the model becomes larger, the activation memory becomes the bottleneck, so in Tensor -Parallel training it usually applies Sequence Parallel to ``LayerNorm`` or ``RMSNorm`` layers. - -.. figure:: /_static/img/distributed/megatron_lm.png - :width: 100% - :align: center - :alt: Megatron-LM TP - - Figure 1. represents the sharding in Tensor Parallel style on a Transformer model’s MLP and Self-Attention layer, where the matrix multiplications in both attention/MLP happens through sharded computations (`image source `__) - - -At a high level, PyTorch Tensor Parallel works as follows: - -**Sharding initialization** - -* Determine which ``ParallelStyle`` to apply to each layer and shard the initialized module by calling ``parallelize_module``. -* The parallelized modules would have their model parameters be swapped to DTensors, and DTensor would be responsible to run the parallelized module using sharded computation. - -**Runtime foward/backward** - -* Depending on the input/outputs DTensor layouts user specified for each ``ParallelStyle``, it would run proper communication operation to transform the DTensor layouts for inputs/outputs (such as ``allreduce``, ``allgather`` and ``reduce_scatter``). -* Run sharded computation for the parallelized layers to save compute/memory (for example, ``nn.Linear``, ``nn.Embedding``). - - -When and Why you should apply Tensor Parallel ---------------------------------------------- -The PyTorch Fully Sharded Data Parallel (FSDP) already has the capability to scale model training to a specific -number of GPUs. However, when it comes to further scale the model training in terms of model size and GPU quantity, -many additional challenges arise that may require combining Tensor Parallel with FSDP.: - -1. As the world size (number of GPUs) is becoming excessively large (exceeding 128/256 GPUs), the FSDP collectives (such as ``allgather``) are being dominated by ring latency. - By implementing TP/SP on top of FSDP, the FSDP world size could be reduced by 8 by applying FSDP to be inter-host only, consequently decreasing the latency costs by the same amount. -2. Hit data parallelism limit where you can not raise the global batch size to be above the number of GPUs due to both convergence and GPU memory limitations, Tensor/Sequence Parallel - is the only known way to “ballpark” the global batch size and continue scaling with more GPUs. This means both model size and number of GPUs could continue to scale. -3. For certain types of models, when local batch size becomes smaller, TP/SP can yield matrix multiplication shapes that are more optimized for floating point operations (FLOPS). - -So, when pre-training, how easy is it to hit those limits? As of now, pre-training a Large Language Model (LLM) with billions or trillions of tokens could take months, even when using thousands of GPUs. - -* It will always hit limitation 1 when training LLM on a large scale. For example, Llama 2 70B trained with 2k GPUs for 35 days, multi-dimensional parallelisms are needed at 2k scale. -* When the Transformer model becomes larger (such as Llama2 70B), it will also quickly hit the limitation 2. One could not use FSDP alone with even local ``batch_size=1`` due to memory - and convergence constraints. For example, Llama 2 global batch size is 1K, so data parallelism alone can not be used at 2K GPUs. - - -How to apply Tensor Parallel ----------------------------- - -PyTorch Tensor Parallel APIs offers a set of module level primitives (``ParallelStyle``) to configure the sharding for each individual layers of the model, including: - -* ``ColwiseParallel`` and ``RowwiseParallel``: Shard the ``nn.Linear`` and ``nn.Embedding`` in the column or row fashion. -* ``SequenceParallel``: Perform sharded computations on ``nn.LayerNorm``, ``nn.Dropout``, ``RMSNormPython``, etc. -* ``PrepareModuleInput`` and ``PrepareModuleOutput``: Configure the module inputs/outputs sharding layouts with proper communication operations. - -To demonstrate how to use the PyTorch native Tensor Parallel APIs, let us look at a common Transformer model. In this tutorial, we use the most recent `Llama2 model `__ as a reference Transformer model implementation, as it is also widely used in the community. - -Since Tensor Parallel shard individual tensors over a set of devices, we would need to set up the distributed environment (such as NCCL communicators) first. -Tensor Parallelism is a Single-Program Multiple-Data (SPMD) sharding algorithm similar to PyTorch DDP/FSDP, and it under the hood leverages the PyTorch DTensor -to perform sharding. It also utilizes the DeviceMesh abstraction (which under the hood manages ProcessGroups) for device management and sharding. -To see how to utilize DeviceMesh to set up multi-dimensional parallelisms, please refer to `this tutorial `__. Tensor Parallel usually works within each host, so let us first initialize a DeviceMesh that connects 8 GPUs within a host. - -.. code-block:: python - - from torch.distributed.device_mesh import init_device_mesh - - tp_mesh = init_device_mesh("cuda", (8,)) - - -Now that we have initialized DeviceMesh, let us take a detailed look at the Llama 2 model architecture and see how we should perform the Tensor Parallel sharding. -Here we focus on the core ``TransformerBlock``, where the Transformer model stacks the identical ``TransformerBlock`` s to scale up the model. - -The core ``TransformerBlock`` consists of an ``Attention`` layer and a ``FeedForward`` layer. Let us first look at the simpler ``FeedForward`` layer. -For the ``FeedForward`` Layer it consists of three Linear layers, where it performs a SwiGLU style MLP, looking at its forward function: - -.. code-block:: python - - # forward in the FeedForward layer - def forward(self, x): - return self.w2(F.silu(self.w1(x)) * self.w3(x)) - - -It performs ``w1`` and ``w3`` matmuls concurrently and followed by a ``w2`` matmul with the result of the combined w1/w3 linear projection results. This means we could -use the idea from the Tensor Parallelism paper to shard the w1/w3 Linear layers in the colwise fashion and shard the ``w2`` Linear layer in the rowwise fashion, so that -there is only one ``allreduce`` communication happening at the end of all the three layers. With the PyTorch native Tensor Parallel, we can simply create a ``parallelize_plan`` for the ``FeedForward`` layer like below: - -.. code-block:: python - - from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module - - layer_tp_plan = { - # by default ColwiseParallel input layouts is replicated - # and RowwiseParallel output layouts is replicated - "feed_foward.w1": ColwiseParallel(), - "feed_forward.w2": RowwiseParallel(), - "feed_forward.w3": ColwiseParallel(), - } - - -That's simply how we configure the shardings for the ``FeedForward`` layer using the PyTorch Tensor Parallel APIs. Note that users would only need to specify how to shard the individual layers and the communications (for example, ``allreduce``) will happen under the hood. - -Moving on to the ``Attention`` Layer. It consists of ``wq``, ``wk``, ``wv`` Linear layers to project input to ``q``/ ``k`` / ``v``, and then it performs attention and output projection with the ``wo`` Linear layer. Tensor Parallelism here intends to perform column-wise sharding for the -q/k/v projection and row-wise sharding for the ``wo`` linear projection. So we can add the Attention plan to the ``tp_plan`` that we just drafted up: - -.. code-block:: python - - layer_tp_plan = { - # by default ColwiseParallel input layouts is replicated - # and RowwiseParallel output layouts is replicated - "attention.wq": ColwiseParallel(), - "attention.wk": ColwiseParallel(), - "attention.wv": ColwiseParallel(), - "attention.wo": RowwiseParallel(), - "feed_forward.w1": ColwiseParallel(), - "feed_forward.w2": RowwiseParallel(), - "feed_forward.w3": ColwiseParallel(), - } - - -This is almost the ``layer_tp_plan`` we need to apply Tensor Parallelism to the ``TransformerBlock``. However, one thing we should be aware is that when sharding the linear layer column-wise, the output of the linear layers would become sharded on the last tensor dimension, and the row-wise sharding linear layer directly accepts an input that shards on the last dimension. -If there are any more tensor operations (such as view operations) between the column-wise linear and the row-wise linear, we would need to adjust the relevant shape related ops to sharded shape. - -For the Llama model, in the attention layer there are couple of view operations that are shape related. In particular, column-wise parallel for ``wq``/ ``wk``/ ``wv`` linear layers, the activation tensor is sharded on the ``num_heads`` dimension, so we would need to adjust the ``num_heads`` to local ``num_heads``. - -Finally, we need to call ``parallelize_module`` API to make the plan for each ``TransformerBlock`` effective. Under the hood, it distributes the model parameters inside ``Attention`` and ``FeedForward`` layers to DTensors, and registers communication hooks for model inputs and outputs (before and after each module respectively), if necessary: - -.. code-block:: python - - for layer_id, transformer_block in enumerate(model.layers): - layer_tp_plan = {...} # i.e. the plan we just generated - - # Adjust attention module to use the local number of heads - attn_layer = transformer_block.attention - attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size() - attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size() - - parallelize_module( - module=transformer_block, - device_mesh=tp_mesh, - parallelize_plan=layer_tp_plan, - ) - -Now that we have elaborated the sharding plan for each ``TransformerBlock``, there is usually a ``nn.Embedding`` in the first layer and a final ``nn.Linear`` projection layer, where user could choose row-wise or column-wise sharding to the first ``nn.Embedding`` and column-wise sharding to the last ``nn.Linear`` projection layer with proper input and output layouts specified. -Here is an example: - -.. code-block:: python - - model = parallelize_module( - model, - tp_mesh, - { - "tok_embeddings": RowwiseParallel( - input_layouts=Replicate(), - ), - "output": ColwiseParallel( - output_layouts=Replicate(), - ), - } - ) - -.. note:: - If the model to be partitioned is too large to fit into CPU memory, one could either use ``meta`` device initialization (for example, initialize the model on meta device first, shard the layers, and the materialize the model), or parallelize the ``TransformerBlock`` layer by layer during the Transformer model initialization. - -Apply Sequence Parallel to ``LayerNorm/RMSNorm`` layers -------------------------------------------------------- - -Sequence Parallel works on top of the Tensor Parallel illustrated above. Compared with basic Tensor Parallel, which only shards tensors within the ``Attention`` modules and ``FeedForward`` modules and keep their module inputs and outputs (namely activations in the forward pass and gradients in the backward pass) replicated, Sequence Parallel keeps them sharded on the sequence dimension. - -In a typical ``TransformerBlock``, the forward function combines norm layers (``LayerNorm`` or ``RMSNorm``), an attention layer, a feed forward layer, and residual connections. For example: - -.. code-block:: python - - # forward in a TransformerBlock - def forward(self, x): - h = x + self.attention(self.attention_norm(x)) - out = h + self.feed_forward(self.ffn_norm(h)) - return out - -In most use cases, the activations (and gradients) are of the shape ``[batch size, sequence length, hidden dimension]`` outside the ``Attention`` and ``FeedForward`` modules. In the DTensor’s language, Sequence Parallel performs activation computation using the ``Shard(1)`` layout for both forward/backward of the module. -Following the code example earlier, the code below demonstrates how we apply Sequence Parallel to the norm layers within a ``TransformerBlock``: - -First let's import the required dependencies for Sequence Parallel: - -.. code-block:: python - - from torch.distributed.tensor.parallel import ( - PrepareModuleInput, - SequenceParallel, - ) - - -Next let's adjust the ``layer_tp_plan`` to enable sequence parallel on the ``RMSNorm`` layers: - -.. code-block:: python - - layer_tp_plan = { - # Now the input and output of SequenceParallel has Shard(1) layouts, - # to represent the input/output tensors sharded on the sequence dimension - "attention_norm": SequenceParallel(), - "attention": PrepareModuleInput( - input_layouts=(Shard(1),), - desired_input_layouts=(Replicate(),), - ), - "attention.wq": ColwiseParallel(), - "attention.wk": ColwiseParallel(), - "attention.wv": ColwiseParallel(), - "attention.wo": RowwiseParallel(output_layouts=Shard(1)), - "ffn_norm": SequenceParallel(), - "feed_forward": PrepareModuleInput( - input_layouts=(Shard(1),), - desired_input_layouts=(Replicate(),), - ), - "feed_forward.w1": ColwiseParallel(), - "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)), - "feed_forward.w3": ColwiseParallel(), - } - - -One can see we now use ``PrepareModuleInput`` to modify the module input layouts to the Attention and FeedForward layers from ``Shard(1)`` to ``Replicate()``, and mark their output layouts as ``Shard(1)``. -Just like what happens to Tensor Parallelism, one only needs to specify the tensor sharding layouts of the inputs and outputs, and the communication between layers will happen automatically. - -Note that with Sequence Parallel, we assume the inputs and outputs of a ``TransformerBlock`` are always sharded on the sequence dimension, so that multiple ``TransformerBlocks`` can be concatenated seamlessly. -This can be facilitated by explicitly specifying the output of the beginning ``nn.Embedding`` layer and the input of the final ``nn.Linear`` projection layer to be ``Shard(1)``: - -.. code-block:: python - - model = parallelize_module( - model, - tp_mesh, - { - "tok_embeddings": RowwiseParallel( - input_layouts=Replicate(), - output_layouts=Shard(1), - ), - "norm": SequenceParallel(), - "output": ColwiseParallel( - input_layouts=Shard(1), - output_layouts=Replicate() - ), - } - ) - - -Apply Loss Parallel -------------------- - -Loss Parallel is a related technique to save memory and communication when the loss function is computed, as model outputs are usually very large. In Loss Parallel, when the model outputs are sharded on the (often huge) vocabulary dimension, the cross-entropy loss can be computed efficiently, without gathering all the model outputs to every single GPU. This not only significantly reduces the memory consumption, but also improves training speed by reducing communication overhead and doing sharded computation in parallel. The picture below briefly illustrates how Loss Parallel avoids gathering all model outputs to every GPU by doing sharded computation. - -.. figure:: /_static/img/distributed/loss_parallel.png - :width: 100% - :align: center - :alt: loss parallel - - Figure 2. Cross-entropy loss forward computation with loss parallel on one GPU. Blue represents sharded tensors; green represents replicated tensors; yellow represents tensors with partial values (to be all-reduced). Black arrows are local computations; red arrows are functional collectives among GPUs. - -In the PyTorch Tensor Parallel API, Loss Parallel can be enabled via a context manager ``loss_parallel``, with which one can directly use ``torch.nn.functional.cross_entropy`` or ``torch.nn.CrossEntropyLoss`` without modifying other parts of their code. - -To apply Loss Parallel, the model predictions, usually of the shape ``[batch size, sequence length, vocabulary size]``, should be sharded on the vocabulary dimension. This can be easily done via marking the output layouts of the last linear projection layer output: - -.. code-block:: python - - model = parallelize_module( - model, - tp_mesh, - { - "tok_embeddings": RowwiseParallel( - input_layouts=Replicate(), - output_layouts=Shard(1), - ), - "norm": SequenceParallel(), - "output": ColwiseParallel( - input_layouts=Shard(1), - # use DTensor as the output - use_local_output=False, - ), - }, - ) - -In the code above, we also apply Sequence Parallel to the norm layer before output. We apply ``use_local_output=False`` to let the output stay as a DTensor, to work with the ``loss_parallel`` context manager. After that, one can simply call the cross_entropy loss function as is shown below. Note that the backward computation also needs to happen within the context. - -.. code-block:: python - - import torch.nn.functional as F - from torch.distributed.tensor.parallel import loss_parallel - - pred = model(input_ids) - with loss_parallel(): - # assuming pred and labels are of the shape [batch, seq, vocab] - loss = F.cross_entropy(pred.flatten(0, 1), labels.flatten(0, 1)) - loss.backward() - - -Combine Tensor Parallel with Fully Sharded Data Parallel together ------------------------------------------------------------------ - - -Now that we have shown how to apply Tensor/Sequence Parallel to the model, let us also take a look at how Tensor Parallel and Fully Sharded Data Parallel could work together. -Since Tensor Parallelism incurs communications that block the computation, we want to make sure it runs within a fast communication channel, such as NVLink. -In practice, we usually apply Tensor Parallel within each host, and apply Fully Sharded Data Parallel across the hosts. - -.. figure:: /_static/img/distributed/fsdp_tp.png - :width: 100% - :align: center - :alt: fsdp + tp - - Figure 3. FSDP and TP work on separate device dimensions, FSDP communication happens inter-host and TP communication happens intra-host. - - -This 2-D parallelism pattern can be easily expressed via a 2-D DeviceMesh, and we just need pass each “sub” DeviceMesh to each individual parallelism APIs: - -.. code-block:: python - - from torch.distributed.device_mesh import init_device_mesh - from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - - # i.e. 2-D mesh is [dp, tp], training on 64 GPUs that performs 8 way DP and 8 way TP - mesh_2d = init_device_mesh("cuda", (8, 8)) - tp_mesh = mesh_2d["tp"] # a submesh that connects intra-host devices - dp_mesh = mesh_2d["dp"] # a submesh that connects inter-host devices - - model = Model(...) - - tp_plan = {...} - - # apply Tensor Parallel intra-host on tp_mesh - model_tp = parallelize_module(model, tp_mesh, tp_plan) - # apply FSDP inter-host on dp_mesh - model_2d = FSDP(model_tp, device_mesh=dp_mesh, use_orig_params=True, ...) - - -This would allow us to easily apply Tensor Parallel within each host (intra-host) and apply FSDP across hosts (inter-hosts), with **0-code changes** to the Llama model. -The Tensor(Model) Parallel and Data Parallel techniques combined together provides the ability to continue increasing model size and training efficiently using a large number of GPUs. - -Conclusion ----------- -This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel in combination with Fully Sharded Data Parallel. -It explains how to apply Tensor Parallel to different parts of the model, with **no code changes** to the model itself. Tensor Parallel is a efficient model parallelism technique for large scale training. - -To see the complete end-to-end code example explained in this tutorial, please refer to the `Tensor Parallel examples `__ in the pytorch/examples repository. diff --git a/intermediate_source/_torch_export_nightly_tutorial.py b/intermediate_source/_torch_export_nightly_tutorial.py deleted file mode 100644 index fdbe18392e..0000000000 --- a/intermediate_source/_torch_export_nightly_tutorial.py +++ /dev/null @@ -1,635 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -torch.export Nightly Tutorial -================ -**Author:** William Wen, Zhengxu Chen, Angela Yi -""" - -###################################################################### -# -# .. warning:: -# -# ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility -# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.1. -# -# :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into -# standardized model representations, intended -# to be run on different (i.e. Python-less) environments. -# -# In this tutorial, you will learn how to use :func:`torch.export` to extract -# ``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs. -# We also detail some considerations/modifications that you may need -# to make in order to make your model compatible with ``torch.export``. -# -# **Contents** -# -# .. contents:: -# :local: - -###################################################################### -# Basic Usage -# ----------- -# -# ``torch.export`` extracts single-graph representations from PyTorch programs -# by tracing the target function, given example inputs. -# ``torch.export.export()`` is the main entry point for ``torch.export``. -# -# In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous, -# though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()`` -# generally refers to the actual function call. -# -# The signature of ``torch.export.export()`` is: -# -# .. code:: python -# -# export( -# f: Callable, -# args: Tuple[Any, ...], -# kwargs: Optional[Dict[str, Any]] = None, -# *, -# dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None -# ) -> ExportedProgram -# -# ``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` -# and wraps it in an ``ExportedProgram``, which can be serialized or executed later with -# different inputs. Note that while the output ``ExportedGraph`` is callable and can be -# called in the same way as the original input callable, it is not a ``torch.nn.Module``. -# We will detail the ``dynamic_shapes`` argument later in the tutorial. - -import torch -from torch.export import export - -class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - -mod = MyModule() -exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100))) -print(type(exported_mod)) -print(exported_mod(torch.randn(8, 100), torch.randn(8, 100))) - -###################################################################### -# Let's review some attributes of ``ExportedProgram`` that are of interest. -# -# The ``graph`` attribute is an `FX graph `__ -# traced from the function we exported, that is, the computation graph of all PyTorch operations. -# The FX graph has some important properties: -# -# - The operations are "ATen-level" operations. -# - The graph is "functionalized", meaning that no operations are mutations. -# -# The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute -# so that it can be ran as a ``torch.nn.Module``. - -print(exported_mod) -print(exported_mod.graph_module) - -###################################################################### -# The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``) -# and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)`` -# is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate. -# Future uses of input to the original mutating ``relu`` op are replaced by the additional new output -# of the replacement non-mutating ``relu`` op. -# -# Other attributes of interest in ``ExportedProgram`` include: -# -# - ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. -# - ``range_constraints`` and ``equality_constraints`` -- constraints, covered later - -print(exported_mod.graph_signature) - -###################################################################### -# See the ``torch.export`` `documentation `__ -# for more details. - -###################################################################### -# Graph Breaks -# ------------ -# -# Although ``torch.export`` shares components with ``torch.compile``, -# the key limitation of ``torch.export``, especially when compared to ``torch.compile``, is that it does not -# support graph breaks. This is because handling graph breaks involves interpreting -# the unsupported operation with default Python evaluation, which is incompatible -# with the export use case. Therefore, in order to make your model code compatible -# with ``torch.export``, you will need to modify your code to remove graph breaks. -# -# A graph break is necessary in cases such as: -# -# - data-dependent control flow - -def bad1(x): - if x.sum() > 0: - return torch.sin(x) - return torch.cos(x) - -import traceback as tb -try: - export(bad1, (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# - accessing tensor data with ``.data`` - -def bad2(x): - x.data[0, 0] = 3 - return x - -try: - export(bad2, (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# - calling unsupported functions (such as many built-in functions) - -def bad3(x): - x = x + 1 - return x + id(x) - -try: - export(bad3, (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# - unsupported Python language features (e.g. throwing exceptions, match statements) - -def bad4(x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - -try: - export(bad4, (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# The sections below demonstrate some ways you can modify your code -# in order to remove graph breaks. - -###################################################################### -# Control Flow Ops -# ---------------- -# -# ``torch.export`` actually does support data-dependent control flow. -# But these need to be expressed using control flow ops. For example, -# we can fix the control flow example above using the ``cond`` op, like so: - -from functorch.experimental.control_flow import cond - -def bad1_fixed(x): - def true_fn(x): - return torch.sin(x) - def false_fn(x): - return torch.cos(x) - return cond(x.sum() > 0, true_fn, false_fn, [x]) - -exported_bad1_fixed = export(bad1_fixed, (torch.randn(3, 3),)) -print(exported_bad1_fixed(torch.ones(3, 3))) -print(exported_bad1_fixed(-torch.ones(3, 3))) - -###################################################################### -# There are limitations to ``cond`` that one should be aware of: -# -# - The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor. -# - The operands (i.e. ``[x]``) must be tensors. -# - The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the -# operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.). -# - Branch functions cannot mutate input or global variables. -# - Branch functions cannot access closure variables, except for ``self`` if the function is -# defined in the scope of a method. -# -# For more details about ``cond``, check out the `documentation `__. - -###################################################################### -# .. -# [NOTE] map is not documented at the moment -# We can also use ``map``, which applies a function across the first dimension -# of the first tensor argument. -# -# from functorch.experimental.control_flow import map -# -# def map_example(xs): -# def map_fn(x, const): -# def true_fn(x): -# return x + const -# def false_fn(x): -# return x - const -# return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x]) -# return control_flow.map(map_fn, xs, torch.tensor([2.0])) -# -# exported_map_example= export(map_example, (torch.randn(4, 3),)) -# inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3))) -# print(exported_map_example(inp)) - -###################################################################### -# Constraints/Dynamic Shapes -# -------------------------- -# -# Ops can have different specializations/behaviors for different tensor shapes, so by default, -# ``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective -# example inputs given to the initial ``torch.export.export()`` call. -# If we try to run the ``ExportedProgram`` in the example below with a tensor -# with a different shape, we get an error: - -class MyModule2(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - -mod2 = MyModule2() -exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) - -try: - exported_mod2(torch.randn(10, 100), torch.randn(10, 100)) -except Exception: - tb.print_exc() - -###################################################################### -# We can relax this constraint using the ``dynamic_shapes`` argument of -# ``torch.export.export()``, which allows us to specify, using ``torch.export.Dim`` -# (`documentation `__), -# which dimensions of the input tensors are dynamic. -# -# For each tensor argument of the input callable, we can specify a mapping from the dimension -# to a ``torch.export.Dim``. -# A ``torch.export.Dim`` is essentially a named symbolic integer with optional -# minimum and maximum bounds. -# -# Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping -# from the input callable's tensor argument names, to dimension --> dim mappings as described above. -# If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is -# assumed to be static. -# -# The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging. -# Then we can specify an optional minimum and maximum bound (inclusive). Below, we show example usage. -# -# In the example below, our input -# ``inp1`` has an unconstrained first dimension, but the size of the second -# dimension must be in the interval [4, 18]. - -from torch.export import Dim - -inp1 = torch.randn(10, 10, 2) - -def dynamic_shapes_example1(x): - x = x[:, 2:] - return torch.relu(x) - -inp1_dim0 = Dim("inp1_dim0") -inp1_dim1 = Dim("inp1_dim1", min=4, max=18) -dynamic_shapes1 = { - "x": {0: inp1_dim0, 1: inp1_dim1}, -} - -exported_dynamic_shapes_example1 = export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1) - -print(exported_dynamic_shapes_example1(torch.randn(5, 5, 2))) - -try: - exported_dynamic_shapes_example1(torch.randn(8, 1, 2)) -except Exception: - tb.print_exc() - -try: - exported_dynamic_shapes_example1(torch.randn(8, 20, 2)) -except Exception: - tb.print_exc() - -try: - exported_dynamic_shapes_example1(torch.randn(8, 8, 3)) -except Exception: - tb.print_exc() - -###################################################################### -# Note that if our example inputs to ``torch.export`` do not satisfy the constraints -# given by ``dynamic_shapes``, then we get an error. - -inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18) -dynamic_shapes1_bad = { - "x": {0: inp1_dim0, 1: inp1_dim1_bad}, -} - -try: - export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1_bad) -except Exception: - tb.print_exc() - -###################################################################### -# We can enforce that equalities between dimensions of different tensors -# by using the same ``torch.export.Dim`` object, for example, in matrix multiplication: - -inp2 = torch.randn(4, 8) -inp3 = torch.randn(8, 2) - -def dynamic_shapes_example2(x, y): - return x @ y - -inp2_dim0 = Dim("inp2_dim0") -inner_dim = Dim("inner_dim") -inp3_dim1 = Dim("inp3_dim1") - -dynamic_shapes2 = { - "x": {0: inp2_dim0, 1: inner_dim}, - "y": {0: inner_dim, 1: inp3_dim1}, -} - -exported_dynamic_shapes_example2 = export(dynamic_shapes_example2, (inp2, inp3), dynamic_shapes=dynamic_shapes2) - -print(exported_dynamic_shapes_example2(torch.randn(2, 16), torch.randn(16, 4))) - -try: - exported_dynamic_shapes_example2(torch.randn(4, 8), torch.randn(4, 2)) -except Exception: - tb.print_exc() - -###################################################################### -# We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints -# are necessary. We can do this by relaxing all constraints (recall that if we -# do not provide constraints for a dimension, the default behavior is to constrain -# to the exact shape value of the example input) and letting ``torch.export`` -# error out. - -inp4 = torch.randn(8, 16) -inp5 = torch.randn(16, 32) - -def dynamic_shapes_example3(x, y): - if x.shape[0] <= 16: - return x @ y[:, :16] - return y - -dynamic_shapes3 = { - "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())}, - "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())}, -} - -try: - export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3) -except Exception: - tb.print_exc() - -###################################################################### -# We can see that the error message gives us suggested fixes to our -# dynamic shape constraints. Let us follow those suggestions (exact -# suggestions may differ slightly): - -def suggested_fixes(): - inp4_dim1 = Dim('shared_dim') - # suggested fixes below - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - # end of suggested fixes - return { - "x": {0: inp4_dim0, 1: inp4_dim1}, - "y": {0: inp5_dim0, 1: inp5_dim1}, - } - -dynamic_shapes3_fixed = suggested_fixes() -exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) -print(exported_dynamic_shapes_example3(torch.randn(4, 32), torch.randn(32, 64))) - -###################################################################### -# Note that in the example above, because we constrained the value of ``x.shape[0]`` in -# ``dynamic_shapes_example3``, the exported program is sound even though there is a -# raw ``if`` statement. -# -# If you want to see why ``torch.export`` generated these constraints, you can -# re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, -# or use ``torch._logging.set_logs``. - -import logging -torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) -exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) - -# reset to previous values -torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) - -###################################################################### -# We can view an ``ExportedProgram``'s constraints using the ``range_constraints`` and -# ``equality_constraints`` attributes. The logging above reveals what the symbols ``s0, s1, ...`` -# represent. - -print(exported_dynamic_shapes_example3.range_constraints) -print(exported_dynamic_shapes_example3.equality_constraints) - -###################################################################### -# Custom Ops -# ---------- -# -# ``torch.export`` can export PyTorch programs with custom operators. -# -# -# Currently, the steps to register a custom op for use by ``torch.export`` are: -# -# - If you’re writing custom ops purely in Python, use torch.library.custom_op. - -import torch.library -import numpy as np - -@torch.library.custom_op("mylib::sin", mutates_args=()) -def sin(x): - x_np = x.numpy() - y_np = np.sin(x_np) - return torch.from_numpy(y_np) - -###################################################################### -# - You will need to provide abstract implementation so that PT2 can trace through it. - -@torch.library.register_fake("mylib::sin") -def _(x): - return torch.empty_like(x) - -# - Sometimes, the custom op you are exporting has data-dependent output, meaning -# we can't determine the shape of the output at compile time. In this case, you can do -# following: -@torch.library.custom_op("mylib::nonzero", mutates_args=()) -def nonzero(x): - x_np = x.cpu().numpy() - res = np.stack(np.nonzero(x_np), axis=1) - return torch.tensor(res, device=x.device) - -@torch.library.register_fake("mylib::nonzero") -def _(x): - # The number of nonzero-elements is data-dependent. - # Since we cannot peek at the data in an abstract implementation, - # we use the `ctx` object to construct a new ``symint`` that - # represents the data-dependent size. - ctx = torch.library.get_ctx() - nnz = ctx.new_dynamic_size() - shape = [nnz, x.dim()] - result = x.new_empty(shape, dtype=torch.int64) - return result - -###################################################################### -# - Call the custom op from the code you want to export using ``torch.ops`` - -def custom_op_example(x): - x = torch.sin(x) - x = torch.ops.mylib.sin(x) - x = torch.cos(x) - y = torch.ops.mylib.nonzero(x) - return x + y.sum() - -###################################################################### -# - Export the code as before - -exported_custom_op_example = export(custom_op_example, (torch.randn(3, 3),)) -exported_custom_op_example.graph_module.print_readable() -print(exported_custom_op_example(torch.randn(3, 3))) - -###################################################################### -# Note in the above outputs that the custom op is included in the exported graph. -# And when we call the exported graph as a function, the original custom op is called, -# as evidenced by the ``print`` call. -# -# If you have a custom operator implemented in C++, please refer to -# `this document `__ -# to make it compatible with ``torch.export``. - -###################################################################### -# Decompositions -# -------------- -# -# The graph produced by ``torch.export`` by default returns a graph containing -# only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 -# operators, all of which are functional, that is, they do not -# mutate or alias inputs. You can find a list of all ATen operators -# `here `__ -# and you can inspect if an operator is functional by checking -# ``op._schema.is_mutable``, for example: - -print(torch.ops.aten.add.Tensor._schema.is_mutable) -print(torch.ops.aten.add_.Tensor._schema.is_mutable) - -###################################################################### -# By default, the environment in which you want to run the exported graph -# should support all ~2000 of these operators. -# However, you can use the following API on the exported program -# if your specific environment is only able to support a subset of -# the ~2000 operators. -# -# .. code:: python -# -# def run_decompositions( -# self: ExportedProgram, -# decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] -# ) -> ExportedProgram -# -# ``run_decompositions`` takes in a decomposition table, which is a mapping of -# operators to a function specifying how to reduce, or decompose, that operator -# into an equivalent sequence of other ATen operators. -# -# The default decomposition table for ``run_decompositions`` is the -# `Core ATen decomposition table `__ -# which will decompose the all ATen operators to the -# `Core ATen Operator Set `__ -# which consists of only ~180 operators. - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - -ep = export(M(), (torch.randn(2, 3),)) -print(ep.graph) - -core_ir_ep = ep.run_decompositions() -print(core_ir_ep.graph) - -###################################################################### -# Notice that after running ``run_decompositions`` the -# ``torch.ops.aten.t.default`` operator, which is not part of the Core ATen -# Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part -# of the Core ATen Opset. - -###################################################################### -# Most ATen operators already have decompositions, which are located -# `here `__. -# If you would like to use some of these existing decomposition functions, -# you can pass in a list of operators you would like to decompose to the -# `get_decompositions `__ -# function, which will return a decomposition table using existing -# decomposition implementations. - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - -ep = export(M(), (torch.randn(2, 3),)) -print(ep.graph) - -from torch._decomp import get_decompositions -decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) -core_ir_ep = ep.run_decompositions(decomp_table) -print(core_ir_ep.graph) - -###################################################################### -# If there is no existing decomposition function for an ATen operator that you would -# like to decompose, feel free to send a pull request into PyTorch -# implementing the decomposition! - -###################################################################### -# ExportDB -# -------- -# -# ``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement, -# there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to -# rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting -# if-statements using ``cond``. -# -# `ExportDB `__ is the standard reference that documents -# supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each -# of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``. -# Examples are also tagged by category so that they can be more easily searched. -# -# For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator. -# We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like: - -def cond_predicate(x): - """ - The conditional statement (aka predicate) passed to ``cond()`` must be one of the following: - - torch.Tensor with a single element - - boolean expression - NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized. - """ - pred = x.dim() > 2 and x.shape[2] > 10 - return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x]) - -###################################################################### -# More generally, ExportDB can be used as a reference when one of the following occurs: -# -# 1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features -# and you want to know if ``torch.export`` covers that feature. -# 2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it. -# -# ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach -# out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``. - -###################################################################### -# Conclusion -# ---------- -# -# We introduced ``torch.export``, the new PyTorch 2.X way to export single computation -# graphs from PyTorch programs. In particular, we demonstrate several code modifications -# and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph. diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py deleted file mode 100644 index ed581426c2..0000000000 --- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py +++ /dev/null @@ -1,514 +0,0 @@ -""" -Hooks for autograd saved tensors -================================ - -""" - - -###################################################################### -# PyTorch typically computes gradients using backpropagation. However, -# certain operations require intermediary results to be saved in order to -# perform backpropagation. This tutorial walks through how these tensors -# are saved/retrieved and how you can define hooks to control the -# packing/unpacking process. -# -# This tutorial assumes you are familiar with how backpropagation works in -# theory. If not, read `this `_ first. -# - - -###################################################################### -# Saved tensors -# ------------- -# - - -###################################################################### -# Training a model usually consumes more memory than running it for -# inference. Broadly speaking, one can say that it is because “PyTorch -# needs to save the computation graph, which is needed to call -# ``backward``”, hence the additional memory usage. One goal of this -# tutorial is to finetune this understanding. -# -# In fact, the graph in itself sometimes does not consume much more memory -# as it never copies any tensors. However, the graph can keep *references* -# to tensors that would otherwise have gone out of scope: those are -# referred to as **saved tensors**. -# - - -###################################################################### -# Why does training a model (typically) requires more memory than evaluating it? -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# We start with a simple example: :math:`y = a \cdot b` , for which -# we know the gradients of :math:`y` with respect to :math:`a` and -# :math:`b`: -# -# .. math:: \frac{\partial y}{\partial a} = b -# -# .. math:: \frac{\partial y}{\partial b} = a -# - -import torch - -a = torch.randn(5, requires_grad=True) -b = torch.ones(5, requires_grad=True) -y = a * b - -################################################################# -# Using a torchviz, we can visualize the computation graph -# -# .. figure:: https://user-images.githubusercontent.com/8019486/130124513-72e016a3-c36f-42b9-88e2-53baf3e016c5.png -# :width: 300 -# :align: center - - -###################################################################### -# In this example, PyTorch saves intermediary values :math:`a` and -# :math:`b` in order to compute the gradient during the backward. -# -# .. figure:: https://user-images.githubusercontent.com/8019486/130124538-3da50977-6f0b-46d0-8909-5456ade9b598.png -# :width: 300 -# :align: center - - -###################################################################### -# Those intermediary values (in orange above) can be accessed (for -# debugging purposes) by looking for attributes of the ``grad_fn`` of -# ``y`` which start with the prefix ``_saved``: -# - -print(y.grad_fn._saved_self) -print(y.grad_fn._saved_other) - - -###################################################################### -# As the computation graph grows in depth, it will store more *saved -# tensors*. Meanwhile, those tensors would have gone out of scope if not -# for the graph. -# - -def f(x): - return x * x - -x = torch.randn(5, requires_grad=True) -y = f(f(f(x))) - -###################################################################### -# .. figure:: https://user-images.githubusercontent.com/8019486/130124570-f1074098-1bb3-459e-bf5a-03bf6f65b403.png -# :width: 500 -# :align: center - - -###################################################################### -# In the example above, executing without grad would only have kept ``x`` -# and ``y`` in the scope, But the graph additionally stores ``f(x)`` and -# ``f(f(x))``. Hence, running a forward pass during training will be more -# costly in memory usage than during evaluation (more precisely, when -# autograd is not required). -# - - -###################################################################### -# The concept of packing / unpacking -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# Going back to the first example: ``y.grad_fn._saved_self`` and -# ``y.grad_fn._saved_other`` point to the original tensor object, -# respectively ``a`` and ``b``. -# - -a = torch.randn(5, requires_grad=True) -b = torch.ones(5, requires_grad=True) -y = a * b - -print(y.grad_fn._saved_self is a) # True -print(y.grad_fn._saved_other is b) # True - - -###################################################################### -# However, that may not always be the case. -# - -a = torch.randn(5, requires_grad=True) -y = torch.exp(a) -print(y.grad_fn._saved_result.equal(y)) # True -print(y.grad_fn._saved_result is y) # False - - -###################################################################### -# Under the hood, PyTorch has **packed** and **unpacked** the tensor -# ``y`` to prevent reference cycles. -# -# As a rule of thumb, you should *not* rely on the fact that accessing -# the tensor saved for backward will yield the same tensor object as the -# original tensor. They will however share the same *storage*. -# - - -###################################################################### -# Saved tensors hooks -# ------------------- -# - - -###################################################################### -# PyTorch provides an API to control how saved tensors should be packed / -# unpacked. -# - -def pack_hook(x): - print("Packing", x) - return x - -def unpack_hook(x): - print("Unpacking", x) - return x -a = torch.ones(5, requires_grad=True) -b = torch.ones(5, requires_grad=True) * 2 - -with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook): - y = a * b - -y.sum().backward() - - -###################################################################### -# The ``pack_hook`` function will be called every time an operation saves -# a tensor for backward. -# The output of ``pack_hook`` is then stored in the computation graph -# instead of the original tensor. -# The ``unpack_hook`` uses that return value to compute a new tensor, -# which is the one actually used during the backward pass. -# In general, you want ``unpack_hook(pack_hook(t))`` to be equal to -# ``t``. -# - -x = torch.randn(5, requires_grad=True) -with torch.autograd.graph.saved_tensors_hooks(lambda x: x * 4, lambda x: x / 4): - y = torch.pow(x, 2) -y.sum().backward() -assert(x.grad.equal(2 * x)) - - -###################################################################### -# One thing to note is that the output of ``pack_hook`` can be *any Python -# object*, as long as ``unpack_hook`` can derive a tensor with the correct -# value from it. -# - - -###################################################################### -# Some unconventional examples -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# First, some silly examples to illustrate what is possible but you -# probably don’t ever want to do it. -# - -###################################################################### -# Returning an ``int`` -# ^^^^^^^^^^^^^^^^^^^^ -# -# Returning the index of a Python list -# Relatively harmless but with debatable usefulness - -storage = [] - -def pack(x): - storage.append(x) - return len(storage) - 1 - -def unpack(x): - return storage[x] - -x = torch.randn(5, requires_grad=True) -with torch.autograd.graph.saved_tensors_hooks(pack, unpack): - y = x * x -y.sum().backward() - -assert(x.grad.equal(2 * x)) - -###################################################################### -# Returning a tuple -# ^^^^^^^^^^^^^^^^^ -# -# Returning some tensor and a function how to unpack it -# Quite unlikely to be useful in its current form - -def pack(x): - delta = torch.randn(*x.size()) - return x - delta, lambda x: x + delta - -def unpack(packed): - x, f = packed - return f(x) - - -x = torch.randn(5, requires_grad=True) -with torch.autograd.graph.saved_tensors_hooks(pack, unpack): - y = x * x -y.sum().backward() - -assert(torch.allclose(x.grad, 2 * x)) - -###################################################################### -# Returning a ``str`` -# ^^^^^^^^^^^^^^^^^^^ -# -# Returning the ``__repr__ of`` the tensor -# Probably never do this - -x = torch.randn(5, requires_grad=True) -with torch.autograd.graph.saved_tensors_hooks(lambda x: repr(x), lambda x: eval("torch." + x)): - y = x * x -y.sum().backward() -assert(torch.all(x.grad - 2 * x <= 1e-4)) - - -###################################################################### -# Although those examples will not be useful in practice, they -# illustrate that the output of ``pack_hook`` can really be any Python -# object as long as it contains enough information to retrieve the -# content of the original tensor. -# In the next sections, we focus on more useful applications. -# - - -###################################################################### -# Saving tensors to CPU -# ~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# Very often, the tensors involved in the computation graph live on GPU. -# Keeping a reference to those tensors in the graph is what causes most -# models to run out of GPU memory during training while they would have -# done fine during evaluation. -# -# Hooks provide a very simple way to implement that. -# - -def pack_hook(x): - return (x.device, x.cpu()) - -def unpack_hook(packed): - device, tensor = packed - return tensor.to(device) - -x = torch.randn(5, requires_grad=True) -with torch.autograd.graph.saved_tensors_hooks(pack, unpack): - y = x * x -y.sum().backward() - -torch.allclose(x.grad, (2 * x)) - - -###################################################################### -# In fact, PyTorch provides an API to conveniently use those hooks (as -# well as the ability to use pinned memory). -# - -import torch.nn as nn - -class Model(nn.Module): - def __init__(self): - super().__init__() - self.w = nn.Parameter(torch.randn(5)) - - def forward(self, x): - with torch.autograd.graph.save_on_cpu(pin_memory=True): - # some computation - return self.w * x - -x = torch.randn(5) -model = Model() -loss = model(x).sum() -loss.backward() - - -###################################################################### -# In practice, on a A100 GPU, for a ResNet-152 with batch size 256, this -# corresponds to a GPU memory usage reduction from 48GB to 5GB, at the -# cost of a 6x slowdown. -# -# Of course, you can modulate the tradeoff by only saving to CPU certain -# parts of the network. -# -# For instance, you could define a special ``nn.Module`` that wraps any -# module and saves its tensors to CPU. -# - -class SaveToCpu(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - - def forward(self, *args, **kwargs): - with torch.autograd.graph.save_on_cpu(pin_memory=True): - return self.module(*args, **kwargs) - -model = nn.Sequential( - nn.Linear(10, 100), - SaveToCpu(nn.Linear(100, 100)), - nn.Linear(100, 10), -) - -x = torch.randn(10) -loss = model(x).sum() -loss.backward() - - -###################################################################### -# Saving tensors to disk -# ~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# Similarly, you may want to save those tensors to disk. Again, this is -# achievable with those hooks. -# - - -###################################################################### -# A naive version would look like this. -# - -# Naive version - HINT: Don't do this - -import uuid -tmp_dir = "temp" - -def pack_hook(tensor): - name = os.path.join(tmp_dir, str(uuid.uuid4())) - torch.save(tensor, name) - return name - -def unpack_hook(name): - return torch.load(name, weights_only=True) - - -###################################################################### -# The reason the above code is bad is that we are leaking files on the -# disk and they are never cleared. Fixing this is not as trivial as it -# seems. -# - -# Incorrect version - HINT: Don't do this - -import uuid -import os -import tempfile -tmp_dir_obj = tempfile.TemporaryDirectory() -tmp_dir = tmp_dir_obj.name - -def pack_hook(tensor): - name = os.path.join(tmp_dir, str(uuid.uuid4())) - torch.save(tensor, name) - return name - -def unpack_hook(name): - tensor = torch.load(name, weights_only=True) - os.remove(name) - return tensor - - -###################################################################### -# The reason the above code doesn’t work is that ``unpack_hook`` can be -# called multiple times. If we delete the file during unpacking the first -# time, it will not be available when the saved tensor is accessed a -# second time, which will raise an error. -# - -x = torch.ones(5, requires_grad=True) -with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook): - y = x.pow(2) -print(y.grad_fn._saved_self) -try: - print(y.grad_fn._saved_self) - print("Double access succeeded!") -except: - print("Double access failed!") - - -###################################################################### -# To fix this, we can write a version of those hooks that takes advantage -# of the fact that PyTorch automatically releases (deletes) the saved data -# when it is no longer needed. -# - -class SelfDeletingTempFile(): - def __init__(self): - self.name = os.path.join(tmp_dir, str(uuid.uuid4())) - - def __del__(self): - os.remove(self.name) - -def pack_hook(tensor): - temp_file = SelfDeletingTempFile() - torch.save(tensor, temp_file.name) - return temp_file - -def unpack_hook(temp_file): - return torch.load(temp_file.name, weights_only=True) - - -###################################################################### -# When we call ``backward``, the output of ``pack_hook`` will be deleted, -# which causes the file to be removed, so we’re no longer leaking the -# files. -# -# This can then be used in your model, in the following way: -# - -# Only save on disk tensors that have size >= 1000 -SAVE_ON_DISK_THRESHOLD = 1000 - -def pack_hook(x): - if x.numel() < SAVE_ON_DISK_THRESHOLD: - return x - temp_file = SelfDeletingTempFile() - torch.save(tensor, temp_file.name) - return temp_file - -def unpack_hook(tensor_or_sctf): - if isinstance(tensor_or_sctf, torch.Tensor): - return tensor_or_sctf - return torch.load(tensor_or_sctf.name) - -class SaveToDisk(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - - def forward(self, *args, **kwargs): - with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook): - return self.module(*args, **kwargs) - -net = nn.DataParallel(SaveToDisk(Model())) - - -###################################################################### -# In this last example, we also demonstrate how to filter which tensors -# should be saved (here, those whose number of elements is greater than -# 1000) and how to combine this feature with ``nn.DataParallel``. -# - - -###################################################################### -# If you’ve made it this far, congratulations! You now know how to use -# saved tensor hooks and how they can be useful in a few scenarios to -# tradeoff memory for compute. -# diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py deleted file mode 100644 index 0f1ae21a55..0000000000 --- a/intermediate_source/ax_multiobjective_nas_tutorial.py +++ /dev/null @@ -1,516 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Multi-Objective NAS with Ax -================================================== - -**Authors:** `David Eriksson `__, -`Max Balandat `__, -and the Adaptive Experimentation team at Meta. - -In this tutorial, we show how to use `Ax `__ to run -multi-objective neural architecture search (NAS) for a simple neural -network model on the popular MNIST dataset. While the underlying -methodology would typically be used for more complicated models and -larger datasets, we opt for a tutorial that is easily runnable -end-to-end on a laptop in less than 20 minutes. - -In many NAS applications, there is a natural tradeoff between multiple -objectives of interest. For instance, when deploying models on-device -we may want to maximize model performance (for example, accuracy), while -simultaneously minimizing competing metrics like power consumption, -inference latency, or model size in order to satisfy deployment -constraints. Often, we may be able to reduce computational requirements -or latency of predictions substantially by accepting minimally lower -model performance. Principled methods for exploring such tradeoffs -efficiently are key enablers of scalable and sustainable AI, and have -many successful applications at Meta - see for instance our -`case study `__ -on a Natural Language Understanding model. - -In our example here, we will tune the widths of two hidden layers, -the learning rate, the dropout probability, the batch size, and the -number of training epochs. The goal is to trade off performance -(accuracy on the validation set) and model size (the number of -model parameters). - -This tutorial makes use of the following PyTorch libraries: - -- `PyTorch Lightning `__ (specifying the model and training loop) -- `TorchX `__ (for running training jobs remotely / asynchronously) -- `BoTorch `__ (the Bayesian Optimization library powering Ax's algorithms) -""" - - -###################################################################### -# Defining the TorchX App -# ----------------------- -# -# Our goal is to optimize the PyTorch Lightning training job defined in -# `mnist_train_nas.py `__. -# To do this using TorchX, we write a helper function that takes in -# the values of the architecture and hyperparameters of the training -# job and creates a `TorchX AppDef `__ -# with the appropriate settings. -# - -from pathlib import Path - -import torchx - -from torchx import specs -from torchx.components import utils - - -def trainer( - log_path: str, - hidden_size_1: int, - hidden_size_2: int, - learning_rate: float, - epochs: int, - dropout: float, - batch_size: int, - trial_idx: int = -1, -) -> specs.AppDef: - - # define the log path so we can pass it to the TorchX ``AppDef`` - if trial_idx >= 0: - log_path = Path(log_path).joinpath(str(trial_idx)).absolute().as_posix() - - return utils.python( - # command line arguments to the training script - "--log_path", - log_path, - "--hidden_size_1", - str(hidden_size_1), - "--hidden_size_2", - str(hidden_size_2), - "--learning_rate", - str(learning_rate), - "--epochs", - str(epochs), - "--dropout", - str(dropout), - "--batch_size", - str(batch_size), - # other config options - name="trainer", - script="mnist_train_nas.py", - image=torchx.version.TORCHX_IMAGE, - ) - - -###################################################################### -# Setting up the Runner -# --------------------- -# -# Ax’s `Runner `__ -# abstraction allows writing interfaces to various backends. -# Ax already comes with Runner for TorchX, and so we just need to -# configure it. For the purpose of this tutorial we run jobs locally -# in a fully asynchronous fashion. -# -# In order to launch them on a cluster, you can instead specify a -# different TorchX scheduler and adjust the configuration appropriately. -# For example, if you have a Kubernetes cluster, you just need to change the -# scheduler from ``local_cwd`` to ``kubernetes``). -# - - -import tempfile -from ax.runners.torchx import TorchXRunner - -# Make a temporary dir to log our results into -log_dir = tempfile.mkdtemp() - -ax_runner = TorchXRunner( - tracker_base="/tmp/", - component=trainer, - # NOTE: To launch this job on a cluster instead of locally you can - # specify a different scheduler and adjust arguments appropriately. - scheduler="local_cwd", - component_const_params={"log_path": log_dir}, - cfg={}, -) - -###################################################################### -# Setting up the ``SearchSpace`` -# ------------------------------ -# -# First, we define our search space. Ax supports both range parameters -# of type integer and float as well as choice parameters which can have -# non-numerical types such as strings. -# We will tune the hidden sizes, learning rate, dropout, and number of -# epochs as range parameters and tune the batch size as an ordered choice -# parameter to enforce it to be a power of 2. -# - -from ax.core import ( - ChoiceParameter, - ParameterType, - RangeParameter, - SearchSpace, -) - -parameters = [ - # NOTE: In a real-world setting, hidden_size_1 and hidden_size_2 - # should probably be powers of 2, but in our simple example this - # would mean that ``num_params`` can't take on that many values, which - # in turn makes the Pareto frontier look pretty weird. - RangeParameter( - name="hidden_size_1", - lower=16, - upper=128, - parameter_type=ParameterType.INT, - log_scale=True, - ), - RangeParameter( - name="hidden_size_2", - lower=16, - upper=128, - parameter_type=ParameterType.INT, - log_scale=True, - ), - RangeParameter( - name="learning_rate", - lower=1e-4, - upper=1e-2, - parameter_type=ParameterType.FLOAT, - log_scale=True, - ), - RangeParameter( - name="epochs", - lower=1, - upper=4, - parameter_type=ParameterType.INT, - ), - RangeParameter( - name="dropout", - lower=0.0, - upper=0.5, - parameter_type=ParameterType.FLOAT, - ), - ChoiceParameter( # NOTE: ``ChoiceParameters`` don't require log-scale - name="batch_size", - values=[32, 64, 128, 256], - parameter_type=ParameterType.INT, - is_ordered=True, - sort_values=True, - ), -] - -search_space = SearchSpace( - parameters=parameters, - # NOTE: In practice, it may make sense to add a constraint - # hidden_size_2 <= hidden_size_1 - parameter_constraints=[], -) - - -###################################################################### -# Setting up Metrics -# ------------------ -# -# Ax has the concept of a `Metric `__ -# that defines properties of outcomes and how observations are obtained -# for these outcomes. This allows e.g. encoding how data is fetched from -# some distributed execution backend and post-processed before being -# passed as input to Ax. -# -# In this tutorial we will use -# `multi-objective optimization `__ -# with the goal of maximizing the validation accuracy and minimizing -# the number of model parameters. The latter represents a simple proxy -# of model latency, which is hard to estimate accurately for small ML -# models (in an actual application we would benchmark the latency while -# running the model on-device). -# -# In our example TorchX will run the training jobs in a fully asynchronous -# fashion locally and write the results to the ``log_dir`` based on the trial -# index (see the ``trainer()`` function above). We will define a metric -# class that is aware of that logging directory. By subclassing -# `TensorboardCurveMetric `__ -# we get the logic to read and parse the TensorBoard logs for free. -# - -from ax.metrics.tensorboard import TensorboardMetric -from tensorboard.backend.event_processing import plugin_event_multiplexer as event_multiplexer - -class MyTensorboardMetric(TensorboardMetric): - - # NOTE: We need to tell the new TensorBoard metric how to get the id / - # file handle for the TensorBoard logs from a trial. In this case - # our convention is to just save a separate file per trial in - # the prespecified log dir. - def _get_event_multiplexer_for_trial(self, trial): - mul = event_multiplexer.EventMultiplexer(max_reload_threads=20) - mul.AddRunsFromDirectory(Path(log_dir).joinpath(str(trial.index)).as_posix(), None) - mul.Reload() - - return mul - - # This indicates whether the metric is queryable while the trial is - # still running. We don't use this in the current tutorial, but Ax - # utilizes this to implement trial-level early-stopping functionality. - @classmethod - def is_available_while_running(cls): - return False - - -###################################################################### -# Now we can instantiate the metrics for accuracy and the number of -# model parameters. Here `curve_name` is the name of the metric in the -# TensorBoard logs, while `name` is the metric name used internally -# by Ax. We also specify `lower_is_better` to indicate the favorable -# direction of the two metrics. -# - -val_acc = MyTensorboardMetric( - name="val_acc", - tag="val_acc", - lower_is_better=False, -) -model_num_params = MyTensorboardMetric( - name="num_params", - tag="num_params", - lower_is_better=True, -) - - -###################################################################### -# Setting up the ``OptimizationConfig`` -# ------------------------------------- -# -# The way to tell Ax what it should optimize is by means of an -# `OptimizationConfig `__. -# Here we use a ``MultiObjectiveOptimizationConfig`` as we will -# be performing multi-objective optimization. -# -# Additionally, Ax supports placing constraints on the different -# metrics by specifying objective thresholds, which bound the region -# of interest in the outcome space that we want to explore. For this -# example, we will constrain the validation accuracy to be at least -# 0.94 (94%) and the number of model parameters to be at most 80,000. -# - -from ax.core import MultiObjective, Objective, ObjectiveThreshold -from ax.core.optimization_config import MultiObjectiveOptimizationConfig - - -opt_config = MultiObjectiveOptimizationConfig( - objective=MultiObjective( - objectives=[ - Objective(metric=val_acc, minimize=False), - Objective(metric=model_num_params, minimize=True), - ], - ), - objective_thresholds=[ - ObjectiveThreshold(metric=val_acc, bound=0.94, relative=False), - ObjectiveThreshold(metric=model_num_params, bound=80_000, relative=False), - ], -) - - -###################################################################### -# Creating the Ax Experiment -# -------------------------- -# -# In Ax, the `Experiment `__ -# object is the object that stores all the information about the problem -# setup. -# -# .. tip: -# ``Experiment`` objects can be serialized to JSON or stored to a -# database backend such as MySQL in order to persist and be available -# to load on different machines. See the the `Ax Docs `__ -# on the storage functionality for details. -# - -from ax.core import Experiment - -experiment = Experiment( - name="torchx_mnist", - search_space=search_space, - optimization_config=opt_config, - runner=ax_runner, -) - -###################################################################### -# Choosing the Generation Strategy -# -------------------------------- -# -# A `GenerationStrategy `__ -# is the abstract representation of how we would like to perform the -# optimization. While this can be customized (if you’d like to do so, see -# `this tutorial `__), -# in most cases Ax can automatically determine an appropriate strategy -# based on the search space, optimization config, and the total number -# of trials we want to run. -# -# Typically, Ax chooses to evaluate a number of random configurations -# before starting a model-based Bayesian Optimization strategy. -# - - -total_trials = 48 # total evaluation budget - -from ax.modelbridge.dispatch_utils import choose_generation_strategy - -gs = choose_generation_strategy( - search_space=experiment.search_space, - optimization_config=experiment.optimization_config, - num_trials=total_trials, - ) - - -###################################################################### -# Configuring the Scheduler -# ------------------------- -# -# The ``Scheduler`` acts as the loop control for the optimization. -# It communicates with the backend to launch trials, check their status, -# and retrieve results. In the case of this tutorial, it is simply reading -# and parsing the locally saved logs. In a remote execution setting, -# it would call APIs. The following illustration from the Ax -# `Scheduler tutorial `__ -# summarizes how the Scheduler interacts with external systems used to run -# trial evaluations: -# -# .. image:: ../../_static/img/ax_scheduler_illustration.png -# -# -# The ``Scheduler`` requires the ``Experiment`` and the ``GenerationStrategy``. -# A set of options can be passed in via ``SchedulerOptions``. Here, we -# configure the number of total evaluations as well as ``max_pending_trials``, -# the maximum number of trials that should run concurrently. In our -# local setting, this is the number of training jobs running as individual -# processes, while in a remote execution setting, this would be the number -# of machines you want to use in parallel. -# - - -from ax.service.scheduler import Scheduler, SchedulerOptions - -scheduler = Scheduler( - experiment=experiment, - generation_strategy=gs, - options=SchedulerOptions( - total_trials=total_trials, max_pending_trials=4 - ), -) - - -###################################################################### -# Running the optimization -# ------------------------ -# -# Now that everything is configured, we can let Ax run the optimization -# in a fully automated fashion. The Scheduler will periodically check -# the logs for the status of all currently running trials, and if a -# trial completes the scheduler will update its status on the -# experiment and fetch the observations needed for the Bayesian -# optimization algorithm. -# - -scheduler.run_all_trials() - - -###################################################################### -# Evaluating the results -# ---------------------- -# -# We can now inspect the result of the optimization using helper -# functions and visualizations included with Ax. - -###################################################################### -# First, we generate a dataframe with a summary of the results -# of the experiment. Each row in this dataframe corresponds to a -# trial (that is, a training job that was run), and contains information -# on the status of the trial, the parameter configuration that was -# evaluated, and the metric values that were observed. This provides -# an easy way to sanity check the optimization. -# - -from ax.service.utils.report_utils import exp_to_df - -df = exp_to_df(experiment) -df.head(10) - - -###################################################################### -# We can also visualize the Pareto frontier of tradeoffs between the -# validation accuracy and the number of model parameters. -# -# .. tip:: -# Ax uses Plotly to produce interactive plots, which allow you to -# do things like zoom, crop, or hover in order to view details -# of components of the plot. Try it out, and take a look at the -# `visualization tutorial `__ -# if you'd like to learn more). -# -# The final optimization results are shown in the figure below where -# the color corresponds to the iteration number for each trial. -# We see that our method was able to successfully explore the -# trade-offs and found both large models with high validation -# accuracy as well as small models with comparatively lower -# validation accuracy. -# - -from ax.service.utils.report_utils import _pareto_frontier_scatter_2d_plotly - -_pareto_frontier_scatter_2d_plotly(experiment) - - -###################################################################### -# To better understand what our surrogate models have learned about -# the black box objectives, we can take a look at the leave-one-out -# cross validation results. Since our models are Gaussian Processes, -# they not only provide point predictions but also uncertainty estimates -# about these predictions. A good model means that the predicted means -# (the points in the figure) are close to the 45 degree line and that the -# confidence intervals cover the 45 degree line with the expected frequency -# (here we use 95% confidence intervals, so we would expect them to contain -# the true observation 95% of the time). -# -# As the figures below show, the model size (``num_params``) metric is -# much easier to model than the validation accuracy (``val_acc``) metric. -# - -from ax.modelbridge.cross_validation import compute_diagnostics, cross_validate -from ax.plot.diagnostic import interact_cross_validation_plotly -from ax.utils.notebook.plotting import init_notebook_plotting, render - -cv = cross_validate(model=gs.model) # The surrogate model is stored on the ``GenerationStrategy`` -compute_diagnostics(cv) - -interact_cross_validation_plotly(cv) - - -###################################################################### -# We can also make contour plots to better understand how the different -# objectives depend on two of the input parameters. In the figure below, -# we show the validation accuracy predicted by the model as a function -# of the two hidden sizes. The validation accuracy clearly increases -# as the hidden sizes increase. -# - -from ax.plot.contour import interact_contour_plotly - -interact_contour_plotly(model=gs.model, metric_name="val_acc") - - -###################################################################### -# Similarly, we show the number of model parameters as a function of -# the hidden sizes in the figure below and see that it also increases -# as a function of the hidden sizes (the dependency on ``hidden_size_1`` -# is much larger). - -interact_contour_plotly(model=gs.model, metric_name="num_params") - - -###################################################################### -# Acknowledgments -# ---------------- -# -# We thank the TorchX team (in particular Kiuk Chung and Tristan Rice) -# for their help with integrating TorchX with Ax. -# diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py deleted file mode 100644 index 8451f07b82..0000000000 --- a/intermediate_source/char_rnn_classification_tutorial.py +++ /dev/null @@ -1,531 +0,0 @@ -# -*- coding: utf-8 -*- -""" -NLP From Scratch: Classifying Names with a Character-Level RNN -************************************************************** -**Author**: `Sean Robertson `_ - -We will be building and training a basic character-level Recurrent Neural -Network (RNN) to classify words. This tutorial, along with two other -Natural Language Processing (NLP) "from scratch" tutorials -:doc:`/intermediate/char_rnn_generation_tutorial` and -:doc:`/intermediate/seq2seq_translation_tutorial`, show how to -preprocess data to model NLP. In particular these tutorials do not -use many of the convenience functions of `torchtext`, so you can see how -preprocessing to model NLP works at a low level. - -A character-level RNN reads words as a series of characters - -outputting a prediction and "hidden state" at each step, feeding its -previous hidden state into each next step. We take the final prediction -to be the output, i.e. which class the word belongs to. - -Specifically, we'll train on a few thousand surnames from 18 languages -of origin, and predict which language a name is from based on the -spelling: - -.. code-block:: sh - - $ python predict.py Hinton - (-0.47) Scottish - (-1.52) English - (-3.57) Irish - - $ python predict.py Schmidhuber - (-0.19) German - (-2.48) Czech - (-2.68) Dutch - - -Recommended Preparation -======================= - -Before starting this tutorial it is recommended that you have installed PyTorch, -and have a basic understanding of Python programming language and Tensors: - -- https://pytorch.org/ For installation instructions -- :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general - and learn the basics of Tensors -- :doc:`/beginner/pytorch_with_examples` for a wide and deep overview -- :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user - -It would also be useful to know about RNNs and how they work: - -- `The Unreasonable Effectiveness of Recurrent Neural - Networks `__ - shows a bunch of real life examples -- `Understanding LSTM - Networks `__ - is about LSTMs specifically but also informative about RNNs in - general - -Preparing the Data -================== - -.. note:: - Download the data from - `here `_ - and extract it to the current directory. - -Included in the ``data/names`` directory are 18 text files named as -``[Language].txt``. Each file contains a bunch of names, one name per -line, mostly romanized (but we still need to convert from Unicode to -ASCII). - -We'll end up with a dictionary of lists of names per language, -``{language: [names ...]}``. The generic variables "category" and "line" -(for language and name in our case) are used for later extensibility. -""" -from io import open -import glob -import os - -def findFiles(path): return glob.glob(path) - -print(findFiles('data/names/*.txt')) - -import unicodedata -import string - -all_letters = string.ascii_letters + " .,;'" -n_letters = len(all_letters) - -# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 -def unicodeToAscii(s): - return ''.join( - c for c in unicodedata.normalize('NFD', s) - if unicodedata.category(c) != 'Mn' - and c in all_letters - ) - -print(unicodeToAscii('Ślusàrski')) - -# Build the category_lines dictionary, a list of names per language -category_lines = {} -all_categories = [] - -# Read a file and split into lines -def readLines(filename): - lines = open(filename, encoding='utf-8').read().strip().split('\n') - return [unicodeToAscii(line) for line in lines] - -for filename in findFiles('data/names/*.txt'): - category = os.path.splitext(os.path.basename(filename))[0] - all_categories.append(category) - lines = readLines(filename) - category_lines[category] = lines - -n_categories = len(all_categories) - - -###################################################################### -# Now we have ``category_lines``, a dictionary mapping each category -# (language) to a list of lines (names). We also kept track of -# ``all_categories`` (just a list of languages) and ``n_categories`` for -# later reference. -# - -print(category_lines['Italian'][:5]) - - -###################################################################### -# Turning Names into Tensors -# -------------------------- -# -# Now that we have all the names organized, we need to turn them into -# Tensors to make any use of them. -# -# To represent a single letter, we use a "one-hot vector" of size -# ``<1 x n_letters>``. A one-hot vector is filled with 0s except for a 1 -# at index of the current letter, e.g. ``"b" = <0 1 0 0 0 ...>``. -# -# To make a word we join a bunch of those into a 2D matrix -# ````. -# -# That extra 1 dimension is because PyTorch assumes everything is in -# batches - we're just using a batch size of 1 here. -# - -import torch - -# Find letter index from all_letters, e.g. "a" = 0 -def letterToIndex(letter): - return all_letters.find(letter) - -# Just for demonstration, turn a letter into a <1 x n_letters> Tensor -def letterToTensor(letter): - tensor = torch.zeros(1, n_letters) - tensor[0][letterToIndex(letter)] = 1 - return tensor - -# Turn a line into a , -# or an array of one-hot letter vectors -def lineToTensor(line): - tensor = torch.zeros(len(line), 1, n_letters) - for li, letter in enumerate(line): - tensor[li][0][letterToIndex(letter)] = 1 - return tensor - -print(letterToTensor('J')) - -print(lineToTensor('Jones').size()) - - -###################################################################### -# Creating the Network -# ==================== -# -# Before autograd, creating a recurrent neural network in Torch involved -# cloning the parameters of a layer over several timesteps. The layers -# held hidden state and gradients which are now entirely handled by the -# graph itself. This means you can implement a RNN in a very "pure" way, -# as regular feed-forward layers. -# -# This RNN module implements a "vanilla RNN" an is just 3 linear layers -# which operate on an input and hidden state, with a ``LogSoftmax`` layer -# after the output. -# - -import torch.nn as nn -import torch.nn.functional as F - -class RNN(nn.Module): - def __init__(self, input_size, hidden_size, output_size): - super(RNN, self).__init__() - - self.hidden_size = hidden_size - - self.i2h = nn.Linear(input_size, hidden_size) - self.h2h = nn.Linear(hidden_size, hidden_size) - self.h2o = nn.Linear(hidden_size, output_size) - self.softmax = nn.LogSoftmax(dim=1) - - def forward(self, input, hidden): - hidden = F.tanh(self.i2h(input) + self.h2h(hidden)) - output = self.h2o(hidden) - output = self.softmax(output) - return output, hidden - - def initHidden(self): - return torch.zeros(1, self.hidden_size) - -n_hidden = 128 -rnn = RNN(n_letters, n_hidden, n_categories) - - -###################################################################### -# To run a step of this network we need to pass an input (in our case, the -# Tensor for the current letter) and a previous hidden state (which we -# initialize as zeros at first). We'll get back the output (probability of -# each language) and a next hidden state (which we keep for the next -# step). -# - -input = letterToTensor('A') -hidden = torch.zeros(1, n_hidden) - -output, next_hidden = rnn(input, hidden) - - -###################################################################### -# For the sake of efficiency we don't want to be creating a new Tensor for -# every step, so we will use ``lineToTensor`` instead of -# ``letterToTensor`` and use slices. This could be further optimized by -# precomputing batches of Tensors. -# - -input = lineToTensor('Albert') -hidden = torch.zeros(1, n_hidden) - -output, next_hidden = rnn(input[0], hidden) -print(output) - - -###################################################################### -# As you can see the output is a ``<1 x n_categories>`` Tensor, where -# every item is the likelihood of that category (higher is more likely). -# - - -###################################################################### -# -# Training -# ======== -# Preparing for Training -# ---------------------- -# -# Before going into training we should make a few helper functions. The -# first is to interpret the output of the network, which we know to be a -# likelihood of each category. We can use ``Tensor.topk`` to get the index -# of the greatest value: -# - -def categoryFromOutput(output): - top_n, top_i = output.topk(1) - category_i = top_i[0].item() - return all_categories[category_i], category_i - -print(categoryFromOutput(output)) - - -###################################################################### -# We will also want a quick way to get a training example (a name and its -# language): -# - -import random - -def randomChoice(l): - return l[random.randint(0, len(l) - 1)] - -def randomTrainingExample(): - category = randomChoice(all_categories) - line = randomChoice(category_lines[category]) - category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) - line_tensor = lineToTensor(line) - return category, line, category_tensor, line_tensor - -for i in range(10): - category, line, category_tensor, line_tensor = randomTrainingExample() - print('category =', category, '/ line =', line) - - -###################################################################### -# Training the Network -# -------------------- -# -# Now all it takes to train this network is show it a bunch of examples, -# have it make guesses, and tell it if it's wrong. -# -# For the loss function ``nn.NLLLoss`` is appropriate, since the last -# layer of the RNN is ``nn.LogSoftmax``. -# - -criterion = nn.NLLLoss() - - -###################################################################### -# Each loop of training will: -# -# - Create input and target tensors -# - Create a zeroed initial hidden state -# - Read each letter in and -# -# - Keep hidden state for next letter -# -# - Compare final output to target -# - Back-propagate -# - Return the output and loss -# - -learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn - -def train(category_tensor, line_tensor): - hidden = rnn.initHidden() - - rnn.zero_grad() - - for i in range(line_tensor.size()[0]): - output, hidden = rnn(line_tensor[i], hidden) - - loss = criterion(output, category_tensor) - loss.backward() - - # Add parameters' gradients to their values, multiplied by learning rate - for p in rnn.parameters(): - p.data.add_(p.grad.data, alpha=-learning_rate) - - return output, loss.item() - - -###################################################################### -# Now we just have to run that with a bunch of examples. Since the -# ``train`` function returns both the output and loss we can print its -# guesses and also keep track of loss for plotting. Since there are 1000s -# of examples we print only every ``print_every`` examples, and take an -# average of the loss. -# - -import time -import math - -n_iters = 100000 -print_every = 5000 -plot_every = 1000 - - - -# Keep track of losses for plotting -current_loss = 0 -all_losses = [] - -def timeSince(since): - now = time.time() - s = now - since - m = math.floor(s / 60) - s -= m * 60 - return '%dm %ds' % (m, s) - -start = time.time() - -for iter in range(1, n_iters + 1): - category, line, category_tensor, line_tensor = randomTrainingExample() - output, loss = train(category_tensor, line_tensor) - current_loss += loss - - # Print ``iter`` number, loss, name and guess - if iter % print_every == 0: - guess, guess_i = categoryFromOutput(output) - correct = '✓' if guess == category else '✗ (%s)' % category - print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct)) - - # Add current loss avg to list of losses - if iter % plot_every == 0: - all_losses.append(current_loss / plot_every) - current_loss = 0 - - -###################################################################### -# Plotting the Results -# -------------------- -# -# Plotting the historical loss from ``all_losses`` shows the network -# learning: -# - -import matplotlib.pyplot as plt -import matplotlib.ticker as ticker - -plt.figure() -plt.plot(all_losses) - - -###################################################################### -# Evaluating the Results -# ====================== -# -# To see how well the network performs on different categories, we will -# create a confusion matrix, indicating for every actual language (rows) -# which language the network guesses (columns). To calculate the confusion -# matrix a bunch of samples are run through the network with -# ``evaluate()``, which is the same as ``train()`` minus the backprop. -# - -# Keep track of correct guesses in a confusion matrix -confusion = torch.zeros(n_categories, n_categories) -n_confusion = 10000 - -# Just return an output given a line -def evaluate(line_tensor): - hidden = rnn.initHidden() - - for i in range(line_tensor.size()[0]): - output, hidden = rnn(line_tensor[i], hidden) - - return output - -# Go through a bunch of examples and record which are correctly guessed -for i in range(n_confusion): - category, line, category_tensor, line_tensor = randomTrainingExample() - output = evaluate(line_tensor) - guess, guess_i = categoryFromOutput(output) - category_i = all_categories.index(category) - confusion[category_i][guess_i] += 1 - -# Normalize by dividing every row by its sum -for i in range(n_categories): - confusion[i] = confusion[i] / confusion[i].sum() - -# Set up plot -fig = plt.figure() -ax = fig.add_subplot(111) -cax = ax.matshow(confusion.numpy()) -fig.colorbar(cax) - -# Set up axes -ax.set_xticklabels([''] + all_categories, rotation=90) -ax.set_yticklabels([''] + all_categories) - -# Force label at every tick -ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) -ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) - -# sphinx_gallery_thumbnail_number = 2 -plt.show() - - -###################################################################### -# You can pick out bright spots off the main axis that show which -# languages it guesses incorrectly, e.g. Chinese for Korean, and Spanish -# for Italian. It seems to do very well with Greek, and very poorly with -# English (perhaps because of overlap with other languages). -# - - -###################################################################### -# Running on User Input -# --------------------- -# - -def predict(input_line, n_predictions=3): - print('\n> %s' % input_line) - with torch.no_grad(): - output = evaluate(lineToTensor(input_line)) - - # Get top N categories - topv, topi = output.topk(n_predictions, 1, True) - predictions = [] - - for i in range(n_predictions): - value = topv[0][i].item() - category_index = topi[0][i].item() - print('(%.2f) %s' % (value, all_categories[category_index])) - predictions.append([value, all_categories[category_index]]) - -predict('Dovesky') -predict('Jackson') -predict('Satoshi') - - -###################################################################### -# The final versions of the scripts `in the Practical PyTorch -# repo `__ -# split the above code into a few files: -# -# - ``data.py`` (loads files) -# - ``model.py`` (defines the RNN) -# - ``train.py`` (runs training) -# - ``predict.py`` (runs ``predict()`` with command line arguments) -# - ``server.py`` (serve prediction as a JSON API with ``bottle.py``) -# -# Run ``train.py`` to train and save the network. -# -# Run ``predict.py`` with a name to view predictions: -# -# .. code-block:: sh -# -# $ python predict.py Hazaki -# (-0.42) Japanese -# (-1.39) Polish -# (-3.51) Czech -# -# Run ``server.py`` and visit http://localhost:5533/Yourname to get JSON -# output of predictions. -# - - -###################################################################### -# Exercises -# ========= -# -# - Try with a different dataset of line -> category, for example: -# -# - Any word -> language -# - First name -> gender -# - Character name -> writer -# - Page title -> blog or subreddit -# -# - Get better results with a bigger and/or better shaped network -# -# - Add more linear layers -# - Try the ``nn.LSTM`` and ``nn.GRU`` layers -# - Combine multiple of these RNNs as a higher level network -# diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py deleted file mode 100644 index f7db4769ed..0000000000 --- a/intermediate_source/char_rnn_generation_tutorial.py +++ /dev/null @@ -1,433 +0,0 @@ -# -*- coding: utf-8 -*- -""" -NLP From Scratch: Generating Names with a Character-Level RNN -************************************************************* -**Author**: `Sean Robertson `_ - -This is our second of three tutorials on "NLP From Scratch". -In the `first tutorial `_ -we used a RNN to classify names into their language of origin. This time -we'll turn around and generate names from languages. - -.. code-block:: sh - - > python sample.py Russian RUS - Rovakov - Uantov - Shavakov - - > python sample.py German GER - Gerren - Ereng - Rosher - - > python sample.py Spanish SPA - Salla - Parer - Allan - - > python sample.py Chinese CHI - Chan - Hang - Iun - -We are still hand-crafting a small RNN with a few linear layers. The big -difference is instead of predicting a category after reading in all the -letters of a name, we input a category and output one letter at a time. -Recurrently predicting characters to form language (this could also be -done with words or other higher order constructs) is often referred to -as a "language model". - -**Recommended Reading:** - -I assume you have at least installed PyTorch, know Python, and -understand Tensors: - -- https://pytorch.org/ For installation instructions -- :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general -- :doc:`/beginner/pytorch_with_examples` for a wide and deep overview -- :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user - -It would also be useful to know about RNNs and how they work: - -- `The Unreasonable Effectiveness of Recurrent Neural - Networks `__ - shows a bunch of real life examples -- `Understanding LSTM - Networks `__ - is about LSTMs specifically but also informative about RNNs in - general - -I also suggest the previous tutorial, :doc:`/intermediate/char_rnn_classification_tutorial` - - -Preparing the Data -================== - -.. note:: - Download the data from - `here `_ - and extract it to the current directory. - -See the last tutorial for more detail of this process. In short, there -are a bunch of plain text files ``data/names/[Language].txt`` with a -name per line. We split lines into an array, convert Unicode to ASCII, -and end up with a dictionary ``{language: [names ...]}``. - -""" -from io import open -import glob -import os -import unicodedata -import string - -all_letters = string.ascii_letters + " .,;'-" -n_letters = len(all_letters) + 1 # Plus EOS marker - -def findFiles(path): return glob.glob(path) - -# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 -def unicodeToAscii(s): - return ''.join( - c for c in unicodedata.normalize('NFD', s) - if unicodedata.category(c) != 'Mn' - and c in all_letters - ) - -# Read a file and split into lines -def readLines(filename): - with open(filename, encoding='utf-8') as some_file: - return [unicodeToAscii(line.strip()) for line in some_file] - -# Build the category_lines dictionary, a list of lines per category -category_lines = {} -all_categories = [] -for filename in findFiles('data/names/*.txt'): - category = os.path.splitext(os.path.basename(filename))[0] - all_categories.append(category) - lines = readLines(filename) - category_lines[category] = lines - -n_categories = len(all_categories) - -if n_categories == 0: - raise RuntimeError('Data not found. Make sure that you downloaded data ' - 'from https://download.pytorch.org/tutorial/data.zip and extract it to ' - 'the current directory.') - -print('# categories:', n_categories, all_categories) -print(unicodeToAscii("O'Néàl")) - - -###################################################################### -# Creating the Network -# ==================== -# -# This network extends `the last tutorial's RNN <#Creating-the-Network>`__ -# with an extra argument for the category tensor, which is concatenated -# along with the others. The category tensor is a one-hot vector just like -# the letter input. -# -# We will interpret the output as the probability of the next letter. When -# sampling, the most likely output letter is used as the next input -# letter. -# -# I added a second linear layer ``o2o`` (after combining hidden and -# output) to give it more muscle to work with. There's also a dropout -# layer, which `randomly zeros parts of its -# input `__ with a given probability -# (here 0.1) and is usually used to fuzz inputs to prevent overfitting. -# Here we're using it towards the end of the network to purposely add some -# chaos and increase sampling variety. -# -# .. figure:: https://i.imgur.com/jzVrf7f.png -# :alt: -# -# - -import torch -import torch.nn as nn - -class RNN(nn.Module): - def __init__(self, input_size, hidden_size, output_size): - super(RNN, self).__init__() - self.hidden_size = hidden_size - - self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size) - self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size) - self.o2o = nn.Linear(hidden_size + output_size, output_size) - self.dropout = nn.Dropout(0.1) - self.softmax = nn.LogSoftmax(dim=1) - - def forward(self, category, input, hidden): - input_combined = torch.cat((category, input, hidden), 1) - hidden = self.i2h(input_combined) - output = self.i2o(input_combined) - output_combined = torch.cat((hidden, output), 1) - output = self.o2o(output_combined) - output = self.dropout(output) - output = self.softmax(output) - return output, hidden - - def initHidden(self): - return torch.zeros(1, self.hidden_size) - - -###################################################################### -# Training -# ========= -# Preparing for Training -# ---------------------- -# -# First of all, helper functions to get random pairs of (category, line): -# - -import random - -# Random item from a list -def randomChoice(l): - return l[random.randint(0, len(l) - 1)] - -# Get a random category and random line from that category -def randomTrainingPair(): - category = randomChoice(all_categories) - line = randomChoice(category_lines[category]) - return category, line - - -###################################################################### -# For each timestep (that is, for each letter in a training word) the -# inputs of the network will be -# ``(category, current letter, hidden state)`` and the outputs will be -# ``(next letter, next hidden state)``. So for each training set, we'll -# need the category, a set of input letters, and a set of output/target -# letters. -# -# Since we are predicting the next letter from the current letter for each -# timestep, the letter pairs are groups of consecutive letters from the -# line - e.g. for ``"ABCD"`` we would create ("A", "B"), ("B", "C"), -# ("C", "D"), ("D", "EOS"). -# -# .. figure:: https://i.imgur.com/JH58tXY.png -# :alt: -# -# The category tensor is a `one-hot -# tensor `__ of size -# ``<1 x n_categories>``. When training we feed it to the network at every -# timestep - this is a design choice, it could have been included as part -# of initial hidden state or some other strategy. -# - -# One-hot vector for category -def categoryTensor(category): - li = all_categories.index(category) - tensor = torch.zeros(1, n_categories) - tensor[0][li] = 1 - return tensor - -# One-hot matrix of first to last letters (not including EOS) for input -def inputTensor(line): - tensor = torch.zeros(len(line), 1, n_letters) - for li in range(len(line)): - letter = line[li] - tensor[li][0][all_letters.find(letter)] = 1 - return tensor - -# ``LongTensor`` of second letter to end (EOS) for target -def targetTensor(line): - letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))] - letter_indexes.append(n_letters - 1) # EOS - return torch.LongTensor(letter_indexes) - - -###################################################################### -# For convenience during training we'll make a ``randomTrainingExample`` -# function that fetches a random (category, line) pair and turns them into -# the required (category, input, target) tensors. -# - -# Make category, input, and target tensors from a random category, line pair -def randomTrainingExample(): - category, line = randomTrainingPair() - category_tensor = categoryTensor(category) - input_line_tensor = inputTensor(line) - target_line_tensor = targetTensor(line) - return category_tensor, input_line_tensor, target_line_tensor - - -###################################################################### -# Training the Network -# -------------------- -# -# In contrast to classification, where only the last output is used, we -# are making a prediction at every step, so we are calculating loss at -# every step. -# -# The magic of autograd allows you to simply sum these losses at each step -# and call backward at the end. -# - -criterion = nn.NLLLoss() - -learning_rate = 0.0005 - -def train(category_tensor, input_line_tensor, target_line_tensor): - target_line_tensor.unsqueeze_(-1) - hidden = rnn.initHidden() - - rnn.zero_grad() - - loss = torch.Tensor([0]) # you can also just simply use ``loss = 0`` - - for i in range(input_line_tensor.size(0)): - output, hidden = rnn(category_tensor, input_line_tensor[i], hidden) - l = criterion(output, target_line_tensor[i]) - loss += l - - loss.backward() - - for p in rnn.parameters(): - p.data.add_(p.grad.data, alpha=-learning_rate) - - return output, loss.item() / input_line_tensor.size(0) - - -###################################################################### -# To keep track of how long training takes I am adding a -# ``timeSince(timestamp)`` function which returns a human readable string: -# - -import time -import math - -def timeSince(since): - now = time.time() - s = now - since - m = math.floor(s / 60) - s -= m * 60 - return '%dm %ds' % (m, s) - - -###################################################################### -# Training is business as usual - call train a bunch of times and wait a -# few minutes, printing the current time and loss every ``print_every`` -# examples, and keeping store of an average loss per ``plot_every`` examples -# in ``all_losses`` for plotting later. -# - -rnn = RNN(n_letters, 128, n_letters) - -n_iters = 100000 -print_every = 5000 -plot_every = 500 -all_losses = [] -total_loss = 0 # Reset every ``plot_every`` ``iters`` - -start = time.time() - -for iter in range(1, n_iters + 1): - output, loss = train(*randomTrainingExample()) - total_loss += loss - - if iter % print_every == 0: - print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss)) - - if iter % plot_every == 0: - all_losses.append(total_loss / plot_every) - total_loss = 0 - - -###################################################################### -# Plotting the Losses -# ------------------- -# -# Plotting the historical loss from all\_losses shows the network -# learning: -# - -import matplotlib.pyplot as plt - -plt.figure() -plt.plot(all_losses) - - -###################################################################### -# Sampling the Network -# ==================== -# -# To sample we give the network a letter and ask what the next one is, -# feed that in as the next letter, and repeat until the EOS token. -# -# - Create tensors for input category, starting letter, and empty hidden -# state -# - Create a string ``output_name`` with the starting letter -# - Up to a maximum output length, -# -# - Feed the current letter to the network -# - Get the next letter from highest output, and next hidden state -# - If the letter is EOS, stop here -# - If a regular letter, add to ``output_name`` and continue -# -# - Return the final name -# -# .. note:: -# Rather than having to give it a starting letter, another -# strategy would have been to include a "start of string" token in -# training and have the network choose its own starting letter. -# - -max_length = 20 - -# Sample from a category and starting letter -def sample(category, start_letter='A'): - with torch.no_grad(): # no need to track history in sampling - category_tensor = categoryTensor(category) - input = inputTensor(start_letter) - hidden = rnn.initHidden() - - output_name = start_letter - - for i in range(max_length): - output, hidden = rnn(category_tensor, input[0], hidden) - topv, topi = output.topk(1) - topi = topi[0][0] - if topi == n_letters - 1: - break - else: - letter = all_letters[topi] - output_name += letter - input = inputTensor(letter) - - return output_name - -# Get multiple samples from one category and multiple starting letters -def samples(category, start_letters='ABC'): - for start_letter in start_letters: - print(sample(category, start_letter)) - -samples('Russian', 'RUS') - -samples('German', 'GER') - -samples('Spanish', 'SPA') - -samples('Chinese', 'CHI') - - -###################################################################### -# Exercises -# ========= -# -# - Try with a different dataset of category -> line, for example: -# -# - Fictional series -> Character name -# - Part of speech -> Word -# - Country -> City -# -# - Use a "start of sentence" token so that sampling can be done without -# choosing a start letter -# - Get better results with a bigger and/or better shaped network -# -# - Try the ``nn.LSTM`` and ``nn.GRU`` layers -# - Combine multiple of these RNNs as a higher level network -# diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py index 932e930102..ff66a0a0a0 100644 --- a/intermediate_source/compiled_autograd_tutorial.py +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -11,37 +11,35 @@ .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn :class-card: card-prerequisites - * How compiled autograd interacts with torch.compile + * How compiled autograd interacts with ``torch.compile`` * How to use the compiled autograd API - * How to inspect logs using TORCH_LOGS + * How to inspect logs using ``TORCH_LOGS`` .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites :class-card: card-prerequisites * PyTorch 2.4 - * `torch.compile `_ familiarity + * Complete the `Introduction to torch.compile `_ """ ###################################################################### # Overview # ------------ -# Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4 +# Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4 # that allows the capture of a larger backward graph. # -# Doesn't torch.compile already capture the backward graph? -# ------------ -# And it does, **partially**. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations: -# 1. Graph breaks in the forward lead to graph breaks in the backward -# 2. `Backward hooks `_ are not captured +# While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations: +# * Graph breaks in the forward lead to graph breaks in the backward +# * `Backward hooks `_ are not captured # # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing # it to capture the full backward graph at runtime. Models with these two characteristics should try # Compiled Autograd, and potentially observe better performance. # -# However, Compiled Autograd has its own limitations: -# 1. Additional runtime overhead at the start of the backward -# 2. Dynamic autograd structure leads to recompiles +# However, Compiled Autograd introduces its own limitations: +# * Added runtime overhead at the start of the backward for cache lookup +# * More prone to recompiles and graph breaks in dynamo due to the larger capture # # .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. # @@ -50,8 +48,9 @@ ###################################################################### # Setup # ------------ -# In this tutorial, we'll base our examples on this toy model. -# +# In this tutorial, we will base our examples on this simple neural network model. +# It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. +# import torch @@ -67,7 +66,7 @@ def forward(self, x): ###################################################################### # Basic usage # ------------ -# .. note:: The ``torch._dynamo.config.compiled_autograd = True`` config must be enabled before calling the torch.compile API. +# Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: # model = Model() @@ -82,23 +81,30 @@ def train(model, x): train(model, x) ###################################################################### -# Inspecting the compiled autograd logs -# ------------ -# Run the script with the TORCH_LOGS environment variables: -# - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` -# - To print the graph with more tensor medata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` +# In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10). +# We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. +# +# When ``train(model, x)`` is called: +# * Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile`` +# * Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph +# * AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. +# * Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward +# * Dynamo sets the optimized function to be evaluated next by Python Interpreter +# * Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()`` +# * Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True`` +# * Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode +# * The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward # - -@torch.compile -def train(model, x): - loss = model(x).sum() - loss.backward() - -train(model, x) ###################################################################### -# The compiled autograd graph should now be logged to stderr. Certain graph nodes will have names that are prefixed by ``aot0_``, -# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0 e.g. ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. +# Inspecting the compiled autograd logs +# ------------------------------------- +# Run the script with the ``TORCH_LOGS`` environment variables: +# - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` +# - To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` +# +# Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``, +# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. # stderr_output = """ @@ -156,17 +162,19 @@ def forward(self, inputs, sizes, scalars, hooks): """ ###################################################################### -# .. note:: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd generates some python code to represent the entire C++ autograd execution. +# .. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution. # ###################################################################### # Compiling the forward and backward pass using different flags -# ------------ -# +# ------------------------------------------------------------- +# You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward. +# def train(model, x): model = torch.compile(model) loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True torch.compile(lambda: loss.backward(), fullgraph=True)() ###################################################################### @@ -182,7 +190,7 @@ def train(model, x): ###################################################################### # Compiled Autograd addresses certain limitations of AOTAutograd -# ------------ +# -------------------------------------------------------------- # 1. Graph breaks in the forward lead to graph breaks in the backward # @@ -252,7 +260,7 @@ def forward(self, inputs, sizes, scalars, hooks): ###################################################################### # Common recompilation reasons for Compiled Autograd -# ------------ +# -------------------------------------------------- # 1. Due to change in autograd structure torch._dynamo.config.compiled_autograd = True @@ -302,7 +310,5 @@ def forward(self, inputs, sizes, scalars, hooks): ###################################################################### # Conclusion # ---------- -# In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. -# -# For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials. +# In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. # diff --git a/intermediate_source/custom_function_conv_bn_tutorial.py b/intermediate_source/custom_function_conv_bn_tutorial.py deleted file mode 100644 index a9fcd8838a..0000000000 --- a/intermediate_source/custom_function_conv_bn_tutorial.py +++ /dev/null @@ -1,394 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Fusing Convolution and Batch Norm using Custom Function -======================================================= - -Fusing adjacent convolution and batch norm layers together is typically an -inference-time optimization to improve run-time. It is usually achieved -by eliminating the batch norm layer entirely and updating the weight -and bias of the preceding convolution [0]. However, this technique is not -applicable for training models. - -In this tutorial, we will show a different technique to fuse the two layers -that can be applied during training. Rather than improved runtime, the -objective of this optimization is to reduce memory usage. - -The idea behind this optimization is to see that both convolution and -batch norm (as well as many other ops) need to save a copy of their input -during forward for the backward pass. For large -batch sizes, these saved inputs are responsible for most of your memory usage, -so being able to avoid allocating another input tensor for every -convolution batch norm pair can be a significant reduction. - -In this tutorial, we avoid this extra allocation by combining convolution -and batch norm into a single layer (as a custom function). In the forward -of this combined layer, we perform normal convolution and batch norm as-is, -with the only difference being that we will only save the inputs to the convolution. -To obtain the input of batch norm, which is necessary to backward through -it, we recompute convolution forward again during the backward pass. - -It is important to note that the usage of this optimization is situational. -Though (by avoiding one buffer saved) we always reduce the memory allocated at -the end of the forward pass, there are cases when the *peak* memory allocated -may not actually be reduced. See the final section for more details. - -For simplicity, in this tutorial we hardcode `bias=False`, `stride=1`, `padding=0`, `dilation=1`, -and `groups=1` for Conv2D. For BatchNorm2D, we hardcode `eps=1e-3`, `momentum=0.1`, -`affine=False`, and `track_running_statistics=False`. Another small difference -is that we add epsilon in the denominator outside of the square root in the computation -of batch norm. - -[0] https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ -""" - -###################################################################### -# Backward Formula Implementation for Convolution -# ------------------------------------------------------------------- -# Implementing a custom function requires us to implement the backward -# ourselves. In this case, we need both the backward formulas for Conv2D -# and BatchNorm2D. Eventually we'd chain them together in our unified -# backward function, but below we first implement them as their own -# custom functions so we can validate their correctness individually -import torch -from torch.autograd.function import once_differentiable -import torch.nn.functional as F - -def convolution_backward(grad_out, X, weight): - grad_input = F.conv2d(X.transpose(0, 1), grad_out.transpose(0, 1)).transpose(0, 1) - grad_X = F.conv_transpose2d(grad_out, weight) - return grad_X, grad_input - -class Conv2D(torch.autograd.Function): - @staticmethod - def forward(ctx, X, weight): - ctx.save_for_backward(X, weight) - return F.conv2d(X, weight) - - # Use @once_differentiable by default unless we intend to double backward - @staticmethod - @once_differentiable - def backward(ctx, grad_out): - X, weight = ctx.saved_tensors - return convolution_backward(grad_out, X, weight) - -###################################################################### -# When testing with ``gradcheck``, it is important to use double precision -weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double) -X = torch.rand(10, 3, 7, 7, requires_grad=True, dtype=torch.double) -torch.autograd.gradcheck(Conv2D.apply, (X, weight)) - -###################################################################### -# Backward Formula Implementation for Batch Norm -# ------------------------------------------------------------------- -# Batch Norm has two modes: training and ``eval`` mode. In training mode -# the sample statistics are a function of the inputs. In ``eval`` mode, -# we use the saved running statistics, which are not a function of the inputs. -# This makes non-training mode's backward significantly simpler. Below -# we implement and test only the training mode case. -def unsqueeze_all(t): - # Helper function to ``unsqueeze`` all the dimensions that we reduce over - return t[None, :, None, None] - -def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps): - # We use the formula: ``out = (X - mean(X)) / (sqrt(var(X)) + eps)`` - # in batch norm 2D forward. To simplify our derivation, we follow the - # chain rule and compute the gradients as follows before accumulating - # them all into a final grad_input. - # 1) ``grad of out wrt var(X)`` * ``grad of var(X) wrt X`` - # 2) ``grad of out wrt mean(X)`` * ``grad of mean(X) wrt X`` - # 3) ``grad of out wrt X in the numerator`` * ``grad of X wrt X`` - # We then rewrite the formulas to use as few extra buffers as possible - tmp = ((X - unsqueeze_all(sum) / N) * grad_out).sum(dim=(0, 2, 3)) - tmp *= -1 - d_denom = tmp / (sqrt_var + eps)**2 # ``d_denom = -num / denom**2`` - # It is useful to delete tensors when you no longer need them with ``del`` - # For example, we could've done ``del tmp`` here because we won't use it later - # In this case, it's not a big difference because ``tmp`` only has size of (C,) - # The important thing is avoid allocating NCHW-sized tensors unnecessarily - d_var = d_denom / (2 * sqrt_var) # ``denom = torch.sqrt(var) + eps`` - # Compute ``d_mean_dx`` before allocating the final NCHW-sized grad_input buffer - d_mean_dx = grad_out / unsqueeze_all(sqrt_var + eps) - d_mean_dx = unsqueeze_all(-d_mean_dx.sum(dim=(0, 2, 3)) / N) - # ``d_mean_dx`` has already been reassigned to a C-sized buffer so no need to worry - - # ``(1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1)`` - grad_input = X * unsqueeze_all(d_var * N) - grad_input += unsqueeze_all(-d_var * sum) - grad_input *= 2 / ((N - 1) * N) - # (2) mean (see above) - grad_input += d_mean_dx - # (3) Add 'grad_out / ' without allocating an extra buffer - grad_input *= unsqueeze_all(sqrt_var + eps) - grad_input += grad_out - grad_input /= unsqueeze_all(sqrt_var + eps) # ``sqrt_var + eps > 0!`` - return grad_input - -class BatchNorm(torch.autograd.Function): - @staticmethod - def forward(ctx, X, eps=1e-3): - # Don't save ``keepdim`` values for backward - sum = X.sum(dim=(0, 2, 3)) - var = X.var(unbiased=True, dim=(0, 2, 3)) - N = X.numel() / X.size(1) - sqrt_var = torch.sqrt(var) - ctx.save_for_backward(X) - ctx.eps = eps - ctx.sum = sum - ctx.N = N - ctx.sqrt_var = sqrt_var - mean = sum / N - denom = sqrt_var + eps - out = X - unsqueeze_all(mean) - out /= unsqueeze_all(denom) - return out - - @staticmethod - @once_differentiable - def backward(ctx, grad_out): - X, = ctx.saved_tensors - return batch_norm_backward(grad_out, X, ctx.sum, ctx.sqrt_var, ctx.N, ctx.eps) - -###################################################################### -# Testing with ``gradcheck`` -a = torch.rand(1, 2, 3, 4, requires_grad=True, dtype=torch.double) -torch.autograd.gradcheck(BatchNorm.apply, (a,), fast_mode=False) - -###################################################################### -# Fusing Convolution and BatchNorm -# ------------------------------------------------------------------- -# Now that the bulk of the work has been done, we can combine -# them together. Note that in (1) we only save a single buffer -# for backward, but this also means we recompute convolution forward -# in (5). Also see that in (2), (3), (4), and (6), it's the same -# exact code as the examples above. -class FusedConvBN2DFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, X, conv_weight, eps=1e-3): - assert X.ndim == 4 # N, C, H, W - # (1) Only need to save this single buffer for backward! - ctx.save_for_backward(X, conv_weight) - - # (2) Exact same Conv2D forward from example above - X = F.conv2d(X, conv_weight) - # (3) Exact same BatchNorm2D forward from example above - sum = X.sum(dim=(0, 2, 3)) - var = X.var(unbiased=True, dim=(0, 2, 3)) - N = X.numel() / X.size(1) - sqrt_var = torch.sqrt(var) - ctx.eps = eps - ctx.sum = sum - ctx.N = N - ctx.sqrt_var = sqrt_var - mean = sum / N - denom = sqrt_var + eps - # Try to do as many things in-place as possible - # Instead of `out = (X - a) / b`, doing `out = X - a; out /= b` - # avoids allocating one extra NCHW-sized buffer here - out = X - unsqueeze_all(mean) - out /= unsqueeze_all(denom) - return out - - @staticmethod - def backward(ctx, grad_out): - X, conv_weight, = ctx.saved_tensors - # (4) Batch norm backward - # (5) We need to recompute conv - X_conv_out = F.conv2d(X, conv_weight) - grad_out = batch_norm_backward(grad_out, X_conv_out, ctx.sum, ctx.sqrt_var, - ctx.N, ctx.eps) - # (6) Conv2d backward - grad_X, grad_input = convolution_backward(grad_out, X, conv_weight) - return grad_X, grad_input, None, None, None, None, None - -###################################################################### -# The next step is to wrap our functional variant in a stateful -# `nn.Module` -import torch.nn as nn -import math - -class FusedConvBN(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, exp_avg_factor=0.1, - eps=1e-3, device=None, dtype=None): - super(FusedConvBN, self).__init__() - factory_kwargs = {'device': device, 'dtype': dtype} - # Conv parameters - weight_shape = (out_channels, in_channels, kernel_size, kernel_size) - self.conv_weight = nn.Parameter(torch.empty(*weight_shape, **factory_kwargs)) - # Batch norm parameters - num_features = out_channels - self.num_features = num_features - self.eps = eps - # Initialize - self.reset_parameters() - - def forward(self, X): - return FusedConvBN2DFunction.apply(X, self.conv_weight, self.eps) - - def reset_parameters(self) -> None: - nn.init.kaiming_uniform_(self.conv_weight, a=math.sqrt(5)) - -###################################################################### -# Use ``gradcheck`` to validate the correctness of our backward formula -weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double) -X = torch.rand(2, 3, 4, 4, requires_grad=True, dtype=torch.double) -torch.autograd.gradcheck(FusedConvBN2DFunction.apply, (X, weight)) - -###################################################################### -# Testing out our new Layer -# ------------------------------------------------------------------- -# Use ``FusedConvBN`` to train a basic network -# The code below is after some light modifications to the example here: -# https://github.com/pytorch/examples/tree/master/mnist -import torch.optim as optim -from torchvision import datasets, transforms -from torch.optim.lr_scheduler import StepLR - -# Record memory allocated at the end of the forward pass -memory_allocated = [[],[]] - -class Net(nn.Module): - def __init__(self, fused=True): - super(Net, self).__init__() - self.fused = fused - if fused: - self.convbn1 = FusedConvBN(1, 32, 3) - self.convbn2 = FusedConvBN(32, 64, 3) - else: - self.conv1 = nn.Conv2d(1, 32, 3, 1, bias=False) - self.bn1 = nn.BatchNorm2d(32, affine=False, track_running_stats=False) - self.conv2 = nn.Conv2d(32, 64, 3, 1, bias=False) - self.bn2 = nn.BatchNorm2d(64, affine=False, track_running_stats=False) - self.fc1 = nn.Linear(9216, 128) - self.dropout = nn.Dropout(0.5) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - if self.fused: - x = self.convbn1(x) - else: - x = self.conv1(x) - x = self.bn1(x) - F.relu_(x) - if self.fused: - x = self.convbn2(x) - else: - x = self.conv2(x) - x = self.bn2(x) - F.relu_(x) - x = F.max_pool2d(x, 2) - F.relu_(x) - x = x.flatten(1) - x = self.fc1(x) - x = self.dropout(x) - F.relu_(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - if fused: - memory_allocated[0].append(torch.cuda.memory_allocated()) - else: - memory_allocated[1].append(torch.cuda.memory_allocated()) - return output - -def train(model, device, train_loader, optimizer, epoch): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % 2 == 0: - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss.item())) - -def test(model, device, test_loader): - model.eval() - test_loss = 0 - correct = 0 - # Use inference mode instead of no_grad, for free improved test-time performance - with torch.inference_mode(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, reduction='sum').item() - # get the index of the max log-probability - pred = output.argmax(dim=1, keepdim=True) - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - - print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - -use_cuda = torch.cuda.is_available() -device = torch.device("cuda" if use_cuda else "cpu") -train_kwargs = {'batch_size': 2048} -test_kwargs = {'batch_size': 2048} - -if use_cuda: - cuda_kwargs = {'num_workers': 1, - 'pin_memory': True, - 'shuffle': True} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - -transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) -]) -dataset1 = datasets.MNIST('../data', train=True, download=True, - transform=transform) -dataset2 = datasets.MNIST('../data', train=False, - transform=transform) -train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) -test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - -###################################################################### -# A Comparison of Memory Usage -# ------------------------------------------------------------------- -# If CUDA is enabled, print out memory usage for both `fused=True` and `fused=False` -# For an example run on NVIDIA GeForce RTX 3070, NVIDIA CUDA® Deep Neural Network library (cuDNN) 8.0.5: fused peak memory: 1.56GB, -# unfused peak memory: 2.68GB -# -# It is important to note that the *peak* memory usage for this model may vary depending -# the specific cuDNN convolution algorithm used. For shallower models, it -# may be possible for the peak memory allocated of the fused model to exceed -# that of the unfused model! This is because the memory allocated to compute -# certain cuDNN convolution algorithms can be high enough to "hide" the typical peak -# you would expect to be near the start of the backward pass. -# -# For this reason, we also record and display the memory allocated at the end -# of the forward pass as an approximation, and to demonstrate that we indeed -# allocate one fewer buffer per fused ``conv-bn`` pair. -from statistics import mean - -torch.backends.cudnn.enabled = True - -if use_cuda: - peak_memory_allocated = [] - - for fused in (True, False): - torch.manual_seed(123456) - - model = Net(fused=fused).to(device) - optimizer = optim.Adadelta(model.parameters(), lr=1.0) - scheduler = StepLR(optimizer, step_size=1, gamma=0.7) - - for epoch in range(1): - train(model, device, train_loader, optimizer, epoch) - test(model, device, test_loader) - scheduler.step() - peak_memory_allocated.append(torch.cuda.max_memory_allocated()) - torch.cuda.reset_peak_memory_stats() - print("cuDNN version:", torch.backends.cudnn.version()) - print() - print("Peak memory allocated:") - print(f"fused: {peak_memory_allocated[0]/1024**3:.2f}GB, unfused: {peak_memory_allocated[1]/1024**3:.2f}GB") - print("Memory allocated at end of forward pass:") - print(f"fused: {mean(memory_allocated[0])/1024**3:.2f}GB, unfused: {mean(memory_allocated[1])/1024**3:.2f}GB") - - diff --git a/intermediate_source/custom_function_double_backward_tutorial.rst b/intermediate_source/custom_function_double_backward_tutorial.rst deleted file mode 100644 index bbb7701f18..0000000000 --- a/intermediate_source/custom_function_double_backward_tutorial.rst +++ /dev/null @@ -1,301 +0,0 @@ -Double Backward with Custom Functions -===================================== - -It is sometimes useful to run backwards twice through backward graph, for -example to compute higher-order gradients. It takes an understanding of -autograd and some care to support double backwards, however. Functions -that support performing backward a single time are not necessarily -equipped to support double backward. In this tutorial we show how to -write a custom autograd function that supports double backward, and -point out some things to look out for. - - -When writing a custom autograd function to backward through twice, -it is important to know when operations performed in a custom function -are recorded by autograd, when they aren't, and most importantly, how -`save_for_backward` works with all of this. - -Custom functions implicitly affects grad mode in two ways: - -- During forward, autograd does not record any the graph for any - operations performed within the forward function. When forward - completes, the backward function of the custom function - becomes the `grad_fn` of each of the forward's outputs - -- During backward, autograd records the computation graph used to - compute the backward pass if create_graph is specified - -Next, to understand how `save_for_backward` interacts with the above, -we can explore a couple examples: - - -Saving the Inputs -------------------------------------------------------------------- -Consider this simple squaring function. It saves an input tensor -for backward. Double backward works automatically when autograd -is able to record operations in the backward pass, so there is usually -nothing to worry about when we save an input for backward as -the input should have grad_fn if it is a function of any tensor -that requires grad. This allows the gradients to be properly propagated. - -.. code:: python - - import torch - - class Square(torch.autograd.Function): - @staticmethod - def forward(ctx, x): - # Because we are saving one of the inputs use `save_for_backward` - # Save non-tensors and non-inputs/non-outputs directly on ctx - ctx.save_for_backward(x) - return x**2 - - @staticmethod - def backward(ctx, grad_out): - # A function support double backward automatically if autograd - # is able to record the computations performed in backward - x, = ctx.saved_tensors - return grad_out * 2 * x - - # Use double precision because finite differencing method magnifies errors - x = torch.rand(3, 3, requires_grad=True, dtype=torch.double) - torch.autograd.gradcheck(Square.apply, x) - # Use gradcheck to verify second-order derivatives - torch.autograd.gradgradcheck(Square.apply, x) - - -We can use torchviz to visualize the graph to see why this works - -.. code-block:: python - - import torchviz - - x = torch.tensor(1., requires_grad=True).clone() - out = Square.apply(x) - grad_x, = torch.autograd.grad(out, x, create_graph=True) - torchviz.make_dot((grad_x, x, out), {"grad_x": grad_x, "x": x, "out": out}) - -We can see that the gradient wrt to x, is itself a function of x (dout/dx = 2x) -And the graph of this function has been properly constructed - -.. image:: https://user-images.githubusercontent.com/13428986/126559699-e04f3cb1-aaf2-4a9a-a83d-b8767d04fbd9.png - :width: 400 - - -Saving the Outputs -------------------------------------------------------------------- -A slight variation on the previous example is to save an output -instead of input. The mechanics are similar because outputs are also -associated with a grad_fn. - -.. code-block:: python - - class Exp(torch.autograd.Function): - # Simple case where everything goes well - @staticmethod - def forward(ctx, x): - # This time we save the output - result = torch.exp(x) - # Note that we should use `save_for_backward` here when - # the tensor saved is an ouptut (or an input). - ctx.save_for_backward(result) - return result - - @staticmethod - def backward(ctx, grad_out): - result, = ctx.saved_tensors - return result * grad_out - - x = torch.tensor(1., requires_grad=True, dtype=torch.double).clone() - # Validate our gradients using gradcheck - torch.autograd.gradcheck(Exp.apply, x) - torch.autograd.gradgradcheck(Exp.apply, x) - -Use torchviz to visualize the graph: - -.. code-block:: python - - out = Exp.apply(x) - grad_x, = torch.autograd.grad(out, x, create_graph=True) - torchviz.make_dot((grad_x, x, out), {"grad_x": grad_x, "x": x, "out": out}) - -.. image:: https://user-images.githubusercontent.com/13428986/126559780-d141f2ba-1ee8-4c33-b4eb-c9877b27a954.png - :width: 332 - - -Saving Intermediate Results -------------------------------------------------------------------- -A more tricky case is when we need to save an intermediate result. -We demonstrate this case by implementing: - -.. math:: - sinh(x) := \frac{e^x - e^{-x}}{2} - -Since the derivative of sinh is cosh, it might be useful to reuse -`exp(x)` and `exp(-x)`, the two intermediate results in forward -in the backward computation. - -Intermediate results should not be directly saved and used in backward though. -Because forward is performed in no-grad mode, if an intermediate result -of the forward pass is used to compute gradients in the backward pass -the backward graph of the gradients would not include the operations -that computed the intermediate result. This leads to incorrect gradients. - -.. code-block:: python - - class Sinh(torch.autograd.Function): - @staticmethod - def forward(ctx, x): - expx = torch.exp(x) - expnegx = torch.exp(-x) - ctx.save_for_backward(expx, expnegx) - # In order to be able to save the intermediate results, a trick is to - # include them as our outputs, so that the backward graph is constructed - return (expx - expnegx) / 2, expx, expnegx - - @staticmethod - def backward(ctx, grad_out, _grad_out_exp, _grad_out_negexp): - expx, expnegx = ctx.saved_tensors - grad_input = grad_out * (expx + expnegx) / 2 - # We cannot skip accumulating these even though we won't use the outputs - # directly. They will be used later in the second backward. - grad_input += _grad_out_exp * expx - grad_input -= _grad_out_negexp * expnegx - return grad_input - - def sinh(x): - # Create a wrapper that only returns the first output - return Sinh.apply(x)[0] - - x = torch.rand(3, 3, requires_grad=True, dtype=torch.double) - torch.autograd.gradcheck(sinh, x) - torch.autograd.gradgradcheck(sinh, x) - - -Use torchviz to visualize the graph: - -.. code-block:: python - - out = sinh(x) - grad_x, = torch.autograd.grad(out.sum(), x, create_graph=True) - torchviz.make_dot((grad_x, x, out), params={"grad_x": grad_x, "x": x, "out": out}) - -.. image:: https://user-images.githubusercontent.com/13428986/126560494-e48eba62-be84-4b29-8c90-a7f6f40b1438.png - :width: 460 - - -Saving Intermediate Results: What not to do -------------------------------------------------------------------- -Now we show what happens when we don't also return our intermediate -results as outputs: `grad_x` would not even have a backward graph -because it is purely a function `exp` and `expnegx`, which don't -require grad. - -.. code-block:: python - - class SinhBad(torch.autograd.Function): - # This is an example of what NOT to do! - @staticmethod - def forward(ctx, x): - expx = torch.exp(x) - expnegx = torch.exp(-x) - ctx.expx = expx - ctx.expnegx = expnegx - return (expx - expnegx) / 2 - - @staticmethod - def backward(ctx, grad_out): - expx = ctx.expx - expnegx = ctx.expnegx - grad_input = grad_out * (expx + expnegx) / 2 - return grad_input - - -Use torchviz to visualize the graph. Notice that `grad_x` is not -part of the graph! - -.. code-block:: python - - out = SinhBad.apply(x) - grad_x, = torch.autograd.grad(out.sum(), x, create_graph=True) - torchviz.make_dot((grad_x, x, out), params={"grad_x": grad_x, "x": x, "out": out}) - -.. image:: https://user-images.githubusercontent.com/13428986/126565889-13992f01-55bc-411a-8aee-05b721fe064a.png - :width: 232 - - - -When Backward is not Tracked -------------------------------------------------------------------- -Finally, let's consider an example when it may not be possible for -autograd to track gradients for a functions backward at all. -We can imagine cube_backward to be a function that may require a -non-PyTorch library like SciPy or NumPy, or written as a -C++ extension. The workaround demonstrated here is to create another -custom function CubeBackward where you also manually specify the -backward of cube_backward! - - -.. code-block:: python - - def cube_forward(x): - return x**3 - - def cube_backward(grad_out, x): - return grad_out * 3 * x**2 - - def cube_backward_backward(grad_out, sav_grad_out, x): - return grad_out * sav_grad_out * 6 * x - - def cube_backward_backward_grad_out(grad_out, x): - return grad_out * 3 * x**2 - - class Cube(torch.autograd.Function): - @staticmethod - def forward(ctx, x): - ctx.save_for_backward(x) - return cube_forward(x) - - @staticmethod - def backward(ctx, grad_out): - x, = ctx.saved_tensors - return CubeBackward.apply(grad_out, x) - - class CubeBackward(torch.autograd.Function): - @staticmethod - def forward(ctx, grad_out, x): - ctx.save_for_backward(x, grad_out) - return cube_backward(grad_out, x) - - @staticmethod - def backward(ctx, grad_out): - x, sav_grad_out = ctx.saved_tensors - dx = cube_backward_backward(grad_out, sav_grad_out, x) - dgrad_out = cube_backward_backward_grad_out(grad_out, x) - return dgrad_out, dx - - x = torch.tensor(2., requires_grad=True, dtype=torch.double) - - torch.autograd.gradcheck(Cube.apply, x) - torch.autograd.gradgradcheck(Cube.apply, x) - - -Use torchviz to visualize the graph: - -.. code-block:: python - - out = Cube.apply(x) - grad_x, = torch.autograd.grad(out, x, create_graph=True) - torchviz.make_dot((grad_x, x, out), params={"grad_x": grad_x, "x": x, "out": out}) - -.. image:: https://user-images.githubusercontent.com/13428986/126559935-74526b4d-d419-4983-b1f0-a6ee99428531.png - :width: 352 - - -To conclude, whether double backward works for your custom function -simply depends on whether the backward pass can be tracked by autograd. -With the first two examples we show situations where double backward -works out of the box. With the third and fourth examples, we demonstrate -techniques that enable a backward function to be tracked, when they -otherwise would not be. - diff --git a/intermediate_source/ddp_series_minGPT.rst b/intermediate_source/ddp_series_minGPT.rst deleted file mode 100644 index 259db3623c..0000000000 --- a/intermediate_source/ddp_series_minGPT.rst +++ /dev/null @@ -1,87 +0,0 @@ -`Introduction <../beginner/ddp_series_intro.html>`__ \|\| `What is DDP <../beginner/ddp_series_theory.html>`__ \|\| `Single-Node -Multi-GPU Training <../beginner/ddp_series_multigpu.html>`__ \|\| `Fault -Tolerance <../beginner/ddp_series_fault_tolerance.html>`__ \|\| `Multi-Node -training `__ \|\| **minGPT Training** - -Training “real-world” models with DDP -===================================== - -Authors: `Suraj Subramanian `__ - -.. grid:: 2 - - .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn - :class-card: card-prerequisites - - - Best practices when writing a distributed training script - - Increased flexibility with saving/loading artifacts in the cloud - - When DDP is NOT suitable - - .. grid:: 1 - - .. grid-item:: - - :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ - - .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites - :class-card: card-prerequisites - - - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ - - [Optional] Familiarity with `multinode training `__ - - 2 or more TCP-reachable GPU machines (this tutorial uses AWS p3.2xlarge instances) - - PyTorch `installed `__ with CUDA on all machines - -Follow along with the video below or on `youtube `__. - -.. raw:: html - -
- -
- -In this video, we will review the process of training a GPT model in multinode DDP. -We first clone the `minGPT repo `__ and refactor the Trainer -to resemble the structure we have used in this series. Watch the video for details on these changes. - -We use `hydra `__ to centrally manage all the configurations for our training run. -Once the code has been refactored, we run it first on a single-node with 4 GPUs, and then on a slurm cluster. - -Files used for training -~~~~~~~~~~~~~~~~~~~~~~~~ -- `trainer.py `__ includes the Trainer class that runs the distributed training iterations on the model with the provided dataset. -- `model.py `__ defines the model architecture. -- `char_dataset.py `__ contains the ``Dataset`` class for a character-level dataset. -- `gpt2_train_cfg.yaml `__ contains the configurations for data, model, optimizer, and training run. -- `main.py `__ is the entry point to the training job. It sets up the DDP process group, reads all the configurations and runs the training job. - - -Saving and Loading from the cloud -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the video above, we save training snapshots directly to the cloud. This gives us the flexibility to continue training -from any node that has access to the cloud bucket. - - -Using Mixed Precision -~~~~~~~~~~~~~~~~~~~~~~~~ -To speed things up, you might be able to use `Mixed Precision `__ to train your models. -In Mixed Precision, some parts of the training process are carried out in reduced precision, while other steps -that are more sensitive to precision drops are maintained in FP32 precision. - - -When is DDP not enough? -~~~~~~~~~~~~~~~~~~~~~~~~ -A typical training run's memory footprint consists of model weights, activations, gradients, the input batch, and the optimizer state. -Since DDP replicates the model on each GPU, it only works when GPUs have sufficient capacity to accomodate the full footprint. -When models grow larger, more aggressive techniques might be useful: - -- `activation checkpointing `__: Instead of saving intermediate activations during the forward pass, the activations are recomputed during the backward pass. In this approach, we run more compute but save on memory footprint. -- `Fully-Sharded Data Parallel `__: Here the model is not replicated but "sharded" across all the GPUs, and computation is overlapped with communication in the forward and backward passes. Read our `blog `__ to learn how we trained a 1 Trillion parameter model with FSDP. - - -Further Reading ---------------- -- `Multi-Node training with DDP `__ (previous tutorial in this series) -- `Mixed Precision training `__ -- `Fully-Sharded Data Parallel `__ -- `Training a 1T parameter model with FSDP `__ -- `FSDP Video Tutorial Series `__ diff --git a/intermediate_source/ddp_series_multinode.rst b/intermediate_source/ddp_series_multinode.rst deleted file mode 100644 index 5717589bda..0000000000 --- a/intermediate_source/ddp_series_multinode.rst +++ /dev/null @@ -1,94 +0,0 @@ -`Introduction <../beginner/ddp_series_intro.html>`__ \|\| `What is DDP <../beginner/ddp_series_theory.html>`__ \|\| `Single-Node -Multi-GPU Training <../beginner/ddp_series_multigpu.html>`__ \|\| `Fault -Tolerance <../beginner/ddp_series_fault_tolerance.html>`__ \|\| **Multi-Node -training** \|\| `minGPT Training `__ - -Multinode Training -================== - -Authors: `Suraj Subramanian `__ - -.. grid:: 2 - - .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn - :class-card: card-prerequisites - - - Launching multinode training jobs with ``torchrun`` - - Code changes (and things to keep in mind) when moving from single-node to multinode training. - - .. grid:: 1 - - .. grid-item:: - - :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ - - .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites - :class-card: card-prerequisites - - - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ - - 2 or more TCP-reachable GPU machines (this tutorial uses AWS p3.2xlarge instances) - - PyTorch `installed `__ with CUDA on all machines - -Follow along with the video below or on `youtube `__. - -.. raw:: html - -
- -
- -Multinode training involves deploying a training job across several -machines. There are two ways to do this: - -- running a ``torchrun`` command on each machine with identical rendezvous arguments, or -- deploying it on a compute cluster using a workload manager (like SLURM) - -In this video we will go over the (minimal) code changes required to move from single-node multigpu to -multinode training, and run our training script in both of the above ways. - -Note that multinode training is bottlenecked by inter-node communication latencies. Running a training job -on 4 GPUs on a single node will be faster than running it on 4 nodes with 1 GPU each. - -Local and Global ranks -~~~~~~~~~~~~~~~~~~~~~~~~ -In single-node settings, we were tracking the -``gpu_id`` of each device running our training process. ``torchrun`` tracks this value in an environment variable ``LOCAL_RANK`` -which uniquely identifies each GPU-process on a node. For a unique identifier across all the nodes, ``torchrun`` provides another variable -``RANK`` which refers to the global rank of a process. - -.. warning:: - Do not use ``RANK`` for critical logic in your training job. When ``torchrun`` restarts processes after a failure or membership changes, there is no guarantee - that the processes will hold the same ``LOCAL_RANK`` and ``RANKS``. - - -Heteregeneous Scaling -~~~~~~~~~~~~~~~~~~~~~~ -Torchrun supports *heteregenous scaling* i.e. each of your multinode machines can have different number of -GPUs participating in the training job. In the video, I deployed the code on 2 machines where one machine has 4 GPUs and the -other used only 2 GPUs. - - -Troubleshooting -~~~~~~~~~~~~~~~~~~ - -- Ensure that your nodes are able to communicate with each other over - TCP. -- Set env variable ``NCCL_DEBUG`` to ``INFO`` (using - ``export NCCL_DEBUG=INFO``) to print verbose logs that can help - diagnose the issue. -- Sometimes you might need to explicitly set the network interface for - the distributed backend (``export NCCL_SOCKET_IFNAME=eth0``). Read - more about this - `here `__. - - -Further Reading ---------------- -- `Training a GPT model with DDP `__ (next tutorial in this series) -- `Fault Tolerant distributed training <../beginner/ddp_series_fault_tolerance.html>`__ (previous tutorial in this series) -- `torchrun `__ -- `Rendezvous - arguments `__ -- `Setting up a cluster on - AWS `__ -- `Slurm docs `__ diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst deleted file mode 100644 index cff5105fa5..0000000000 --- a/intermediate_source/ddp_tutorial.rst +++ /dev/null @@ -1,375 +0,0 @@ -Getting Started with Distributed Data Parallel -================================================= -**Author**: `Shen Li `_ - -**Edited by**: `Joe Zhu `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ -- `DistributedDataParallel API documents `__ -- `DistributedDataParallel notes `__ - - -`DistributedDataParallel `__ -(DDP) implements data parallelism at the module level which can run across -multiple machines. Applications using DDP should spawn multiple processes and -create a single DDP instance per process. DDP uses collective communications in the -`torch.distributed `__ -package to synchronize gradients and buffers. More specifically, DDP registers -an autograd hook for each parameter given by ``model.parameters()`` and the -hook will fire when the corresponding gradient is computed in the backward -pass. Then DDP uses that signal to trigger gradient synchronization across -processes. Please refer to -`DDP design note `__ for more details. - - -The recommended way to use DDP is to spawn one process for each model replica, -where a model replica can span multiple devices. DDP processes can be -placed on the same machine or across machines, but GPU devices cannot be -shared across processes. This tutorial starts from a basic DDP use case and -then demonstrates more advanced use cases including checkpointing models and -combining DDP with model parallel. - - -.. note:: - The code in this tutorial runs on an 8-GPU server, but it can be easily - generalized to other environments. - - -Comparison between ``DataParallel`` and ``DistributedDataParallel`` -------------------------------------------------------------------- - -Before we dive in, let's clarify why, despite the added complexity, you would -consider using ``DistributedDataParallel`` over ``DataParallel``: - -- First, ``DataParallel`` is single-process, multi-thread, and only works on a - single machine, while ``DistributedDataParallel`` is multi-process and works - for both single- and multi- machine training. ``DataParallel`` is usually - slower than ``DistributedDataParallel`` even on a single machine due to GIL - contention across threads, per-iteration replicated model, and additional - overhead introduced by scattering inputs and gathering outputs. -- Recall from the - `prior tutorial `__ - that if your model is too large to fit on a single GPU, you must use **model parallel** - to split it across multiple GPUs. ``DistributedDataParallel`` works with - **model parallel**; ``DataParallel`` does not at this time. When DDP is combined - with model parallel, each DDP process would use model parallel, and all processes - collectively would use data parallel. -- If your model needs to span multiple machines or if your use case does not fit - into data parallelism paradigm, please see `the RPC API `__ - for more generic distributed training support. - -Basic Use Case --------------- - -To create a DDP module, you must first set up process groups properly. More details can -be found in -`Writing Distributed Applications with PyTorch `__. - -.. code:: python - - import os - import sys - import tempfile - import torch - import torch.distributed as dist - import torch.nn as nn - import torch.optim as optim - import torch.multiprocessing as mp - - from torch.nn.parallel import DistributedDataParallel as DDP - - # On Windows platform, the torch.distributed package only - # supports Gloo backend, FileStore and TcpStore. - # For FileStore, set init_method parameter in init_process_group - # to a local file. Example as follow: - # init_method="file:///f:/libtmp/some_file" - # dist.init_process_group( - # "gloo", - # rank=rank, - # init_method=init_method, - # world_size=world_size) - # For TcpStore, same way as on Linux. - - def setup(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '12355' - - # initialize the process group - dist.init_process_group("gloo", rank=rank, world_size=world_size) - - def cleanup(): - dist.destroy_process_group() - -Now, let's create a toy module, wrap it with DDP, and feed it some dummy -input data. Please note, as DDP broadcasts model states from rank 0 process to -all other processes in the DDP constructor, you do not need to worry about -different DDP processes starting from different initial model parameter values. - -.. code:: python - - class ToyModel(nn.Module): - def __init__(self): - super(ToyModel, self).__init__() - self.net1 = nn.Linear(10, 10) - self.relu = nn.ReLU() - self.net2 = nn.Linear(10, 5) - - def forward(self, x): - return self.net2(self.relu(self.net1(x))) - - - def demo_basic(rank, world_size): - print(f"Running basic DDP example on rank {rank}.") - setup(rank, world_size) - - # create model and move it to GPU with id rank - model = ToyModel().to(rank) - ddp_model = DDP(model, device_ids=[rank]) - - loss_fn = nn.MSELoss() - optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) - - optimizer.zero_grad() - outputs = ddp_model(torch.randn(20, 10)) - labels = torch.randn(20, 5).to(rank) - loss_fn(outputs, labels).backward() - optimizer.step() - - cleanup() - - - def run_demo(demo_fn, world_size): - mp.spawn(demo_fn, - args=(world_size,), - nprocs=world_size, - join=True) - -As you can see, DDP wraps lower-level distributed communication details and -provides a clean API as if it were a local model. Gradient synchronization -communications take place during the backward pass and overlap with the -backward computation. When the ``backward()`` returns, ``param.grad`` already -contains the synchronized gradient tensor. For basic use cases, DDP only -requires a few more LoCs to set up the process group. When applying DDP to more -advanced use cases, some caveats require caution. - -Skewed Processing Speeds ------------------------- - -In DDP, the constructor, the forward pass, and the backward pass are -distributed synchronization points. Different processes are expected to launch -the same number of synchronizations and reach these synchronization points in -the same order and enter each synchronization point at roughly the same time. -Otherwise, fast processes might arrive early and timeout while waiting for -stragglers. Hence, users are responsible for balancing workload distributions -across processes. Sometimes, skewed processing speeds are inevitable due to, -e.g., network delays, resource contentions, or unpredictable workload spikes. To -avoid timeouts in these situations, make sure that you pass a sufficiently -large ``timeout`` value when calling -`init_process_group `__. - -Save and Load Checkpoints -------------------------- - -It's common to use ``torch.save`` and ``torch.load`` to checkpoint modules -during training and recover from checkpoints. See -`SAVING AND LOADING MODELS `__ -for more details. When using DDP, one optimization is to save the model in -only one process and then load it to all processes, reducing write overhead. -This is correct because all processes start from the same parameters and -gradients are synchronized in backward passes, and hence optimizers should keep -setting parameters to the same values. If you use this optimization, make sure no process starts -loading before the saving is finished. Additionally, when -loading the module, you need to provide an appropriate ``map_location`` -argument to prevent a process from stepping into others' devices. If ``map_location`` -is missing, ``torch.load`` will first load the module to CPU and then copy each -parameter to where it was saved, which would result in all processes on the -same machine using the same set of devices. For more advanced failure recovery -and elasticity support, please refer to `TorchElastic `__. - -.. code:: python - - def demo_checkpoint(rank, world_size): - print(f"Running DDP checkpoint example on rank {rank}.") - setup(rank, world_size) - - model = ToyModel().to(rank) - ddp_model = DDP(model, device_ids=[rank]) - - - CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint" - if rank == 0: - # All processes should see same parameters as they all start from same - # random parameters and gradients are synchronized in backward passes. - # Therefore, saving it in one process is sufficient. - torch.save(ddp_model.state_dict(), CHECKPOINT_PATH) - - # Use a barrier() to make sure that process 1 loads the model after process - # 0 saves it. - dist.barrier() - # configure map_location properly - map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} - ddp_model.load_state_dict( - torch.load(CHECKPOINT_PATH, map_location=map_location, weights_only=True)) - - loss_fn = nn.MSELoss() - optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) - - optimizer.zero_grad() - outputs = ddp_model(torch.randn(20, 10)) - labels = torch.randn(20, 5).to(rank) - - loss_fn(outputs, labels).backward() - optimizer.step() - - # Not necessary to use a dist.barrier() to guard the file deletion below - # as the AllReduce ops in the backward pass of DDP already served as - # a synchronization. - - if rank == 0: - os.remove(CHECKPOINT_PATH) - - cleanup() - -Combining DDP with Model Parallelism ------------------------------------- - -DDP also works with multi-GPU models. DDP wrapping multi-GPU models is especially -helpful when training large models with a huge amount of data. - -.. code:: python - - class ToyMpModel(nn.Module): - def __init__(self, dev0, dev1): - super(ToyMpModel, self).__init__() - self.dev0 = dev0 - self.dev1 = dev1 - self.net1 = torch.nn.Linear(10, 10).to(dev0) - self.relu = torch.nn.ReLU() - self.net2 = torch.nn.Linear(10, 5).to(dev1) - - def forward(self, x): - x = x.to(self.dev0) - x = self.relu(self.net1(x)) - x = x.to(self.dev1) - return self.net2(x) - -When passing a multi-GPU model to DDP, ``device_ids`` and ``output_device`` -must NOT be set. Input and output data will be placed in proper devices by -either the application or the model ``forward()`` method. - -.. code:: python - - def demo_model_parallel(rank, world_size): - print(f"Running DDP with model parallel example on rank {rank}.") - setup(rank, world_size) - - # setup mp_model and devices for this process - dev0 = rank * 2 - dev1 = rank * 2 + 1 - mp_model = ToyMpModel(dev0, dev1) - ddp_mp_model = DDP(mp_model) - - loss_fn = nn.MSELoss() - optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001) - - optimizer.zero_grad() - # outputs will be on dev1 - outputs = ddp_mp_model(torch.randn(20, 10)) - labels = torch.randn(20, 5).to(dev1) - loss_fn(outputs, labels).backward() - optimizer.step() - - cleanup() - - - if __name__ == "__main__": - n_gpus = torch.cuda.device_count() - assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}" - world_size = n_gpus - run_demo(demo_basic, world_size) - run_demo(demo_checkpoint, world_size) - world_size = n_gpus//2 - run_demo(demo_model_parallel, world_size) - -Initialize DDP with torch.distributed.run/torchrun ---------------------------------------------------- - -We can leverage PyTorch Elastic to simplify the DDP code and initialize the job more easily. -Let's still use the Toymodel example and create a file named ``elastic_ddp.py``. - -.. code:: python - - import torch - import torch.distributed as dist - import torch.nn as nn - import torch.optim as optim - - from torch.nn.parallel import DistributedDataParallel as DDP - - class ToyModel(nn.Module): - def __init__(self): - super(ToyModel, self).__init__() - self.net1 = nn.Linear(10, 10) - self.relu = nn.ReLU() - self.net2 = nn.Linear(10, 5) - - def forward(self, x): - return self.net2(self.relu(self.net1(x))) - - - def demo_basic(): - dist.init_process_group("nccl") - rank = dist.get_rank() - print(f"Start running basic DDP example on rank {rank}.") - - # create model and move it to GPU with id rank - device_id = rank % torch.cuda.device_count() - model = ToyModel().to(device_id) - ddp_model = DDP(model, device_ids=[device_id]) - - loss_fn = nn.MSELoss() - optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) - - optimizer.zero_grad() - outputs = ddp_model(torch.randn(20, 10)) - labels = torch.randn(20, 5).to(device_id) - loss_fn(outputs, labels).backward() - optimizer.step() - dist.destroy_process_group() - - if __name__ == "__main__": - demo_basic() - -One can then run a `torch elastic/torchrun `__ command -on all nodes to initialize the DDP job created above: - -.. code:: bash - - torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py - -We are running the DDP script on two hosts, and each host we run with 8 processes, aka, we -are running it on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes. - -Here torchrun will launch 8 process and invoke ``elastic_ddp.py`` -on each process on the node it is launched on, but user also needs to apply cluster -management tools like slurm to actually run this command on 2 nodes. - -For example, on a SLURM enabled cluster, we can write a script to run the command above -and set ``MASTER_ADDR`` as: - -.. code:: bash - - export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) - - -Then we can just run this script using the SLURM command: ``srun --nodes=2 ./torchrun_script.sh``. -Of course, this is just an example; you can choose your own cluster scheduling tools -to initiate the torchrun job. - -For more information about Elastic run, one can check this -`quick start document `__ to learn more. diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst deleted file mode 100644 index 35f6341395..0000000000 --- a/intermediate_source/dist_tuto.rst +++ /dev/null @@ -1,636 +0,0 @@ -Writing Distributed Applications with PyTorch -============================================= -**Author**: `Séb Arnold `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ - -In this short tutorial, we will be going over the distributed package -of PyTorch. We'll see how to set up the distributed setting, use the -different communication strategies, and go over some of the internals of -the package. - -Setup ------ - -.. raw:: html - - - -The distributed package included in PyTorch (i.e., -``torch.distributed``) enables researchers and practitioners to easily -parallelize their computations across processes and clusters of -machines. To do so, it leverages message passing semantics -allowing each process to communicate data to any of the other processes. -As opposed to the multiprocessing (``torch.multiprocessing``) package, -processes can use different communication backends and are not -restricted to being executed on the same machine. - -In order to get started we need the ability to run multiple processes -simultaneously. If you have access to compute cluster you should check -with your local sysadmin or use your favorite coordination tool (e.g., -`pdsh `__, -`clustershell `__, or -`others `__). For the purpose of this -tutorial, we will use a single machine and spawn multiple processes using -the following template. - -.. code:: python - - """run.py:""" - #!/usr/bin/env python - import os - import torch - import torch.distributed as dist - import torch.multiprocessing as mp - - def run(rank, size): - """ Distributed function to be implemented later. """ - pass - - def init_process(rank, size, fn, backend='gloo'): - """ Initialize the distributed environment. """ - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = '29500' - dist.init_process_group(backend, rank=rank, world_size=size) - fn(rank, size) - - - if __name__ == "__main__": - size = 2 - processes = [] - mp.set_start_method("spawn") - for rank in range(size): - p = mp.Process(target=init_process, args=(rank, size, run)) - p.start() - processes.append(p) - - for p in processes: - p.join() - -The above script spawns two processes who will each setup the -distributed environment, initialize the process group -(``dist.init_process_group``), and finally execute the given ``run`` -function. - -Let's have a look at the ``init_process`` function. It ensures that -every process will be able to coordinate through a master, using the -same ip address and port. Note that we used the ``gloo`` backend but -other backends are available. (c.f. -`Section 5.1 <#communication-backends>`__) We will go over the magic -happening in ``dist.init_process_group`` at the end of this tutorial, -but it essentially allows processes to communicate with each other by -sharing their locations. - -Point-to-Point Communication ----------------------------- - -.. figure:: /_static/img/distributed/send_recv.png - :width: 100% - :align: center - :alt: Send and Recv - - Send and Recv - - -A transfer of data from one process to another is called a -point-to-point communication. These are achieved through the ``send`` -and ``recv`` functions or their *immediate* counter-parts, ``isend`` and -``irecv``. - -.. code:: python - - """Blocking point-to-point communication.""" - - def run(rank, size): - tensor = torch.zeros(1) - if rank == 0: - tensor += 1 - # Send the tensor to process 1 - dist.send(tensor=tensor, dst=1) - else: - # Receive tensor from process 0 - dist.recv(tensor=tensor, src=0) - print('Rank ', rank, ' has data ', tensor[0]) - -In the above example, both processes start with a zero tensor, then -process 0 increments the tensor and sends it to process 1 so that they -both end up with 1.0. Notice that process 1 needs to allocate memory in -order to store the data it will receive. - -Also notice that ``send``/``recv`` are **blocking**: both processes stop -until the communication is completed. On the other hand immediates are -**non-blocking**; the script continues its execution and the methods -return a ``Work`` object upon which we can choose to -``wait()``. - -.. code:: python - - """Non-blocking point-to-point communication.""" - - def run(rank, size): - tensor = torch.zeros(1) - req = None - if rank == 0: - tensor += 1 - # Send the tensor to process 1 - req = dist.isend(tensor=tensor, dst=1) - print('Rank 0 started sending') - else: - # Receive tensor from process 0 - req = dist.irecv(tensor=tensor, src=0) - print('Rank 1 started receiving') - req.wait() - print('Rank ', rank, ' has data ', tensor[0]) - -When using immediates we have to be careful about how we use the sent and received tensors. -Since we do not know when the data will be communicated to the other process, -we should not modify the sent tensor nor access the received tensor before ``req.wait()`` has completed. -In other words, - -- writing to ``tensor`` after ``dist.isend()`` will result in undefined behaviour. -- reading from ``tensor`` after ``dist.irecv()`` will result in undefined behaviour. - -However, after ``req.wait()`` -has been executed we are guaranteed that the communication took place, -and that the value stored in ``tensor[0]`` is 1.0. - -Point-to-point communication is useful when we want more fine-grained -control over the communication of our processes. They can be used to -implement fancy algorithms, such as the one used in `Baidu's -DeepSpeech `__ or -`Facebook's large-scale -experiments `__.(c.f. -`Section 4.1 <#our-own-ring-allreduce>`__) - -Collective Communication ------------------------- - -+----------------------------------------------------+-----------------------------------------------------+ -| .. figure:: /_static/img/distributed/scatter.png | .. figure:: /_static/img/distributed/gather.png | -| :alt: Scatter | :alt: Gather | -| :width: 100% | :width: 100% | -| :align: center | :align: center | -| | | -| Scatter | Gather | -+----------------------------------------------------+-----------------------------------------------------+ -| .. figure:: /_static/img/distributed/reduce.png | .. figure:: /_static/img/distributed/all_reduce.png | -| :alt: Reduce | :alt: All-Reduce | -| :width: 100% | :width: 100% | -| :align: center | :align: center | -| | | -| Reduce | All-Reduce | -+----------------------------------------------------+-----------------------------------------------------+ -| .. figure:: /_static/img/distributed/broadcast.png | .. figure:: /_static/img/distributed/all_gather.png | -| :alt: Broadcast | :alt: All-Gather | -| :width: 100% | :width: 100% | -| :align: center | :align: center | -| | | -| Broadcast | All-Gather | -+----------------------------------------------------+-----------------------------------------------------+ - - - -As opposed to point-to-point communcation, collectives allow for -communication patterns across all processes in a **group**. A group is a -subset of all our processes. To create a group, we can pass a list of -ranks to ``dist.new_group(group)``. By default, collectives are executed -on all processes, also known as the **world**. For example, in order -to obtain the sum of all tensors on all processes, we can use the -``dist.all_reduce(tensor, op, group)`` collective. - -.. code:: python - - """ All-Reduce example.""" - def run(rank, size): - """ Simple collective communication. """ - group = dist.new_group([0, 1]) - tensor = torch.ones(1) - dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group) - print('Rank ', rank, ' has data ', tensor[0]) - -Since we want the sum of all tensors in the group, we use -``dist.ReduceOp.SUM`` as the reduce operator. Generally speaking, any -commutative mathematical operation can be used as an operator. -Out-of-the-box, PyTorch comes with 4 such operators, all working at the -element-wise level: - -- ``dist.ReduceOp.SUM``, -- ``dist.ReduceOp.PRODUCT``, -- ``dist.ReduceOp.MAX``, -- ``dist.ReduceOp.MIN``. - -In addition to ``dist.all_reduce(tensor, op, group)``, there are a total -of 6 collectives currently implemented in PyTorch. - -- ``dist.broadcast(tensor, src, group)``: Copies ``tensor`` from - ``src`` to all other processes. -- ``dist.reduce(tensor, dst, op, group)``: Applies ``op`` to every - ``tensor`` and stores the result in ``dst``. -- ``dist.all_reduce(tensor, op, group)``: Same as reduce, but the - result is stored in all processes. -- ``dist.scatter(tensor, scatter_list, src, group)``: Copies the - :math:`i^{\text{th}}` tensor ``scatter_list[i]`` to the - :math:`i^{\text{th}}` process. -- ``dist.gather(tensor, gather_list, dst, group)``: Copies ``tensor`` - from all processes in ``dst``. -- ``dist.all_gather(tensor_list, tensor, group)``: Copies ``tensor`` - from all processes to ``tensor_list``, on all processes. -- ``dist.barrier(group)``: Blocks all processes in `group` until each one has entered this function. - -Distributed Training --------------------- - -.. raw:: html - - - -**Note:** You can find the example script of this section in `this -GitHub repository `__. - -Now that we understand how the distributed module works, let us write -something useful with it. Our goal will be to replicate the -functionality of -`DistributedDataParallel `__. -Of course, this will be a didactic example and in a real-world -situation you should use the official, well-tested and well-optimized -version linked above. - -Quite simply we want to implement a distributed version of stochastic -gradient descent. Our script will let all processes compute the -gradients of their model on their batch of data and then average their -gradients. In order to ensure similar convergence results when changing -the number of processes, we will first have to partition our dataset. -(You could also use -`tnt.dataset.SplitDataset `__, -instead of the snippet below.) - -.. code:: python - - """ Dataset partitioning helper """ - class Partition(object): - - def __init__(self, data, index): - self.data = data - self.index = index - - def __len__(self): - return len(self.index) - - def __getitem__(self, index): - data_idx = self.index[index] - return self.data[data_idx] - - - class DataPartitioner(object): - - def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234): - self.data = data - self.partitions = [] - rng = Random() # from random import Random - rng.seed(seed) - data_len = len(data) - indexes = [x for x in range(0, data_len)] - rng.shuffle(indexes) - - for frac in sizes: - part_len = int(frac * data_len) - self.partitions.append(indexes[0:part_len]) - indexes = indexes[part_len:] - - def use(self, partition): - return Partition(self.data, self.partitions[partition]) - -With the above snippet, we can now simply partition any dataset using -the following few lines: - -.. code:: python - - """ Partitioning MNIST """ - def partition_dataset(): - dataset = datasets.MNIST('./data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - size = dist.get_world_size() - bsz = 128 // size - partition_sizes = [1.0 / size for _ in range(size)] - partition = DataPartitioner(dataset, partition_sizes) - partition = partition.use(dist.get_rank()) - train_set = torch.utils.data.DataLoader(partition, - batch_size=bsz, - shuffle=True) - return train_set, bsz - -Assuming we have 2 replicas, then each process will have a ``train_set`` -of 60000 / 2 = 30000 samples. We also divide the batch size by the -number of replicas in order to maintain the *overall* batch size of 128. - -We can now write our usual forward-backward-optimize training code, and -add a function call to average the gradients of our models. (The -following is largely inspired by the official `PyTorch MNIST -example `__.) - -.. code:: python - - """ Distributed Synchronous SGD Example """ - def run(rank, size): - torch.manual_seed(1234) - train_set, bsz = partition_dataset() - model = Net() - optimizer = optim.SGD(model.parameters(), - lr=0.01, momentum=0.5) - - num_batches = ceil(len(train_set.dataset) / float(bsz)) - for epoch in range(10): - epoch_loss = 0.0 - for data, target in train_set: - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - epoch_loss += loss.item() - loss.backward() - average_gradients(model) - optimizer.step() - print('Rank ', dist.get_rank(), ', epoch ', - epoch, ': ', epoch_loss / num_batches) - -It remains to implement the ``average_gradients(model)`` function, which -simply takes in a model and averages its gradients across the whole -world. - -.. code:: python - - """ Gradient averaging. """ - def average_gradients(model): - size = float(dist.get_world_size()) - for param in model.parameters(): - dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) - param.grad.data /= size - -*Et voilà*! We successfully implemented distributed synchronous SGD and -could train any model on a large computer cluster. - -**Note:** While the last sentence is *technically* true, there are `a -lot more tricks `__ required to -implement a production-level implementation of synchronous SGD. Again, -use what `has been tested and -optimized `__. - -Our Own Ring-Allreduce -~~~~~~~~~~~~~~~~~~~~~~ - -As an additional challenge, imagine that we wanted to implement -DeepSpeech's efficient ring allreduce. This is fairly easy to implement -using point-to-point collectives. - -.. code:: python - - """ Implementation of a ring-reduce with addition. """ - def allreduce(send, recv): - rank = dist.get_rank() - size = dist.get_world_size() - send_buff = send.clone() - recv_buff = send.clone() - accum = send.clone() - - left = ((rank - 1) + size) % size - right = (rank + 1) % size - - for i in range(size - 1): - if i % 2 == 0: - # Send send_buff - send_req = dist.isend(send_buff, right) - dist.recv(recv_buff, left) - accum[:] += recv_buff[:] - else: - # Send recv_buff - send_req = dist.isend(recv_buff, right) - dist.recv(send_buff, left) - accum[:] += send_buff[:] - send_req.wait() - recv[:] = accum[:] - -In the above script, the ``allreduce(send, recv)`` function has a -slightly different signature than the ones in PyTorch. It takes a -``recv`` tensor and will store the sum of all ``send`` tensors in it. As -an exercise left to the reader, there is still one difference between -our version and the one in DeepSpeech: their implementation divides the -gradient tensor into *chunks*, so as to optimally utilize the -communication bandwidth. (Hint: -`torch.chunk `__) - -Advanced Topics ---------------- - -We are now ready to discover some of the more advanced functionalities -of ``torch.distributed``. Since there is a lot to cover, this section is -divided into two subsections: - -1. Communication Backends: where we learn how to use MPI and Gloo for - GPU-GPU communication. -2. Initialization Methods: where we understand how to best set up the - initial coordination phase in ``dist.init_process_group()``. - -Communication Backends -~~~~~~~~~~~~~~~~~~~~~~ - -One of the most elegant aspects of ``torch.distributed`` is its ability -to abstract and build on top of different backends. As mentioned before, -there are currently three backends implemented in PyTorch: Gloo, NCCL, and -MPI. They each have different specifications and tradeoffs, depending -on the desired use case. A comparative table of supported functions can -be found -`here `__. - -**Gloo Backend** - -So far we have made extensive usage of the `Gloo backend `__. -It is quite handy as a development platform, as it is included in -the pre-compiled PyTorch binaries and works on both Linux (since 0.2) -and macOS (since 1.3). It supports all point-to-point and collective -operations on CPU, and all collective operations on GPU. The -implementation of the collective operations for CUDA tensors is not as -optimized as the ones provided by the NCCL backend. - -As you have surely noticed, our -distributed SGD example does not work if you put ``model`` on the GPU. -In order to use multiple GPUs, let us also make the following -modifications: - -1. Use ``device = torch.device("cuda:{}".format(rank))`` -2. ``model = Net()`` :math:`\rightarrow` ``model = Net().to(device)`` -3. Use ``data, target = data.to(device), target.to(device)`` - -With the above modifications, our model is now training on two GPUs and -you can monitor their utilization with ``watch nvidia-smi``. - -**MPI Backend** - -The Message Passing Interface (MPI) is a standardized tool from the -field of high-performance computing. It allows to do point-to-point and -collective communications and was the main inspiration for the API of -``torch.distributed``. Several implementations of MPI exist (e.g. -`Open-MPI `__, -`MVAPICH2 `__, `Intel -MPI `__) each -optimized for different purposes. The advantage of using the MPI backend -lies in MPI's wide availability - and high-level of optimization - on -large computer clusters. `Some `__ -`recent `__ -`implementations `__ are also able to take -advantage of CUDA IPC and GPU Direct technologies in order to avoid -memory copies through the CPU. - -Unfortunately, PyTorch's binaries cannot include an MPI implementation -and we'll have to recompile it by hand. Fortunately, this process is -fairly simple given that upon compilation, PyTorch will look *by itself* -for an available MPI implementation. The following steps install the MPI -backend, by installing PyTorch `from -source `__. - -1. Create and activate your Anaconda environment, install all the - pre-requisites following `the - guide `__, but do - **not** run ``python setup.py install`` yet. -2. Choose and install your favorite MPI implementation. Note that - enabling CUDA-aware MPI might require some additional steps. In our - case, we'll stick to Open-MPI *without* GPU support: - ``conda install -c conda-forge openmpi`` -3. Now, go to your cloned PyTorch repo and execute - ``python setup.py install``. - -In order to test our newly installed backend, a few modifications are -required. - -1. Replace the content under ``if __name__ == '__main__':`` with - ``init_process(0, 0, run, backend='mpi')``. -2. Run ``mpirun -n 4 python myscript.py``. - -The reason for these changes is that MPI needs to create its own -environment before spawning the processes. MPI will also spawn its own -processes and perform the handshake described in `Initialization -Methods <#initialization-methods>`__, making the ``rank``\ and ``size`` -arguments of ``init_process_group`` superfluous. This is actually quite -powerful as you can pass additional arguments to ``mpirun`` in order to -tailor computational resources for each process. (Things like number of -cores per process, hand-assigning machines to specific ranks, and `some -more `__) -Doing so, you should obtain the same familiar output as with the other -communication backends. - -**NCCL Backend** - -The `NCCL backend `__ provides an -optimized implementation of collective operations against CUDA -tensors. If you only use CUDA tensors for your collective operations, -consider using this backend for the best in class performance. The -NCCL backend is included in the pre-built binaries with CUDA support. - -Initialization Methods -~~~~~~~~~~~~~~~~~~~~~~ - -To finish this tutorial, let's talk about the very first function we -called: ``dist.init_process_group(backend, init_method)``. In -particular, we will go over the different initialization methods which -are responsible for the initial coordination step between each process. -Those methods allow you to define how this coordination is done. -Depending on your hardware setup, one of these methods should be -naturally more suitable than the others. In addition to the following -sections, you should also have a look at the `official -documentation `__. - -**Environment Variable** - -We have been using the environment variable initialization method -throughout this tutorial. By setting the following four environment -variables on all machines, all processes will be able to properly -connect to the master, obtain information about the other processes, and -finally handshake with them. - -- ``MASTER_PORT``: A free port on the machine that will host the - process with rank 0. -- ``MASTER_ADDR``: IP address of the machine that will host the process - with rank 0. -- ``WORLD_SIZE``: The total number of processes, so that the master - knows how many workers to wait for. -- ``RANK``: Rank of each process, so they will know whether it is the - master of a worker. - -**Shared File System** - -The shared filesystem requires all processes to have access to a shared -file system, and will coordinate them through a shared file. This means -that each process will open the file, write its information, and wait -until everybody did so. After that all required information will be -readily available to all processes. In order to avoid race conditions, -the file system must support locking through -`fcntl `__. - -.. code:: python - - dist.init_process_group( - init_method='file:///mnt/nfs/sharedfile', - rank=args.rank, - world_size=4) - -**TCP** - -Initializing via TCP can be achieved by providing the IP address of the process with rank 0 and a reachable port number. -Here, all workers will be able to connect to the process -with rank 0 and exchange information on how to reach each other. - -.. code:: python - - dist.init_process_group( - init_method='tcp://10.1.1.20:23456', - rank=args.rank, - world_size=4) - -.. raw:: html - - - -.. raw:: html - -
- -**Acknowledgements** - -.. raw:: html - -
- -I'd like to thank the PyTorch developers for doing such a good job on -their implementation, documentation, and tests. When the code was -unclear, I could always count on the -`docs `__ or the -`tests `__ -to find an answer. In particular, I'd like to thank Soumith Chintala, -Adam Paszke, and Natalia Gimelshein for providing insightful comments -and answering questions on early drafts. diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py deleted file mode 100644 index 6ea0955939..0000000000 --- a/intermediate_source/dqn_with_rnn_tutorial.py +++ /dev/null @@ -1,468 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Recurrent DQN: Training recurrent policies -========================================== - -**Author**: `Vincent Moens `_ - -.. grid:: 2 - - .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn - :class-card: card-prerequisites - - * How to incorporating an RNN in an actor in TorchRL - * How to use that memory-based policy with a replay buffer and a loss module - - .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites - :class-card: card-prerequisites - - * PyTorch v2.0.0 - * gym[mujoco] - * tqdm -""" - -######################################################################### -# Overview -# -------- -# -# Memory-based policies are crucial not only when the observations are partially -# observable but also when the time dimension must be taken into account to -# make informed decisions. -# -# Recurrent neural network have long been a popular tool for memory-based -# policies. The idea is to keep a recurrent state in memory between two -# consecutive steps, and use this as an input to the policy along with the -# current observation. -# -# This tutorial shows how to incorporate an RNN in a policy using TorchRL. -# -# Key learnings: -# -# - Incorporating an RNN in an actor in TorchRL; -# - Using that memory-based policy with a replay buffer and a loss module. -# -# The core idea of using RNNs in TorchRL is to use TensorDict as a data carrier -# for the hidden states from one step to another. We'll build a policy that -# reads the previous recurrent state from the current TensorDict, and writes the -# current recurrent states in the TensorDict of the next state: -# -# .. figure:: /_static/img/rollout_recurrent.png -# :alt: Data collection with a recurrent policy -# -# As this figure shows, our environment populates the TensorDict with zeroed recurrent -# states which are read by the policy together with the observation to produce an -# action, and recurrent states that will be used for the next step. -# When the :func:`~torchrl.envs.utils.step_mdp` function is called, the recurrent states -# from the next state are brought to the current TensorDict. Let's see how this -# is implemented in practice. - -###################################################################### -# If you are running this in Google Colab, make sure you install the following dependencies: -# -# .. code-block:: bash -# -# !pip3 install torchrl -# !pip3 install gym[mujoco] -# !pip3 install tqdm -# -# Setup -# ----- -# - -# sphinx_gallery_start_ignore -import warnings - -warnings.filterwarnings("ignore") -from torch import multiprocessing - -# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside -# `__main__` method call, but for the easy of reading the code switch to fork -# which is also a default spawn method in Google's Colaboratory -try: - multiprocessing.set_start_method("fork") -except RuntimeError: - pass - -# sphinx_gallery_end_ignore - -import torch -import tqdm -from tensordict.nn import TensorDictModule as Mod, TensorDictSequential as Seq -from torch import nn -from torchrl.collectors import SyncDataCollector -from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer -from torchrl.envs import ( - Compose, - ExplorationType, - GrayScale, - InitTracker, - ObservationNorm, - Resize, - RewardScaling, - set_exploration_type, - StepCounter, - ToTensorImage, - TransformedEnv, -) -from torchrl.envs.libs.gym import GymEnv -from torchrl.modules import ConvNet, EGreedyModule, LSTMModule, MLP, QValueModule -from torchrl.objectives import DQNLoss, SoftUpdate - -is_fork = multiprocessing.get_start_method() == "fork" -device = ( - torch.device(0) - if torch.cuda.is_available() and not is_fork - else torch.device("cpu") -) - -###################################################################### -# Environment -# ----------- -# -# As usual, the first step is to build our environment: it helps us -# define the problem and build the policy network accordingly. For this tutorial, -# we'll be running a single pixel-based instance of the CartPole gym -# environment with some custom transforms: turning to grayscale, resizing to -# 84x84, scaling down the rewards and normalizing the observations. -# -# .. note:: -# The :class:`~torchrl.envs.transforms.StepCounter` transform is accessory. Since the CartPole -# task goal is to make trajectories as long as possible, counting the steps -# can help us track the performance of our policy. -# -# Two transforms are important for the purpose of this tutorial: -# -# - :class:`~torchrl.envs.transforms.InitTracker` will stamp the -# calls to :meth:`~torchrl.envs.EnvBase.reset` by adding a ``"is_init"`` -# boolean mask in the TensorDict that will track which steps require a reset -# of the RNN hidden states. -# - The :class:`~torchrl.envs.transforms.TensorDictPrimer` transform is a bit more -# technical. It is not required to use RNN policies. However, it -# instructs the environment (and subsequently the collector) that some extra -# keys are to be expected. Once added, a call to `env.reset()` will populate -# the entries indicated in the primer with zeroed tensors. Knowing that -# these tensors are expected by the policy, the collector will pass them on -# during collection. Eventually, we'll be storing our hidden states in the -# replay buffer, which will help us bootstrap the computation of the -# RNN operations in the loss module (which would otherwise be initiated -# with 0s). In summary: not including this transform will not impact hugely -# the training of our policy, but it will make the recurrent keys disappear -# from the collected data and the replay buffer, which will in turn lead to -# a slightly less optimal training. -# Fortunately, the :class:`~torchrl.modules.LSTMModule` we propose is -# equipped with a helper method to build just that transform for us, so -# we can wait until we build it! -# - -env = TransformedEnv( - GymEnv("CartPole-v1", from_pixels=True, device=device), - Compose( - ToTensorImage(), - GrayScale(), - Resize(84, 84), - StepCounter(), - InitTracker(), - RewardScaling(loc=0.0, scale=0.1), - ObservationNorm(standard_normal=True, in_keys=["pixels"]), - ), -) - -###################################################################### -# As always, we need to initialize manually our normalization constants: -# -env.transform[-1].init_stats(1000, reduce_dim=[0, 1, 2], cat_dim=0, keep_dims=[0]) -td = env.reset() - -###################################################################### -# Policy -# ------ -# -# Our policy will have 3 components: a :class:`~torchrl.modules.ConvNet` -# backbone, an :class:`~torchrl.modules.LSTMModule` memory layer and a shallow -# :class:`~torchrl.modules.MLP` block that will map the LSTM output onto the -# action values. -# -# Convolutional network -# ~~~~~~~~~~~~~~~~~~~~~ -# -# We build a convolutional network flanked with a :class:`torch.nn.AdaptiveAvgPool2d` -# that will squash the output in a vector of size 64. The :class:`~torchrl.modules.ConvNet` -# can assist us with this: -# - -feature = Mod( - ConvNet( - num_cells=[32, 32, 64], - squeeze_output=True, - aggregator_class=nn.AdaptiveAvgPool2d, - aggregator_kwargs={"output_size": (1, 1)}, - device=device, - ), - in_keys=["pixels"], - out_keys=["embed"], -) -###################################################################### -# we execute the first module on a batch of data to gather the size of the -# output vector: -# -n_cells = feature(env.reset())["embed"].shape[-1] - -###################################################################### -# LSTM Module -# ~~~~~~~~~~~ -# -# TorchRL provides a specialized :class:`~torchrl.modules.LSTMModule` class -# to incorporate LSTMs in your code-base. It is a :class:`~tensordict.nn.TensorDictModuleBase` -# subclass: as such, it has a set of ``in_keys`` and ``out_keys`` that indicate -# what values should be expected to be read and written/updated during the -# execution of the module. The class comes with customizable predefined -# values for these attributes to facilitate its construction. -# -# .. note:: -# *Usage limitations*: The class supports almost all LSTM features such as -# dropout or multi-layered LSTMs. -# However, to respect TorchRL's conventions, this LSTM must have the ``batch_first`` -# attribute set to ``True`` which is **not** the default in PyTorch. However, -# our :class:`~torchrl.modules.LSTMModule` changes this default -# behavior, so we're good with a native call. -# -# Also, the LSTM cannot have a ``bidirectional`` attribute set to ``True`` as -# this wouldn't be usable in online settings. In this case, the default value -# is the correct one. -# - -lstm = LSTMModule( - input_size=n_cells, - hidden_size=128, - device=device, - in_key="embed", - out_key="embed", -) - -###################################################################### -# Let us look at the LSTM Module class, specifically its in and out_keys: -print("in_keys", lstm.in_keys) -print("out_keys", lstm.out_keys) - -###################################################################### -# We can see that these values contain the key we indicated as the in_key (and out_key) -# as well as recurrent key names. The out_keys are preceded by a "next" prefix -# that indicates that they will need to be written in the "next" TensorDict. -# We use this convention (which can be overridden by passing the in_keys/out_keys -# arguments) to make sure that a call to :func:`~torchrl.envs.utils.step_mdp` will -# move the recurrent state to the root TensorDict, making it available to the -# RNN during the following call (see figure in the intro). -# -# As mentioned earlier, we have one more optional transform to add to our -# environment to make sure that the recurrent states are passed to the buffer. -# The :meth:`~torchrl.modules.LSTMModule.make_tensordict_primer` method does -# exactly that: -# -env.append_transform(lstm.make_tensordict_primer()) - -###################################################################### -# and that's it! We can print the environment to check that everything looks good now -# that we have added the primer: -print(env) - -###################################################################### -# MLP -# ~~~ -# -# We use a single-layer MLP to represent the action values we'll be using for -# our policy. -# -mlp = MLP( - out_features=2, - num_cells=[ - 64, - ], - device=device, -) -###################################################################### -# and fill the bias with zeros: - -mlp[-1].bias.data.fill_(0.0) -mlp = Mod(mlp, in_keys=["embed"], out_keys=["action_value"]) - -###################################################################### -# Using the Q-Values to select an action -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The last part of our policy is the Q-Value Module. -# The Q-Value module :class:`~torchrl.modules.tensordict_module.QValueModule` -# will read the ``"action_values"`` key that is produced by our MLP and -# from it, gather the action that has the maximum value. -# The only thing we need to do is to specify the action space, which can be done -# either by passing a string or an action-spec. This allows us to use -# Categorical (sometimes called "sparse") encoding or the one-hot version of it. -# -qval = QValueModule(spec=env.action_spec) - -###################################################################### -# .. note:: -# TorchRL also provides a wrapper class :class:`torchrl.modules.QValueActor` that -# wraps a module in a Sequential together with a :class:`~torchrl.modules.tensordict_module.QValueModule` -# like we are doing explicitly here. There is little advantage to do this -# and the process is less transparent, but the end results will be similar to -# what we do here. -# -# We can now put things together in a :class:`~tensordict.nn.TensorDictSequential` -# -stoch_policy = Seq(feature, lstm, mlp, qval) - -###################################################################### -# DQN being a deterministic algorithm, exploration is a crucial part of it. -# We'll be using an :math:`\epsilon`-greedy policy with an epsilon of 0.2 decaying -# progressively to 0. -# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyModule.step` -# (see training loop below). -# -exploration_module = EGreedyModule( - annealing_num_steps=1_000_000, spec=env.action_spec, eps_init=0.2 -) -stoch_policy = Seq( - stoch_policy, - exploration_module, -) - -###################################################################### -# Using the model for the loss -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The model as we've built it is well equipped to be used in sequential settings. -# However, the class :class:`torch.nn.LSTM` can use a cuDNN-optimized backend -# to run the RNN sequence faster on GPU device. We would not want to miss -# such an opportunity to speed up our training loop! -# To use it, we just need to tell the LSTM module to run on "recurrent-mode" -# when used by the loss. -# As we'll usually want to have two copies of the LSTM module, we do this by -# calling a :meth:`~torchrl.modules.LSTMModule.set_recurrent_mode` method that -# will return a new instance of the LSTM (with shared weights) that will -# assume that the input data is sequential in nature. -# -policy = Seq(feature, lstm.set_recurrent_mode(True), mlp, qval) - -###################################################################### -# Because we still have a couple of uninitialized parameters we should -# initialize them before creating an optimizer and such. -# -policy(env.reset()) - -###################################################################### -# DQN Loss -# -------- -# -# Out DQN loss requires us to pass the policy and, again, the action-space. -# While this may seem redundant, it is important as we want to make sure that -# the :class:`~torchrl.objectives.DQNLoss` and the :class:`~torchrl.modules.tensordict_module.QValueModule` -# classes are compatible, but aren't strongly dependent on each other. -# -# To use the Double-DQN, we ask for a ``delay_value`` argument that will -# create a non-differentiable copy of the network parameters to be used -# as a target network. -loss_fn = DQNLoss(policy, action_space=env.action_spec, delay_value=True) - -###################################################################### -# Since we are using a double DQN, we need to update the target parameters. -# We'll use a :class:`~torchrl.objectives.SoftUpdate` instance to carry out -# this work. -# -updater = SoftUpdate(loss_fn, eps=0.95) - -optim = torch.optim.Adam(policy.parameters(), lr=3e-4) - -###################################################################### -# Collector and replay buffer -# --------------------------- -# -# We build the simplest data collector there is. We'll try to train our algorithm -# with a million frames, extending the buffer with 50 frames at a time. The buffer -# will be designed to store 20 thousands trajectories of 50 steps each. -# At each optimization step (16 per data collection), we'll collect 4 items -# from our buffer, for a total of 200 transitions. -# We'll use a :class:`~torchrl.data.replay_buffers.LazyMemmapStorage` storage to keep the data -# on disk. -# -# .. note:: -# For the sake of efficiency, we're only running a few thousands iterations -# here. In a real setting, the total number of frames should be set to 1M. -# -collector = SyncDataCollector(env, stoch_policy, frames_per_batch=50, total_frames=200, device=device) -rb = TensorDictReplayBuffer( - storage=LazyMemmapStorage(20_000), batch_size=4, prefetch=10 -) - -###################################################################### -# Training loop -# ------------- -# -# To keep track of the progress, we will run the policy in the environment once -# every 50 data collection, and plot the results after training. -# - -utd = 16 -pbar = tqdm.tqdm(total=1_000_000) -longest = 0 - -traj_lens = [] -for i, data in enumerate(collector): - if i == 0: - print( - "Let us print the first batch of data.\nPay attention to the key names " - "which will reflect what can be found in this data structure, in particular: " - "the output of the QValueModule (action_values, action and chosen_action_value)," - "the 'is_init' key that will tell us if a step is initial or not, and the " - "recurrent_state keys.\n", - data, - ) - pbar.update(data.numel()) - # it is important to pass data that is not flattened - rb.extend(data.unsqueeze(0).to_tensordict().cpu()) - for _ in range(utd): - s = rb.sample().to(device, non_blocking=True) - loss_vals = loss_fn(s) - loss_vals["loss"].backward() - optim.step() - optim.zero_grad() - longest = max(longest, data["step_count"].max().item()) - pbar.set_description( - f"steps: {longest}, loss_val: {loss_vals['loss'].item(): 4.4f}, action_spread: {data['action'].sum(0)}" - ) - exploration_module.step(data.numel()) - updater.step() - - with set_exploration_type(ExplorationType.MODE), torch.no_grad(): - rollout = env.rollout(10000, stoch_policy) - traj_lens.append(rollout.get(("next", "step_count")).max().item()) - -###################################################################### -# Let's plot our results: -# -if traj_lens: - from matplotlib import pyplot as plt - - plt.plot(traj_lens) - plt.xlabel("Test collection") - plt.title("Test trajectory lengths") - -###################################################################### -# Conclusion -# ---------- -# -# We have seen how an RNN can be incorporated in a policy in TorchRL. -# You should now be able: -# -# - Create an LSTM module that acts as a :class:`~tensordict.nn.TensorDictModule` -# - Indicate to the LSTM module that a reset is needed via an :class:`~torchrl.envs.transforms.InitTracker` -# transform -# - Incorporate this module in a policy and in a loss module -# - Make sure that the collector is made aware of the recurrent state entries -# such that they can be stored in the replay buffer along with the rest of -# the data -# -# Further Reading -# --------------- -# -# - The TorchRL documentation can be found `here `_. diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst deleted file mode 100644 index e515f53a1d..0000000000 --- a/intermediate_source/dynamic_quantization_bert_tutorial.rst +++ /dev/null @@ -1,568 +0,0 @@ -(beta) Dynamic Quantization on BERT -=========================================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. This will allow you to experiment with the information presented below. - -**Author**: `Jianyu Huang `_ - -**Reviewed by**: `Raghuraman Krishnamoorthi `_ - -**Edited by**: `Jessica Lin `_ - - -Introduction ------------- - - -In this tutorial, we will apply the dynamic quantization on a BERT -model, closely following the BERT model from `the HuggingFace -Transformers examples `_. -With this step-by-step journey, we would like to demonstrate how to -convert a well-known state-of-the-art model like BERT into dynamic -quantized model. - -- BERT, or Bidirectional Embedding Representations from Transformers, - is a new method of pre-training language representations which - achieves the state-of-the-art accuracy results on many popular - Natural Language Processing (NLP) tasks, such as question answering, - text classification, and others. The original paper can be found - `here `_. - -- Dynamic quantization support in PyTorch converts a float model to a - quantized model with static int8 or float16 data types for the - weights and dynamic quantization for the activations. The activations - are quantized dynamically (per batch) to int8 when the weights are - quantized to int8. In PyTorch, we have `torch.quantization.quantize_dynamic API - `_, - which replaces specified modules with dynamic weight-only quantized - versions and output the quantized model. - -- We demonstrate the accuracy and inference performance results on the - `Microsoft Research Paraphrase Corpus (MRPC) task `_ - in the General Language Understanding Evaluation benchmark `(GLUE) - `_. The MRPC (Dolan and Brockett, 2005) is - a corpus of sentence pairs automatically extracted from online news - sources, with human annotations of whether the sentences in the pair - are semantically equivalent. As the classes are imbalanced (68% - positive, 32% negative), we follow the common practice and report - `F1 score `_. - MRPC is a common NLP task for language pair classification, as shown - below. - -.. image:: /_static/img/bert.png - - -1. Setup --------- - -1.1 Install PyTorch and HuggingFace Transformers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To start this tutorial, let’s first follow the installation instructions -in PyTorch `here `_ and HuggingFace Github Repo `here `_. -In addition, we also install `scikit-learn `_ package, as we will reuse its -built-in F1 score calculation helper function. - -.. code:: shell - - pip install sklearn - pip install transformers==4.29.2 - - -Because we will be using the beta parts of the PyTorch, it is -recommended to install the latest version of torch and torchvision. You -can find the most recent instructions on local installation `here -`_. For example, to install on -Mac: - -.. code:: shell - - yes y | pip uninstall torch torchvision - yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html - - - - -1.2 Import the necessary modules -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In this step we import the necessary Python modules for the tutorial. - -.. code:: python - - import logging - import numpy as np - import os - import random - import sys - import time - import torch - - from argparse import Namespace - from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) - from tqdm import tqdm - from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,) - from transformers import glue_compute_metrics as compute_metrics - from transformers import glue_output_modes as output_modes - from transformers import glue_processors as processors - from transformers import glue_convert_examples_to_features as convert_examples_to_features - - # Setup logging - logger = logging.getLogger(__name__) - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) - - logging.getLogger("transformers.modeling_utils").setLevel( - logging.WARN) # Reduce logging - - print(torch.__version__) - -We set the number of threads to compare the single thread performance between FP32 and INT8 performance. -In the end of the tutorial, the user can set other number of threads by building PyTorch with right parallel backend. - -.. code:: python - - torch.set_num_threads(1) - print(torch.__config__.parallel_info()) - - -1.3 Learn about helper functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The helper functions are built-in in transformers library. We mainly use -the following helper functions: one for converting the text examples -into the feature vectors; The other one for measuring the F1 score of -the predicted result. - -The `glue_convert_examples_to_features `_ function converts the texts into input features: - -- Tokenize the input sequences; -- Insert [CLS] in the beginning; -- Insert [SEP] between the first sentence and the second sentence, and - in the end; -- Generate token type ids to indicate whether a token belongs to the - first sequence or the second sequence. - -The `glue_compute_metrics `_ function has the compute metrics with -the `F1 score `_, which -can be interpreted as a weighted average of the precision and recall, -where an F1 score reaches its best value at 1 and worst score at 0. The -relative contribution of precision and recall to the F1 score are equal. - -- The equation for the F1 score is: -.. math:: F1 = 2 * (\text{precision} * \text{recall}) / (\text{precision} + \text{recall}) - - -1.4 Download the dataset -^^^^^^^^^^^^^^^^^^^^^^^^ - -Before running MRPC tasks we download the `GLUE data -`_ by running `this script -`_ -and unpack it to a directory ``glue_data``. - - -.. code:: shell - - python download_glue_data.py --data_dir='glue_data' --tasks='MRPC' - - -2. Fine-tune the BERT model ---------------------------- - -The spirit of BERT is to pre-train the language representations and then -to fine-tune the deep bi-directional representations on a wide range of -tasks with minimal task-dependent parameters, and achieves -state-of-the-art results. In this tutorial, we will focus on fine-tuning -with the pre-trained BERT model to classify semantically equivalent -sentence pairs on MRPC task. - -To fine-tune the pre-trained BERT model (``bert-base-uncased`` model in -HuggingFace transformers) for the MRPC task, you can follow the command -in `examples `_: - -.. code:: python - - export GLUE_DIR=./glue_data - export TASK_NAME=MRPC - export OUT_DIR=./$TASK_NAME/ - python ./run_glue.py \ - --model_type bert \ - --model_name_or_path bert-base-uncased \ - --task_name $TASK_NAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/$TASK_NAME \ - --max_seq_length 128 \ - --per_gpu_eval_batch_size=8 \ - --per_gpu_train_batch_size=8 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --save_steps 100000 \ - --output_dir $OUT_DIR - -We provide the fine-tuned BERT model for MRPC task `here `_. -To save time, you can download the model file (~400 MB) directly into your local folder ``$OUT_DIR``. - -2.1 Set global configurations -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Here we set the global configurations for evaluating the fine-tuned BERT -model before and after the dynamic quantization. - -.. code:: python - - configs = Namespace() - - # The output directory for the fine-tuned model, $OUT_DIR. - configs.output_dir = "./MRPC/" - - # The data directory for the MRPC task in the GLUE benchmark, $GLUE_DIR/$TASK_NAME. - configs.data_dir = "./glue_data/MRPC" - - # The model name or path for the pre-trained model. - configs.model_name_or_path = "bert-base-uncased" - # The maximum length of an input sequence - configs.max_seq_length = 128 - - # Prepare GLUE task. - configs.task_name = "MRPC".lower() - configs.processor = processors[configs.task_name]() - configs.output_mode = output_modes[configs.task_name] - configs.label_list = configs.processor.get_labels() - configs.model_type = "bert".lower() - configs.do_lower_case = True - - # Set the device, batch size, topology, and caching flags. - configs.device = "cpu" - configs.per_gpu_eval_batch_size = 8 - configs.n_gpu = 0 - configs.local_rank = -1 - configs.overwrite_cache = False - - - # Set random seed for reproducibility. - def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - set_seed(42) - - - -2.2 Load the fine-tuned BERT model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We load the tokenizer and fine-tuned BERT sequence classifier model -(FP32) from the ``configs.output_dir``. - -.. code:: python - - tokenizer = BertTokenizer.from_pretrained( - configs.output_dir, do_lower_case=configs.do_lower_case) - - model = BertForSequenceClassification.from_pretrained(configs.output_dir) - model.to(configs.device) - - -2.3 Define the tokenize and evaluation function -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We reuse the tokenize and evaluation function from `HuggingFace `_. - -.. code:: python - - # coding=utf-8 - # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu eval - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return results - - - def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - - -3. Apply the dynamic quantization ---------------------------------- - -We call ``torch.quantization.quantize_dynamic`` on the model to apply -the dynamic quantization on the HuggingFace BERT model. Specifically, - -- We specify that we want the torch.nn.Linear modules in our model to - be quantized; -- We specify that we want weights to be converted to quantized int8 - values. - -.. code:: python - - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - - -3.1 Check the model size -^^^^^^^^^^^^^^^^^^^^^^^^ - -Let’s first check the model size. We can observe a significant reduction -in model size (FP32 total size: 438 MB; INT8 total size: 181 MB): - -.. code:: python - - def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - - print_size_of_model(model) - print_size_of_model(quantized_model) - - -The BERT model used in this tutorial (``bert-base-uncased``) has a -vocabulary size V of 30522. With the embedding size of 768, the total -size of the word embedding table is ~ 4 (Bytes/FP32) \* 30522 \* 768 = -90 MB. So with the help of quantization, the model size of the -non-embedding table part is reduced from 350 MB (FP32 model) to 90 MB -(INT8 model). - - -3.2 Evaluate the inference accuracy and time -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Next, let’s compare the inference time as well as the evaluation -accuracy between the original FP32 model and the INT8 model after the -dynamic quantization. - -.. code:: python - - def time_model_evaluation(model, configs, tokenizer): - eval_start_time = time.time() - result = evaluate(configs, model, tokenizer, prefix="") - eval_end_time = time.time() - eval_duration_time = eval_end_time - eval_start_time - print(result) - print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time)) - - # Evaluate the original FP32 BERT model - time_model_evaluation(model, configs, tokenizer) - - # Evaluate the INT8 BERT model after the dynamic quantization - time_model_evaluation(quantized_model, configs, tokenizer) - - -Running this locally on a MacBook Pro, without quantization, inference -(for all 408 examples in MRPC dataset) takes about 160 seconds, and with -quantization it takes just about 90 seconds. We summarize the results -for running the quantized BERT model inference on a Macbook Pro as the -follows: - -.. code:: - - | Prec | F1 score | Model Size | 1 thread | 4 threads | - | FP32 | 0.9019 | 438 MB | 160 sec | 85 sec | - | INT8 | 0.902 | 181 MB | 90 sec | 46 sec | - -We have 0.6% lower F1 score accuracy after applying the post-training dynamic -quantization on the fine-tuned BERT model on the MRPC task. As a -comparison, in a `recent paper `_ (Table 1), -it achieved 0.8788 by -applying the post-training dynamic quantization and 0.8956 by applying -the quantization-aware training. The main difference is that we support the -asymmetric quantization in PyTorch while that paper supports the -symmetric quantization only. - -Note that we set the number of threads to 1 for the single-thread -comparison in this tutorial. We also support the intra-op -parallelization for these quantized INT8 operators. The users can now -set multi-thread by ``torch.set_num_threads(N)`` (``N`` is the number of -intra-op parallelization threads). One preliminary requirement to enable -the intra-op parallelization support is to build PyTorch with the right -`backend `_ -such as OpenMP, Native or TBB. -You can use ``torch.__config__.parallel_info()`` to check the -parallelization settings. On the same MacBook Pro using PyTorch with -Native backend for parallelization, we can get about 46 seconds for -processing the evaluation of MRPC dataset. - - -3.3 Serialize the quantized model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We can serialize and save the quantized model for the future use using -`torch.jit.save` after tracing the model. - -.. code:: python - - def ids_tensor(shape, vocab_size): - # Creates a random int32 tensor of the shape within the vocab size - return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu') - - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - dummy_input = (input_ids, attention_mask, token_type_ids) - traced_model = torch.jit.trace(quantized_model, dummy_input) - torch.jit.save(traced_model, "bert_traced_eager_quant.pt") - -To load the quantized model, we can use `torch.jit.load` - -.. code:: python - - loaded_quantized_model = torch.jit.load("bert_traced_eager_quant.pt") - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a -well-known state-of-the-art NLP model like BERT into dynamic quantized -model. Dynamic quantization can reduce the size of the model while only -having a limited implication on accuracy. - -Thanks for reading! As always, we welcome any feedback, so please create -an issue `here `_ if you have -any. - - - -References ------------ - -[1] J.Devlin, M. Chang, K. Lee and K. Toutanova, `BERT: Pre-training of -Deep Bidirectional Transformers for Language Understanding (2018) -`_. - -[2] `HuggingFace Transformers `_. - -[3] O. Zafrir, G. Boudoukh, P. Izsak, and M. Wasserblat (2019). `Q8BERT: -Quantized 8bit BERT `_. diff --git a/intermediate_source/ensembling.py b/intermediate_source/ensembling.py deleted file mode 100644 index 9199daf13a..0000000000 --- a/intermediate_source/ensembling.py +++ /dev/null @@ -1,175 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Model ensembling -================ - -This tutorial illustrates how to vectorize model ensembling using ``torch.vmap``. - -What is model ensembling? -------------------------- -Model ensembling combines the predictions from multiple models together. -Traditionally this is done by running each model on some inputs separately -and then combining the predictions. However, if you're running models with -the same architecture, then it may be possible to combine them together -using ``torch.vmap``. ``vmap`` is a function transform that maps functions across -dimensions of the input tensors. One of its use cases is eliminating -for-loops and speeding them up through vectorization. - -Let's demonstrate how to do this using an ensemble of simple MLPs. - -.. note:: - - This tutorial requires PyTorch 2.0.0 or later. -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -torch.manual_seed(0) - -# Here's a simple MLP -class SimpleMLP(nn.Module): - def __init__(self): - super(SimpleMLP, self).__init__() - self.fc1 = nn.Linear(784, 128) - self.fc2 = nn.Linear(128, 128) - self.fc3 = nn.Linear(128, 10) - - def forward(self, x): - x = x.flatten(1) - x = self.fc1(x) - x = F.relu(x) - x = self.fc2(x) - x = F.relu(x) - x = self.fc3(x) - return x - -###################################################################### -# Let’s generate a batch of dummy data and pretend that we’re working with -# an MNIST dataset. Thus, the dummy images are 28 by 28, and we have a -# minibatch of size 64. Furthermore, lets say we want to combine the predictions -# from 10 different models. - -device = 'cuda' -num_models = 10 - -data = torch.randn(100, 64, 1, 28, 28, device=device) -targets = torch.randint(10, (6400,), device=device) - -models = [SimpleMLP().to(device) for _ in range(num_models)] - -###################################################################### -# We have a couple of options for generating predictions. Maybe we want to -# give each model a different randomized minibatch of data. Alternatively, -# maybe we want to run the same minibatch of data through each model (e.g. -# if we were testing the effect of different model initializations). - -###################################################################### -# Option 1: different minibatch for each model - -minibatches = data[:num_models] -predictions_diff_minibatch_loop = [model(minibatch) for model, minibatch in zip(models, minibatches)] - -###################################################################### -# Option 2: Same minibatch - -minibatch = data[0] -predictions2 = [model(minibatch) for model in models] - -###################################################################### -# Using ``vmap`` to vectorize the ensemble -# ---------------------------------------- -# -# Let's use ``vmap`` to speed up the for-loop. We must first prepare the models -# for use with ``vmap``. -# -# First, let’s combine the states of the model together by stacking each -# parameter. For example, ``model[i].fc1.weight`` has shape ``[784, 128]``; we are -# going to stack the ``.fc1.weight`` of each of the 10 models to produce a big -# weight of shape ``[10, 784, 128]``. -# -# PyTorch offers the ``torch.func.stack_module_state`` convenience function to do -# this. -from torch.func import stack_module_state - -params, buffers = stack_module_state(models) - -###################################################################### -# Next, we need to define a function to ``vmap`` over. The function should, -# given parameters and buffers and inputs, run the model using those -# parameters, buffers, and inputs. We'll use ``torch.func.functional_call`` -# to help out: - -from torch.func import functional_call -import copy - -# Construct a "stateless" version of one of the models. It is "stateless" in -# the sense that the parameters are meta Tensors and do not have storage. -base_model = copy.deepcopy(models[0]) -base_model = base_model.to('meta') - -def fmodel(params, buffers, x): - return functional_call(base_model, (params, buffers), (x,)) - -###################################################################### -# Option 1: get predictions using a different minibatch for each model. -# -# By default, ``vmap`` maps a function across the first dimension of all inputs to -# the passed-in function. After using ``stack_module_state``, each of -# the ``params`` and buffers have an additional dimension of size 'num_models' at -# the front, and minibatches has a dimension of size 'num_models'. - -print([p.size(0) for p in params.values()]) # show the leading 'num_models' dimension - -assert minibatches.shape == (num_models, 64, 1, 28, 28) # verify minibatch has leading dimension of size 'num_models' - -from torch import vmap - -predictions1_vmap = vmap(fmodel)(params, buffers, minibatches) - -# verify the ``vmap`` predictions match the -assert torch.allclose(predictions1_vmap, torch.stack(predictions_diff_minibatch_loop), atol=1e-3, rtol=1e-5) - -###################################################################### -# Option 2: get predictions using the same minibatch of data. -# -# ``vmap`` has an ``in_dims`` argument that specifies which dimensions to map over. -# By using ``None``, we tell ``vmap`` we want the same minibatch to apply for all of -# the 10 models. - -predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch) - -assert torch.allclose(predictions2_vmap, torch.stack(predictions2), atol=1e-3, rtol=1e-5) - -###################################################################### -# A quick note: there are limitations around what types of functions can be -# transformed by ``vmap``. The best functions to transform are ones that are pure -# functions: a function where the outputs are only determined by the inputs -# that have no side effects (e.g. mutation). ``vmap`` is unable to handle mutation -# of arbitrary Python data structures, but it is able to handle many in-place -# PyTorch operations. - -###################################################################### -# Performance -# ----------- -# Curious about performance numbers? Here's how the numbers look. - -from torch.utils.benchmark import Timer -without_vmap = Timer( - stmt="[model(minibatch) for model, minibatch in zip(models, minibatches)]", - globals=globals()) -with_vmap = Timer( - stmt="vmap(fmodel)(params, buffers, minibatches)", - globals=globals()) -print(f'Predictions without vmap {without_vmap.timeit(100)}') -print(f'Predictions with vmap {with_vmap.timeit(100)}') - -###################################################################### -# There's a large speedup using ``vmap``! -# -# In general, vectorization with ``vmap`` should be faster than running a function -# in a for-loop and competitive with manual batching. There are some exceptions -# though, like if we haven’t implemented the ``vmap`` rule for a particular -# operation or if the underlying kernels weren’t optimized for older hardware -# (GPUs). If you see any of these cases, please let us know by opening an issue -# on GitHub. diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py deleted file mode 100644 index 8b0162a9e8..0000000000 --- a/intermediate_source/flask_rest_api_tutorial.py +++ /dev/null @@ -1,335 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Deploying PyTorch in Python via a REST API with Flask -======================================================== -**Author**: `Avinash Sajjanshetty `_ - -In this tutorial, we will deploy a PyTorch model using Flask and expose a -REST API for model inference. In particular, we will deploy a pretrained -DenseNet 121 model which detects the image. - -.. tip:: All the code used here is released under MIT license and is available on `Github `_. - -This represents the first in a series of tutorials on deploying PyTorch models -in production. Using Flask in this way is by far the easiest way to start -serving your PyTorch models, but it will not work for a use case -with high performance requirements. For that: - - - If you're already familiar with TorchScript, you can jump straight into our - `Loading a TorchScript Model in C++ `_ tutorial. - - - If you first need a refresher on TorchScript, check out our - `Intro a TorchScript `_ tutorial. -""" - - -###################################################################### -# API Definition -# -------------- -# -# We will first define our API endpoints, the request and response types. Our -# API endpoint will be at ``/predict`` which takes HTTP POST requests with a -# ``file`` parameter which contains the image. The response will be of JSON -# response containing the prediction: -# -# .. code-block:: sh -# -# {"class_id": "n02124075", "class_name": "Egyptian_cat"} -# -# - -###################################################################### -# Dependencies -# ------------ -# -# Install the required dependencies by running the following command: -# -# .. code-block:: sh -# -# pip install Flask==2.0.1 torchvision==0.10.0 - - -###################################################################### -# Simple Web Server -# ----------------- -# -# Following is a simple web server, taken from Flask's documentation - - -from flask import Flask -app = Flask(__name__) - - -@app.route('/') -def hello(): - return 'Hello World!' - -############################################################################### -# We will also change the response type, so that it returns a JSON response -# containing ImageNet class id and name. The updated ``app.py`` file will -# be now: - -from flask import Flask, jsonify -app = Flask(__name__) - -@app.route('/predict', methods=['POST']) -def predict(): - return jsonify({'class_id': 'IMAGE_NET_XXX', 'class_name': 'Cat'}) - - -###################################################################### -# Inference -# ----------------- -# -# In the next sections we will focus on writing the inference code. This will -# involve two parts, one where we prepare the image so that it can be fed -# to DenseNet and next, we will write the code to get the actual prediction -# from the model. -# -# Preparing the image -# ~~~~~~~~~~~~~~~~~~~ -# -# DenseNet model requires the image to be of 3 channel RGB image of size -# 224 x 224. We will also normalize the image tensor with the required mean -# and standard deviation values. You can read more about it -# `here `_. -# -# We will use ``transforms`` from ``torchvision`` library and build a -# transform pipeline, which transforms our images as required. You -# can read more about transforms `here `_. - -import io - -import torchvision.transforms as transforms -from PIL import Image - -def transform_image(image_bytes): - my_transforms = transforms.Compose([transforms.Resize(255), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - [0.485, 0.456, 0.406], - [0.229, 0.224, 0.225])]) - image = Image.open(io.BytesIO(image_bytes)) - return my_transforms(image).unsqueeze(0) - -###################################################################### -# The above method takes image data in bytes, applies the series of transforms -# and returns a tensor. To test the above method, read an image file in -# bytes mode (first replacing `../_static/img/sample_file.jpeg` with the actual -# path to the file on your computer) and see if you get a tensor back: - -with open("../_static/img/sample_file.jpeg", 'rb') as f: - image_bytes = f.read() - tensor = transform_image(image_bytes=image_bytes) - print(tensor) - -###################################################################### -# Prediction -# ~~~~~~~~~~~~~~~~~~~ -# -# Now will use a pretrained DenseNet 121 model to predict the image class. We -# will use one from ``torchvision`` library, load the model and get an -# inference. While we'll be using a pretrained model in this example, you can -# use this same approach for your own models. See more about loading your -# models in this :doc:`tutorial `. - -from torchvision import models - -# Make sure to set `weights` as `'IMAGENET1K_V1'` to use the pretrained weights: -model = models.densenet121(weights='IMAGENET1K_V1') -# Since we are using our model only for inference, switch to `eval` mode: -model.eval() - - -def get_prediction(image_bytes): - tensor = transform_image(image_bytes=image_bytes) - outputs = model.forward(tensor) - _, y_hat = outputs.max(1) - return y_hat - -###################################################################### -# The tensor ``y_hat`` will contain the index of the predicted class id. -# However, we need a human readable class name. For that we need a class id -# to name mapping. Download -# `this file `_ -# as ``imagenet_class_index.json`` and remember where you saved it (or, if you -# are following the exact steps in this tutorial, save it in -# `tutorials/_static`). This file contains the mapping of ImageNet class id to -# ImageNet class name. We will load this JSON file and get the class name of -# the predicted index. - -import json - -imagenet_class_index = json.load(open('../_static/imagenet_class_index.json')) - -def get_prediction(image_bytes): - tensor = transform_image(image_bytes=image_bytes) - outputs = model.forward(tensor) - _, y_hat = outputs.max(1) - predicted_idx = str(y_hat.item()) - return imagenet_class_index[predicted_idx] - - -###################################################################### -# Before using ``imagenet_class_index`` dictionary, first we will convert -# tensor value to a string value, since the keys in the -# ``imagenet_class_index`` dictionary are strings. -# We will test our above method: - - -with open("../_static/img/sample_file.jpeg", 'rb') as f: - image_bytes = f.read() - print(get_prediction(image_bytes=image_bytes)) - -###################################################################### -# You should get a response like this: - -['n02124075', 'Egyptian_cat'] - -###################################################################### -# The first item in array is ImageNet class id and second item is the human -# readable name. -# - -###################################################################### -# Integrating the model in our API Server -# --------------------------------------- -# -# In this final part we will add our model to our Flask API server. Since -# our API server is supposed to take an image file, we will update our ``predict`` -# method to read files from the requests: -# -# .. code-block:: python -# -# from flask import request -# -# @app.route('/predict', methods=['POST']) -# def predict(): -# if request.method == 'POST': -# # we will get the file from the request -# file = request.files['file'] -# # convert that to bytes -# img_bytes = file.read() -# class_id, class_name = get_prediction(image_bytes=img_bytes) -# return jsonify({'class_id': class_id, 'class_name': class_name}) -# -# -###################################################################### -# The ``app.py`` file is now complete. Following is the full version; replace -# the paths with the paths where you saved your files and it should run: -# -# .. code-block:: python -# -# import io -# import json -# -# from torchvision import models -# import torchvision.transforms as transforms -# from PIL import Image -# from flask import Flask, jsonify, request -# -# -# app = Flask(__name__) -# imagenet_class_index = json.load(open('/imagenet_class_index.json')) -# model = models.densenet121(weights='IMAGENET1K_V1') -# model.eval() -# -# -# def transform_image(image_bytes): -# my_transforms = transforms.Compose([transforms.Resize(255), -# transforms.CenterCrop(224), -# transforms.ToTensor(), -# transforms.Normalize( -# [0.485, 0.456, 0.406], -# [0.229, 0.224, 0.225])]) -# image = Image.open(io.BytesIO(image_bytes)) -# return my_transforms(image).unsqueeze(0) -# -# -# def get_prediction(image_bytes): -# tensor = transform_image(image_bytes=image_bytes) -# outputs = model.forward(tensor) -# _, y_hat = outputs.max(1) -# predicted_idx = str(y_hat.item()) -# return imagenet_class_index[predicted_idx] -# -# -# @app.route('/predict', methods=['POST']) -# def predict(): -# if request.method == 'POST': -# file = request.files['file'] -# img_bytes = file.read() -# class_id, class_name = get_prediction(image_bytes=img_bytes) -# return jsonify({'class_id': class_id, 'class_name': class_name}) -# -# -# if __name__ == '__main__': -# app.run() -# -# -###################################################################### -# Let's test our web server! Run: -# -# .. code-block:: sh -# -# FLASK_ENV=development FLASK_APP=app.py flask run -# -####################################################################### -# We can use the -# `requests `_ -# library to send a POST request to our app: -# -# .. code-block:: python -# -# import requests -# -# resp = requests.post("http://localhost:5000/predict", -# files={"file": open('/cat.jpg','rb')}) -# - -####################################################################### -# Printing `resp.json()` will now show the following: -# -# .. code-block:: sh -# -# {"class_id": "n02124075", "class_name": "Egyptian_cat"} -# -###################################################################### -# Next steps -# -------------- -# -# The server we wrote is quite trivial and may not do everything -# you need for your production application. So, here are some things you -# can do to make it better: -# -# - The endpoint ``/predict`` assumes that always there will be a image file -# in the request. This may not hold true for all requests. Our user may -# send image with a different parameter or send no images at all. -# -# - The user may send non-image type files too. Since we are not handling -# errors, this will break our server. Adding an explicit error handing -# path that will throw an exception would allow us to better handle -# the bad inputs -# -# - Even though the model can recognize a large number of classes of images, -# it may not be able to recognize all images. Enhance the implementation -# to handle cases when the model does not recognize anything in the image. -# -# - We run the Flask server in the development mode, which is not suitable for -# deploying in production. You can check out `this tutorial `_ -# for deploying a Flask server in production. -# -# - You can also add a UI by creating a page with a form which takes the image and -# displays the prediction. Check out the `demo `_ -# of a similar project and its `source code `_. -# -# - In this tutorial, we only showed how to build a service that could return predictions for -# a single image at a time. We could modify our service to be able to return predictions for -# multiple images at once. In addition, the `service-streamer `_ -# library automatically queues requests to your service and samples them into mini-batches -# that can be fed into your model. You can check out `this tutorial `_. -# -# - Finally, we encourage you to check out our other tutorials on deploying PyTorch models -# linked-to at the top of the page. -# diff --git a/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst deleted file mode 100644 index 4c9752d016..0000000000 --- a/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst +++ /dev/null @@ -1,11 +0,0 @@ -Forced Alignment with Wav2Vec2 -============================== - -This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html - -It will redirect in 3 seconds. - -.. raw:: html - - - diff --git a/intermediate_source/forward_ad_usage.py b/intermediate_source/forward_ad_usage.py deleted file mode 100644 index 10965d64ab..0000000000 --- a/intermediate_source/forward_ad_usage.py +++ /dev/null @@ -1,246 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Forward-mode Automatic Differentiation (Beta) -============================================= - -This tutorial demonstrates how to use forward-mode AD to compute -directional derivatives (or equivalently, Jacobian-vector products). - -The tutorial below uses some APIs only available in versions >= 1.11 -(or nightly builds). - -Also note that forward-mode AD is currently in beta. The API is -subject to change and operator coverage is still incomplete. - -Basic Usage --------------------------------------------------------------------- -Unlike reverse-mode AD, forward-mode AD computes gradients eagerly -alongside the forward pass. We can use forward-mode AD to compute a -directional derivative by performing the forward pass as before, -except we first associate our input with another tensor representing -the direction of the directional derivative (or equivalently, the ``v`` -in a Jacobian-vector product). When an input, which we call "primal", is -associated with a "direction" tensor, which we call "tangent", the -resultant new tensor object is called a "dual tensor" for its connection -to dual numbers[0]. - -As the forward pass is performed, if any input tensors are dual tensors, -extra computation is performed to propagate this "sensitivity" of the -function. - -""" - -import torch -import torch.autograd.forward_ad as fwAD - -primal = torch.randn(10, 10) -tangent = torch.randn(10, 10) - -def fn(x, y): - return x ** 2 + y ** 2 - -# All forward AD computation must be performed in the context of -# a ``dual_level`` context. All dual tensors created in such a context -# will have their tangents destroyed upon exit. This is to ensure that -# if the output or intermediate results of this computation are reused -# in a future forward AD computation, their tangents (which are associated -# with this computation) won't be confused with tangents from the later -# computation. -with fwAD.dual_level(): - # To create a dual tensor we associate a tensor, which we call the - # primal with another tensor of the same size, which we call the tangent. - # If the layout of the tangent is different from that of the primal, - # The values of the tangent are copied into a new tensor with the same - # metadata as the primal. Otherwise, the tangent itself is used as-is. - # - # It is also important to note that the dual tensor created by - # ``make_dual`` is a view of the primal. - dual_input = fwAD.make_dual(primal, tangent) - assert fwAD.unpack_dual(dual_input).tangent is tangent - - # To demonstrate the case where the copy of the tangent happens, - # we pass in a tangent with a layout different from that of the primal - dual_input_alt = fwAD.make_dual(primal, tangent.T) - assert fwAD.unpack_dual(dual_input_alt).tangent is not tangent - - # Tensors that do not have an associated tangent are automatically - # considered to have a zero-filled tangent of the same shape. - plain_tensor = torch.randn(10, 10) - dual_output = fn(dual_input, plain_tensor) - - # Unpacking the dual returns a ``namedtuple`` with ``primal`` and ``tangent`` - # as attributes - jvp = fwAD.unpack_dual(dual_output).tangent - -assert fwAD.unpack_dual(dual_output).tangent is None - -###################################################################### -# Usage with Modules -# -------------------------------------------------------------------- -# To use ``nn.Module`` with forward AD, replace the parameters of your -# model with dual tensors before performing the forward pass. At the -# time of writing, it is not possible to create dual tensor -# `nn.Parameter`s. As a workaround, one must register the dual tensor -# as a non-parameter attribute of the module. - -import torch.nn as nn - -model = nn.Linear(5, 5) -input = torch.randn(16, 5) - -params = {name: p for name, p in model.named_parameters()} -tangents = {name: torch.rand_like(p) for name, p in params.items()} - -with fwAD.dual_level(): - for name, p in params.items(): - delattr(model, name) - setattr(model, name, fwAD.make_dual(p, tangents[name])) - - out = model(input) - jvp = fwAD.unpack_dual(out).tangent - -###################################################################### -# Using the functional Module API (beta) -# -------------------------------------------------------------------- -# Another way to use ``nn.Module`` with forward AD is to utilize -# the functional Module API (also known as the stateless Module API). - -from torch.func import functional_call - -# We need a fresh module because the functional call requires the -# the model to have parameters registered. -model = nn.Linear(5, 5) - -dual_params = {} -with fwAD.dual_level(): - for name, p in params.items(): - # Using the same ``tangents`` from the above section - dual_params[name] = fwAD.make_dual(p, tangents[name]) - out = functional_call(model, dual_params, input) - jvp2 = fwAD.unpack_dual(out).tangent - -# Check our results -assert torch.allclose(jvp, jvp2) - -###################################################################### -# Custom autograd Function -# -------------------------------------------------------------------- -# Custom Functions also support forward-mode AD. To create custom Function -# supporting forward-mode AD, register the ``jvp()`` static method. It is -# possible, but not mandatory for custom Functions to support both forward -# and backward AD. See the -# `documentation `_ -# for more information. - -class Fn(torch.autograd.Function): - @staticmethod - def forward(ctx, foo): - result = torch.exp(foo) - # Tensors stored in ``ctx`` can be used in the subsequent forward grad - # computation. - ctx.result = result - return result - - @staticmethod - def jvp(ctx, gI): - gO = gI * ctx.result - # If the tensor stored in`` ctx`` will not also be used in the backward pass, - # one can manually free it using ``del`` - del ctx.result - return gO - -fn = Fn.apply - -primal = torch.randn(10, 10, dtype=torch.double, requires_grad=True) -tangent = torch.randn(10, 10) - -with fwAD.dual_level(): - dual_input = fwAD.make_dual(primal, tangent) - dual_output = fn(dual_input) - jvp = fwAD.unpack_dual(dual_output).tangent - -# It is important to use ``autograd.gradcheck`` to verify that your -# custom autograd Function computes the gradients correctly. By default, -# ``gradcheck`` only checks the backward-mode (reverse-mode) AD gradients. Specify -# ``check_forward_ad=True`` to also check forward grads. If you did not -# implement the backward formula for your function, you can also tell ``gradcheck`` -# to skip the tests that require backward-mode AD by specifying -# ``check_backward_ad=False``, ``check_undefined_grad=False``, and -# ``check_batched_grad=False``. -torch.autograd.gradcheck(Fn.apply, (primal,), check_forward_ad=True, - check_backward_ad=False, check_undefined_grad=False, - check_batched_grad=False) - -###################################################################### -# Functional API (beta) -# -------------------------------------------------------------------- -# We also offer a higher-level functional API in functorch -# for computing Jacobian-vector products that you may find simpler to use -# depending on your use case. -# -# The benefit of the functional API is that there isn't a need to understand -# or use the lower-level dual tensor API and that you can compose it with -# other `functorch transforms (like vmap) `_; -# the downside is that it offers you less control. -# -# Note that the remainder of this tutorial will require functorch -# (https://github.com/pytorch/functorch) to run. Please find installation -# instructions at the specified link. - -import functorch as ft - -primal0 = torch.randn(10, 10) -tangent0 = torch.randn(10, 10) -primal1 = torch.randn(10, 10) -tangent1 = torch.randn(10, 10) - -def fn(x, y): - return x ** 2 + y ** 2 - -# Here is a basic example to compute the JVP of the above function. -# The ``jvp(func, primals, tangents)`` returns ``func(*primals)`` as well as the -# computed Jacobian-vector product (JVP). Each primal must be associated with a tangent of the same shape. -primal_out, tangent_out = ft.jvp(fn, (primal0, primal1), (tangent0, tangent1)) - -# ``functorch.jvp`` requires every primal to be associated with a tangent. -# If we only want to associate certain inputs to `fn` with tangents, -# then we'll need to create a new function that captures inputs without tangents: -primal = torch.randn(10, 10) -tangent = torch.randn(10, 10) -y = torch.randn(10, 10) - -import functools -new_fn = functools.partial(fn, y=y) -primal_out, tangent_out = ft.jvp(new_fn, (primal,), (tangent,)) - -###################################################################### -# Using the functional API with Modules -# -------------------------------------------------------------------- -# To use ``nn.Module`` with ``functorch.jvp`` to compute Jacobian-vector products -# with respect to the model parameters, we need to reformulate the -# ``nn.Module`` as a function that accepts both the model parameters and inputs -# to the module. - -model = nn.Linear(5, 5) -input = torch.randn(16, 5) -tangents = tuple([torch.rand_like(p) for p in model.parameters()]) - -# Given a ``torch.nn.Module``, ``ft.make_functional_with_buffers`` extracts the state -# (``params`` and buffers) and returns a functional version of the model that -# can be invoked like a function. -# That is, the returned ``func`` can be invoked like -# ``func(params, buffers, input)``. -# ``ft.make_functional_with_buffers`` is analogous to the ``nn.Modules`` stateless API -# that you saw previously and we're working on consolidating the two. -func, params, buffers = ft.make_functional_with_buffers(model) - -# Because ``jvp`` requires every input to be associated with a tangent, we need to -# create a new function that, when given the parameters, produces the output -def func_params_only(params): - return func(params, buffers, input) - -model_output, jvp_out = ft.jvp(func_params_only, (params,), (tangents,)) - - -###################################################################### -# [0] https://en.wikipedia.org/wiki/Dual_number diff --git a/intermediate_source/fx_conv_bn_fuser.py b/intermediate_source/fx_conv_bn_fuser.py deleted file mode 100644 index 547f93fb7f..0000000000 --- a/intermediate_source/fx_conv_bn_fuser.py +++ /dev/null @@ -1,262 +0,0 @@ -# -*- coding: utf-8 -*- -""" -(beta) Building a Convolution/Batch Norm fuser in FX -******************************************************* -**Author**: `Horace He `_ - -In this tutorial, we are going to use FX, a toolkit for composable function -transformations of PyTorch, to do the following: - -1) Find patterns of conv/batch norm in the data dependencies. -2) For the patterns found in 1), fold the batch norm statistics into the convolution weights. - -Note that this optimization only works for models in inference mode (i.e. `mode.eval()`) - -We will be building the fuser that exists here: -https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/fx/experimental/fuser.py - -""" - - -###################################################################### -# First, let's get some imports out of the way (we will be using all -# of these later in the code). - -from typing import Type, Dict, Any, Tuple, Iterable -import copy -import torch.fx as fx -import torch -import torch.nn as nn - -###################################################################### -# For this tutorial, we are going to create a model consisting of convolutions -# and batch norms. Note that this model has some tricky components - some of -# the conv/batch norm patterns are hidden within Sequentials and one of the -# ``BatchNorms`` is wrapped in another Module. - -class WrappedBatchNorm(nn.Module): - def __init__(self): - super().__init__() - self.mod = nn.BatchNorm2d(1) - def forward(self, x): - return self.mod(x) - -class M(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 1, 1) - self.bn1 = nn.BatchNorm2d(1) - self.conv2 = nn.Conv2d(1, 1, 1) - self.nested = nn.Sequential( - nn.BatchNorm2d(1), - nn.Conv2d(1, 1, 1), - ) - self.wrapped = WrappedBatchNorm() - - def forward(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = self.conv2(x) - x = self.nested(x) - x = self.wrapped(x) - return x - -model = M() - -model.eval() - -###################################################################### -# Fusing Convolution with Batch Norm -# ----------------------------------------- -# One of the primary challenges with trying to automatically fuse convolution -# and batch norm in PyTorch is that PyTorch does not provide an easy way of -# accessing the computational graph. FX resolves this problem by symbolically -# tracing the actual operations called, so that we can track the computations -# through the `forward` call, nested within Sequential modules, or wrapped in -# an user-defined module. - -traced_model = torch.fx.symbolic_trace(model) -print(traced_model.graph) - -###################################################################### -# This gives us a graph representation of our model. Note that both the modules -# hidden within the sequential as well as the wrapped Module have been inlined -# into the graph. This is the default level of abstraction, but it can be -# configured by the pass writer. More information can be found at the FX -# overview https://pytorch.org/docs/master/fx.html#module-torch.fx - - -#################################### -# Fusing Convolution with Batch Norm -# ---------------------------------- -# Unlike some other fusions, fusion of convolution with batch norm does not -# require any new operators. Instead, as batch norm during inference -# consists of a pointwise add and multiply, these operations can be "baked" -# into the preceding convolution's weights. This allows us to remove the batch -# norm entirely from our model! Read -# https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ for further details. The -# code here is copied from -# https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/nn/utils/fusion.py -# clarity purposes. -def fuse_conv_bn_eval(conv, bn): - """ - Given a conv Module `A` and an batch_norm module `B`, returns a conv - module `C` such that C(x) == B(A(x)) in inference mode. - """ - assert(not (conv.training or bn.training)), "Fusion only for eval!" - fused_conv = copy.deepcopy(conv) - - fused_conv.weight, fused_conv.bias = \ - fuse_conv_bn_weights(fused_conv.weight, fused_conv.bias, - bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias) - - return fused_conv - -def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): - if conv_b is None: - conv_b = torch.zeros_like(bn_rm) - if bn_w is None: - bn_w = torch.ones_like(bn_rm) - if bn_b is None: - bn_b = torch.zeros_like(bn_rm) - bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps) - - conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) - conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b - - return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b) - - -#################################### -# FX Fusion Pass -# ---------------------------------- -# Now that we have our computational graph as well as a method for fusing -# convolution and batch norm, all that remains is to iterate over the FX graph -# and apply the desired fusions. - - -def _parent_name(target : str) -> Tuple[str, str]: - """ - Splits a ``qualname`` into parent path and last atom. - For example, `foo.bar.baz` -> (`foo.bar`, `baz`) - """ - *parent, name = target.rsplit('.', 1) - return parent[0] if parent else '', name - -def replace_node_module(node: fx.Node, modules: Dict[str, Any], new_module: torch.nn.Module): - assert(isinstance(node.target, str)) - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, new_module) - - -def fuse(model: torch.nn.Module) -> torch.nn.Module: - model = copy.deepcopy(model) - # The first step of most FX passes is to symbolically trace our model to - # obtain a `GraphModule`. This is a representation of our original model - # that is functionally identical to our original model, except that we now - # also have a graph representation of our forward pass. - fx_model: fx.GraphModule = fx.symbolic_trace(model) - modules = dict(fx_model.named_modules()) - - # The primary representation for working with FX are the `Graph` and the - # `Node`. Each `GraphModule` has a `Graph` associated with it - this - # `Graph` is also what generates `GraphModule.code`. - # The `Graph` itself is represented as a list of `Node` objects. Thus, to - # iterate through all of the operations in our graph, we iterate over each - # `Node` in our `Graph`. - for node in fx_model.graph.nodes: - # The FX IR contains several types of nodes, which generally represent - # call sites to modules, functions, or methods. The type of node is - # determined by `Node.op`. - if node.op != 'call_module': # If our current node isn't calling a Module then we can ignore it. - continue - # For call sites, `Node.target` represents the module/function/method - # that's being called. Here, we check `Node.target` to see if it's a - # batch norm module, and then check `Node.args[0].target` to see if the - # input `Node` is a convolution. - if type(modules[node.target]) is nn.BatchNorm2d and type(modules[node.args[0].target]) is nn.Conv2d: - if len(node.args[0].users) > 1: # Output of conv is used by other nodes - continue - conv = modules[node.args[0].target] - bn = modules[node.target] - fused_conv = fuse_conv_bn_eval(conv, bn) - replace_node_module(node.args[0], modules, fused_conv) - # As we've folded the batch nor into the conv, we need to replace all uses - # of the batch norm with the conv. - node.replace_all_uses_with(node.args[0]) - # Now that all uses of the batch norm have been replaced, we can - # safely remove the batch norm. - fx_model.graph.erase_node(node) - fx_model.graph.lint() - # After we've modified our graph, we need to recompile our graph in order - # to keep the generated code in sync. - fx_model.recompile() - return fx_model - - -###################################################################### -# .. note:: -# We make some simplifications here for demonstration purposes, such as only -# matching 2D convolutions. View -# https://github.com/pytorch/pytorch/blob/master/torch/fx/experimental/fuser.py -# for a more usable pass. - -###################################################################### -# Testing out our Fusion Pass -# ----------------------------------------- -# We can now run this fusion pass on our initial toy model and verify that our -# results are identical. In addition, we can print out the code for our fused -# model and verify that there are no more batch norms. - - -fused_model = fuse(model) -print(fused_model.code) -inp = torch.randn(5, 1, 1, 1) -torch.testing.assert_allclose(fused_model(inp), model(inp)) - - -###################################################################### -# Benchmarking our Fusion on ResNet18 -# ----------------------------------- -# We can test our fusion pass on a larger model like ResNet18 and see how much -# this pass improves inference performance. -import torchvision.models as models -import time - -rn18 = models.resnet18() -rn18.eval() - -inp = torch.randn(10, 3, 224, 224) -output = rn18(inp) - -def benchmark(model, iters=20): - for _ in range(10): - model(inp) - begin = time.time() - for _ in range(iters): - model(inp) - return str(time.time()-begin) - -fused_rn18 = fuse(rn18) -print("Unfused time: ", benchmark(rn18)) -print("Fused time: ", benchmark(fused_rn18)) -###################################################################### -# As we previously saw, the output of our FX transformation is -# ("torchscriptable") PyTorch code, we can easily ``jit.script`` the output to try -# and increase our performance even more. In this way, our FX model -# transformation composes with TorchScript with no issues. -jit_rn18 = torch.jit.script(fused_rn18) -print("jit time: ", benchmark(jit_rn18)) - - -############ -# Conclusion -# ---------- -# As we can see, using FX we can easily write static graph transformations on -# PyTorch code. -# -# Since FX is still in beta, we would be happy to hear any -# feedback you have about using it. Please feel free to use the -# PyTorch Forums (https://discuss.pytorch.org/) and the issue tracker -# (https://github.com/pytorch/pytorch/issues) to provide any feedback -# you might have. diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py deleted file mode 100644 index 8caaf7be39..0000000000 --- a/intermediate_source/fx_profiling_tutorial.py +++ /dev/null @@ -1,236 +0,0 @@ -# -*- coding: utf-8 -*- -""" -(beta) Building a Simple CPU Performance Profiler with FX -********************************************************* -**Author**: `James Reed `_ - -In this tutorial, we are going to use FX to do the following: - -1) Capture PyTorch Python code in a way that we can inspect and gather - statistics about the structure and execution of the code -2) Build out a small class that will serve as a simple performance "profiler", - collecting runtime statistics about each part of the model from actual - runs. - -""" - -###################################################################### -# For this tutorial, we are going to use the torchvision ResNet18 model -# for demonstration purposes. - -import torch -import torch.fx -import torchvision.models as models - -rn18 = models.resnet18() -rn18.eval() - -###################################################################### -# Now that we have our model, we want to inspect deeper into its -# performance. That is, for the following invocation, which parts -# of the model are taking the longest? -input = torch.randn(5, 3, 224, 224) -output = rn18(input) - -###################################################################### -# A common way of answering that question is to go through the program -# source, add code that collects timestamps at various points in the -# program, and compare the difference between those timestamps to see -# how long the regions between the timestamps take. -# -# That technique is certainly applicable to PyTorch code, however it -# would be nicer if we didn't have to copy over model code and edit it, -# especially code we haven't written (like this torchvision model). -# Instead, we are going to use FX to automate this "instrumentation" -# process without needing to modify any source. - -###################################################################### -# First, let's get some imports out of the way (we will be using all -# of these later in the code). - -import statistics, tabulate, time -from typing import Any, Dict, List -from torch.fx import Interpreter - -###################################################################### -# .. note:: -# ``tabulate`` is an external library that is not a dependency of PyTorch. -# We will be using it to more easily visualize performance data. Please -# make sure you've installed it from your favorite Python package source. - -###################################################################### -# Capturing the Model with Symbolic Tracing -# ----------------------------------------- -# Next, we are going to use FX's symbolic tracing mechanism to capture -# the definition of our model in a data structure we can manipulate -# and examine. - -traced_rn18 = torch.fx.symbolic_trace(rn18) -print(traced_rn18.graph) - -###################################################################### -# This gives us a Graph representation of the ResNet18 model. A Graph -# consists of a series of Nodes connected to each other. Each Node -# represents a call-site in the Python code (whether to a function, -# a module, or a method) and the edges (represented as ``args`` and ``kwargs`` -# on each node) represent the values passed between these call-sites. More -# information about the Graph representation and the rest of FX's APIs ca -# be found at the FX documentation https://pytorch.org/docs/master/fx.html. - - -###################################################################### -# Creating a Profiling Interpreter -# -------------------------------- -# Next, we are going to create a class that inherits from ``torch.fx.Interpreter``. -# Though the ``GraphModule`` that ``symbolic_trace`` produces compiles Python code -# that is run when you call a ``GraphModule``, an alternative way to run a -# ``GraphModule`` is by executing each ``Node`` in the ``Graph`` one by one. That is -# the functionality that ``Interpreter`` provides: It interprets the graph node- -# by-node. -# -# By inheriting from ``Interpreter``, we can override various functionality and -# install the profiling behavior we want. The goal is to have an object to which -# we can pass a model, invoke the model 1 or more times, then get statistics about -# how long the model and each part of the model took during those runs. -# -# Let's define our ``ProfilingInterpreter`` class: - -class ProfilingInterpreter(Interpreter): - def __init__(self, mod : torch.nn.Module): - # Rather than have the user symbolically trace their model, - # we're going to do it in the constructor. As a result, the - # user can pass in any ``Module`` without having to worry about - # symbolic tracing APIs - gm = torch.fx.symbolic_trace(mod) - super().__init__(gm) - - # We are going to store away two things here: - # - # 1. A list of total runtimes for ``mod``. In other words, we are - # storing away the time ``mod(...)`` took each time this - # interpreter is called. - self.total_runtime_sec : List[float] = [] - # 2. A map from ``Node`` to a list of times (in seconds) that - # node took to run. This can be seen as similar to (1) but - # for specific sub-parts of the model. - self.runtimes_sec : Dict[torch.fx.Node, List[float]] = {} - - ###################################################################### - # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run`` - # method is the top-level entry point for execution of the model. We will - # want to intercept this so that we can record the total runtime of the - # model. - - def run(self, *args) -> Any: - # Record the time we started running the model - t_start = time.time() - # Run the model by delegating back into Interpreter.run() - return_val = super().run(*args) - # Record the time we finished running the model - t_end = time.time() - # Store the total elapsed time this model execution took in the - # ``ProfilingInterpreter`` - self.total_runtime_sec.append(t_end - t_start) - return return_val - - ###################################################################### - # Now, let's override ``run_node``. ``Interpreter`` calls ``run_node`` each - # time it executes a single node. We will intercept this so that we - # can measure and record the time taken for each individual call in - # the model. - - def run_node(self, n : torch.fx.Node) -> Any: - # Record the time we started running the op - t_start = time.time() - # Run the op by delegating back into Interpreter.run_node() - return_val = super().run_node(n) - # Record the time we finished running the op - t_end = time.time() - # If we don't have an entry for this node in our runtimes_sec - # data structure, add one with an empty list value. - self.runtimes_sec.setdefault(n, []) - # Record the total elapsed time for this single invocation - # in the runtimes_sec data structure - self.runtimes_sec[n].append(t_end - t_start) - return return_val - - ###################################################################### - # Finally, we are going to define a method (one which doesn't override - # any ``Interpreter`` method) that provides us a nice, organized view of - # the data we have collected. - - def summary(self, should_sort : bool = False) -> str: - # Build up a list of summary information for each node - node_summaries : List[List[Any]] = [] - # Calculate the mean runtime for the whole network. Because the - # network may have been called multiple times during profiling, - # we need to summarize the runtimes. We choose to use the - # arithmetic mean for this. - mean_total_runtime = statistics.mean(self.total_runtime_sec) - - # For each node, record summary statistics - for node, runtimes in self.runtimes_sec.items(): - # Similarly, compute the mean runtime for ``node`` - mean_runtime = statistics.mean(runtimes) - # For easier understanding, we also compute the percentage - # time each node took with respect to the whole network. - pct_total = mean_runtime / mean_total_runtime * 100 - # Record the node's type, name of the node, mean runtime, and - # percent runtime. - node_summaries.append( - [node.op, str(node), mean_runtime, pct_total]) - - # One of the most important questions to answer when doing performance - # profiling is "Which op(s) took the longest?". We can make this easy - # to see by providing sorting functionality in our summary view - if should_sort: - node_summaries.sort(key=lambda s: s[2], reverse=True) - - # Use the ``tabulate`` library to create a well-formatted table - # presenting our summary information - headers : List[str] = [ - 'Op type', 'Op', 'Average runtime (s)', 'Pct total runtime' - ] - return tabulate.tabulate(node_summaries, headers=headers) - -###################################################################### -# .. note:: -# We use Python's ``time.time`` function to pull wall clock -# timestamps and compare them. This is not the most accurate -# way to measure performance, and will only give us a first- -# order approximation. We use this simple technique only for the -# purpose of demonstration in this tutorial. - -###################################################################### -# Investigating the Performance of ResNet18 -# ----------------------------------------- -# We can now use ``ProfilingInterpreter`` to inspect the performance -# characteristics of our ResNet18 model; - -interp = ProfilingInterpreter(rn18) -interp.run(input) -print(interp.summary(True)) - -###################################################################### -# There are two things we should call out here: -# -# * ``MaxPool2d`` takes up the most time. This is a known issue: -# https://github.com/pytorch/pytorch/issues/51393 -# * BatchNorm2d also takes up significant time. We can continue this -# line of thinking and optimize this in the Conv-BN Fusion with FX -# `tutorial `_. -# -# -# Conclusion -# ---------- -# As we can see, using FX we can easily capture PyTorch programs (even -# ones we don't have the source code for!) in a machine-interpretable -# format and use that for analysis, such as the performance analysis -# we've done here. FX opens up an exciting world of possibilities for -# working with PyTorch programs. -# -# Finally, since FX is still in beta, we would be happy to hear any -# feedback you have about using it. Please feel free to use the -# PyTorch Forums (https://discuss.pytorch.org/) and the issue tracker -# (https://github.com/pytorch/pytorch/issues) to provide any feedback -# you might have. diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py deleted file mode 100644 index f44d5bd76b..0000000000 --- a/intermediate_source/inductor_debug_cpu.py +++ /dev/null @@ -1,637 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Inductor CPU backend debugging and profiling -============================================ - -**Authors**: `Xuan Liao `_, `Haozhe Zhu `_, `Jiong Gong `_, `Weihan Wang `_ -""" - -######################################################################### -# Overview -# -------- -# -# PyTorch 2.0 introduced the compilation API called ``torch.compile``. -# This new feature offers a significant speedup over eager mode execution through graph-level optimization powered by the default Inductor backend. -# -# This tutorial is intended to provide an in-depth introduction on the debugging -# and performance profiling on Inductor CPU backend by delving into the intricacies of ``torch.compile``. -# -# Meanwhile, you may also find related tutorials about ``torch.compile`` -# around `basic usage `_, -# comprehensive `troubleshooting `_ -# and GPU-specific knowledge like `GPU performance profiling `_. -# -# We will start debugging with a motivating example that triggers compilation issues and accuracy problems -# by demonstrating the process of debugging to pinpoint the problems. -# -# By enabling logging and exploring the underlying generated code, -# you can learn how to narrow down the failure step by step and finally figure out the route cause. -# -# Following that, we will proceed to discuss how to profile the compiled code and, -# through a performance comparison with eager mode, -# elaborate on the reasons why ``torch.compile`` can provide an additional performance boost compared to its eager counterpart. - - -###################################################################### -# Debugging -# --------- -# -# Here is a simple example to run the ``torch.compile`` using Inductor and compare its result with eager mode: - -import torch - -def foo1(x1, x2): - a = torch.neg(x1) - b = torch.maximum(x2, a) - y = torch.cat([b], dim=0) - return y - -x1 = torch.randint(256, (1, 8), dtype=torch.uint8) -x2 = torch.randint(256, (8390, 8), dtype=torch.uint8) - -compiled_foo1 = torch.compile(foo1) -result = compiled_foo1(x1, x2) - -###################################################################### -# The correct implementation of ``neg`` in the ``cpp`` codegen is as follows: - -def neg1(x): - return f"decltype({x})(-{x})" - -###################################################################### -# In order to demonstrate the debugging, we will modify the function to a wrong one later. -# -# -# Get more logging information -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# No debugging information would be provided if you run this simple example by default. In order to get more useful debugging and logging information, we usually add a ``TORCH_COMPILE_DEBUG`` environment variable like below: -# -# .. code-block:: shell -# -# TORCH_COMPILE_DEBUG=1 python xx.py -# -# This would print more debug information in the output logs and also dump the intermediate IRs generated during the codegen process. You can find the dumped file paths in the log like below: -# -# .. code-block:: shell -# -# torch._inductor.debug: [WARNING] model___20 debug trace: /tmp/torchinductor_root/rx/crxfi2ybd7yp5sbj2pnhw33wfhtdw7wumvrobyp5sjvdui5ktjc2.debug -# -# In this directory, the following files are saved for debugging purposes: -# -# +-----------------------------+----------------------------------------------------------------+ -# | File | Description | -# +=============================+================================================================+ -# | ``fx_graph_runnable.py`` | Executable FX graph, after decomposition, before pattern match | -# +-----------------------------+----------------------------------------------------------------+ -# | ``fx_graph_transformed.py`` | Transformed FX graph, after pattern match | -# +-----------------------------+----------------------------------------------------------------+ -# | ``ir_pre_fusion.txt`` | Inductor IR before fusion | -# +-----------------------------+----------------------------------------------------------------+ -# | ``ir_post_fusion.txt`` | Inductor IR after fusion | -# +-----------------------------+----------------------------------------------------------------+ -# | ``output_code.py`` | Generated Python code for graph, with C++/Triton kernels | -# +-----------------------------+----------------------------------------------------------------+ -# -# Note that ``fx_graph_runnable.py`` and ``output_code.py`` are both runnable and editable in order to make debugging easier. -# Here are the main parts of code extracted from the files and we correlate the C++ generated line with the FX code line. -# -# ``fx_graph_runnable``: -# - -def forward1(self, arg0_1, arg1_1): - neg = torch.ops.aten.neg.default(arg0_1); arg0_1 = None - maximum = torch.ops.aten.maximum.default(arg1_1, neg); arg1_1 = neg = None - clone = torch.ops.aten.clone.default(maximum); maximum = None - return (clone,) - -###################################################################### -# C++ kernel in ``output_code``: -# - -import torch -from torch._inductor.async_compile import AsyncCompile -async_compile = AsyncCompile() - -cpp_fused_cat_maximum_neg_0 = async_compile.cpp(''' -#include "/tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h" -extern "C" void kernel(const unsigned char* in_ptr0, - const unsigned char* in_ptr1, - unsigned char* out_ptr0) -{ - { - #pragma GCC ivdep - for(long i0=static_cast(0L); i0(8390L); i0+=static_cast(1L)) - { - #pragma GCC ivdep - for(long i1=static_cast(0L); i1(8L); i1+=static_cast(1L)) - { - auto tmp0 = in_ptr0[static_cast(i1 + (8L*i0))]; - auto tmp1 = in_ptr1[static_cast(i1)]; - // Corresponding FX code line: neg = torch.ops.aten.neg.default(arg0_1); arg0_1 = None - auto tmp2 = decltype(tmp1)(-tmp1); - // Corresponding FX code line: maximum = torch.ops.aten.maximum.default(arg1_1, neg); arg1_1 = neg = None - auto tmp3 = max_propagate_nan(tmp0, tmp2); - // Corresponding FX code line: clone = torch.ops.aten.clone.default(maximum); maximum = None - out_ptr0[static_cast(i1 + (8L*i0))] = tmp3; - } - } - } -}''') - - -###################################################################### -# Determine component of error -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# When encountering errors or accuracy problems, a straightforward solution to find the bug is to narrow down the problem. The first thing to do is to determine the component where the error occurs. Luckily, it can be simply achieved by changing the backend of ``torch.compile``. -# -# +--------------------------------------------+-----------------------------------------+ -# | Code | Description | -# +============================================+=========================================+ -# | ``torch.compile(fn, backend="eager")`` | Enable Dynamo | -# +--------------------------------------------+-----------------------------------------+ -# | ``torch.compile(fn, backend="aot_eager")`` | Enable Dynamo + AOT Autograd | -# +--------------------------------------------+-----------------------------------------+ -# | ``torch.compile(fn, backend="inductor")`` | Enable Dynamo + AOT Autograd + Inductor | -# +--------------------------------------------+-----------------------------------------+ -# -# If the model can successfully run when the backend is set to ``eager`` or ``aot_eager`` while it fails with ``inductor``, we can narrow down the failure to Inductor. -# -# -# Compilation error -# ^^^^^^^^^^^^^^^^^ -# -# As we know, the evolved chain of graph-level optimization is like: -# -# .. code-block:: sh -# -# torch.neg (Python) -> torch.ops.aten.neg.default (within FX graph) -> ops.neg (within IR node) -> tmp2 = -tmp1 (within C++ kernel) -# -# If you encounter a compilation error, there is something wrong when compiling C++ kernels in the output code. -# This type of error indicates that bugs are introduced when lowering IR nodes to output code. -# The root cause of compilation error is usually shown in the traceback log. -# -# For example, the ``neg`` function is modified like this: - -def neg2(x): - return f"-{x}" - -###################################################################### -# The logging gives the following compile error with a rather clear reason. -# -# .. code-block:: -# -# torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: -# CppCompileError: C++ compile error -# /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp: In function ‘void kernel(const unsigned char*, const unsigned char*, unsigned char*)’: -# /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp:17:57: error: no matching function for call to ‘max_propagate_nan(unsigned char&, int&)’ -# 17 | auto tmp3 = max_propagate_nan(tmp0, tmp2); -# | ^ -# In file included from /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp:2: -# /tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h:27:17: note: candidate: ‘template scalar_t max_propagate_nan(scalar_t, scalar_t)’ -# 27 | inline scalar_t max_propagate_nan(scalar_t a, scalar_t b) { -# | ^~~~~~~~~~~~~~~~~ -# /tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h:27:17: note: template argument deduction/substitution failed: -# /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp:17:57: note: deduced conflicting types for parameter ‘scalar_t’ (‘unsigned char’ and ‘int’) -# 17 | auto tmp3 = max_propagate_nan(tmp0, tmp2); -# | ^ -# -# -# Let us also see the corresponding C++ kernel in output code and IR node. -# -# C++ kernel: -# -# .. code:: c -# -# include "/tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h" -# extern "C" void kernel(const unsigned char* in_ptr0, -# const unsigned char* in_ptr1, -# unsigned char* out_ptr0) -# { -# { -# #pragma GCC ivdep -# for(long i0=static_cast(0L); i0(8390L); i0+=static_cast(1L)) -# { -# #pragma GCC ivdep -# for(long i1=static_cast(0L); i1(8L); i1+=static_cast(1L)) -# { -# auto tmp0 = in_ptr0[static_cast(i1 + (8L*i0))]; -# auto tmp1 = in_ptr1[static_cast(i1)]; -# auto tmp2 = -tmp1; -# auto tmp3 = max_propagate_nan(tmp0, tmp2); -# out_ptr0[static_cast(i1 + (8L*i0))] = tmp3; -# } -# } -# } -# } -# - -###################################################################### -# IR node: -# -# .. code-block:: sh -# -# buf0: SchedulerNode(ComputedBuffer) -# buf0.writes = [MemoryDep('buf0', c0, {c0: 67120})] -# buf0.unmet_dependencies = [] -# buf0.met_dependencies = -# [ MemoryDep('arg0_1', c1, {c0: 8390, c1: 8}), -# MemoryDep('arg1_1', c0, {c0: 67120})] -# buf0.users = [NodeUser(node=OUTPUT, can_inplace=False)] -# buf0.group.device = cpu -# buf0.group.iteration = ((8390, 8), ()) -# buf0.sizes = ([8390, 8], []) -# class buf0_loop_body: -# var_ranges = {z0: 8390, z1: 8} -# index0 = 8*z0 + z1 -# index1 = z1 -# def body(self, ops): -# get_index = self.get_index('index0') -# load = ops.load('arg1_1', get_index) -# get_index_1 = self.get_index('index1') -# load_1 = ops.load('arg0_1', get_index_1) -# neg = ops.neg(load_1) -# maximum = ops.maximum(load, neg) -# get_index_2 = self.get_index('index0') -# store = ops.store('buf0', get_index_2, maximum, None) -# return store -# - -###################################################################### -# According to the traceback logging, the compilation error is caused by the data type inconsistency of ``max_propagate_nan``'s inputs. -# By checking the C++ kernel, we know that ``tmp2`` is no longer ``long`` after doing ``-`` as ``tmp0`` is ``long``. -# We can easily match ``-`` and ``max_propagate_nan`` in C++ kernel with ``ops.neg`` and ``ops.maximum`` in IR node respectively. -# -# Now we successfully find that the root cause is the implementation of ``ops.neg`` in ``cpp`` codegen, which silently changes the data type when doing ``neg``. -# -# -# Accuracy debugging -# ^^^^^^^^^^^^^^^^^^^ -# -# Otherwise, if the model runs with other errors or accuracy problem, you can use the PyTorch debugging tool called `Minifier `_. -# -# The core idea of ``Minifier`` is to keep removing the nodes and inputs of graph until finding the minimal graph with problem. -# It helps to automatically generate a minified problematic graph through 4 strategies: truncating suffix, delta debugging, eliminating dead code and removing unused inputs. -# -# -# We will now show the debugging process for the accuracy problem with the help of ``Minifer``. -# The accuracy problem refers to the case where the outputs of backends eager and inductor are different. -# -# For instance, we modify the example like this: - -from torch._dynamo.utils import same - -def foo2(x1, x2): - a = torch.neg(x1) - b = torch.maximum(x2, a) - y = torch.cat([b], dim=0) - return y - -x1 = torch.randn((1, 8), dtype=torch.float32) -x2 = torch.randn((8390, 8), dtype=torch.float32) - -expected_result = foo2(x1, x2) - -compiled_foo2 = torch.compile(foo2) -actual_result = compiled_foo2(x1, x2) - -assert same(expected_result, actual_result) == True - -###################################################################### -# And also modify the ``neg`` function: - -def neg3(x): - return f"decltype({x})(2 * {x})" - -###################################################################### -# An accuracy problem would be raised as follows: -# -# .. code-block:: sh -# -# torch._dynamo.utils: [ERROR] Accuracy failed: allclose not within tol=0.0001 -# Traceback (most recent call last): -# File "test_script.py", line 18, in -# assert same(expected_result, actual_result) == True -# AssertionError -# -# To debug an accuracy problem with Minifier, two environment variables are needed: -# -# .. code-block:: sh -# -# TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4 python xx.py -# -# Which gives us logging information that demonstrates the steps of minifying: -# -# .. code-block:: sh -# -# Started off with 6 nodes -# -# Trying granularity 2 -# Strategy: Truncate suffix (G: 2) (6 nodes, 2 inputs) -# SUCCESS: Went from 6 to 4 nodes -# -# Trying granularity 4 -# Strategy: Remove unused inputs (G: 4) (4 nodes, 2 inputs) -# SUCCESS: Went from 4 to 3 nodes -# -# After running, we get the final minified graph with the target node ``neg``: - -def forward2(self, arg0_1): - neg = torch.ops.aten.neg.default(arg0_1); arg0_1 = None - return (neg,) - -###################################################################### -# For more usage details about Minifier, please refer to `Troubleshooting `_. - - -###################################################################### -# Performance profiling -# --------------------- -# -# Within this section, we will demonstrate the process of conducting performance analysis for a model that has been compiled using the Inductor CPU backend. -# In the example below, we benchmark a Hugging Face Transformer model ``MobileBertForQuestionAnswering`` with both the eager mode and the Inductor graph mode. -# The execution time and the speedup ratio of Inductor are printed after the benchmark. -# We use Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz and run benchmark on the first socket to demonstrate the optimization within this section. -# We set following environment variable as a best practice to benchmark on Intel(R) CPU. - -######################################################### -# .. code-block:: shell -# -# export KMP_BLOCKTIME=1 -# export KMP_SETTINGS=1 -# export KMP_AFFINITY=granularity=fine,compact,1,0 -# export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so -# export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" -# numactl -C 0-31 -m 0 python bench.py -# - -# bench.py -from transformers import MobileBertForQuestionAnswering -# Initialize an eager model -model = MobileBertForQuestionAnswering.from_pretrained("csarron/mobilebert-uncased-squad-v2") -seq_length = 128 -bs = 128 -vocab_size = model.config.vocab_size -input = torch.randint(0, vocab_size, (bs, seq_length), dtype=torch.int64) -input_dict = {"input_ids": input} - -# Initialize the inductor model -compiled_model = torch.compile(model) -with torch.no_grad(): - compiled_model(**input_dict) - -NUM_ITERS=50 -import timeit -with torch.no_grad(): - # warmup - for _ in range(10): - model(**input_dict) - eager_t = timeit.timeit("model(**input_dict)", number=NUM_ITERS, globals=globals()) - -with torch.no_grad(): - # warmup - for _ in range(10): - compiled_model(**input_dict) - inductor_t = timeit.timeit("compiled_model(**input_dict)", number=NUM_ITERS, globals=globals()) -# print(f"eager use: {eager_t * 1000 / NUM_ITERS} ms/iter") -# print(f"inductor use: {inductor_t * 1000 / NUM_ITERS} ms/iter") -# print(f"speed up ratio: {eager_t / inductor_t}") - - -###################################################################### -# Output: -# -# .. code-block:: shell -# -# eager use: 802.1023553796113 ms/iter -# inductor use: 339.95180135127157 ms/iter -# speed up ratio: 2.359459053287382 -# -# In our own testing, we find the Inductor CPU backend speed up the model by around 2.355x. -# -# -# Next, let's dive deep into the performance at the operation level to understand where the speed-up comes from. -# `Pytorch Profiler `_ is a good tool to help us. -# Inductor CPU backend has the support to report the time of the fusion kernels to the profiler with the ``enable_kernel_profile`` configuration option: - -from torch._inductor import config -config.cpp.enable_kernel_profile = True - -###################################################################### -# Following the steps in `Pytorch Profiler `_ -# We are able to get the profiling table and trace files. - -# bench.py -from torch.profiler import profile, schedule, ProfilerActivity -RESULT_DIR = "./prof_trace" -my_schedule = schedule( - skip_first=10, - wait=5, - warmup=5, - active=1, - repeat=5) - -def trace_handler(p): - output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=20) - # print(output) - p.export_chrome_trace(f"{RESULT_DIR}/{p.step_num}.json") - -for _ in range(10): - model(**input_dict) # compiled_model(**input_dict) to get inductor model profiling - -total = 0 -with profile( - activities=[ProfilerActivity.CPU], - schedule=my_schedule, - on_trace_ready=trace_handler -) as p: - for _ in range(50): - model(**input_dict) # compiled_model(**input_dict) to get inductor model profiling - p.step() - -###################################################################### -# We get the following performance profiling table for the eager-mode model (omitting some columns): -# -# .. code-block:: shell -# -# ------------------------- ------------ ------------ ------------ -# Name CPU total % CPU total # of Calls -# ------------------------- ------------ ------------ ------------ -# aten::addmm 45.73% 370.814ms 362 -# aten::add 19.89% 161.276ms 363 -# aten::copy_ 14.97% 121.416ms 488 -# aten::mul 9.02% 73.154ms 194 -# aten::clamp_min 8.81% 71.444ms 96 -# aten::bmm 5.46% 44.258ms 48 -# ProfilerStep* 100.00% 810.920ms 1 -# aten::div 2.89% 23.447ms 24 -# aten::_softmax 1.00% 8.087ms 24 -# aten::linear 46.48% 376.888ms 362 -# aten::clone 2.77% 22.430ms 98 -# aten::t 0.31% 2.502ms 362 -# aten::view 0.14% 1.161ms 850 -# aten::transpose 0.17% 1.377ms 386 -# aten::index_select 0.12% 952.000us 3 -# aten::expand 0.12% 986.000us 458 -# aten::matmul 8.31% 67.420ms 48 -# aten::cat 0.09% 703.000us 1 -# aten::as_strided 0.08% 656.000us 963 -# aten::relu 8.86% 71.864ms 96 -# ------------------------- ------------ ------------ ------------ -# Self CPU time total: 810.920ms -# - -###################################################################### -# -# Similarly, we also get the table for the compiled model with Inductor (omitting some columns): -# -# .. code-block:: shell -# -# ----------------------------------------------- ------------ ------------ ------------ -# Name CPU total % CPU total # of Calls -# ----------------------------------------------- ------------ ------------ ------------ -# mkl::_mkl_linear 68.79% 231.573ms 362 -# aten::bmm 8.02% 26.992ms 48 -# ProfilerStep* 100.00% 336.642ms 1 -# graph_0_cpp_fused_constant_pad_nd_embedding_0 0.27% 915.000us 1 -# aten::empty 0.27% 911.000us 362 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_151 0.27% 901.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_226 0.27% 899.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_361 0.27% 898.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_121 0.27% 895.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_31 0.27% 893.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_76 0.26% 892.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_256 0.26% 892.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_346 0.26% 892.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_241 0.26% 891.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_316 0.26% 891.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_91 0.26% 890.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_106 0.26% 890.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_211 0.26% 890.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_61 0.26% 889.000us 1 -# graph_0_cpp_fused__mkl_linear_add_mul_relu_286 0.26% 889.000us 1 -# ----------------------------------------------- ------------ ------------ ------------ -# Self CPU time total: 336.642ms -# -# From the profiling table of the eager model, we can see the most time consumption ops are [``aten::addmm``, ``aten::add``, ``aten::copy_``, ``aten::mul``, ``aten::clamp_min``, ``aten::bmm``]. -# Comparing with the inductor model profiling table, we notice an ``mkl::_mkl_linear`` entry and multiple fused kernels in the form ``graph_0_cpp_fused_*``. They are the major -# optimizations that the inductor model is doing. Let us discuss them separately. -# -# (1) Regarding ``mkl::_mkl_linear``: You may notice the number of calls to this kernel is 362, which is exactly the same as ``aten::linear`` in the eager model profiling table. -# The CPU total of ``aten::linear`` is 376.888ms, while it is 231.573ms for ``mkl::_mkl_linear``. This suggests a ~1.63x for the "linear" part. -# The speedup mainly comes from `packing the weight tensor to block memory format `_ -# and invoking `cblas_sgemm_compute `_ within the Inductor CPU backend -# to have a better cache behavior during GEMM computation. -# -# (2) Regarding other memory-intensive ops: The end-to-end latency for the eager/inductor model is 802/339ms in our testing. So we can roughly infer that the speed up for the other memory-intensive ops is around 3.94x. -# Let's read the generated code to understand how the inductor achieves this impressive optimization. You can find the generated code by -# searching ``cpp_fused__mkl_linear_add_mul_relu_151`` in ``output_code.py`` -# - - -cpp_fused__mkl_linear_add_mul_relu_151 = async_compile.cpp(''' -#include -#include "/tmp/torchinductor_root/lr/clrlgu27q4ggd472umdzwsu6qcpqxcuusjxqvx2hwitjbujiiz7z.h" -extern "C" void kernel(float* in_out_ptr0, - const float* in_ptr0, - const float* in_ptr1, - const float* in_ptr2, - const float* in_ptr3) -{ - RECORD_FUNCTION("graph_0_cpp_fused__mkl_linear_add_mul_relu_151", c10::ArrayRef({})); - #pragma omp parallel num_threads(32) - { - { - #pragma omp for - for(long i0=static_cast(0L); i0(16384L); i0+=static_cast(1L)) - { - for(long i1=static_cast(0L); i1(512L); i1+=static_cast(8L)) - { - auto tmp0 = at::vec::Vectorized::loadu(in_ptr0 + static_cast(i1 + (512L*i0))); - auto tmp1 = at::vec::Vectorized::loadu(in_ptr1 + static_cast(i1)); - auto tmp3 = at::vec::Vectorized::loadu(in_out_ptr0 + static_cast(i1 + (512L*i0))); - auto tmp5 = at::vec::Vectorized::loadu(in_ptr2 + static_cast(i1)); - auto tmp7 = at::vec::Vectorized::loadu(in_ptr3 + static_cast(i1)); - auto tmp2 = tmp0 + tmp1; - auto tmp4 = tmp2 + tmp3; - auto tmp6 = tmp4 * tmp5; - auto tmp8 = tmp6 + tmp7; - tmp8.store(in_out_ptr0 + static_cast(i1 + (512L*i0))); - } - } - } - } -}''') - -###################################################################### -# From the generated code above, we can see this kernel has done a typical `Loop Fusion `_ on ``[add, add, mul, add]``. -# This is a memory-bound bottle neck preventing good performance. To get a more intuitive feeling about this optimization, -# we can infer the sizes and stride of the inputs and further benchmark this ``[add, add, mul, add]`` pattern. - -# bench.py -def func(arg_0, arg_1, arg_2, arg_3, arg_4): - add_0 = arg_0 + arg_1 - add_1 = add_0 + arg_2 - mul_1 = add_1 * arg_3 - add_2 = mul_1 + arg_4 - arg_2 = add_2 - return arg_2 - -arg_0 = torch.rand(16384, 512) -arg_1 = torch.rand(1, 512) -arg_2 = torch.zeros(16384, 512) -arg_3 = torch.rand(1, 512) -arg_4 = torch.rand(1, 512) - -input = (arg_0, arg_1, arg_2, arg_3, arg_4) -inductor_func = torch.compile(func) -with torch.no_grad(): - inductor_func(*input) - -import timeit -NUM_ITERS=100 -with torch.no_grad(): - # warmup - for _ in range(10): - func(*input) - eager_t = timeit.timeit("func(*input)", number=NUM_ITERS, globals=globals()) - -with torch.no_grad(): - # warmup - for _ in range(10): - inductor_func(*input) - inductor_t = timeit.timeit("inductor_func(*input)", number=NUM_ITERS, globals=globals()) -# print(f"eager use: {eager_t * 1000 / NUM_ITERS} ms/iter") -# print(f"inductor use: {inductor_t * 1000 / NUM_ITERS} ms/iter") -# print(f"speed up ratio: {eager_t / inductor_t}") - -###################################################################### -# Output: -# -# .. code-block:: shell -# -# eager use: 5.780875144992024 ms/iter -# inductor use: 0.9588955780491233 ms/iter -# speed up ratio: 6.0286805751604735 -# -# -# This is just an example. The profiling table shows all element-wise op are fused within the inductor automatically in this model. You can read more kernels in -# `output_code.py` - - -######################################################################### -# Conclusion -# ---------- -# -# The document gives an in-depth tutorial for the Inductor CPU backend. -# -# With motivating examples, we walk through the process of debugging and profiling. -# The main idea is to narrow down the problem. -# -# We demonstrate step by step the way to delve deeper the issue and find the root cause of failures, with the help of debugging logging and the tool Minifier. -# Firstly determine which component the failure occurs in and then try to generate the smallest snippet of code that can reproduce the failure. -# -# When the performance with Inductor is better than that of eager mode, we provide a solid analytical method for performance profiling. -# We show how to find the time-consuming hotspot with PyTorch Profiler and figure out the operator-level or kernel-level reason to explain the phenomenon. diff --git a/intermediate_source/jacobians_hessians.py b/intermediate_source/jacobians_hessians.py deleted file mode 100644 index b8b96c30a3..0000000000 --- a/intermediate_source/jacobians_hessians.py +++ /dev/null @@ -1,349 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Jacobians, Hessians, hvp, vhp, and more: composing function transforms -====================================================================== - -Computing jacobians or hessians are useful in a number of non-traditional -deep learning models. It is difficult (or annoying) to compute these quantities -efficiently using PyTorch's regular autodiff APIs -(``Tensor.backward()``, ``torch.autograd.grad``). PyTorch's -`JAX-inspired `_ -`function transforms API `_ -provides ways of computing various higher-order autodiff quantities -efficiently. - -.. note:: - - This tutorial requires PyTorch 2.0.0 or later. - -Computing the Jacobian ----------------------- -""" - -import torch -import torch.nn.functional as F -from functools import partial -_ = torch.manual_seed(0) - -###################################################################### -# Let's start with a function that we'd like to compute the jacobian of. -# This is a simple linear function with non-linear activation. - -def predict(weight, bias, x): - return F.linear(x, weight, bias).tanh() - -###################################################################### -# Let's add some dummy data: a weight, a bias, and a feature vector x. - -D = 16 -weight = torch.randn(D, D) -bias = torch.randn(D) -x = torch.randn(D) # feature vector - -###################################################################### -# Let's think of ``predict`` as a function that maps the input ``x`` from :math:`R^D \to R^D`. -# PyTorch Autograd computes vector-Jacobian products. In order to compute the full -# Jacobian of this :math:`R^D \to R^D` function, we would have to compute it row-by-row -# by using a different unit vector each time. - -def compute_jac(xp): - jacobian_rows = [torch.autograd.grad(predict(weight, bias, xp), xp, vec)[0] - for vec in unit_vectors] - return torch.stack(jacobian_rows) - -xp = x.clone().requires_grad_() -unit_vectors = torch.eye(D) - -jacobian = compute_jac(xp) - -print(jacobian.shape) -print(jacobian[0]) # show first row - -###################################################################### -# Instead of computing the jacobian row-by-row, we can use PyTorch's -# ``torch.vmap`` function transform to get rid of the for-loop and vectorize the -# computation. We can’t directly apply ``vmap`` to ``torch.autograd.grad``; -# instead, PyTorch provides a ``torch.func.vjp`` transform that composes with -# ``torch.vmap``: - -from torch.func import vmap, vjp - -_, vjp_fn = vjp(partial(predict, weight, bias), x) - -ft_jacobian, = vmap(vjp_fn)(unit_vectors) - -# let's confirm both methods compute the same result -assert torch.allclose(ft_jacobian, jacobian) - -###################################################################### -# In a later tutorial a composition of reverse-mode AD and ``vmap`` will give us -# per-sample-gradients. -# In this tutorial, composing reverse-mode AD and ``vmap`` gives us Jacobian -# computation! -# Various compositions of ``vmap`` and autodiff transforms can give us different -# interesting quantities. -# -# PyTorch provides ``torch.func.jacrev`` as a convenience function that performs -# the ``vmap-vjp`` composition to compute jacobians. ``jacrev`` accepts an ``argnums`` -# argument that says which argument we would like to compute Jacobians with -# respect to. - -from torch.func import jacrev - -ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x) - -# Confirm by running the following: -assert torch.allclose(ft_jacobian, jacobian) - -###################################################################### -# Let's compare the performance of the two ways to compute the jacobian. -# The function transform version is much faster (and becomes even faster the -# more outputs there are). -# -# In general, we expect that vectorization via ``vmap`` can help eliminate overhead -# and give better utilization of your hardware. -# -# ``vmap`` does this magic by pushing the outer loop down into the function's -# primitive operations in order to obtain better performance. -# -# Let's make a quick function to evaluate performance and deal with -# microseconds and milliseconds measurements: - -def get_perf(first, first_descriptor, second, second_descriptor): - """takes torch.benchmark objects and compares delta of second vs first.""" - faster = second.times[0] - slower = first.times[0] - gain = (slower-faster)/slower - if gain < 0: gain *=-1 - final_gain = gain*100 - print(f" Performance delta: {final_gain:.4f} percent improvement with {second_descriptor} ") - -###################################################################### -# And then run the performance comparison: - -from torch.utils.benchmark import Timer - -without_vmap = Timer(stmt="compute_jac(xp)", globals=globals()) -with_vmap = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals()) - -no_vmap_timer = without_vmap.timeit(500) -with_vmap_timer = with_vmap.timeit(500) - -print(no_vmap_timer) -print(with_vmap_timer) - -###################################################################### -# Let's do a relative performance comparison of the above with our ``get_perf`` function: - -get_perf(no_vmap_timer, "without vmap", with_vmap_timer, "vmap") - -###################################################################### -# Furthermore, it’s pretty easy to flip the problem around and say we want to -# compute Jacobians of the parameters to our model (weight, bias) instead of the input - -# note the change in input via ``argnums`` parameters of 0,1 to map to weight and bias -ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x) - -###################################################################### -# Reverse-mode Jacobian (``jacrev``) vs forward-mode Jacobian (``jacfwd``) -# ------------------------------------------------------------------------ -# -# We offer two APIs to compute jacobians: ``jacrev`` and ``jacfwd``: -# -# - ``jacrev`` uses reverse-mode AD. As you saw above it is a composition of our -# ``vjp`` and ``vmap`` transforms. -# - ``jacfwd`` uses forward-mode AD. It is implemented as a composition of our -# ``jvp`` and ``vmap`` transforms. -# -# ``jacfwd`` and ``jacrev`` can be substituted for each other but they have different -# performance characteristics. -# -# As a general rule of thumb, if you’re computing the jacobian of an :math:`R^N \to R^M` -# function, and there are many more outputs than inputs (for example, :math:`M > N`) then -# ``jacfwd`` is preferred, otherwise use ``jacrev``. There are exceptions to this rule, -# but a non-rigorous argument for this follows: -# -# In reverse-mode AD, we are computing the jacobian row-by-row, while in -# forward-mode AD (which computes Jacobian-vector products), we are computing -# it column-by-column. The Jacobian matrix has M rows and N columns, so if it -# is taller or wider one way we may prefer the method that deals with fewer -# rows or columns. - -from torch.func import jacrev, jacfwd - -###################################################################### -# First, let's benchmark with more inputs than outputs: - -Din = 32 -Dout = 2048 -weight = torch.randn(Dout, Din) - -bias = torch.randn(Dout) -x = torch.randn(Din) - -# remember the general rule about taller vs wider... here we have a taller matrix: -print(weight.shape) - -using_fwd = Timer(stmt="jacfwd(predict, argnums=2)(weight, bias, x)", globals=globals()) -using_bwd = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals()) - -jacfwd_timing = using_fwd.timeit(500) -jacrev_timing = using_bwd.timeit(500) - -print(f'jacfwd time: {jacfwd_timing}') -print(f'jacrev time: {jacrev_timing}') - -###################################################################### -# and then do a relative benchmark: - -get_perf(jacfwd_timing, "jacfwd", jacrev_timing, "jacrev", ); - -####################################################################### -# and now the reverse - more outputs (M) than inputs (N): - -Din = 2048 -Dout = 32 -weight = torch.randn(Dout, Din) -bias = torch.randn(Dout) -x = torch.randn(Din) - -using_fwd = Timer(stmt="jacfwd(predict, argnums=2)(weight, bias, x)", globals=globals()) -using_bwd = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals()) - -jacfwd_timing = using_fwd.timeit(500) -jacrev_timing = using_bwd.timeit(500) - -print(f'jacfwd time: {jacfwd_timing}') -print(f'jacrev time: {jacrev_timing}') - -####################################################################### -# and a relative performance comparison: - -get_perf(jacrev_timing, "jacrev", jacfwd_timing, "jacfwd") - -####################################################################### -# Hessian computation with functorch.hessian -# ------------------------------------------ -# We offer a convenience API to compute hessians: ``torch.func.hessiani``. -# Hessians are the jacobian of the jacobian (or the partial derivative of -# the partial derivative, aka second order). -# -# This suggests that one can just compose functorch jacobian transforms to -# compute the Hessian. -# Indeed, under the hood, ``hessian(f)`` is simply ``jacfwd(jacrev(f))``. -# -# Note: to boost performance: depending on your model, you may also want to -# use ``jacfwd(jacfwd(f))`` or ``jacrev(jacrev(f))`` instead to compute hessians -# leveraging the rule of thumb above regarding wider vs taller matrices. - -from torch.func import hessian - -# lets reduce the size in order not to overwhelm Colab. Hessians require -# significant memory: -Din = 512 -Dout = 32 -weight = torch.randn(Dout, Din) -bias = torch.randn(Dout) -x = torch.randn(Din) - -hess_api = hessian(predict, argnums=2)(weight, bias, x) -hess_fwdfwd = jacfwd(jacfwd(predict, argnums=2), argnums=2)(weight, bias, x) -hess_revrev = jacrev(jacrev(predict, argnums=2), argnums=2)(weight, bias, x) - -####################################################################### -# Let's verify we have the same result regardless of using hessian API or -# using ``jacfwd(jacfwd())``. - -torch.allclose(hess_api, hess_fwdfwd) - -####################################################################### -# Batch Jacobian and Batch Hessian -# -------------------------------- -# In the above examples we’ve been operating with a single feature vector. -# In some cases you might want to take the Jacobian of a batch of outputs -# with respect to a batch of inputs. That is, given a batch of inputs of -# shape ``(B, N)`` and a function that goes from :math:`R^N \to R^M`, we would like -# a Jacobian of shape ``(B, M, N)``. -# -# The easiest way to do this is to use ``vmap``: - -batch_size = 64 -Din = 31 -Dout = 33 - -weight = torch.randn(Dout, Din) -print(f"weight shape = {weight.shape}") - -bias = torch.randn(Dout) - -x = torch.randn(batch_size, Din) - -compute_batch_jacobian = vmap(jacrev(predict, argnums=2), in_dims=(None, None, 0)) -batch_jacobian0 = compute_batch_jacobian(weight, bias, x) - -####################################################################### -# If you have a function that goes from (B, N) -> (B, M) instead and are -# certain that each input produces an independent output, then it's also -# sometimes possible to do this without using ``vmap`` by summing the outputs -# and then computing the Jacobian of that function: - -def predict_with_output_summed(weight, bias, x): - return predict(weight, bias, x).sum(0) - -batch_jacobian1 = jacrev(predict_with_output_summed, argnums=2)(weight, bias, x).movedim(1, 0) -assert torch.allclose(batch_jacobian0, batch_jacobian1) - -####################################################################### -# If you instead have a function that goes from :math:`R^N \to R^M` but inputs that -# are batched, you compose ``vmap`` with ``jacrev`` to compute batched jacobians: -# -# Finally, batch hessians can be computed similarly. It's easiest to think -# about them by using ``vmap`` to batch over hessian computation, but in some -# cases the sum trick also works. - -compute_batch_hessian = vmap(hessian(predict, argnums=2), in_dims=(None, None, 0)) - -batch_hess = compute_batch_hessian(weight, bias, x) -batch_hess.shape - -####################################################################### -# Computing Hessian-vector products -# --------------------------------- -# The naive way to compute a Hessian-vector product (hvp) is to materialize -# the full Hessian and perform a dot-product with a vector. We can do better: -# it turns out we don't need to materialize the full Hessian to do this. We'll -# go through two (of many) different strategies to compute Hessian-vector products: -# - composing reverse-mode AD with reverse-mode AD -# - composing reverse-mode AD with forward-mode AD -# -# Composing reverse-mode AD with forward-mode AD (as opposed to reverse-mode -# with reverse-mode) is generally the more memory efficient way to compute a -# hvp because forward-mode AD doesn't need to construct an Autograd graph and -# save intermediates for backward: - -from torch.func import jvp, grad, vjp - -def hvp(f, primals, tangents): - return jvp(grad(f), primals, tangents)[1] - -####################################################################### -# Here's some sample usage. - -def f(x): - return x.sin().sum() - -x = torch.randn(2048) -tangent = torch.randn(2048) - -result = hvp(f, (x,), (tangent,)) - -####################################################################### -# If PyTorch forward-AD does not have coverage for your operations, then we can -# instead compose reverse-mode AD with reverse-mode AD: - -def hvp_revrev(f, primals, tangents): - _, vjp_fn = vjp(grad(f), *primals) - return vjp_fn(*tangents) - -result_hvp_revrev = hvp_revrev(f, (x,), (tangent,)) -assert torch.allclose(result, result_hvp_revrev[0]) diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py deleted file mode 100755 index 03d6396a47..0000000000 --- a/intermediate_source/mario_rl_tutorial.py +++ /dev/null @@ -1,791 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Train a Mario-playing RL Agent -=============================== - -**Authors:** `Yuansong Feng `__, `Suraj Subramanian `__, `Howard Wang `__, `Steven Guo `__. - - -This tutorial walks you through the fundamentals of Deep Reinforcement -Learning. At the end, you will implement an AI-powered Mario (using -`Double Deep Q-Networks `__) that -can play the game by itself. - -Although no prior knowledge of RL is necessary for this tutorial, you -can familiarize yourself with these RL -`concepts `__, -and have this handy -`cheatsheet `__ -as your companion. The full code is available -`here `__. - -.. figure:: /_static/img/mario.gif - :alt: mario - -""" - - -###################################################################### -# -# -# .. code-block:: bash -# -# %%bash -# pip install gym-super-mario-bros==7.4.0 -# pip install tensordict==0.3.0 -# pip install torchrl==0.3.0 -# - -import torch -from torch import nn -from torchvision import transforms as T -from PIL import Image -import numpy as np -from pathlib import Path -from collections import deque -import random, datetime, os - -# Gym is an OpenAI toolkit for RL -import gym -from gym.spaces import Box -from gym.wrappers import FrameStack - -# NES Emulator for OpenAI Gym -from nes_py.wrappers import JoypadSpace - -# Super Mario environment for OpenAI Gym -import gym_super_mario_bros - -from tensordict import TensorDict -from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage - -###################################################################### -# RL Definitions -# """""""""""""""""" -# -# **Environment** The world that an agent interacts with and learns from. -# -# **Action** :math:`a` : How the Agent responds to the Environment. The -# set of all possible Actions is called *action-space*. -# -# **State** :math:`s` : The current characteristic of the Environment. The -# set of all possible States the Environment can be in is called -# *state-space*. -# -# **Reward** :math:`r` : Reward is the key feedback from Environment to -# Agent. It is what drives the Agent to learn and to change its future -# action. An aggregation of rewards over multiple time steps is called -# **Return**. -# -# **Optimal Action-Value function** :math:`Q^*(s,a)` : Gives the expected -# return if you start in state :math:`s`, take an arbitrary action -# :math:`a`, and then for each future time step take the action that -# maximizes returns. :math:`Q` can be said to stand for the “quality” of -# the action in a state. We try to approximate this function. -# - - -###################################################################### -# Environment -# """""""""""""""" -# -# Initialize Environment -# ------------------------ -# -# In Mario, the environment consists of tubes, mushrooms and other -# components. -# -# When Mario makes an action, the environment responds with the changed -# (next) state, reward and other info. -# - -# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen) -if gym.__version__ < '0.26': - env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True) -else: - env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='rgb', apply_api_compatibility=True) - -# Limit the action-space to -# 0. walk right -# 1. jump right -env = JoypadSpace(env, [["right"], ["right", "A"]]) - -env.reset() -next_state, reward, done, trunc, info = env.step(action=0) -print(f"{next_state.shape},\n {reward},\n {done},\n {info}") - - -###################################################################### -# Preprocess Environment -# ------------------------ -# -# Environment data is returned to the agent in ``next_state``. As you saw -# above, each state is represented by a ``[3, 240, 256]`` size array. -# Often that is more information than our agent needs; for instance, -# Mario’s actions do not depend on the color of the pipes or the sky! -# -# We use **Wrappers** to preprocess environment data before sending it to -# the agent. -# -# ``GrayScaleObservation`` is a common wrapper to transform an RGB image -# to grayscale; doing so reduces the size of the state representation -# without losing useful information. Now the size of each state: -# ``[1, 240, 256]`` -# -# ``ResizeObservation`` downsamples each observation into a square image. -# New size: ``[1, 84, 84]`` -# -# ``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and -# implements the ``step()`` function. Because consecutive frames don’t -# vary much, we can skip n-intermediate frames without losing much -# information. The n-th frame aggregates rewards accumulated over each -# skipped frame. -# -# ``FrameStack`` is a wrapper that allows us to squash consecutive frames -# of the environment into a single observation point to feed to our -# learning model. This way, we can identify if Mario was landing or -# jumping based on the direction of his movement in the previous several -# frames. -# - - -class SkipFrame(gym.Wrapper): - def __init__(self, env, skip): - """Return only every `skip`-th frame""" - super().__init__(env) - self._skip = skip - - def step(self, action): - """Repeat action, and sum reward""" - total_reward = 0.0 - for i in range(self._skip): - # Accumulate reward and repeat the same action - obs, reward, done, trunk, info = self.env.step(action) - total_reward += reward - if done: - break - return obs, total_reward, done, trunk, info - - -class GrayScaleObservation(gym.ObservationWrapper): - def __init__(self, env): - super().__init__(env) - obs_shape = self.observation_space.shape[:2] - self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) - - def permute_orientation(self, observation): - # permute [H, W, C] array to [C, H, W] tensor - observation = np.transpose(observation, (2, 0, 1)) - observation = torch.tensor(observation.copy(), dtype=torch.float) - return observation - - def observation(self, observation): - observation = self.permute_orientation(observation) - transform = T.Grayscale() - observation = transform(observation) - return observation - - -class ResizeObservation(gym.ObservationWrapper): - def __init__(self, env, shape): - super().__init__(env) - if isinstance(shape, int): - self.shape = (shape, shape) - else: - self.shape = tuple(shape) - - obs_shape = self.shape + self.observation_space.shape[2:] - self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) - - def observation(self, observation): - transforms = T.Compose( - [T.Resize(self.shape, antialias=True), T.Normalize(0, 255)] - ) - observation = transforms(observation).squeeze(0) - return observation - - -# Apply Wrappers to environment -env = SkipFrame(env, skip=4) -env = GrayScaleObservation(env) -env = ResizeObservation(env, shape=84) -if gym.__version__ < '0.26': - env = FrameStack(env, num_stack=4, new_step_api=True) -else: - env = FrameStack(env, num_stack=4) - - -###################################################################### -# After applying the above wrappers to the environment, the final wrapped -# state consists of 4 gray-scaled consecutive frames stacked together, as -# shown above in the image on the left. Each time Mario makes an action, -# the environment responds with a state of this structure. The structure -# is represented by a 3-D array of size ``[4, 84, 84]``. -# -# .. figure:: /_static/img/mario_env.png -# :alt: picture -# -# - - -###################################################################### -# Agent -# """"""""" -# -# We create a class ``Mario`` to represent our agent in the game. Mario -# should be able to: -# -# - **Act** according to the optimal action policy based on the current -# state (of the environment). -# -# - **Remember** experiences. Experience = (current state, current -# action, reward, next state). Mario *caches* and later *recalls* his -# experiences to update his action policy. -# -# - **Learn** a better action policy over time -# - - -class Mario: - def __init__(): - pass - - def act(self, state): - """Given a state, choose an epsilon-greedy action""" - pass - - def cache(self, experience): - """Add the experience to memory""" - pass - - def recall(self): - """Sample experiences from memory""" - pass - - def learn(self): - """Update online action value (Q) function with a batch of experiences""" - pass - - -###################################################################### -# In the following sections, we will populate Mario’s parameters and -# define his functions. -# - - -###################################################################### -# Act -# -------------- -# -# For any given state, an agent can choose to do the most optimal action -# (**exploit**) or a random action (**explore**). -# -# Mario randomly explores with a chance of ``self.exploration_rate``; when -# he chooses to exploit, he relies on ``MarioNet`` (implemented in -# ``Learn`` section) to provide the most optimal action. -# - - -class Mario: - def __init__(self, state_dim, action_dim, save_dir): - self.state_dim = state_dim - self.action_dim = action_dim - self.save_dir = save_dir - - self.device = "cuda" if torch.cuda.is_available() else "cpu" - - # Mario's DNN to predict the most optimal action - we implement this in the Learn section - self.net = MarioNet(self.state_dim, self.action_dim).float() - self.net = self.net.to(device=self.device) - - self.exploration_rate = 1 - self.exploration_rate_decay = 0.99999975 - self.exploration_rate_min = 0.1 - self.curr_step = 0 - - self.save_every = 5e5 # no. of experiences between saving Mario Net - - def act(self, state): - """ - Given a state, choose an epsilon-greedy action and update value of step. - - Inputs: - state(``LazyFrame``): A single observation of the current state, dimension is (state_dim) - Outputs: - ``action_idx`` (``int``): An integer representing which action Mario will perform - """ - # EXPLORE - if np.random.rand() < self.exploration_rate: - action_idx = np.random.randint(self.action_dim) - - # EXPLOIT - else: - state = state[0].__array__() if isinstance(state, tuple) else state.__array__() - state = torch.tensor(state, device=self.device).unsqueeze(0) - action_values = self.net(state, model="online") - action_idx = torch.argmax(action_values, axis=1).item() - - # decrease exploration_rate - self.exploration_rate *= self.exploration_rate_decay - self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate) - - # increment step - self.curr_step += 1 - return action_idx - - -###################################################################### -# Cache and Recall -# ---------------------- -# -# These two functions serve as Mario’s “memory” process. -# -# ``cache()``: Each time Mario performs an action, he stores the -# ``experience`` to his memory. His experience includes the current -# *state*, *action* performed, *reward* from the action, the *next state*, -# and whether the game is *done*. -# -# ``recall()``: Mario randomly samples a batch of experiences from his -# memory, and uses that to learn the game. -# - - -class Mario(Mario): # subclassing for continuity - def __init__(self, state_dim, action_dim, save_dir): - super().__init__(state_dim, action_dim, save_dir) - self.memory = TensorDictReplayBuffer(storage=LazyMemmapStorage(100000, device=torch.device("cpu"))) - self.batch_size = 32 - - def cache(self, state, next_state, action, reward, done): - """ - Store the experience to self.memory (replay buffer) - - Inputs: - state (``LazyFrame``), - next_state (``LazyFrame``), - action (``int``), - reward (``float``), - done(``bool``)) - """ - def first_if_tuple(x): - return x[0] if isinstance(x, tuple) else x - state = first_if_tuple(state).__array__() - next_state = first_if_tuple(next_state).__array__() - - state = torch.tensor(state) - next_state = torch.tensor(next_state) - action = torch.tensor([action]) - reward = torch.tensor([reward]) - done = torch.tensor([done]) - - # self.memory.append((state, next_state, action, reward, done,)) - self.memory.add(TensorDict({"state": state, "next_state": next_state, "action": action, "reward": reward, "done": done}, batch_size=[])) - - def recall(self): - """ - Retrieve a batch of experiences from memory - """ - batch = self.memory.sample(self.batch_size).to(self.device) - state, next_state, action, reward, done = (batch.get(key) for key in ("state", "next_state", "action", "reward", "done")) - return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze() - - -###################################################################### -# Learn -# -------------- -# -# Mario uses the `DDQN algorithm `__ -# under the hood. DDQN uses two ConvNets - :math:`Q_{online}` and -# :math:`Q_{target}` - that independently approximate the optimal -# action-value function. -# -# In our implementation, we share feature generator ``features`` across -# :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC -# classifiers for each. :math:`\theta_{target}` (the parameters of -# :math:`Q_{target}`) is frozen to prevent updating by backprop. Instead, -# it is periodically synced with :math:`\theta_{online}` (more on this -# later). -# -# Neural Network -# ~~~~~~~~~~~~~~~~~~ - - -class MarioNet(nn.Module): - """mini CNN structure - input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output - """ - - def __init__(self, input_dim, output_dim): - super().__init__() - c, h, w = input_dim - - if h != 84: - raise ValueError(f"Expecting input height: 84, got: {h}") - if w != 84: - raise ValueError(f"Expecting input width: 84, got: {w}") - - self.online = self.__build_cnn(c, output_dim) - - self.target = self.__build_cnn(c, output_dim) - self.target.load_state_dict(self.online.state_dict()) - - # Q_target parameters are frozen. - for p in self.target.parameters(): - p.requires_grad = False - - def forward(self, input, model): - if model == "online": - return self.online(input) - elif model == "target": - return self.target(input) - - def __build_cnn(self, c, output_dim): - return nn.Sequential( - nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4), - nn.ReLU(), - nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), - nn.ReLU(), - nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), - nn.ReLU(), - nn.Flatten(), - nn.Linear(3136, 512), - nn.ReLU(), - nn.Linear(512, output_dim), - ) - - -###################################################################### -# TD Estimate & TD Target -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Two values are involved in learning: -# -# **TD Estimate** - the predicted optimal :math:`Q^*` for a given state -# :math:`s` -# -# .. math:: -# -# -# {TD}_e = Q_{online}^*(s,a) -# -# **TD Target** - aggregation of current reward and the estimated -# :math:`Q^*` in the next state :math:`s'` -# -# .. math:: -# -# -# a' = argmax_{a} Q_{online}(s', a) -# -# .. math:: -# -# -# {TD}_t = r + \gamma Q_{target}^*(s',a') -# -# Because we don’t know what next action :math:`a'` will be, we use the -# action :math:`a'` maximizes :math:`Q_{online}` in the next state -# :math:`s'`. -# -# Notice we use the -# `@torch.no_grad() `__ -# decorator on ``td_target()`` to disable gradient calculations here -# (because we don’t need to backpropagate on :math:`\theta_{target}`). -# - - -class Mario(Mario): - def __init__(self, state_dim, action_dim, save_dir): - super().__init__(state_dim, action_dim, save_dir) - self.gamma = 0.9 - - def td_estimate(self, state, action): - current_Q = self.net(state, model="online")[ - np.arange(0, self.batch_size), action - ] # Q_online(s,a) - return current_Q - - @torch.no_grad() - def td_target(self, reward, next_state, done): - next_state_Q = self.net(next_state, model="online") - best_action = torch.argmax(next_state_Q, axis=1) - next_Q = self.net(next_state, model="target")[ - np.arange(0, self.batch_size), best_action - ] - return (reward + (1 - done.float()) * self.gamma * next_Q).float() - - -###################################################################### -# Updating the model -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# As Mario samples inputs from his replay buffer, we compute :math:`TD_t` -# and :math:`TD_e` and backpropagate this loss down :math:`Q_{online}` to -# update its parameters :math:`\theta_{online}` (:math:`\alpha` is the -# learning rate ``lr`` passed to the ``optimizer``) -# -# .. math:: -# -# -# \theta_{online} \leftarrow \theta_{online} + \alpha \nabla(TD_e - TD_t) -# -# :math:`\theta_{target}` does not update through backpropagation. -# Instead, we periodically copy :math:`\theta_{online}` to -# :math:`\theta_{target}` -# -# .. math:: -# -# -# \theta_{target} \leftarrow \theta_{online} -# -# - - -class Mario(Mario): - def __init__(self, state_dim, action_dim, save_dir): - super().__init__(state_dim, action_dim, save_dir) - self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025) - self.loss_fn = torch.nn.SmoothL1Loss() - - def update_Q_online(self, td_estimate, td_target): - loss = self.loss_fn(td_estimate, td_target) - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - return loss.item() - - def sync_Q_target(self): - self.net.target.load_state_dict(self.net.online.state_dict()) - - -###################################################################### -# Save checkpoint -# ~~~~~~~~~~~~~~~~~~ -# - - -class Mario(Mario): - def save(self): - save_path = ( - self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt" - ) - torch.save( - dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate), - save_path, - ) - print(f"MarioNet saved to {save_path} at step {self.curr_step}") - - -###################################################################### -# Putting it all together -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -class Mario(Mario): - def __init__(self, state_dim, action_dim, save_dir): - super().__init__(state_dim, action_dim, save_dir) - self.burnin = 1e4 # min. experiences before training - self.learn_every = 3 # no. of experiences between updates to Q_online - self.sync_every = 1e4 # no. of experiences between Q_target & Q_online sync - - def learn(self): - if self.curr_step % self.sync_every == 0: - self.sync_Q_target() - - if self.curr_step % self.save_every == 0: - self.save() - - if self.curr_step < self.burnin: - return None, None - - if self.curr_step % self.learn_every != 0: - return None, None - - # Sample from memory - state, next_state, action, reward, done = self.recall() - - # Get TD Estimate - td_est = self.td_estimate(state, action) - - # Get TD Target - td_tgt = self.td_target(reward, next_state, done) - - # Backpropagate loss through Q_online - loss = self.update_Q_online(td_est, td_tgt) - - return (td_est.mean().item(), loss) - - -###################################################################### -# Logging -# -------------- -# - -import numpy as np -import time, datetime -import matplotlib.pyplot as plt - - -class MetricLogger: - def __init__(self, save_dir): - self.save_log = save_dir / "log" - with open(self.save_log, "w") as f: - f.write( - f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}" - f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}" - f"{'TimeDelta':>15}{'Time':>20}\n" - ) - self.ep_rewards_plot = save_dir / "reward_plot.jpg" - self.ep_lengths_plot = save_dir / "length_plot.jpg" - self.ep_avg_losses_plot = save_dir / "loss_plot.jpg" - self.ep_avg_qs_plot = save_dir / "q_plot.jpg" - - # History metrics - self.ep_rewards = [] - self.ep_lengths = [] - self.ep_avg_losses = [] - self.ep_avg_qs = [] - - # Moving averages, added for every call to record() - self.moving_avg_ep_rewards = [] - self.moving_avg_ep_lengths = [] - self.moving_avg_ep_avg_losses = [] - self.moving_avg_ep_avg_qs = [] - - # Current episode metric - self.init_episode() - - # Timing - self.record_time = time.time() - - def log_step(self, reward, loss, q): - self.curr_ep_reward += reward - self.curr_ep_length += 1 - if loss: - self.curr_ep_loss += loss - self.curr_ep_q += q - self.curr_ep_loss_length += 1 - - def log_episode(self): - "Mark end of episode" - self.ep_rewards.append(self.curr_ep_reward) - self.ep_lengths.append(self.curr_ep_length) - if self.curr_ep_loss_length == 0: - ep_avg_loss = 0 - ep_avg_q = 0 - else: - ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5) - ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5) - self.ep_avg_losses.append(ep_avg_loss) - self.ep_avg_qs.append(ep_avg_q) - - self.init_episode() - - def init_episode(self): - self.curr_ep_reward = 0.0 - self.curr_ep_length = 0 - self.curr_ep_loss = 0.0 - self.curr_ep_q = 0.0 - self.curr_ep_loss_length = 0 - - def record(self, episode, epsilon, step): - mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3) - mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3) - mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3) - mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3) - self.moving_avg_ep_rewards.append(mean_ep_reward) - self.moving_avg_ep_lengths.append(mean_ep_length) - self.moving_avg_ep_avg_losses.append(mean_ep_loss) - self.moving_avg_ep_avg_qs.append(mean_ep_q) - - last_record_time = self.record_time - self.record_time = time.time() - time_since_last_record = np.round(self.record_time - last_record_time, 3) - - print( - f"Episode {episode} - " - f"Step {step} - " - f"Epsilon {epsilon} - " - f"Mean Reward {mean_ep_reward} - " - f"Mean Length {mean_ep_length} - " - f"Mean Loss {mean_ep_loss} - " - f"Mean Q Value {mean_ep_q} - " - f"Time Delta {time_since_last_record} - " - f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}" - ) - - with open(self.save_log, "a") as f: - f.write( - f"{episode:8d}{step:8d}{epsilon:10.3f}" - f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}" - f"{time_since_last_record:15.3f}" - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n" - ) - - for metric in ["ep_lengths", "ep_avg_losses", "ep_avg_qs", "ep_rewards"]: - plt.clf() - plt.plot(getattr(self, f"moving_avg_{metric}"), label=f"moving_avg_{metric}") - plt.legend() - plt.savefig(getattr(self, f"{metric}_plot")) - - -###################################################################### -# Let’s play! -# """"""""""""""" -# -# In this example we run the training loop for 40 episodes, but for Mario to truly learn the ways of -# his world, we suggest running the loop for at least 40,000 episodes! -# -use_cuda = torch.cuda.is_available() -print(f"Using CUDA: {use_cuda}") -print() - -save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") -save_dir.mkdir(parents=True) - -mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir) - -logger = MetricLogger(save_dir) - -episodes = 40 -for e in range(episodes): - - state = env.reset() - - # Play the game! - while True: - - # Run agent on the state - action = mario.act(state) - - # Agent performs action - next_state, reward, done, trunc, info = env.step(action) - - # Remember - mario.cache(state, next_state, action, reward, done) - - # Learn - q, loss = mario.learn() - - # Logging - logger.log_step(reward, loss, q) - - # Update state - state = next_state - - # Check if end of game - if done or info["flag_get"]: - break - - logger.log_episode() - - if (e % 20 == 0) or (e == episodes - 1): - logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step) - - -###################################################################### -# Conclusion -# """"""""""""""" -# -# In this tutorial, we saw how we can use PyTorch to train a game-playing AI. You can use the same methods -# to train an AI to play any of the games at the `OpenAI gym `__. Hope you enjoyed this tutorial, feel free to reach us at -# `our github `__! diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py deleted file mode 100644 index 26bc5c9d53..0000000000 --- a/intermediate_source/memory_format_tutorial.py +++ /dev/null @@ -1,389 +0,0 @@ -# -*- coding: utf-8 -*- -""" -(beta) Channels Last Memory Format in PyTorch -******************************************************* -**Author**: `Vitaly Fedyunin `_ - -What is Channels Last ---------------------- - -Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). - -For example, classic (contiguous) storage of NCHW tensor (in our case it is two 4x4 images with 3 color channels) look like this: - -.. figure:: /_static/img/classic_memory_format.png - :alt: classic_memory_format - -Channels last memory format orders data differently: - -.. figure:: /_static/img/channels_last_memory_format.png - :alt: channels_last_memory_format - -Pytorch supports memory formats (and provides back compatibility with existing models including eager, JIT, and TorchScript) by utilizing existing strides structure. -For example, 10x3x16x16 batch in Channels last format will have strides equal to (768, 1, 48, 3). -""" - -###################################################################### -# Channels last memory format is implemented for 4D NCHW Tensors only. -# - -###################################################################### -# Memory Format API -# ----------------------- -# -# Here is how to convert tensors between contiguous and channels -# last memory formats. - -###################################################################### -# Classic PyTorch contiguous tensor -import torch - -N, C, H, W = 10, 3, 32, 32 -x = torch.empty(N, C, H, W) -print(x.stride()) # Outputs: (3072, 1024, 32, 1) - -###################################################################### -# Conversion operator -x = x.to(memory_format=torch.channels_last) -print(x.shape) # Outputs: (10, 3, 32, 32) as dimensions order preserved -print(x.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# Back to contiguous -x = x.to(memory_format=torch.contiguous_format) -print(x.stride()) # Outputs: (3072, 1024, 32, 1) - -###################################################################### -# Alternative option -x = x.contiguous(memory_format=torch.channels_last) -print(x.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# Format checks -print(x.is_contiguous(memory_format=torch.channels_last)) # Outputs: True - -###################################################################### -# There are minor difference between the two APIs ``to`` and -# ``contiguous``. We suggest to stick with ``to`` when explicitly -# converting memory format of tensor. -# -# For general cases the two APIs behave the same. However in special -# cases for a 4D tensor with size ``NCHW`` when either: ``C==1`` or -# ``H==1 && W==1``, only ``to`` would generate a proper stride to -# represent channels last memory format. -# -# This is because in either of the two cases above, the memory format -# of a tensor is ambiguous, i.e. a contiguous tensor with size -# ``N1HW`` is both ``contiguous`` and channels last in memory storage. -# Therefore, they are already considered as ``is_contiguous`` -# for the given memory format and hence ``contiguous`` call becomes a -# no-op and would not update the stride. On the contrary, ``to`` -# would restride tensor with a meaningful stride on dimensions whose -# sizes are 1 in order to properly represent the intended memory -# format -special_x = torch.empty(4, 1, 4, 4) -print(special_x.is_contiguous(memory_format=torch.channels_last)) # Outputs: True -print(special_x.is_contiguous(memory_format=torch.contiguous_format)) # Outputs: True - -###################################################################### -# Same thing applies to explicit permutation API ``permute``. In -# special case where ambiguity could occur, ``permute`` does not -# guarantee to produce a stride that properly carry the intended -# memory format. We suggest to use ``to`` with explicit memory format -# to avoid unintended behavior. -# -# And a side note that in the extreme case, where three non-batch -# dimensions are all equal to ``1`` (``C==1 && H==1 && W==1``), -# current implementation cannot mark a tensor as channels last memory -# format. - -###################################################################### -# Create as channels last -x = torch.empty(N, C, H, W, memory_format=torch.channels_last) -print(x.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# ``clone`` preserves memory format -y = x.clone() -print(y.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# ``to``, ``cuda``, ``float`` ... preserves memory format -if torch.cuda.is_available(): - y = x.cuda() - print(y.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# ``empty_like``, ``*_like`` operators preserves memory format -y = torch.empty_like(x) -print(y.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# Pointwise operators preserves memory format -z = x + y -print(z.stride()) # Outputs: (3072, 1, 96, 3) - -###################################################################### -# ``Conv``, ``Batchnorm`` modules using ``cudnn`` backends support channels last -# (only works for cuDNN >= 7.6). Convolution modules, unlike binary -# p-wise operator, have channels last as the dominating memory format. -# If all inputs are in contiguous memory format, the operator -# produces output in contiguous memory format. Otherwise, output will -# be in channels last memory format. - -if torch.backends.cudnn.is_available() and torch.backends.cudnn.version() >= 7603: - model = torch.nn.Conv2d(8, 4, 3).cuda().half() - model = model.to(memory_format=torch.channels_last) # Module parameters need to be channels last - - input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, requires_grad=True) - input = input.to(device="cuda", memory_format=torch.channels_last, dtype=torch.float16) - - out = model(input) - print(out.is_contiguous(memory_format=torch.channels_last)) # Outputs: True - -###################################################################### -# When input tensor reaches a operator without channels last support, -# a permutation should automatically apply in the kernel to restore -# contiguous on input tensor. This introduces overhead and stops the -# channels last memory format propagation. Nevertheless, it guarantees -# correct output. - -###################################################################### -# Performance Gains -# -------------------------------------------------------------------- -# Channels last memory format optimizations are available on both GPU and CPU. -# On GPU, the most significant performance gains are observed on NVIDIA's -# hardware with Tensor Cores support running on reduced precision -# (``torch.float16``). -# We were able to archive over 22% performance gains with channels last -# comparing to contiguous format, both while utilizing -# 'AMP (Automated Mixed Precision)' training scripts. -# Our scripts uses AMP supplied by NVIDIA -# https://github.com/NVIDIA/apex. -# -# ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 ./data`` - -# opt_level = O2 -# keep_batchnorm_fp32 = None -# loss_scale = None -# CUDNN VERSION: 7603 -# => creating model 'resnet50' -# Selected optimization level O2: FP16 training with FP32 batchnorm and FP32 master weights. -# Defaults for this optimization level are: -# enabled : True -# opt_level : O2 -# cast_model_type : torch.float16 -# patch_torch_functions : False -# keep_batchnorm_fp32 : True -# master_weights : True -# loss_scale : dynamic -# Processing user overrides (additional kwargs that are not None)... -# After processing overrides, optimization options are: -# enabled : True -# opt_level : O2 -# cast_model_type : torch.float16 -# patch_torch_functions : False -# keep_batchnorm_fp32 : True -# master_weights : True -# loss_scale : dynamic -# Epoch: [0][10/125] Time 0.866 (0.866) Speed 230.949 (230.949) Loss 0.6735125184 (0.6735) Prec@1 61.000 (61.000) Prec@5 100.000 (100.000) -# Epoch: [0][20/125] Time 0.259 (0.562) Speed 773.481 (355.693) Loss 0.6968704462 (0.6852) Prec@1 55.000 (58.000) Prec@5 100.000 (100.000) -# Epoch: [0][30/125] Time 0.258 (0.461) Speed 775.089 (433.965) Loss 0.7877287269 (0.7194) Prec@1 51.500 (55.833) Prec@5 100.000 (100.000) -# Epoch: [0][40/125] Time 0.259 (0.410) Speed 771.710 (487.281) Loss 0.8285319805 (0.7467) Prec@1 48.500 (54.000) Prec@5 100.000 (100.000) -# Epoch: [0][50/125] Time 0.260 (0.380) Speed 770.090 (525.908) Loss 0.7370464802 (0.7447) Prec@1 56.500 (54.500) Prec@5 100.000 (100.000) -# Epoch: [0][60/125] Time 0.258 (0.360) Speed 775.623 (555.728) Loss 0.7592862844 (0.7472) Prec@1 51.000 (53.917) Prec@5 100.000 (100.000) -# Epoch: [0][70/125] Time 0.258 (0.345) Speed 774.746 (579.115) Loss 1.9698858261 (0.9218) Prec@1 49.500 (53.286) Prec@5 100.000 (100.000) -# Epoch: [0][80/125] Time 0.260 (0.335) Speed 770.324 (597.659) Loss 2.2505953312 (1.0879) Prec@1 50.500 (52.938) Prec@5 100.000 (100.000) - -###################################################################### -# Passing ``--channels-last true`` allows running a model in Channels last format with observed 22% performance gain. -# -# ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 --channels-last true ./data`` - -# opt_level = O2 -# keep_batchnorm_fp32 = None -# loss_scale = None -# -# CUDNN VERSION: 7603 -# -# => creating model 'resnet50' -# Selected optimization level O2: FP16 training with FP32 batchnorm and FP32 master weights. -# -# Defaults for this optimization level are: -# enabled : True -# opt_level : O2 -# cast_model_type : torch.float16 -# patch_torch_functions : False -# keep_batchnorm_fp32 : True -# master_weights : True -# loss_scale : dynamic -# Processing user overrides (additional kwargs that are not None)... -# After processing overrides, optimization options are: -# enabled : True -# opt_level : O2 -# cast_model_type : torch.float16 -# patch_torch_functions : False -# keep_batchnorm_fp32 : True -# master_weights : True -# loss_scale : dynamic -# -# Epoch: [0][10/125] Time 0.767 (0.767) Speed 260.785 (260.785) Loss 0.7579724789 (0.7580) Prec@1 53.500 (53.500) Prec@5 100.000 (100.000) -# Epoch: [0][20/125] Time 0.198 (0.482) Speed 1012.135 (414.716) Loss 0.7007197738 (0.7293) Prec@1 49.000 (51.250) Prec@5 100.000 (100.000) -# Epoch: [0][30/125] Time 0.198 (0.387) Speed 1010.977 (516.198) Loss 0.7113101482 (0.7233) Prec@1 55.500 (52.667) Prec@5 100.000 (100.000) -# Epoch: [0][40/125] Time 0.197 (0.340) Speed 1013.023 (588.333) Loss 0.8943189979 (0.7661) Prec@1 54.000 (53.000) Prec@5 100.000 (100.000) -# Epoch: [0][50/125] Time 0.198 (0.312) Speed 1010.541 (641.977) Loss 1.7113249302 (0.9551) Prec@1 51.000 (52.600) Prec@5 100.000 (100.000) -# Epoch: [0][60/125] Time 0.198 (0.293) Speed 1011.163 (683.574) Loss 5.8537774086 (1.7716) Prec@1 50.500 (52.250) Prec@5 100.000 (100.000) -# Epoch: [0][70/125] Time 0.198 (0.279) Speed 1011.453 (716.767) Loss 5.7595844269 (2.3413) Prec@1 46.500 (51.429) Prec@5 100.000 (100.000) -# Epoch: [0][80/125] Time 0.198 (0.269) Speed 1011.827 (743.883) Loss 2.8196096420 (2.4011) Prec@1 47.500 (50.938) Prec@5 100.000 (100.000) - -###################################################################### -# The following list of models has the full support of Channels last and showing 8%-35% performance gains on Volta devices: -# ``alexnet``, ``mnasnet0_5``, ``mnasnet0_75``, ``mnasnet1_0``, ``mnasnet1_3``, ``mobilenet_v2``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``shufflenet_v2_x1_5``, ``shufflenet_v2_x2_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2`` -# - -###################################################################### -# The following list of models has the full support of Channels last and showing 26%-76% performance gains on Intel(R) Xeon(R) Ice Lake (or newer) CPUs: -# ``alexnet``, ``densenet121``, ``densenet161``, ``densenet169``, ``googlenet``, ``inception_v3``, ``mnasnet0_5``, ``mnasnet1_0``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext101_32x8d``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2`` -# - -###################################################################### -# Converting existing models -# -------------------------- -# -# Channels last support is not limited by existing models, as any -# model can be converted to channels last and propagate format through -# the graph as soon as input (or certain weight) is formatted -# correctly. -# - -# Need to be done once, after model initialization (or load) -model = model.to(memory_format=torch.channels_last) # Replace with your model - -# Need to be done for every input -input = input.to(memory_format=torch.channels_last) # Replace with your input -output = model(input) - -####################################################################### -# However, not all operators fully converted to support channels last -# (usually returning contiguous output instead). In the example posted -# above, layers that does not support channels last will stop the -# memory format propagation. In spite of that, as we have converted the -# model to channels last format, that means each convolution layer, -# which has its 4 dimensional weight in channels last memory format, -# will restore channels last memory format and benefit from faster -# kernels. -# -# But operators that does not support channels last does introduce -# overhead by permutation. Optionally, you can investigate and identify -# operators in your model that does not support channels last, if you -# want to improve the performance of converted model. -# -# That means you need to verify the list of used operators -# against supported operators list https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support, -# or introduce memory format checks into eager execution mode and run your model. -# -# After running the code below, operators will raise an exception if the output of the -# operator doesn't match the memory format of the input. -# -# -def contains_cl(args): - for t in args: - if isinstance(t, torch.Tensor): - if t.is_contiguous(memory_format=torch.channels_last) and not t.is_contiguous(): - return True - elif isinstance(t, list) or isinstance(t, tuple): - if contains_cl(list(t)): - return True - return False - - -def print_inputs(args, indent=""): - for t in args: - if isinstance(t, torch.Tensor): - print(indent, t.stride(), t.shape, t.device, t.dtype) - elif isinstance(t, list) or isinstance(t, tuple): - print(indent, type(t)) - print_inputs(list(t), indent=indent + " ") - else: - print(indent, t) - - -def check_wrapper(fn): - name = fn.__name__ - - def check_cl(*args, **kwargs): - was_cl = contains_cl(args) - try: - result = fn(*args, **kwargs) - except Exception as e: - print("`{}` inputs are:".format(name)) - print_inputs(args) - print("-------------------") - raise e - failed = False - if was_cl: - if isinstance(result, torch.Tensor): - if result.dim() == 4 and not result.is_contiguous(memory_format=torch.channels_last): - print( - "`{}` got channels_last input, but output is not channels_last:".format(name), - result.shape, - result.stride(), - result.device, - result.dtype, - ) - failed = True - if failed and True: - print("`{}` inputs are:".format(name)) - print_inputs(args) - raise Exception("Operator `{}` lost channels_last property".format(name)) - return result - - return check_cl - - -old_attrs = dict() - - -def attribute(m): - old_attrs[m] = dict() - for i in dir(m): - e = getattr(m, i) - exclude_functions = ["is_cuda", "has_names", "numel", "stride", "Tensor", "is_contiguous", "__class__"] - if i not in exclude_functions and not i.startswith("_") and "__call__" in dir(e): - try: - old_attrs[m][i] = e - setattr(m, i, check_wrapper(e)) - except Exception as e: - print(i) - print(e) - - -attribute(torch.Tensor) -attribute(torch.nn.functional) -attribute(torch) - - -###################################################################### -# If you found an operator that doesn't support channels last tensors -# and you want to contribute, feel free to use following developers -# guide https://github.com/pytorch/pytorch/wiki/Writing-memory-format-aware-operators. -# - -###################################################################### -# Code below is to recover the attributes of torch. - -for (m, attrs) in old_attrs.items(): - for (k, v) in attrs.items(): - setattr(m, k, v) - -###################################################################### -# Work to do -# ---------- -# There are still many things to do, such as: -# -# - Resolving ambiguity of ``N1HW`` and ``NC11`` Tensors; -# - Testing of Distributed Training support; -# - Improving operators coverage. -# -# If you have feedback and/or suggestions for improvement, please let us -# know by creating `an issue `_. diff --git a/intermediate_source/mnist_train_nas.py b/intermediate_source/mnist_train_nas.py deleted file mode 100644 index 4ae6d894fc..0000000000 --- a/intermediate_source/mnist_train_nas.py +++ /dev/null @@ -1,171 +0,0 @@ -""" -Example training code for ``ax_multiobjective_nas_tutorial.py`` -""" - -import argparse -import logging -import os -import sys -import time -import warnings - -import torch -from IPython.utils import io -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning import loggers as pl_loggers -from torch import nn -from torch.nn import functional as F -from torch.utils.data import DataLoader -from torchmetrics.functional.classification.accuracy import multiclass_accuracy -from torchvision import transforms -from torchvision.datasets import MNIST - -warnings.filterwarnings("ignore") # Disable data logger warnings -logging.getLogger("pytorch_lightning").setLevel(logging.ERROR) # Disable GPU/TPU prints - - -def parse_args(): - parser = argparse.ArgumentParser(description="train mnist") - parser.add_argument( - "--log_path", type=str, required=True, help="dir to place tensorboard logs from all trials" - ) - parser.add_argument( - "--hidden_size_1", type=int, required=True, help="hidden size layer 1" - ) - parser.add_argument( - "--hidden_size_2", type=int, required=True, help="hidden size layer 2" - ) - parser.add_argument("--learning_rate", type=float, required=True, help="learning rate") - parser.add_argument("--epochs", type=int, required=True, help="number of epochs") - parser.add_argument("--dropout", type=float, required=True, help="dropout probability") - parser.add_argument("--batch_size", type=int, required=True, help="batch size") - return parser.parse_args() - -args = parse_args() - -PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") - - -class MnistModel(LightningModule): - def __init__(self): - super().__init__() - - # Tunable parameters - self.hidden_size_1 = args.hidden_size_1 - self.hidden_size_2 = args.hidden_size_2 - self.learning_rate = args.learning_rate - self.dropout = args.dropout - self.batch_size = args.batch_size - - # Set class attributes - self.data_dir = PATH_DATASETS - - # Hardcode some dataset specific attributes - self.num_classes = 10 - self.dims = (1, 28, 28) - channels, width, height = self.dims - self.transform = transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)), - ] - ) - - # Create a PyTorch model - layers = [nn.Flatten()] - width = channels * width * height - hidden_layers = [self.hidden_size_1, self.hidden_size_2] - num_params = 0 - for hidden_size in hidden_layers: - if hidden_size > 0: - layers.append(nn.Linear(width, hidden_size)) - layers.append(nn.ReLU()) - layers.append(nn.Dropout(self.dropout)) - num_params += width * hidden_size - width = hidden_size - layers.append(nn.Linear(width, self.num_classes)) - num_params += width * self.num_classes - - # Save the model and parameter counts - self.num_params = num_params - self.model = nn.Sequential(*layers) # No need to use Relu for the last layer - - def forward(self, x): - x = self.model(x) - return F.log_softmax(x, dim=1) - - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return loss - - def validation_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - preds = torch.argmax(logits, dim=1) - acc = multiclass_accuracy(preds, y, num_classes=self.num_classes) - self.log("val_acc", acc, prog_bar=False) - return loss - - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) - return optimizer - - def prepare_data(self): - MNIST(self.data_dir, train=True, download=True) - MNIST(self.data_dir, train=False, download=True) - - def setup(self, stage=None): - self.mnist_train = MNIST(self.data_dir, train=True, transform=self.transform) - self.mnist_val = MNIST(self.data_dir, train=False, transform=self.transform) - - def train_dataloader(self): - return DataLoader(self.mnist_train, batch_size=self.batch_size) - - def val_dataloader(self): - return DataLoader(self.mnist_val, batch_size=self.batch_size) - - -def run_training_job(): - - mnist_model = MnistModel() - - # Initialize a trainer (don't log anything since things get so slow...) - trainer = Trainer( - logger=False, - max_epochs=args.epochs, - enable_progress_bar=False, - deterministic=True, # Do we want a bit of noise? - default_root_dir=args.log_path, - ) - - logger = pl_loggers.TensorBoardLogger(args.log_path) - - print(f"Logging to path: {args.log_path}.") - - # Train the model and log time ⚡ - start = time.time() - trainer.fit(model=mnist_model) - end = time.time() - train_time = end - start - logger.log_metrics({"train_time": end - start}) - - # Compute the validation accuracy once and log the score - with io.capture_output() as captured: - val_accuracy = trainer.validate()[0]["val_acc"] - logger.log_metrics({"val_acc": val_accuracy}) - - # Log the number of model parameters - num_params = trainer.model.num_params - logger.log_metrics({"num_params": num_params}) - - logger.save() - - # Print outputs - print(f"train time: {train_time}, val acc: {val_accuracy}, num_params: {num_params}") - - -if __name__ == "__main__": - run_training_job() diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py deleted file mode 100644 index 562064614b..0000000000 --- a/intermediate_source/model_parallel_tutorial.py +++ /dev/null @@ -1,357 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Single-Machine Model Parallel Best Practices -============================================ -**Author**: `Shen Li `_ - -Model parallel is widely-used in distributed training -techniques. Previous posts have explained how to use -`DataParallel `_ -to train a neural network on multiple GPUs; this feature replicates the -same model to all GPUs, where each GPU consumes a different partition of the -input data. Although it can significantly accelerate the training process, it -does not work for some use cases where the model is too large to fit into a -single GPU. This post shows how to solve that problem by using **model parallel**, -which, in contrast to ``DataParallel``, splits a single model onto different GPUs, -rather than replicating the entire model on each GPU (to be concrete, say a model -``m`` contains 10 layers: when using ``DataParallel``, each GPU will have a -replica of each of these 10 layers, whereas when using model parallel on two GPUs, -each GPU could host 5 layers). - -The high-level idea of model parallel is to place different sub-networks of a -model onto different devices, and implement the ``forward`` method accordingly -to move intermediate outputs across devices. As only part of a model operates -on any individual device, a set of devices can collectively serve a larger -model. In this post, we will not try to construct huge models and squeeze them -into a limited number of GPUs. Instead, this post focuses on showing the idea -of model parallel. It is up to the readers to apply the ideas to real-world -applications. - -.. note:: - - For distributed model parallel training where a model spans multiple - servers, please refer to - `Getting Started With Distributed RPC Framework `__ - for examples and details. - -Basic Usage ------------ -""" - -###################################################################### -# Let us start with a toy model that contains two linear layers. To run this -# model on two GPUs, simply put each linear layer on a different GPU, and move -# inputs and intermediate outputs to match the layer devices accordingly. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -class ToyModel(nn.Module): - def __init__(self): - super(ToyModel, self).__init__() - self.net1 = torch.nn.Linear(10, 10).to('cuda:0') - self.relu = torch.nn.ReLU() - self.net2 = torch.nn.Linear(10, 5).to('cuda:1') - - def forward(self, x): - x = self.relu(self.net1(x.to('cuda:0'))) - return self.net2(x.to('cuda:1')) - -###################################################################### -# Note that, the above ``ToyModel`` looks very similar to how one would -# implement it on a single GPU, except the four ``to(device)`` calls which -# place linear layers and tensors on proper devices. That is the only place in -# the model that requires changes. The ``backward()`` and ``torch.optim`` will -# automatically take care of gradients as if the model is on one GPU. You only -# need to make sure that the labels are on the same device as the outputs when -# calling the loss function. - - -model = ToyModel() -loss_fn = nn.MSELoss() -optimizer = optim.SGD(model.parameters(), lr=0.001) - -optimizer.zero_grad() -outputs = model(torch.randn(20, 10)) -labels = torch.randn(20, 5).to('cuda:1') -loss_fn(outputs, labels).backward() -optimizer.step() - -###################################################################### -# Apply Model Parallel to Existing Modules -# ---------------------------------------- -# -# It is also possible to run an existing single-GPU module on multiple GPUs -# with just a few lines of changes. The code below shows how to decompose -# ``torchvision.models.resnet50()`` to two GPUs. The idea is to inherit from -# the existing ``ResNet`` module, and split the layers to two GPUs during -# construction. Then, override the ``forward`` method to stitch two -# sub-networks by moving the intermediate outputs accordingly. - - -from torchvision.models.resnet import ResNet, Bottleneck - -num_classes = 1000 - - -class ModelParallelResNet50(ResNet): - def __init__(self, *args, **kwargs): - super(ModelParallelResNet50, self).__init__( - Bottleneck, [3, 4, 6, 3], num_classes=num_classes, *args, **kwargs) - - self.seq1 = nn.Sequential( - self.conv1, - self.bn1, - self.relu, - self.maxpool, - - self.layer1, - self.layer2 - ).to('cuda:0') - - self.seq2 = nn.Sequential( - self.layer3, - self.layer4, - self.avgpool, - ).to('cuda:1') - - self.fc.to('cuda:1') - - def forward(self, x): - x = self.seq2(self.seq1(x).to('cuda:1')) - return self.fc(x.view(x.size(0), -1)) - - -###################################################################### -# The above implementation solves the problem for cases where the model is too -# large to fit into a single GPU. However, you might have already noticed that -# it will be slower than running it on a single GPU if your model fits. It is -# because, at any point in time, only one of the two GPUs are working, while -# the other one is sitting there doing nothing. The performance further -# deteriorates as the intermediate outputs need to be copied from ``cuda:0`` to -# ``cuda:1`` between ``layer2`` and ``layer3``. -# -# Let us run an experiment to get a more quantitative view of the execution -# time. In this experiment, we train ``ModelParallelResNet50`` and the existing -# ``torchvision.models.resnet50()`` by running random inputs and labels through -# them. After the training, the models will not produce any useful predictions, -# but we can get a reasonable understanding of the execution times. - - -import torchvision.models as models - -num_batches = 3 -batch_size = 120 -image_w = 128 -image_h = 128 - - -def train(model): - model.train(True) - loss_fn = nn.MSELoss() - optimizer = optim.SGD(model.parameters(), lr=0.001) - - one_hot_indices = torch.LongTensor(batch_size) \ - .random_(0, num_classes) \ - .view(batch_size, 1) - - for _ in range(num_batches): - # generate random inputs and labels - inputs = torch.randn(batch_size, 3, image_w, image_h) - labels = torch.zeros(batch_size, num_classes) \ - .scatter_(1, one_hot_indices, 1) - - # run forward pass - optimizer.zero_grad() - outputs = model(inputs.to('cuda:0')) - - # run backward pass - labels = labels.to(outputs.device) - loss_fn(outputs, labels).backward() - optimizer.step() - - -###################################################################### -# The ``train(model)`` method above uses ``nn.MSELoss`` as the loss function, -# and ``optim.SGD`` as the optimizer. It mimics training on ``128 X 128`` -# images which are organized into 3 batches where each batch contains 120 -# images. Then, we use ``timeit`` to run the ``train(model)`` method 10 times -# and plot the execution times with standard deviations. - - -import matplotlib.pyplot as plt -plt.switch_backend('Agg') -import numpy as np -import timeit - -num_repeat = 10 - -stmt = "train(model)" - -setup = "model = ModelParallelResNet50()" -mp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -mp_mean, mp_std = np.mean(mp_run_times), np.std(mp_run_times) - -setup = "import torchvision.models as models;" + \ - "model = models.resnet50(num_classes=num_classes).to('cuda:0')" -rn_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -rn_mean, rn_std = np.mean(rn_run_times), np.std(rn_run_times) - - -def plot(means, stds, labels, fig_name): - fig, ax = plt.subplots() - ax.bar(np.arange(len(means)), means, yerr=stds, - align='center', alpha=0.5, ecolor='red', capsize=10, width=0.6) - ax.set_ylabel('ResNet50 Execution Time (Second)') - ax.set_xticks(np.arange(len(means))) - ax.set_xticklabels(labels) - ax.yaxis.grid(True) - plt.tight_layout() - plt.savefig(fig_name) - plt.close(fig) - - -plot([mp_mean, rn_mean], - [mp_std, rn_std], - ['Model Parallel', 'Single GPU'], - 'mp_vs_rn.png') - - -###################################################################### -# -# .. figure:: /_static/img/model-parallel-images/mp_vs_rn.png -# :alt: -# -# The result shows that the execution time of model parallel implementation is -# ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. So we -# can conclude there is roughly 7% overhead in copying tensors back and forth -# across the GPUs. There are rooms for improvements, as we know one of the two -# GPUs is sitting idle throughout the execution. One option is to further -# divide each batch into a pipeline of splits, such that when one split reaches -# the second sub-network, the following split can be fed into the first -# sub-network. In this way, two consecutive splits can run concurrently on two -# GPUs. - -###################################################################### -# Speed Up by Pipelining Inputs -# ----------------------------- -# -# In the following experiments, we further divide each 120-image batch into -# 20-image splits. As PyTorch launches CUDA operations asynchronously, the -# implementation does not need to spawn multiple threads to achieve -# concurrency. - - -class PipelineParallelResNet50(ModelParallelResNet50): - def __init__(self, split_size=20, *args, **kwargs): - super(PipelineParallelResNet50, self).__init__(*args, **kwargs) - self.split_size = split_size - - def forward(self, x): - splits = iter(x.split(self.split_size, dim=0)) - s_next = next(splits) - s_prev = self.seq1(s_next).to('cuda:1') - ret = [] - - for s_next in splits: - # A. ``s_prev`` runs on ``cuda:1`` - s_prev = self.seq2(s_prev) - ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - - # B. ``s_next`` runs on ``cuda:0``, which can run concurrently with A - s_prev = self.seq1(s_next).to('cuda:1') - - s_prev = self.seq2(s_prev) - ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - - return torch.cat(ret) - - -setup = "model = PipelineParallelResNet50()" -pp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -pp_mean, pp_std = np.mean(pp_run_times), np.std(pp_run_times) - -plot([mp_mean, rn_mean, pp_mean], - [mp_std, rn_std, pp_std], - ['Model Parallel', 'Single GPU', 'Pipelining Model Parallel'], - 'mp_vs_rn_vs_pp.png') - -###################################################################### -# Please note, device-to-device tensor copy operations are synchronized on -# current streams on the source and the destination devices. If you create -# multiple streams, you have to make sure that copy operations are properly -# synchronized. Writing the source tensor or reading/writing the destination -# tensor before finishing the copy operation can lead to undefined behavior. -# The above implementation only uses default streams on both source and -# destination devices, hence it is not necessary to enforce additional -# synchronizations. -# -# .. figure:: /_static/img/model-parallel-images/mp_vs_rn_vs_pp.png -# :alt: -# -# The experiment result shows that, pipelining inputs to model parallel -# ResNet50 speeds up the training process by roughly ``3.75/2.51-1=49%``. It is -# still quite far away from the ideal 100% speedup. As we have introduced a new -# parameter ``split_sizes`` in our pipeline parallel implementation, it is -# unclear how the new parameter affects the overall training time. Intuitively -# speaking, using small ``split_size`` leads to many tiny CUDA kernel launch, -# while using large ``split_size`` results to relatively long idle times during -# the first and last splits. Neither are optimal. There might be an optimal -# ``split_size`` configuration for this specific experiment. Let us try to find -# it by running experiments using several different ``split_size`` values. - - -means = [] -stds = [] -split_sizes = [1, 3, 5, 8, 10, 12, 20, 40, 60] - -for split_size in split_sizes: - setup = "model = PipelineParallelResNet50(split_size=%d)" % split_size - pp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) - means.append(np.mean(pp_run_times)) - stds.append(np.std(pp_run_times)) - -fig, ax = plt.subplots() -ax.plot(split_sizes, means) -ax.errorbar(split_sizes, means, yerr=stds, ecolor='red', fmt='ro') -ax.set_ylabel('ResNet50 Execution Time (Second)') -ax.set_xlabel('Pipeline Split Size') -ax.set_xticks(split_sizes) -ax.yaxis.grid(True) -plt.tight_layout() -plt.savefig("split_size_tradeoff.png") -plt.close(fig) - -###################################################################### -# -# .. figure:: /_static/img/model-parallel-images/split_size_tradeoff.png -# :alt: -# -# The result shows that setting ``split_size`` to 12 achieves the fastest -# training speed, which leads to ``3.75/2.43-1=54%`` speedup. There are -# still opportunities to further accelerate the training process. For example, -# all operations on ``cuda:0`` is placed on its default stream. It means that -# computations on the next split cannot overlap with the copy operation of the -# ``prev`` split. However, as ``prev`` and next splits are different tensors, there is -# no problem to overlap one's computation with the other one's copy. The -# implementation need to use multiple streams on both GPUs, and different -# sub-network structures require different stream management strategies. As no -# general multi-stream solution works for all model parallel use cases, we will -# not discuss it in this tutorial. -# -# **Note:** -# -# This post shows several performance measurements. You might see different -# numbers when running the same code on your own machine, because the result -# depends on the underlying hardware and software. To get the best performance -# for your environment, a proper approach is to first generate the curve to -# figure out the best split size, and then use that split size to pipeline -# inputs. -# diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py deleted file mode 100644 index 62a49794af..0000000000 --- a/intermediate_source/neural_tangent_kernels.py +++ /dev/null @@ -1,251 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Neural Tangent Kernels -====================== - -The neural tangent kernel (NTK) is a kernel that describes -`how a neural network evolves during training `_. -There has been a lot of research around it `in recent years `_. -This tutorial, inspired by the implementation of `NTKs in JAX `_ -(see `Fast Finite Width Neural Tangent Kernel `_ for details), -demonstrates how to easily compute this quantity using ``torch.func``, -composable function transforms for PyTorch. - -.. note:: - - This tutorial requires PyTorch 2.0.0 or later. - -Setup ------ - -First, some setup. Let's define a simple CNN that we wish to compute the NTK of. -""" - -import torch -import torch.nn as nn -from torch.func import functional_call, vmap, vjp, jvp, jacrev -device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu' - -class CNN(nn.Module): - def __init__(self): - super(CNN, self).__init__() - self.conv1 = nn.Conv2d(3, 32, (3, 3)) - self.conv2 = nn.Conv2d(32, 32, (3, 3)) - self.conv3 = nn.Conv2d(32, 32, (3, 3)) - self.fc = nn.Linear(21632, 10) - - def forward(self, x): - x = self.conv1(x) - x = x.relu() - x = self.conv2(x) - x = x.relu() - x = self.conv3(x) - x = x.flatten(1) - x = self.fc(x) - return x - -###################################################################### -# And let's generate some random data - -x_train = torch.randn(20, 3, 32, 32, device=device) -x_test = torch.randn(5, 3, 32, 32, device=device) - -###################################################################### -# Create a function version of the model -# -------------------------------------- -# -# ``torch.func`` transforms operate on functions. In particular, to compute the NTK, -# we will need a function that accepts the parameters of the model and a single -# input (as opposed to a batch of inputs!) and returns a single output. -# -# We'll use ``torch.func.functional_call``, which allows us to call an ``nn.Module`` -# using different parameters/buffers, to help accomplish the first step. -# -# Keep in mind that the model was originally written to accept a batch of input -# data points. In our CNN example, there are no inter-batch operations. That -# is, each data point in the batch is independent of other data points. With -# this assumption in mind, we can easily generate a function that evaluates the -# model on a single data point: - - -net = CNN().to(device) - -# Detaching the parameters because we won't be calling Tensor.backward(). -params = {k: v.detach() for k, v in net.named_parameters()} - -def fnet_single(params, x): - return functional_call(net, params, (x.unsqueeze(0),)).squeeze(0) - -###################################################################### -# Compute the NTK: method 1 (Jacobian contraction) -# ------------------------------------------------ -# We're ready to compute the empirical NTK. The empirical NTK for two data -# points :math:`x_1` and :math:`x_2` is defined as the matrix product between the Jacobian -# of the model evaluated at :math:`x_1` and the Jacobian of the model evaluated at -# :math:`x_2`: -# -# .. math:: -# -# J_{net}(x_1) J_{net}^T(x_2) -# -# In the batched case where :math:`x_1` is a batch of data points and :math:`x_2` is a -# batch of data points, then we want the matrix product between the Jacobians -# of all combinations of data points from :math:`x_1` and :math:`x_2`. -# -# The first method consists of doing just that - computing the two Jacobians, -# and contracting them. Here's how to compute the NTK in the batched case: - -def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2): - # Compute J(x1) - jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1) - jac1 = jac1.values() - jac1 = [j.flatten(2) for j in jac1] - - # Compute J(x2) - jac2 = vmap(jacrev(fnet_single), (None, 0))(params, x2) - jac2 = jac2.values() - jac2 = [j.flatten(2) for j in jac2] - - # Compute J(x1) @ J(x2).T - result = torch.stack([torch.einsum('Naf,Mbf->NMab', j1, j2) for j1, j2 in zip(jac1, jac2)]) - result = result.sum(0) - return result - -result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test) -print(result.shape) - -###################################################################### -# In some cases, you may only want the diagonal or the trace of this quantity, -# especially if you know beforehand that the network architecture results in an -# NTK where the non-diagonal elements can be approximated by zero. It's easy to -# adjust the above function to do that: - -def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute='full'): - # Compute J(x1) - jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1) - jac1 = jac1.values() - jac1 = [j.flatten(2) for j in jac1] - - # Compute J(x2) - jac2 = vmap(jacrev(fnet_single), (None, 0))(params, x2) - jac2 = jac2.values() - jac2 = [j.flatten(2) for j in jac2] - - # Compute J(x1) @ J(x2).T - einsum_expr = None - if compute == 'full': - einsum_expr = 'Naf,Mbf->NMab' - elif compute == 'trace': - einsum_expr = 'Naf,Maf->NM' - elif compute == 'diagonal': - einsum_expr = 'Naf,Maf->NMa' - else: - assert False - - result = torch.stack([torch.einsum(einsum_expr, j1, j2) for j1, j2 in zip(jac1, jac2)]) - result = result.sum(0) - return result - -result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test, 'trace') -print(result.shape) - -###################################################################### -# The asymptotic time complexity of this method is :math:`N O [FP]` (time to -# compute the Jacobians) + :math:`N^2 O^2 P` (time to contract the Jacobians), -# where :math:`N` is the batch size of :math:`x_1` and :math:`x_2`, :math:`O` -# is the model's output size, :math:`P` is the total number of parameters, and -# :math:`[FP]` is the cost of a single forward pass through the model. See -# section 3.2 in -# `Fast Finite Width Neural Tangent Kernel `_ -# for details. -# -# Compute the NTK: method 2 (NTK-vector products) -# ----------------------------------------------- -# -# The next method we will discuss is a way to compute the NTK using NTK-vector -# products. -# -# This method reformulates NTK as a stack of NTK-vector products applied to -# columns of an identity matrix :math:`I_O` of size :math:`O\times O` -# (where :math:`O` is the output size of the model): -# -# .. math:: -# -# J_{net}(x_1) J_{net}^T(x_2) = J_{net}(x_1) J_{net}^T(x_2) I_{O} = \left[J_{net}(x_1) \left[J_{net}^T(x_2) e_o\right]\right]_{o=1}^{O}, -# -# where :math:`e_o\in \mathbb{R}^O` are column vectors of the identity matrix -# :math:`I_O`. -# -# - Let :math:`\textrm{vjp}_o = J_{net}^T(x_2) e_o`. We can use -# a vector-Jacobian product to compute this. -# - Now, consider :math:`J_{net}(x_1) \textrm{vjp}_o`. This is a -# Jacobian-vector product! -# - Finally, we can run the above computation in parallel over all -# columns :math:`e_o` of :math:`I_O` using ``vmap``. -# -# This suggests that we can use a combination of reverse-mode AD (to compute -# the vector-Jacobian product) and forward-mode AD (to compute the -# Jacobian-vector product) to compute the NTK. -# -# Let's code that up: - -def empirical_ntk_ntk_vps(func, params, x1, x2, compute='full'): - def get_ntk(x1, x2): - def func_x1(params): - return func(params, x1) - - def func_x2(params): - return func(params, x2) - - output, vjp_fn = vjp(func_x1, params) - - def get_ntk_slice(vec): - # This computes ``vec @ J(x2).T`` - # `vec` is some unit vector (a single slice of the Identity matrix) - vjps = vjp_fn(vec) - # This computes ``J(X1) @ vjps`` - _, jvps = jvp(func_x2, (params,), vjps) - return jvps - - # Here's our identity matrix - basis = torch.eye(output.numel(), dtype=output.dtype, device=output.device).view(output.numel(), -1) - return vmap(get_ntk_slice)(basis) - - # ``get_ntk(x1, x2)`` computes the NTK for a single data point x1, x2 - # Since the x1, x2 inputs to ``empirical_ntk_ntk_vps`` are batched, - # we actually wish to compute the NTK between every pair of data points - # between {x1} and {x2}. That's what the ``vmaps`` here do. - result = vmap(vmap(get_ntk, (None, 0)), (0, None))(x1, x2) - - if compute == 'full': - return result - if compute == 'trace': - return torch.einsum('NMKK->NM', result) - if compute == 'diagonal': - return torch.einsum('NMKK->NMK', result) - -# Disable TensorFloat-32 for convolutions on Ampere+ GPUs to sacrifice performance in favor of accuracy -with torch.backends.cudnn.flags(allow_tf32=False): - result_from_jacobian_contraction = empirical_ntk_jacobian_contraction(fnet_single, params, x_test, x_train) - result_from_ntk_vps = empirical_ntk_ntk_vps(fnet_single, params, x_test, x_train) - -assert torch.allclose(result_from_jacobian_contraction, result_from_ntk_vps, atol=1e-5) - -###################################################################### -# Our code for ``empirical_ntk_ntk_vps`` looks like a direct translation from -# the math above! This showcases the power of function transforms: good luck -# trying to write an efficient version of the above by only using -# ``torch.autograd.grad``. -# -# The asymptotic time complexity of this method is :math:`N^2 O [FP]`, where -# :math:`N` is the batch size of :math:`x_1` and :math:`x_2`, :math:`O` is the -# model's output size, and :math:`[FP]` is the cost of a single forward pass -# through the model. Hence this method performs more forward passes through the -# network than method 1, Jacobian contraction (:math:`N^2 O` instead of -# :math:`N O`), but avoids the contraction cost altogether (no :math:`N^2 O^2 P` -# term, where :math:`P` is the total number of model's parameters). Therefore, -# this method is preferable when :math:`O P` is large relative to :math:`[FP]`, -# such as fully-connected (not convolutional) models with many outputs :math:`O`. -# Memory-wise, both methods should be comparable. See section 3.3 in -# `Fast Finite Width Neural Tangent Kernel `_ -# for details. diff --git a/intermediate_source/nvfuser_intro_tutorial.rst b/intermediate_source/nvfuser_intro_tutorial.rst deleted file mode 100644 index 965500d71e..0000000000 --- a/intermediate_source/nvfuser_intro_tutorial.rst +++ /dev/null @@ -1,8 +0,0 @@ -Getting Started - Accelerate Your Scripts with nvFuser -====================================================== - -This tutorial has been deprecated. Redirecting to homepage in 3 seconds... - -.. raw:: html - - diff --git a/intermediate_source/optimizer_step_in_backward_tutorial.py b/intermediate_source/optimizer_step_in_backward_tutorial.py deleted file mode 100644 index fd72f733c5..0000000000 --- a/intermediate_source/optimizer_step_in_backward_tutorial.py +++ /dev/null @@ -1,268 +0,0 @@ -""" - -How to save memory by fusing the optimizer step into the backward pass -====================================================================== - -Hello there! This tutorial aims to showcase one way of reducing the -memory footprint of a training loop by reducing the memory taken by -the *gradients*. Say you have a model and you're interested in ways to -optimize memory to avoid ``Out of Memory`` (OOM) errors or simply to ooze -more out of your GPU. Well, you _might_ be in luck (if gradients take up -a portion of your memory and you do not need to do gradient accumulation). -We will explore the following: - -1. What takes up memory during your training or finetuning loop, -2. How to capture and visualize memory snapshots to determine the bottleneck, -3. The new ``Tensor.register_post_accumulate_grad_hook(hook)`` API, and finally, -4. How everything fits together in 10 lines to achieve memory savings. - -To run this tutorial, you will need: - -* PyTorch 2.1.0 or newer with ``torchvision`` -* 1 CUDA GPU if you'd like to run the memory visualizations locally. - Otherwise, this technique would benefit similarly on any device. - -Let us start by importing the required modules and models. We will use a -vision transformer model from torchvision, but feel free to substitute -with your own model. We will also use ``torch.optim.Adam`` as our optimizer, -but, again, feel free to substitute with your own optimizer. - -""" - -import torch -from torchvision import models -from pickle import dump - -model = models.vit_l_16(weights='DEFAULT').cuda() -optimizer = torch.optim.Adam(model.parameters()) - -############################################################################### -# Now let's define our typical training loop. You should use real images when -# training, but for the purposes of this tutorial, we are passing in fake -# inputs and not worrying about loading any actual data. - -IMAGE_SIZE = 224 - -def train(model, optimizer): - # create our fake image input: tensor shape is batch_size, channels, height, width - fake_image = torch.rand(1, 3, IMAGE_SIZE, IMAGE_SIZE).cuda() - - # call our forward and backward - loss = model.forward(fake_image) - loss.sum().backward() - - # optimizer update - optimizer.step() - optimizer.zero_grad() - -############################################################################### -# Memory usage during training -# """""""""""""""""""""""""""" -# We are about to look at some memory snapshots, so we should be prepared to -# analyze them properly. Typically, training memory consists of: -# -# * Model parameters (size P) -# * Activations that are saved for the backward pass (size A) -# * Gradients, which are the same size as the model parameters, so size G = P. -# * Optimizer state, which is proportional to the size of the parameters. In -# this case, the state for Adam requires 2x the model parameters, so size O = 2P. -# * Intermediate tensors, which are allocated throughout the compute. We will -# not worry about them for now as they are usually small and ephemeral. -# -# Capturing and visualizing memory snapshots -# """""""""""""""""""""""""""""""""""""""""" -# Let's get us a memory snapshot! As your code runs, consider what you may expect -# the CUDA memory timeline to look like. - -# tell CUDA to start recording memory allocations -torch.cuda.memory._record_memory_history(enabled='all') - -# train 3 steps -for _ in range(3): - train(model, optimizer) - -# save a snapshot of the memory allocations -s = torch.cuda.memory._snapshot() -with open(f"snapshot.pickle", "wb") as f: - dump(s, f) - -# tell CUDA to stop recording memory allocations now -torch.cuda.memory._record_memory_history(enabled=None) - -############################################################################### -# Now open up the snapshot in the CUDA Memory Visualizer at -# https://pytorch.org/memory_viz by dragging and dropping the -# ``snapshot.pickle`` file. Does the memory timeline match your expectations? -# -# .. figure:: /_static/img/optim_step_in_bwd/snapshot.jpg -# :alt: snapshot.png loaded into CUDA Memory Visualizer -# -# The model parameters have already been loaded in memory before the training -# step, so we see a chunk of memory devoted to the weights right off the bat. -# As we start our forward pass, memory is allocated gradually for the activations, -# or the tensors we are saving to be able to compute gradients in the backward pass. -# Once we start the backward pass, the activations are gradually freed while memory -# of the gradients starts building up. -# -# Lastly, as the optimizer kicks in, its state will be lazily initialized, so we -# should see the optimizer state memory gradually increase during the optimizer -# step of the first training loop only. In future loops, the optimizer memory -# will remain and be updated in-place. The memory for the gradients is then -# freed accordingly at the end of every training loop when ``zero_grad`` is called. -# -# Where is the memory bottleneck in this training loop? Or, in other words, -# where is the peak memory? -# -# The peak memory usage is during the optimizer step! Note the memory then -# consists of ~1.2GB of parameters, ~1.2GB of gradients, and ~2.4GB=2*1.2GB of -# the optimizer state as expected. The last ~1.2GB comes from Adam optimizer -# requiring memory for intermediates, totaling to ~6GB of peak memory. -# Technically, you can remove the need for the last 1.2GB for optimizer -# intermediates if you set ``Adam(model.parameters(), foreach=False)`` which -# would trade off runtime for memory. If switching off the ``foreach`` runtime -# optimization is sufficient in memory savings for you, nice, but please -# read on if you're curious how this tutorial can help you do better! -# With the technique we will soon introduce, we will reduce peak memory by -# removing the need for the ~1.2GB of **gradients memory** as well as **optimizer -# intermediates memory**. Now, what would you expect the new peak memory to be? -# The answer will be revealed in the `next` snapshot. -# -# DISCLAIMER: This technique is **not** for all -# """"""""""""""""""""""""""""""""""""""""""""" -# Before we get too excited, we have to consider whether this technique is applicable -# for `your` use case. This is NOT a silver bullet! The technique of fusing the -# optimizer step into the backward only targets reducing *gradient* memory (and as a side effect also optimizer intermediates -# memory). Thus, the more sizable the memory taken up by the gradients, the more -# tantamount the memory reduction. In our example above, the gradients eat up 20% -# of the memory pie, which is quite sizable! -# -# This may not be the case for you, for example, if your weights are already tiny, -# (say, due to applying LoRa,) then the gradients do not take much space in your -# training loop and the wins are way less exciting. In that case, you should -# first try other techniques like activations checkpointing, distributed -# training, quantization, or reducing the batch size. Then, when the gradients -# are part of the bottleneck again, come back to this tutorial! -# -# Still here? Cool, let's introduce our new ``register_post_accumulate_grad_hook(hook)`` -# API on Tensor. -# -# ``Tensor.register_post_accumulate_grad_hook(hook)`` API and our technique -# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -# Our technique relies on not having to save the gradients during ``backward()``. Instead, -# once a gradient has been accumulated, we will immediately apply the optimizer to -# the corresponding parameter and drop that gradient entirely! This removes the need -# for holding onto a big buffer of gradients until the optimizer step. -# -# So how can we unlock the behavior of applying the optimizer more eagerly? In our 2.1 -# release, we've added a new API :func:`torch.Tensor.register_post_accumulate_grad_hook` -# that would allow us to add a hook onto a Tensor once its ``.grad`` field has been -# accumulated. We will encapsulate the optimizer step into this hook. How? -# -# How everything fits together in 10 lines -# """""""""""""""""""""""""""""""""""""""" -# Remember our model and optimizer setup from the beginning? I'll leave them commented -# out below so we don't spend resources rerunning the code. -# -# .. code-block:: python -# -# model = models.vit_l_16(weights='DEFAULT').cuda() -# optimizer = torch.optim.Adam(model.parameters()) - -# Instead of having just *one* optimizer, we will have a ``dict`` of optimizers -# for every parameter so we could reference them in our hook. -optimizer_dict = {p: torch.optim.Adam([p], foreach=False) for p in model.parameters()} - -# Define our hook, which will call the optimizer ``step()`` and ``zero_grad()`` -def optimizer_hook(parameter) -> None: - optimizer_dict[parameter].step() - optimizer_dict[parameter].zero_grad() - -# Register the hook onto every parameter -for p in model.parameters(): - p.register_post_accumulate_grad_hook(optimizer_hook) - -# Now remember our previous ``train()`` function? Since the optimizer has been -# fused into the backward, we can remove the optimizer step and zero_grad calls. -def train(model): - # create our fake image input: tensor shape is batch_size, channels, height, width - fake_image = torch.rand(1, 3, IMAGE_SIZE, IMAGE_SIZE).cuda() - - # call our forward and backward - loss = model.forward(fake_image) - loss.sum().backward() - - # optimizer update --> no longer needed! - # optimizer.step() - # optimizer.zero_grad() - -######################################################################## -# That took about 10 lines of changes in our sample model, which is neat. -# However, for real models, it could be a fairly intrusive change to switch -# out the optimizer for an optimizer dictionary, especially for those who use -# ``LRScheduler``s or manipulate optimizer configuration throughout the -# training epochs. Working out this API with those changes will be more -# involved and will likely require moving more configuration into global -# state but should not be impossible. That said, a next step for PyTorch -# is to make this API easier to adopt with LRSchedulers and other features -# you are already used to. -# -# But let me get back to convincing you that this technique is worth it. -# We will consult our friend, the memory snapshot. - -# delete optimizer memory from before to get a clean slate for the next -# memory snapshot -del optimizer - -# tell CUDA to start recording memory allocations -torch.cuda.memory._record_memory_history(enabled='all') - -# train 3 steps. note that we no longer pass the optimizer into train() -for _ in range(3): - train(model) - -# save a snapshot of the memory allocations -s = torch.cuda.memory._snapshot() -with open(f"snapshot-opt-in-bwd.pickle", "wb") as f: - dump(s, f) - -# tell CUDA to stop recording memory allocations now -torch.cuda.memory._record_memory_history(enabled=None) - -############################################################################### -# Yes, take some time to drag your snapshot into the CUDA Memory Visualizer. -# -# .. figure:: /_static/img/optim_step_in_bwd/snapshot_opt_in_bwd.jpg -# :alt: snapshot.png loaded into CUDA Memory Visualizer -# -# Several major observations: -# 1. There is no more optimizer step! Right...we fused that into the backward. -# 2. Likewise, the backward drags longer and there are more random allocations -# for intermediates. This is expected, as the optimizer step requires -# intermediates. -# 3. Most importantly! The peak memory is lower! It is now ~4GB (which I -# hope maps closely to your earlier expectation). -# -# Note that there is no longer any big chunk of memory allocated for the gradients -# compared to before, accounting for ~1.2GB of memory savings. Instead, we've freed -# each gradient very quickly after they've been computed by moving the optimizer -# step as far ahead as we can. Woohoo! By the way, the other ~1.2GB of memory savings -# comes from breaking apart the optimizer into per-parameter optimizers, so the -# intermediates have proportionally shrunk. This detail is `less important` than -# the gradient memory savings, as you can get optimizer intermediates savings -# from just turning ``foreach=False`` without this technique. -# -# You may be correctly wondering: if we saved 2.4GB of memory, why is the peak memory -# NOT 6GB - 2.4GB = 3.6GB? Well, the peak has moved! The peak is now near the start -# of the backward step, when we still have activations in memory, where before, the peak -# was during the optimizer step when the activations had been freed. The ~0.4GB difference -# accounting for ~4.0GB - ~3.6GB is thus due to the activations memory. One can then -# imagine that this technique can be coupled with activations checkpointing for more -# memory wins. -# -# Conclusion -# """""""""" -# In this tutorial, we learned about the memory saving technique of -# fusing the optimizer into the backward step through the new -# ``Tensor.register_post_accumulate_grad_hook()`` API and *when* to apply this -# technique (when gradients memory is significant). Along the way, we also learned -# about memory snapshots, which are generally useful in memory optimization. diff --git a/intermediate_source/parametrizations.py b/intermediate_source/parametrizations.py deleted file mode 100644 index 59cff1d241..0000000000 --- a/intermediate_source/parametrizations.py +++ /dev/null @@ -1,393 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Parametrizations Tutorial -========================= -**Author**: `Mario Lezcano `_ - -Regularizing deep-learning models is a surprisingly challenging task. -Classical techniques such as penalty methods often fall short when applied -on deep models due to the complexity of the function being optimized. -This is particularly problematic when working with ill-conditioned models. -Examples of these are RNNs trained on long sequences and GANs. A number -of techniques have been proposed in recent years to regularize these -models and improve their convergence. On recurrent models, it has been -proposed to control the singular values of the recurrent kernel for the -RNN to be well-conditioned. This can be achieved, for example, by making -the recurrent kernel `orthogonal `_. -Another way to regularize recurrent models is via -"`weight normalization `_". -This approach proposes to decouple the learning of the parameters from the -learning of their norms. To do so, the parameter is divided by its -`Frobenius norm `_ -and a separate parameter encoding its norm is learned. -A similar regularization was proposed for GANs under the name of -"`spectral normalization `_". This method -controls the Lipschitz constant of the network by dividing its parameters by -their `spectral norm `_, -rather than their Frobenius norm. - -All these methods have a common pattern: they all transform a parameter -in an appropriate way before using it. In the first case, they make it orthogonal by -using a function that maps matrices to orthogonal matrices. In the case of weight -and spectral normalization, they divide the original parameter by its norm. - -More generally, all these examples use a function to put extra structure on the parameters. -In other words, they use a function to constrain the parameters. - -In this tutorial, you will learn how to implement and use this pattern to put -constraints on your model. Doing so is as easy as writing your own ``nn.Module``. - -Requirements: ``torch>=1.9.0`` - -Implementing parametrizations by hand -------------------------------------- - -Assume that we want to have a square linear layer with symmetric weights, that is, -with weights ``X`` such that ``X = Xᵀ``. One way to do so is -to copy the upper-triangular part of the matrix into its lower-triangular part -""" - -import torch -import torch.nn as nn -import torch.nn.utils.parametrize as parametrize - -def symmetric(X): - return X.triu() + X.triu(1).transpose(-1, -2) - -X = torch.rand(3, 3) -A = symmetric(X) -assert torch.allclose(A, A.T) # A is symmetric -print(A) # Quick visual check - -############################################################################### -# We can then use this idea to implement a linear layer with symmetric weights -class LinearSymmetric(nn.Module): - def __init__(self, n_features): - super().__init__() - self.weight = nn.Parameter(torch.rand(n_features, n_features)) - - def forward(self, x): - A = symmetric(self.weight) - return x @ A - -############################################################################### -# The layer can be then used as a regular linear layer -layer = LinearSymmetric(3) -out = layer(torch.rand(8, 3)) - -############################################################################### -# This implementation, although correct and self-contained, presents a number of problems: -# -# 1) It reimplements the layer. We had to implement the linear layer as ``x @ A``. This is -# not very problematic for a linear layer, but imagine having to reimplement a CNN or a -# Transformer... -# 2) It does not separate the layer and the parametrization. If the parametrization were -# more difficult, we would have to rewrite its code for each layer that we want to use it -# in. -# 3) It recomputes the parametrization every time we use the layer. If we use the layer -# several times during the forward pass, (imagine the recurrent kernel of an RNN), it -# would compute the same ``A`` every time that the layer is called. -# -# Introduction to parametrizations -# -------------------------------- -# -# Parametrizations can solve all these problems as well as others. -# -# Let's start by reimplementing the code above using ``torch.nn.utils.parametrize``. -# The only thing that we have to do is to write the parametrization as a regular ``nn.Module`` -class Symmetric(nn.Module): - def forward(self, X): - return X.triu() + X.triu(1).transpose(-1, -2) - -############################################################################### -# This is all we need to do. Once we have this, we can transform any regular layer into a -# symmetric layer by doing -layer = nn.Linear(3, 3) -parametrize.register_parametrization(layer, "weight", Symmetric()) - -############################################################################### -# Now, the matrix of the linear layer is symmetric -A = layer.weight -assert torch.allclose(A, A.T) # A is symmetric -print(A) # Quick visual check - -############################################################################### -# We can do the same thing with any other layer. For example, we can create a CNN with -# `skew-symmetric `_ kernels. -# We use a similar parametrization, copying the upper-triangular part with signs -# reversed into the lower-triangular part -class Skew(nn.Module): - def forward(self, X): - A = X.triu(1) - return A - A.transpose(-1, -2) - - -cnn = nn.Conv2d(in_channels=5, out_channels=8, kernel_size=3) -parametrize.register_parametrization(cnn, "weight", Skew()) -# Print a few kernels -print(cnn.weight[0, 1]) -print(cnn.weight[2, 2]) - -############################################################################### -# Inspecting a parametrized module -# -------------------------------- -# -# When a module is parametrized, we find that the module has changed in three ways: -# -# 1) ``model.weight`` is now a property -# -# 2) It has a new ``module.parametrizations`` attribute -# -# 3) The unparametrized weight has been moved to ``module.parametrizations.weight.original`` -# -# | -# After parametrizing ``weight``, ``layer.weight`` is turned into a -# `Python property `_. -# This property computes ``parametrization(weight)`` every time we request ``layer.weight`` -# just as we did in our implementation of ``LinearSymmetric`` above. -# -# Registered parametrizations are stored under a ``parametrizations`` attribute within the module. -layer = nn.Linear(3, 3) -print(f"Unparametrized:\n{layer}") -parametrize.register_parametrization(layer, "weight", Symmetric()) -print(f"\nParametrized:\n{layer}") - -############################################################################### -# This ``parametrizations`` attribute is an ``nn.ModuleDict``, and it can be accessed as such -print(layer.parametrizations) -print(layer.parametrizations.weight) - -############################################################################### -# Each element of this ``nn.ModuleDict`` is a ``ParametrizationList``, which behaves like an -# ``nn.Sequential``. This list will allow us to concatenate parametrizations on one weight. -# Since this is a list, we can access the parametrizations indexing it. Here's -# where our ``Symmetric`` parametrization sits -print(layer.parametrizations.weight[0]) - -############################################################################### -# The other thing that we notice is that, if we print the parameters, we see that the -# parameter ``weight`` has been moved -print(dict(layer.named_parameters())) - -############################################################################### -# It now sits under ``layer.parametrizations.weight.original`` -print(layer.parametrizations.weight.original) - -############################################################################### -# Besides these three small differences, the parametrization is doing exactly the same -# as our manual implementation -symmetric = Symmetric() -weight_orig = layer.parametrizations.weight.original -print(torch.dist(layer.weight, symmetric(weight_orig))) - -############################################################################### -# Parametrizations are first-class citizens -# ----------------------------------------- -# -# Since ``layer.parametrizations`` is an ``nn.ModuleList``, it means that the parametrizations -# are properly registered as submodules of the original module. As such, the same rules -# for registering parameters in a module apply to register a parametrization. -# For example, if a parametrization has parameters, these will be moved from CPU -# to CUDA when calling ``model = model.cuda()``. -# -# Caching the value of a parametrization -# -------------------------------------- -# -# Parametrizations come with an inbuilt caching system via the context manager -# ``parametrize.cached()`` -class NoisyParametrization(nn.Module): - def forward(self, X): - print("Computing the Parametrization") - return X - -layer = nn.Linear(4, 4) -parametrize.register_parametrization(layer, "weight", NoisyParametrization()) -print("Here, layer.weight is recomputed every time we call it") -foo = layer.weight + layer.weight.T -bar = layer.weight.sum() -with parametrize.cached(): - print("Here, it is computed just the first time layer.weight is called") - foo = layer.weight + layer.weight.T - bar = layer.weight.sum() - -############################################################################### -# Concatenating parametrizations -# ------------------------------ -# -# Concatenating two parametrizations is as easy as registering them on the same tensor. -# We may use this to create more complex parametrizations from simpler ones. For example, the -# `Cayley map `_ -# maps the skew-symmetric matrices to the orthogonal matrices of positive determinant. We can -# concatenate ``Skew`` and a parametrization that implements the Cayley map to get a layer with -# orthogonal weights -class CayleyMap(nn.Module): - def __init__(self, n): - super().__init__() - self.register_buffer("Id", torch.eye(n)) - - def forward(self, X): - # (I + X)(I - X)^{-1} - return torch.linalg.solve(self.Id - X, self.Id + X) - -layer = nn.Linear(3, 3) -parametrize.register_parametrization(layer, "weight", Skew()) -parametrize.register_parametrization(layer, "weight", CayleyMap(3)) -X = layer.weight -print(torch.dist(X.T @ X, torch.eye(3))) # X is orthogonal - -############################################################################### -# This may also be used to prune a parametrized module, or to reuse parametrizations. For example, -# the matrix exponential maps the symmetric matrices to the Symmetric Positive Definite (SPD) matrices -# But the matrix exponential also maps the skew-symmetric matrices to the orthogonal matrices. -# Using these two facts, we may reuse the parametrizations before to our advantage -class MatrixExponential(nn.Module): - def forward(self, X): - return torch.matrix_exp(X) - -layer_orthogonal = nn.Linear(3, 3) -parametrize.register_parametrization(layer_orthogonal, "weight", Skew()) -parametrize.register_parametrization(layer_orthogonal, "weight", MatrixExponential()) -X = layer_orthogonal.weight -print(torch.dist(X.T @ X, torch.eye(3))) # X is orthogonal - -layer_spd = nn.Linear(3, 3) -parametrize.register_parametrization(layer_spd, "weight", Symmetric()) -parametrize.register_parametrization(layer_spd, "weight", MatrixExponential()) -X = layer_spd.weight -print(torch.dist(X, X.T)) # X is symmetric -print((torch.linalg.eigvalsh(X) > 0.).all()) # X is positive definite - -############################################################################### -# Initializing parametrizations -# ----------------------------- -# -# Parametrizations come with a mechanism to initialize them. If we implement a method -# ``right_inverse`` with signature -# -# .. code-block:: python -# -# def right_inverse(self, X: Tensor) -> Tensor -# -# it will be used when assigning to the parametrized tensor. -# -# Let's upgrade our implementation of the ``Skew`` class to support this -class Skew(nn.Module): - def forward(self, X): - A = X.triu(1) - return A - A.transpose(-1, -2) - - def right_inverse(self, A): - # We assume that A is skew-symmetric - # We take the upper-triangular elements, as these are those used in the forward - return A.triu(1) - -############################################################################### -# We may now initialize a layer that is parametrized with ``Skew`` -layer = nn.Linear(3, 3) -parametrize.register_parametrization(layer, "weight", Skew()) -X = torch.rand(3, 3) -X = X - X.T # X is now skew-symmetric -layer.weight = X # Initialize layer.weight to be X -print(torch.dist(layer.weight, X)) # layer.weight == X - -############################################################################### -# This ``right_inverse`` works as expected when we concatenate parametrizations. -# To see this, let's upgrade the Cayley parametrization to also support being initialized -class CayleyMap(nn.Module): - def __init__(self, n): - super().__init__() - self.register_buffer("Id", torch.eye(n)) - - def forward(self, X): - # Assume X skew-symmetric - # (I + X)(I - X)^{-1} - return torch.linalg.solve(self.Id - X, self.Id + X) - - def right_inverse(self, A): - # Assume A orthogonal - # See https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map - # (A - I)(A + I)^{-1} - return torch.linalg.solve(A + self.Id, self.Id - A) - -layer_orthogonal = nn.Linear(3, 3) -parametrize.register_parametrization(layer_orthogonal, "weight", Skew()) -parametrize.register_parametrization(layer_orthogonal, "weight", CayleyMap(3)) -# Sample an orthogonal matrix with positive determinant -X = torch.empty(3, 3) -nn.init.orthogonal_(X) -if X.det() < 0.: - X[0].neg_() -layer_orthogonal.weight = X -print(torch.dist(layer_orthogonal.weight, X)) # layer_orthogonal.weight == X - -############################################################################### -# This initialization step can be written more succinctly as -layer_orthogonal.weight = nn.init.orthogonal_(layer_orthogonal.weight) - -############################################################################### -# The name of this method comes from the fact that we would often expect -# that ``forward(right_inverse(X)) == X``. This is a direct way of rewriting that -# the forward after the initialization with value ``X`` should return the value ``X``. -# This constraint is not strongly enforced in practice. In fact, at times, it might be of -# interest to relax this relation. For example, consider the following implementation -# of a randomized pruning method: -class PruningParametrization(nn.Module): - def __init__(self, X, p_drop=0.2): - super().__init__() - # sample zeros with probability p_drop - mask = torch.full_like(X, 1.0 - p_drop) - self.mask = torch.bernoulli(mask) - - def forward(self, X): - return X * self.mask - - def right_inverse(self, A): - return A - -############################################################################### -# In this case, it is not true that for every matrix A ``forward(right_inverse(A)) == A``. -# This is only true when the matrix ``A`` has zeros in the same positions as the mask. -# Even then, if we assign a tensor to a pruned parameter, it will comes as no surprise -# that tensor will be, in fact, pruned -layer = nn.Linear(3, 4) -X = torch.rand_like(layer.weight) -print(f"Initialization matrix:\n{X}") -parametrize.register_parametrization(layer, "weight", PruningParametrization(layer.weight)) -layer.weight = X -print(f"\nInitialized weight:\n{layer.weight}") - -############################################################################### -# Removing parametrizations -# ------------------------- -# -# We may remove all the parametrizations from a parameter or a buffer in a module -# by using ``parametrize.remove_parametrizations()`` -layer = nn.Linear(3, 3) -print("Before:") -print(layer) -print(layer.weight) -parametrize.register_parametrization(layer, "weight", Skew()) -print("\nParametrized:") -print(layer) -print(layer.weight) -parametrize.remove_parametrizations(layer, "weight") -print("\nAfter. Weight has skew-symmetric values but it is unconstrained:") -print(layer) -print(layer.weight) - -############################################################################### -# When removing a parametrization, we may choose to leave the original parameter (i.e. that in -# ``layer.parametriations.weight.original``) rather than its parametrized version by setting -# the flag ``leave_parametrized=False`` -layer = nn.Linear(3, 3) -print("Before:") -print(layer) -print(layer.weight) -parametrize.register_parametrization(layer, "weight", Skew()) -print("\nParametrized:") -print(layer) -print(layer.weight) -parametrize.remove_parametrizations(layer, "weight", leave_parametrized=False) -print("\nAfter. Same as Before:") -print(layer) -print(layer.weight) diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py deleted file mode 100644 index ece80d3f94..0000000000 --- a/intermediate_source/per_sample_grads.py +++ /dev/null @@ -1,225 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Per-sample-gradients -==================== - -What is it? ------------ - -Per-sample-gradient computation is computing the gradient for each and every -sample in a batch of data. It is a useful quantity in differential privacy, -meta-learning, and optimization research. - -.. note:: - - This tutorial requires PyTorch 2.0.0 or later. - -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -torch.manual_seed(0) - -# Here's a simple CNN and loss function: - -class SimpleCNN(nn.Module): - def __init__(self): - super(SimpleCNN, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - -def loss_fn(predictions, targets): - return F.nll_loss(predictions, targets) - - -###################################################################### -# Let’s generate a batch of dummy data and pretend that we’re working with an MNIST dataset. -# The dummy images are 28 by 28 and we use a minibatch of size 64. - -device = 'cuda' - -num_models = 10 -batch_size = 64 -data = torch.randn(batch_size, 1, 28, 28, device=device) - -targets = torch.randint(10, (64,), device=device) - -###################################################################### -# In regular model training, one would forward the minibatch through the model, -# and then call .backward() to compute gradients. This would generate an -# 'average' gradient of the entire mini-batch: - -model = SimpleCNN().to(device=device) -predictions = model(data) # move the entire mini-batch through the model - -loss = loss_fn(predictions, targets) -loss.backward() # back propagate the 'average' gradient of this mini-batch - -###################################################################### -# In contrast to the above approach, per-sample-gradient computation is -# equivalent to: -# -# - for each individual sample of the data, perform a forward and a backward -# pass to get an individual (per-sample) gradient. - -def compute_grad(sample, target): - sample = sample.unsqueeze(0) # prepend batch dimension for processing - target = target.unsqueeze(0) - - prediction = model(sample) - loss = loss_fn(prediction, target) - - return torch.autograd.grad(loss, list(model.parameters())) - - -def compute_sample_grads(data, targets): - """ manually process each sample with per sample gradient """ - sample_grads = [compute_grad(data[i], targets[i]) for i in range(batch_size)] - sample_grads = zip(*sample_grads) - sample_grads = [torch.stack(shards) for shards in sample_grads] - return sample_grads - -per_sample_grads = compute_sample_grads(data, targets) - -###################################################################### -# ``sample_grads[0]`` is the per-sample-grad for model.conv1.weight. -# ``model.conv1.weight.shape`` is ``[32, 1, 3, 3]``; notice how there is one -# gradient, per sample, in the batch for a total of 64. - -print(per_sample_grads[0].shape) - -###################################################################### -# Per-sample-grads, *the efficient way*, using function transforms -# ---------------------------------------------------------------- -# We can compute per-sample-gradients efficiently by using function transforms. -# -# The ``torch.func`` function transform API transforms over functions. -# Our strategy is to define a function that computes the loss and then apply -# transforms to construct a function that computes per-sample-gradients. -# -# We'll use the ``torch.func.functional_call`` function to treat an ``nn.Module`` -# like a function. -# -# First, let’s extract the state from ``model`` into two dictionaries, -# parameters and buffers. We'll be detaching them because we won't use -# regular PyTorch autograd (e.g. Tensor.backward(), torch.autograd.grad). - -from torch.func import functional_call, vmap, grad - -params = {k: v.detach() for k, v in model.named_parameters()} -buffers = {k: v.detach() for k, v in model.named_buffers()} - -###################################################################### -# Next, let's define a function to compute the loss of the model given a -# single input rather than a batch of inputs. It is important that this -# function accepts the parameters, the input, and the target, because we will -# be transforming over them. -# -# Note - because the model was originally written to handle batches, we’ll -# use ``torch.unsqueeze`` to add a batch dimension. - -def compute_loss(params, buffers, sample, target): - batch = sample.unsqueeze(0) - targets = target.unsqueeze(0) - - predictions = functional_call(model, (params, buffers), (batch,)) - loss = loss_fn(predictions, targets) - return loss - -###################################################################### -# Now, let’s use the ``grad`` transform to create a new function that computes -# the gradient with respect to the first argument of ``compute_loss`` -# (i.e. the ``params``). - -ft_compute_grad = grad(compute_loss) - -###################################################################### -# The ``ft_compute_grad`` function computes the gradient for a single -# (sample, target) pair. We can use ``vmap`` to get it to compute the gradient -# over an entire batch of samples and targets. Note that -# ``in_dims=(None, None, 0, 0)`` because we wish to map ``ft_compute_grad`` over -# the 0th dimension of the data and targets, and use the same ``params`` and -# buffers for each. - -ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0)) - -###################################################################### -# Finally, let's used our transformed function to compute per-sample-gradients: - -ft_per_sample_grads = ft_compute_sample_grad(params, buffers, data, targets) - -###################################################################### -# we can double check that the results using ``grad`` and ``vmap`` match the -# results of hand processing each one individually: - -for per_sample_grad, ft_per_sample_grad in zip(per_sample_grads, ft_per_sample_grads.values()): - assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=3e-3, rtol=1e-5) - -###################################################################### -# A quick note: there are limitations around what types of functions can be -# transformed by ``vmap``. The best functions to transform are ones that are pure -# functions: a function where the outputs are only determined by the inputs, -# and that have no side effects (e.g. mutation). ``vmap`` is unable to handle -# mutation of arbitrary Python data structures, but it is able to handle many -# in-place PyTorch operations. -# -# Performance comparison -# ---------------------- -# -# Curious about how the performance of ``vmap`` compares? -# -# Currently the best results are obtained on newer GPU's such as the A100 -# (Ampere) where we've seen up to 25x speedups on this example, but here are -# some results on our build machines: - -def get_perf(first, first_descriptor, second, second_descriptor): - """takes torch.benchmark objects and compares delta of second vs first.""" - second_res = second.times[0] - first_res = first.times[0] - - gain = (first_res-second_res)/first_res - if gain < 0: gain *=-1 - final_gain = gain*100 - - print(f"Performance delta: {final_gain:.4f} percent improvement with {first_descriptor} ") - -from torch.utils.benchmark import Timer - -without_vmap = Timer(stmt="compute_sample_grads(data, targets)", globals=globals()) -with_vmap = Timer(stmt="ft_compute_sample_grad(params, buffers, data, targets)",globals=globals()) -no_vmap_timing = without_vmap.timeit(100) -with_vmap_timing = with_vmap.timeit(100) - -print(f'Per-sample-grads without vmap {no_vmap_timing}') -print(f'Per-sample-grads with vmap {with_vmap_timing}') - -get_perf(with_vmap_timing, "vmap", no_vmap_timing, "no vmap") - -###################################################################### -# There are other optimized solutions (like in https://github.com/pytorch/opacus) -# to computing per-sample-gradients in PyTorch that also perform better than -# the naive method. But it’s cool that composing ``vmap`` and ``grad`` give us a -# nice speedup. -# -# In general, vectorization with ``vmap`` should be faster than running a function -# in a for-loop and competitive with manual batching. There are some exceptions -# though, like if we haven’t implemented the ``vmap`` rule for a particular -# operation or if the underlying kernels weren’t optimized for older hardware -# (GPUs). If you see any of these cases, please let us know by opening an issue -# at on GitHub. diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py deleted file mode 100644 index fa69507a0e..0000000000 --- a/intermediate_source/pinmem_nonblock.py +++ /dev/null @@ -1,728 +0,0 @@ -# -*- coding: utf-8 -*- -""" -A guide on good usage of ``non_blocking`` and ``pin_memory()`` in PyTorch -========================================================================= - -**Author**: `Vincent Moens `_ - -Introduction ------------- - -Transferring data from the CPU to the GPU is fundamental in many PyTorch applications. -It's crucial for users to understand the most effective tools and options available for moving data between devices. -This tutorial examines two key methods for device-to-device data transfer in PyTorch: -:meth:`~torch.Tensor.pin_memory` and :meth:`~torch.Tensor.to` with the ``non_blocking=True`` option. - -What you will learn -~~~~~~~~~~~~~~~~~~~ - -Optimizing the transfer of tensors from the CPU to the GPU can be achieved through asynchronous transfers and memory -pinning. However, there are important considerations: - -- Using ``tensor.pin_memory().to(device, non_blocking=True)`` can be up to twice as slow as a straightforward ``tensor.to(device)``. -- Generally, ``tensor.to(device, non_blocking=True)`` is an effective choice for enhancing transfer speed. -- While ``cpu_tensor.to("cuda", non_blocking=True).mean()`` executes correctly, attempting - ``cuda_tensor.to("cpu", non_blocking=True).mean()`` will result in erroneous outputs. - -Preamble -~~~~~~~~ - -The performance reported in this tutorial are conditioned on the system used to build the tutorial. -Although the conclusions are applicable across different systems, the specific observations may vary slightly -depending on the hardware available, especially on older hardware. -The primary objective of this tutorial is to offer a theoretical framework for understanding CPU to GPU data transfers. -However, any design decisions should be tailored to individual cases and guided by benchmarked throughput measurements, -as well as the specific requirements of the task at hand. - -""" - -import torch - -assert torch.cuda.is_available(), "A cuda device is required to run this tutorial" - - -###################################################################### -# -# This tutorial requires tensordict to be installed. If you don't have tensordict in your environment yet, install it -# by running the following command in a separate cell: -# -# .. code-block:: bash -# -# # Install tensordict with the following command -# !pip3 install tensordict -# -# We start by outlining the theory surrounding these concepts, and then move to concrete test examples of the features. -# -# -# Background -# ---------- -# -# .. _pinned_memory_background: -# -# Memory management basics -# ~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. _pinned_memory_memory: -# -# When one creates a CPU tensor in PyTorch, the content of this tensor needs to be placed -# in memory. The memory we talk about here is a rather complex concept worth looking at carefully. -# We distinguish two types of memory that are handled by the Memory Management Unit: the RAM (for simplicity) -# and the swap space on disk (which may or may not be the hard drive). Together, the available space in disk and RAM (physical memory) -# make up the virtual memory, which is an abstraction of the total resources available. -# In short, the virtual memory makes it so that the available space is larger than what can be found on RAM in isolation -# and creates the illusion that the main memory is larger than it actually is. -# -# In normal circumstances, a regular CPU tensor is pageable which means that it is divided in blocks called pages that -# can live anywhere in the virtual memory (both in RAM or on disk). As mentioned earlier, this has the advantage that -# the memory seems larger than what the main memory actually is. -# -# Typically, when a program accesses a page that is not in RAM, a "page fault" occurs and the operating system (OS) then brings -# back this page into RAM ("swap in" or "page in"). -# In turn, the OS may have to swap out (or "page out") another page to make room for the new page. -# -# In contrast to pageable memory, a pinned (or page-locked or non-pageable) memory is a type of memory that cannot -# be swapped out to disk. -# It allows for faster and more predictable access times, but has the downside that it is more limited than the -# pageable memory (aka the main memory). -# -# .. figure:: /_static/img/pinmem/pinmem.png -# :alt: -# -# CUDA and (non-)pageable memory -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. _pinned_memory_cuda_pageable_memory: -# -# To understand how CUDA copies a tensor from CPU to CUDA, let's consider the two scenarios above: -# -# - If the memory is page-locked, the device can access the memory directly in the main memory. The memory addresses are well -# defined and functions that need to read these data can be significantly accelerated. -# - If the memory is pageable, all the pages will have to be brought to the main memory before being sent to the GPU. -# This operation may take time and is less predictable than when executed on page-locked tensors. -# -# More precisely, when CUDA sends pageable data from CPU to GPU, it must first create a page-locked copy of that data -# before making the transfer. -# -# Asynchronous vs. Synchronous Operations with ``non_blocking=True`` (CUDA ``cudaMemcpyAsync``) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. _pinned_memory_async_sync: -# -# When executing a copy from a host (e.g., CPU) to a device (e.g., GPU), the CUDA toolkit offers modalities to do these -# operations synchronously or asynchronously with respect to the host. -# -# In practice, when calling :meth:`~torch.Tensor.to`, PyTorch always makes a call to -# `cudaMemcpyAsync `_. -# If ``non_blocking=False`` (default), a ``cudaStreamSynchronize`` will be called after each and every ``cudaMemcpyAsync``, making -# the call to :meth:`~torch.Tensor.to` blocking in the main thread. -# If ``non_blocking=True``, no synchronization is triggered, and the main thread on the host is not blocked. -# Therefore, from the host perspective, multiple tensors can be sent to the device simultaneously, -# as the thread does not need to wait for one transfer to be completed to initiate the other. -# -# .. note:: In general, the transfer is blocking on the device side (even if it isn't on the host side): -# the copy on the device cannot occur while another operation is being executed. -# However, in some advanced scenarios, a copy and a kernel execution can be done simultaneously on the GPU side. -# As the following example will show, three requirements must be met to enable this: -# -# 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra, -# Tesla, or H100 devices have more than one DMA engine. -# -# 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using -# :class:`~torch.cuda.Stream`. -# -# 3. The source data must be in pinned memory. -# -# We demonstrate this by running profiles on the following script. -# - -import contextlib - -from torch.cuda import Stream - - -s = Stream() - -torch.manual_seed(42) -t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True) -t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False) -t3_cuda = torch.randn(1024**2 * 5, device="cuda:0") - -assert torch.cuda.is_available() -device = torch.device("cuda", torch.cuda.current_device()) - - -# The function we want to profile -def inner(pinned: bool, streamed: bool): - with torch.cuda.stream(s) if streamed else contextlib.nullcontext(): - if pinned: - t1_cuda = t1_cpu_pinned.to(device, non_blocking=True) - else: - t2_cuda = t2_cpu_paged.to(device, non_blocking=True) - t_star_cuda_h2d_event = s.record_event() - # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is - # done in the other stream - t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda - t3_cuda_h2d_event = torch.cuda.current_stream().record_event() - t_star_cuda_h2d_event.synchronize() - t3_cuda_h2d_event.synchronize() - - -# Our profiler: profiles the `inner` function and stores the results in a .json file -def benchmark_with_profiler( - pinned, - streamed, -) -> None: - torch._C._profiler._set_cuda_sync_enabled_val(True) - wait, warmup, active = 1, 1, 2 - num_steps = wait + warmup + active - rank = 0 - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule( - wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1 - ), - ) as prof: - for step_idx in range(1, num_steps + 1): - inner(streamed=streamed, pinned=pinned) - if rank is None or rank == 0: - prof.step() - prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json") - - -###################################################################### -# Loading these profile traces in chrome (``chrome://tracing``) shows the following results: first, let's see -# what happens if both the arithmetic operation on ``t3_cuda`` is executed after the pageable tensor is sent to GPU -# in the main stream: -# - -benchmark_with_profiler(streamed=False, pinned=False) - -###################################################################### -# .. figure:: /_static/img/pinmem/trace_streamed0_pinned0.png -# :alt: -# -# Using a pinned tensor doesn't change the trace much, both operations are still executed consecutively: - -benchmark_with_profiler(streamed=False, pinned=True) - -###################################################################### -# -# .. figure:: /_static/img/pinmem/trace_streamed0_pinned1.png -# :alt: -# -# Sending a pageable tensor to GPU on a separate stream is also a blocking operation: - -benchmark_with_profiler(streamed=True, pinned=False) - -###################################################################### -# -# .. figure:: /_static/img/pinmem/trace_streamed1_pinned0.png -# :alt: -# -# Only pinned tensors copies to GPU on a separate stream overlap with another cuda kernel executed on -# the main stream: - -benchmark_with_profiler(streamed=True, pinned=True) - -###################################################################### -# -# .. figure:: /_static/img/pinmem/trace_streamed1_pinned1.png -# :alt: -# -# A PyTorch perspective -# --------------------- -# -# .. _pinned_memory_pt_perspective: -# -# ``pin_memory()`` -# ~~~~~~~~~~~~~~~~ -# -# .. _pinned_memory_pinned: -# -# PyTorch offers the possibility to create and send tensors to page-locked memory through the -# :meth:`~torch.Tensor.pin_memory` method and constructor arguments. -# CPU tensors on a machine where CUDA is initialized can be cast to pinned memory through the :meth:`~torch.Tensor.pin_memory` -# method. Importantly, ``pin_memory`` is blocking on the main thread of the host: it will wait for the tensor to be copied to -# page-locked memory before executing the next operation. -# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other -# constructors. -# -# Let us check the speed of pinning memory and sending tensors to CUDA: - - -import torch -import gc -from torch.utils.benchmark import Timer -import matplotlib.pyplot as plt - - -def timer(cmd): - median = ( - Timer(cmd, globals=globals()) - .adaptive_autorange(min_run_time=1.0, max_run_time=20.0) - .median - * 1000 - ) - print(f"{cmd}: {median: 4.4f} ms") - return median - - -# A tensor in pageable memory -pageable_tensor = torch.randn(1_000_000) - -# A tensor in page-locked (pinned) memory -pinned_tensor = torch.randn(1_000_000, pin_memory=True) - -# Runtimes: -pageable_to_device = timer("pageable_tensor.to('cuda:0')") -pinned_to_device = timer("pinned_tensor.to('cuda:0')") -pin_mem = timer("pageable_tensor.pin_memory()") -pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')") - -# Ratios: -r1 = pinned_to_device / pageable_to_device -r2 = pin_mem_to_device / pageable_to_device - -# Create a figure with the results -fig, ax = plt.subplots() - -xlabels = [0, 1, 2] -bar_labels = [ - "pageable_tensor.to(device) (1x)", - f"pinned_tensor.to(device) ({r1:4.2f}x)", - f"pageable_tensor.pin_memory().to(device) ({r2:4.2f}x)" - f"\npin_memory()={100*pin_mem/pin_mem_to_device:.2f}% of runtime.", -] -values = [pageable_to_device, pinned_to_device, pin_mem_to_device] -colors = ["tab:blue", "tab:red", "tab:orange"] -ax.bar(xlabels, values, label=bar_labels, color=colors) - -ax.set_ylabel("Runtime (ms)") -ax.set_title("Device casting runtime (pin-memory)") -ax.set_xticks([]) -ax.legend() - -plt.show() - -# Clear tensors -del pageable_tensor, pinned_tensor -_ = gc.collect() - -###################################################################### -# -# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under -# the hood, a pageable tensor must be copied to pinned memory before being sent to GPU. -# -# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before -# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just -# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will -# perform anyway before copying the data from host to device. -# -# .. note:: The PyTorch implementation of -# `pin_memory `_ -# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc `_ -# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does. -# Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or -# the amount of available RAM. -# -# ``non_blocking=True`` -# ~~~~~~~~~~~~~~~~~~~~~ -# -# .. _pinned_memory_non_blocking: -# -# As mentioned earlier, many PyTorch operations have the option of being executed asynchronously with respect to the host -# through the ``non_blocking`` argument. -# -# Here, to account accurately of the benefits of using ``non_blocking``, we will design a slightly more complex -# experiment since we want to assess how fast it is to send multiple tensors to GPU with and without calling -# ``non_blocking``. -# - - -# A simple loop that copies all tensors to cuda -def copy_to_device(*tensors): - result = [] - for tensor in tensors: - result.append(tensor.to("cuda:0")) - return result - - -# A loop that copies all tensors to cuda asynchronously -def copy_to_device_nonblocking(*tensors): - result = [] - for tensor in tensors: - result.append(tensor.to("cuda:0", non_blocking=True)) - # We need to synchronize - torch.cuda.synchronize() - return result - - -# Create a list of tensors -tensors = [torch.randn(1000) for _ in range(1000)] -to_device = timer("copy_to_device(*tensors)") -to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)") - -# Ratio -r1 = to_device_nonblocking / to_device - -# Plot the results -fig, ax = plt.subplots() - -xlabels = [0, 1] -bar_labels = [f"to(device) (1x)", f"to(device, non_blocking=True) ({r1:4.2f}x)"] -colors = ["tab:blue", "tab:red"] -values = [to_device, to_device_nonblocking] - -ax.bar(xlabels, values, label=bar_labels, color=colors) - -ax.set_ylabel("Runtime (ms)") -ax.set_title("Device casting runtime (non-blocking)") -ax.set_xticks([]) -ax.legend() - -plt.show() - - -###################################################################### -# To get a better sense of what is happening here, let us profile these two functions: - - -from torch.profiler import profile, ProfilerActivity - - -def profile_mem(cmd): - with profile(activities=[ProfilerActivity.CPU]) as prof: - exec(cmd) - print(cmd) - print(prof.key_averages().table(row_limit=10)) - - -###################################################################### -# Let's see the call stack with a regular ``to(device)`` first: -# - -print("Call to `to(device)`", profile_mem("copy_to_device(*tensors)")) - -###################################################################### -# and now the ``non_blocking`` version: -# - -print( - "Call to `to(device, non_blocking=True)`", - profile_mem("copy_to_device_nonblocking(*tensors)"), -) - - -###################################################################### -# The results are without any doubt better when using ``non_blocking=True``, as all transfers are initiated simultaneously -# on the host side and only one synchronization is done. -# -# The benefit will vary depending on the number and the size of the tensors as well as depending on the hardware being -# used. -# -# .. note:: Interestingly, the blocking ``to("cuda")`` actually performs the same asynchronous device casting operation -# (``cudaMemcpyAsync``) as the one with ``non_blocking=True`` with a synchronization point after each copy. -# -# Synergies -# ~~~~~~~~~ -# -# .. _pinned_memory_synergies: -# -# Now that we have made the point that data transfer of tensors already in pinned memory to GPU is faster than from -# pageable memory, and that we know that doing these transfers asynchronously is also faster than synchronously, we can -# benchmark combinations of these approaches. First, let's write a couple of new functions that will call ``pin_memory`` -# and ``to(device)`` on each tensor: -# - - -def pin_copy_to_device(*tensors): - result = [] - for tensor in tensors: - result.append(tensor.pin_memory().to("cuda:0")) - return result - - -def pin_copy_to_device_nonblocking(*tensors): - result = [] - for tensor in tensors: - result.append(tensor.pin_memory().to("cuda:0", non_blocking=True)) - # We need to synchronize - torch.cuda.synchronize() - return result - - -###################################################################### -# The benefits of using :meth:`~torch.Tensor.pin_memory` are more pronounced for -# somewhat large batches of large tensors: -# - -tensors = [torch.randn(1_000_000) for _ in range(1000)] -page_copy = timer("copy_to_device(*tensors)") -page_copy_nb = timer("copy_to_device_nonblocking(*tensors)") - -tensors_pinned = [torch.randn(1_000_000, pin_memory=True) for _ in range(1000)] -pinned_copy = timer("copy_to_device(*tensors_pinned)") -pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned)") - -pin_and_copy = timer("pin_copy_to_device(*tensors)") -pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)") - -# Plot -strategies = ("pageable copy", "pinned copy", "pin and copy") -blocking = { - "blocking": [page_copy, pinned_copy, pin_and_copy], - "non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb], -} - -x = torch.arange(3) -width = 0.25 -multiplier = 0 - - -fig, ax = plt.subplots(layout="constrained") - -for attribute, runtimes in blocking.items(): - offset = width * multiplier - rects = ax.bar(x + offset, runtimes, width, label=attribute) - ax.bar_label(rects, padding=3, fmt="%.2f") - multiplier += 1 - -# Add some text for labels, title and custom x-axis tick labels, etc. -ax.set_ylabel("Runtime (ms)") -ax.set_title("Runtime (pin-mem and non-blocking)") -ax.set_xticks([0, 1, 2]) -ax.set_xticklabels(strategies) -plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") -ax.legend(loc="upper left", ncols=3) - -plt.show() - -del tensors, tensors_pinned -_ = gc.collect() - - -###################################################################### -# Other copy directions (GPU -> CPU, CPU -> MPS) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. _pinned_memory_other_direction: -# -# Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe. -# This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is -# valid at read time. -# However, this guarantee does not extend to transfers in the opposite direction, from GPU to CPU. -# Without explicit synchronization, these transfers offer no assurance that the copy will be complete at the time of -# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage: -# - - -tensor = ( - torch.arange(1, 1_000_000, dtype=torch.double, device="cuda") - .expand(100, 999999) - .clone() -) -torch.testing.assert_close( - tensor.mean(), torch.tensor(500_000, dtype=torch.double, device="cuda") -), tensor.mean() -try: - i = -1 - for i in range(100): - cpu_tensor = tensor.to("cpu", non_blocking=True) - torch.testing.assert_close( - cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) - ) - print("No test failed with non_blocking") -except AssertionError: - print(f"{i}th test failed with non_blocking. Skipping remaining tests") -try: - i = -1 - for i in range(100): - cpu_tensor = tensor.to("cpu", non_blocking=True) - torch.cuda.synchronize() - torch.testing.assert_close( - cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) - ) - print("No test failed with synchronize") -except AssertionError: - print(f"One test failed with synchronize: {i}th assertion!") - - -###################################################################### -# The same considerations apply to copies from the CPU to non-CUDA devices, such as MPS. -# Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a -# CUDA-enabled device. -# -# In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction, -# ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed before -# the data is accessed. -# -# Practical recommendations -# ------------------------- -# -# .. _pinned_memory_recommendations: -# -# We can now wrap up some early recommendations based on our observations: -# -# In general, ``non_blocking=True`` will provide good throughput, regardless of whether the original tensor is or -# isn't in pinned memory. -# If the tensor is already in pinned memory, the transfer can be accelerated, but sending it to -# pin memory manually from python main thread is a blocking operation on the host, and hence will annihilate much of -# the benefit of using ``non_blocking=True`` (as CUDA does the `pin_memory` transfer anyway). -# -# One might now legitimately ask what use there is for the :meth:`~torch.Tensor.pin_memory` method. -# In the following section, we will explore further how this can be used to accelerate the data transfer even more. -# -# Additional considerations -# ------------------------- -# -# .. _pinned_memory_considerations: -# -# PyTorch notoriously provides a :class:`~torch.utils.data.DataLoader` class whose constructor accepts a -# ``pin_memory`` argument. -# Considering our previous discussion on ``pin_memory``, you might wonder how the ``DataLoader`` manages to -# accelerate data transfers if memory pinning is inherently blocking. -# -# The key lies in the DataLoader's use of a separate thread to handle the transfer of data from pageable to pinned -# memory, thus preventing any blockage in the main thread. -# -# To illustrate this, we will use the TensorDict primitive from the homonymous library. -# When invoking :meth:`~tensordict.TensorDict.to`, the default behavior is to send tensors to the device asynchronously, -# followed by a single call to ``torch.device.synchronize()`` afterwards. -# -# Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option which initiates multiple threads to execute -# ``pin_memory()`` before proceeding with to ``to(device)``. -# This approach can further accelerate data transfers, as demonstrated in the following example. -# -# - -from tensordict import TensorDict -import torch -from torch.utils.benchmark import Timer -import matplotlib.pyplot as plt - -# Create the dataset -td = TensorDict({str(i): torch.randn(1_000_000) for i in range(1000)}) - -# Runtimes -copy_blocking = timer("td.to('cuda:0', non_blocking=False)") -copy_non_blocking = timer("td.to('cuda:0')") -copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)") -copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)") - -# Rations -r1 = copy_non_blocking / copy_blocking -r2 = copy_pin_nb / copy_blocking -r3 = copy_pin_multithread_nb / copy_blocking - -# Figure -fig, ax = plt.subplots() - -xlabels = [0, 1, 2, 3] -bar_labels = [ - "Blocking copy (1x)", - f"Non-blocking copy ({r1:4.2f}x)", - f"Blocking pin, non-blocking copy ({r2:4.2f}x)", - f"Non-blocking pin, non-blocking copy ({r3:4.2f}x)", -] -values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb] -colors = ["tab:blue", "tab:red", "tab:orange", "tab:green"] - -ax.bar(xlabels, values, label=bar_labels, color=colors) - -ax.set_ylabel("Runtime (ms)") -ax.set_title("Device casting runtime") -ax.set_xticks([]) -ax.legend() - -plt.show() - -###################################################################### -# In this example, we are transferring many large tensors from the CPU to the GPU. -# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance. -# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits. -# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited. -# -# As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle -# tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite -# computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor. -# -# Moreover, transferring data that resides on disk (whether in shared memory or files) to the GPU typically requires an -# intermediate step of copying the data into pinned memory (located in RAM). -# Utilizing non_blocking for large data transfers in this context can significantly increase RAM consumption, -# potentially leading to adverse effects. -# -# In practice, there is no one-size-fits-all solution. -# The effectiveness of using multithreaded ``pin_memory`` combined with ``non_blocking`` transfers depends on a -# variety of factors, including the specific system, operating system, hardware, and the nature of the tasks -# being executed. -# Here is a list of factors to check when trying to speed-up data transfers between CPU and GPU, or comparing -# throughput's across scenarios: -# -# - **Number of available cores** -# -# How many CPU cores are available? Is the system shared with other users or processes that might compete for -# resources? -# -# - **Core utilization** -# -# Are the CPU cores heavily utilized by other processes? Does the application perform other CPU-intensive tasks -# concurrently with data transfers? -# -# - **Memory utilization** -# -# How much pageable and page-locked memory is currently being used? Is there sufficient free memory to allocate -# additional pinned memory without affecting system performance? Remember that nothing comes for free, for instance -# ``pin_memory`` will consume RAM and may impact other tasks. -# -# - **CUDA Device Capabilities** -# -# Does the GPU support multiple DMA engines for concurrent data transfers? What are the specific capabilities and -# limitations of the CUDA device being used? -# -# - **Number of tensors to be sent** -# -# How many tensors are transferred in a typical operation? -# -# - **Size of the tensors to be sent** -# -# What is the size of the tensors being transferred? A few large tensors or many small tensors may not benefit from -# the same transfer program. -# -# - **System Architecture** -# -# How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)? -# -# Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial -# portion of RAM. -# This reduces the available memory for other critical operations, such as paging, which can negatively impact the -# overall performance of an algorithm. -# -# Conclusion -# ---------- -# -# .. _pinned_memory_conclusion: -# -# Throughout this tutorial, we have explored several critical factors that influence transfer speeds and memory -# management when sending tensors from the host to the device. We've learned that using ``non_blocking=True`` generally -# accelerates data transfers, and that :meth:`~torch.Tensor.pin_memory` can also enhance performance if implemented -# correctly. However, these techniques require careful design and calibration to be effective. -# -# Remember that profiling your code and keeping an eye on the memory consumption are essential to optimize resource -# usage and achieve the best possible performance. -# -# Additional resources -# -------------------- -# -# .. _pinned_memory_resources: -# -# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about -# what was discussed in this tutorial, check the following references: -# -# - `CUDA toolkit memory management doc `_; -# - `CUDA pin-memory note `_; -# - `How to Optimize Data Transfers in CUDA C/C++ `_; -# - `tensordict doc `_ and `repo `_. -# diff --git a/intermediate_source/pipelining_tutorial.rst b/intermediate_source/pipelining_tutorial.rst deleted file mode 100644 index 0c6fc79846..0000000000 --- a/intermediate_source/pipelining_tutorial.rst +++ /dev/null @@ -1,236 +0,0 @@ -Introduction to Distributed Pipeline Parallelism -================================================ -**Authors**: `Howard Huang `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -This tutorial uses a gpt-style transformer model to demonstrate implementing distributed -pipeline parallelism with `torch.distributed.pipelining `__ -APIs. - -.. grid:: 2 - - .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn - :class-card: card-prerequisites - - * How to use ``torch.distributed.pipelining`` APIs - * How to apply pipeline parallelism to a transformer model - * How to utilize different schedules on a set of microbatches - - - .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites - :class-card: card-prerequisites - - * Familiarity with `basic distributed training `__ in PyTorch - -Setup ------ - -With ``torch.distributed.pipelining`` we will be partitioning the execution of a model and scheduling computation on micro-batches. We will be using a simplified version -of a transformer decoder model. The model architecture is for educational purposes and has multiple transformer decoder layers as we want to demonstrate how to split the model into different -chunks. First, let us define the model: - -.. code:: python - - import torch - import torch.nn as nn - from dataclasses import dataclass - - @dataclass - class ModelArgs: - dim: int = 512 - n_layers: int = 8 - n_heads: int = 8 - vocab_size: int = 10000 - - class Transformer(nn.Module): - def __init__(self, model_args: ModelArgs): - super().__init__() - - self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) - - # Using a ModuleDict lets us delete layers witout affecting names, - # ensuring checkpoints will correctly save and load. - self.layers = torch.nn.ModuleDict() - for layer_id in range(model_args.n_layers): - self.layers[str(layer_id)] = nn.TransformerDecoderLayer(model_args.dim, model_args.n_heads) - - self.norm = nn.LayerNorm(model_args.dim) - self.output = nn.Linear(model_args.dim, model_args.vocab_size) - - def forward(self, tokens: torch.Tensor): - # Handling layers being 'None' at runtime enables easy pipeline splitting - h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens - - for layer in self.layers.values(): - h = layer(h, h) - - h = self.norm(h) if self.norm else h - output = self.output(h).float() if self.output else h - return output - -Then, we need to import the necessary libraries in our script and initialize the distributed training process. In this case, we are defining some global variables to use -later in the script: - -.. code:: python - - import os - import torch.distributed as dist - from torch.distributed.pipelining import pipeline, SplitPoint, PipelineStage, ScheduleGPipe - - global rank, device, pp_group, stage_index, num_stages - def init_distributed(): - global rank, device, pp_group, stage_index, num_stages - rank = int(os.environ["LOCAL_RANK"]) - world_size = int(os.environ["WORLD_SIZE"]) - device = torch.device(f"cuda:{rank}") if torch.cuda.is_available() else torch.device("cpu") - dist.init_process_group() - - # This group can be a sub-group in the N-D parallel case - pp_group = dist.new_group() - stage_index = rank - num_stages = world_size - -The ``rank``, ``world_size``, and ``init_process_group()`` code should seem familiar to you as those are commonly used in -all distributed programs. The globals specific to pipeline parallelism include ``pp_group`` which is the process -group that will be used for send/recv communications, ``stage_index`` which, in this example, is a single rank -per stage so the index is equivalent to the rank, and ``num_stages`` which is equivalent to world_size. - -The ``num_stages`` is used to set the number of stages that will be used in the pipeline parallelism schedule. For example, -for ``num_stages=4``, a microbatch will need to go through 4 forwards and 4 backwards before it is completed. The ``stage_index`` -is necessary for the framework to know how to communicate between stages. For example, for the first stage (``stage_index=0``), it will -use data from the dataloader and does not need to receive data from any previous peers to perform its computation. - - -Step 1: Partition the Transformer Model ---------------------------------------- - -There are two different ways of partitioning the model: - -First is the manual mode in which we can manually create two instances of the model by deleting portions of -attributes of the model. In this example for a 2 stage (2 ranks) the model is cut in half. - -.. code:: python - - def manual_model_split(model, example_input_microbatch, model_args) -> PipelineStage: - if stage_index == 0: - # prepare the first stage model - for i in range(4, 8): - del model.layers[str(i)] - model.norm = None - model.output = None - stage_input_microbatch = example_input_microbatch - - elif stage_index == 1: - # prepare the second stage model - for i in range(4): - del model.layers[str(i)] - model.tok_embeddings = None - stage_input_microbatch = torch.randn(example_input_microbatch.shape[0], example_input_microbatch.shape[1], model_args.dim) - - stage = PipelineStage( - model, - stage_index, - num_stages, - device, - input_args=stage_input_microbatch, - ) - return stage - -As we can see the first stage does not have the layer norm or the output layer, and it only includes the first four transformer blocks. -The second stage does not have the input embedding layers, but includes the output layers and the final four transformer blocks. The function -then returns the ``PipelineStage`` for the current rank. - -The second method is the tracer-based mode which automatically splits the model based on a ``split_spec`` argument. Using the pipeline specification, we can instruct -``torch.distributed.pipelining`` where to split the model. In the following code block, -we are splitting before the before 4th transformer decoder layer, mirroring the manual split described above. Similarly, -we can retrieve a ``PipelineStage`` by calling ``build_stage`` after this splitting is done. - -.. code:: python - def tracer_model_split(model, example_input_microbatch) -> PipelineStage: - pipe = pipeline( - module=model, - mb_args=(example_input_microbatch,), - split_spec={ - "layers.4": SplitPoint.BEGINNING, - } - ) - stage = pipe.build_stage(stage_index, device, pp_group) - return stage - - -Step 2: Define The Main Execution ---------------------------------- - -In the main function we will create a particular pipeline schedule that the stages should follow. ``torch.distributed.pipelining`` -supports multiple schedules including supports multiple schedules, including single-stage-per-rank schedules ``GPipe`` and ``1F1B``, -as well as multiple-stage-per-rank schedules such as ``Interleaved1F1B`` and ``LoopedBFS``. - -.. code:: python - - if __name__ == "__main__": - init_distributed() - num_microbatches = 4 - model_args = ModelArgs() - model = Transformer(model_args) - - # Dummy data - x = torch.ones(32, 500, dtype=torch.long) - y = torch.randint(0, model_args.vocab_size, (32, 500), dtype=torch.long) - example_input_microbatch = x.chunk(num_microbatches)[0] - - # Option 1: Manual model splitting - stage = manual_model_split(model, example_input_microbatch, model_args) - - # Option 2: Tracer model splitting - # stage = tracer_model_split(model, example_input_microbatch) - - x = x.to(device) - y = y.to(device) - - def tokenwise_loss_fn(outputs, targets): - loss_fn = nn.CrossEntropyLoss() - outputs = outputs.view(-1, model_args.vocab_size) - targets = targets.view(-1) - return loss_fn(outputs, targets) - - schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn) - - if rank == 0: - schedule.step(x) - elif rank == 1: - losses = [] - output = schedule.step(target=y, losses=losses) - dist.destroy_process_group() - -In the example above, we are using the manual method to split the model, but the code can be uncommented to also try the -tracer-based model splitting function. In our schedule, we need to pass in the number of microbatches and -the loss function used to evaluate the targets. - -The ``.step()`` function processes the entire minibatch and automatically splits it into microbatches based -on the ``n_microbatches`` passed previously. The microbatches are then operated on according to the schedule class. -In the example above, we are using GPipe, which follows a simple all-forwards and then all-backwards schedule. The output -returned from rank 1 will be the same as if the model was on a single GPU and run with the entire batch. Similarly, -we can pass in a ``losses`` container to store the corresponding losses for each microbatch. - -Step 3: Launch the Distributed Processes ----------------------------------------- - -Finally, we are ready to run the script. We will use ``torchrun`` to create a single host, 2-process job. -Our script is already written in a way rank 0 that performs the required logic for pipeline stage 0, and rank 1 -performs the logic for pipeline stage 1. - -``torchrun --nnodes 1 --nproc_per_node 2 pipelining_tutorial.py`` - -Conclusion ----------- - -In this tutorial, we have learned how to implement distributed pipeline parallelism using PyTorch's ``torch.distributed.pipelining`` APIs. -We explored setting up the environment, defining a transformer model, and partitioning it for distributed training. -We discussed two methods of model partitioning, manual and tracer-based, and demonstrated how to schedule computations on -micro-batches across different stages. Finally, we covered the execution of the pipeline schedule and the launch of distributed -processes using ``torchrun``. - -For a production ready usage of pipeline parallelism as well as composition with other distributed techniques, see also -`TorchTitan end to end example of 3D parallelism `__. diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst deleted file mode 100644 index 47379bf881..0000000000 --- a/intermediate_source/process_group_cpp_extension_tutorial.rst +++ /dev/null @@ -1,307 +0,0 @@ -Customize Process Group Backends Using Cpp Extensions -===================================================== - -**Author**: `Howard Huang `__, `Feng Tian `__, `Shen Li `__, `Min Si `__ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ -- `PyTorch Collective Communication Package `__ -- `PyTorch Cpp Extension `__ -- `Writing Distributed Applications with PyTorch `__ - -This tutorial demonstrates how to implement a custom ``Backend`` and plug that into -`PyTorch distributed package `__ using -`cpp extensions `__. This is helpful when you need a specialized software -stack for your hardware, or when you would like to experiment with new -collective communication algorithms. - - -Basics ------- - -PyTorch collective communications power several widely adopted distributed -training features, including -`DistributedDataParallel `__, -`ZeroRedundancyOptimizer `__, -`FullyShardedDataParallel `__. -In order to make the same collective communication API work with -different communication backends, the distributed package abstracts collective -communication operations into a -`Backend `__ -class. Different backends can -then be implemented as subclasses of ``Backend`` using preferred -third-party libraries. PyTorch distributed comes with three default backends, -``ProcessGroupNCCL``, ``ProcessGroupGloo``, and ``ProcessGroupMPI``. However, -beyond these three backends, there are also other communication libraries -(e.g., `UCC `__, -`OneCCL `__), different types of hardware -(e.g., `TPU `__, -`Trainum `__), and emerging -communication algorithms (e.g., -`Herring `__, -`Reduction Server `__). -Therefore, the distributed package exposes extension APIs to allow customizing -collective communication backends. - - -The 4 steps below show how to implement a dummy ``Backend`` backend -and use that in Python application code. Please note that this tutorial focuses -on demonstrating the extension APIs, instead of developing a functioning -communication backend. Hence, the ``dummy`` backend just covers a subset of the -APIs (``all_reduce`` and ``all_gather``), and simply sets the values of tensors -to 0. - - -Step 1: Implement a Subclass of ``Backend`` ------------------------------------------------- - -This first step is to implement a ``Backend`` subclass that overrides -target collective communication APIs and runs the custom communication algorithm. -The extension also needs to implement a ``Work`` subclass, which -serves as a future of communication results and allows asynchronous execution in -application code. If the extension uses third-party libraries, it can -include the headers and call into the library APIs from the ``BackendDummy`` -subclass. The two code snippets below present the implementation of ``dummy.h`` and -``dummy.cpp``. See the `dummy collectives `__ -repository for the full implementation. - -.. code-block:: cpp - - // file name: dummy.hpp - #include - - #include - #include - #include - #include - #include - - #include - - namespace c10d { - - class BackendDummy : public Backend { - public: - BackendDummy(int rank, int size); - - c10::intrusive_ptr allgather( - std::vector>& outputTensors, - std::vector& inputTensors, - const AllgatherOptions& opts = AllgatherOptions()) override; - - c10::intrusive_ptr allreduce( - std::vector& tensors, - const AllreduceOptions& opts = AllreduceOptions()) override; - - // The collective communication APIs without a custom implementation - // will error out if invoked by application code. - }; - - class WorkDummy : public Work { - public: - WorkDummy( - OpType opType, - c10::intrusive_ptr future) // future of the output - : Work( - -1, // rank, only used by recvAnySource, irrelevant in this demo - opType), - future_(std::move(future)) {} - bool isCompleted() override; - bool isSuccess() const override; - bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override; - virtual c10::intrusive_ptr getFuture() override; - - private: - c10::intrusive_ptr future_; - }; - } // namespace c10d - - -.. code-block:: cpp - - // file name: dummy.cpp - #include "dummy.hpp" - - namespace c10d { - - // This is a dummy allgather that sets all output tensors to zero - // Modify the implementation to conduct real communication asynchronously - c10::intrusive_ptr BackendDummy::allgather( - std::vector>& outputTensors, - std::vector& inputTensors, - const AllgatherOptions& /* unused */) { - for (auto& outputTensorVec : outputTensors) { - for (auto& outputTensor : outputTensorVec) { - outputTensor.zero_(); - } - } - - auto future = c10::make_intrusive( - c10::ListType::create(c10::ListType::create(c10::TensorType::get()))); - future->markCompleted(c10::IValue(outputTensors)); - return c10::make_intrusive(OpType::ALLGATHER, std::move(future)); - } - - // This is a dummy allreduce that sets all output tensors to zero - // Modify the implementation to conduct real communication asynchronously - c10::intrusive_ptr BackendDummy::allreduce( - std::vector& tensors, - const AllreduceOptions& opts) { - for (auto& tensor : tensors) { - tensor.zero_(); - } - - auto future = c10::make_intrusive( - c10::ListType::create(c10::TensorType::get())); - future->markCompleted(c10::IValue(tensors)); - return c10::make_intrusive(OpType::ALLGATHER, std::move(future)); - } - } // namespace c10d - -Step 2: Expose The Extension Python APIs ----------------------------------------- - -The backend constructors are called -`from Python side `__, -so the extension also needs to expose the constructor APIs to Python. This can -be done by adding the following methods. In this example, ``store`` and -``timeout`` are ignored by the ``BackendDummy`` instantiation method, as -those are not used in this dummy implementation. However, real-world extensions -should consider using the ``store`` to perform rendezvous and supporting the -``timeout`` argument. - -.. code-block:: cpp - - // file name: dummy.hpp - class BackendDummy : public Backend { - ... - - ... - - static c10::intrusive_ptr createBackendDummy( - const c10::intrusive_ptr<::c10d::Store>& store, - int rank, - int size, - const std::chrono::duration& timeout); - - static void BackendDummyConstructor() __attribute__((constructor)) { - py::object module = py::module::import("torch.distributed"); - py::object register_backend = - module.attr("Backend").attr("register_backend"); - // torch.distributed.Backend.register_backend will add `dummy` as a - // new valid backend. - register_backend("dummy", py::cpp_function(createBackendDummy)); - } - } - -.. code-block:: cpp - - // file name: dummy.cpp - c10::intrusive_ptr BackendDummy::createBackendDummy( - const c10::intrusive_ptr<::c10d::Store>& /* unused */, - int rank, - int size, - const std::chrono::duration& /* unused */) { - return c10::make_intrusive(rank, size); - } - - PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("createBackendDummy", &BackendDummy::createBackendDummy); - } - - -Step 3: Build The Custom Extension ----------------------------------- - -Now, the extension source code files are ready. We can then use -`cpp extensions `__ -to build it. To do that, create a ``setup.py`` file that prepares the paths and -commands. Then call ``python setup.py develop`` to install the extension. - -If the extension depends on third-party libraries, you can also specify -``libraries_dirs`` and ``libraries`` to the cpp extension APIs. See the -`torch ucc `__ -project as a real-world example. - -.. code-block:: python - - # file name: setup.py - import os - import sys - import torch - from setuptools import setup - from torch.utils import cpp_extension - - sources = ["src/dummy.cpp"] - include_dirs = [f"{os.path.dirname(os.path.abspath(__file__))}/include/"] - - if torch.cuda.is_available(): - module = cpp_extension.CUDAExtension( - name = "dummy_collectives", - sources = sources, - include_dirs = include_dirs, - ) - else: - module = cpp_extension.CppExtension( - name = "dummy_collectives", - sources = sources, - include_dirs = include_dirs, - ) - - setup( - name = "Dummy-Collectives", - version = "0.0.1", - ext_modules = [module], - cmdclass={'build_ext': cpp_extension.BuildExtension} - ) - -Step 4: Use The Extension in Application ----------------------------------------- - -After installation, you can conveniently use the ``dummy`` backend when calling -`init_process_group `__ -as if it is an builtin backend. - -We can specify dispatching based on backend by changing the ``backend`` argument of ``init_process_group``. We -can dispatch collective with CPU tensor to ``gloo`` backend and dispatch collective with CUDA tensor to ``dummy`` backend by -specifying ``cpu:gloo,cuda:dummy`` as the backend argument. - -To send all tensors to ``dummy`` backend, we can simply specify ``dummy`` as the backend argument. - -.. code-block:: python - - import os - - import torch - # importing dummy_collectives makes torch.distributed recognize `dummy` - # as a valid backend. - import dummy_collectives - - import torch.distributed as dist - - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - - # Alternatively: - # dist.init_process_group("dummy", rank=0, world_size=1) - dist.init_process_group("cpu:gloo,cuda:dummy", rank=0, world_size=1) - - # this goes through gloo - x = torch.ones(6) - dist.all_reduce(x) - print(f"cpu allreduce: {x}") - - # this goes through dummy - if torch.cuda.is_available(): - y = x.cuda() - dist.all_reduce(y) - print(f"cuda allreduce: {y}") - - try: - dist.broadcast(y, 0) - except RuntimeError: - print("got RuntimeError when calling broadcast") diff --git a/intermediate_source/pruning_tutorial.py b/intermediate_source/pruning_tutorial.py deleted file mode 100644 index 346200502d..0000000000 --- a/intermediate_source/pruning_tutorial.py +++ /dev/null @@ -1,403 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Pruning Tutorial -===================================== -**Author**: `Michela Paganini `_ - -State-of-the-art deep learning techniques rely on over-parametrized models -that are hard to deploy. On the contrary, biological neural networks are -known to use efficient sparse connectivity. Identifying optimal -techniques to compress models by reducing the number of parameters in them is -important in order to reduce memory, battery, and hardware consumption without -sacrificing accuracy. This in turn allows you to deploy lightweight models on device, and guarantee -privacy with private on-device computation. On the research front, pruning is -used to investigate the differences in learning dynamics between -over-parametrized and under-parametrized networks, to study the role of lucky -sparse subnetworks and initializations -("`lottery tickets `_") as a destructive -neural architecture search technique, and more. - -In this tutorial, you will learn how to use ``torch.nn.utils.prune`` to -sparsify your neural networks, and how to extend it to implement your -own custom pruning technique. - -Requirements ------------- -``"torch>=1.4.0a0+8e8a5e0"`` - -""" -import torch -from torch import nn -import torch.nn.utils.prune as prune -import torch.nn.functional as F - -###################################################################### -# Create a model -# -------------- -# -# In this tutorial, we use the `LeNet -# `_ architecture from -# LeCun et al., 1998. - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -class LeNet(nn.Module): - def __init__(self): - super(LeNet, self).__init__() - # 1 input image channel, 6 output channels, 5x5 square conv kernel - self.conv1 = nn.Conv2d(1, 6, 5) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5x5 image dimension - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) - x = F.max_pool2d(F.relu(self.conv2(x)), 2) - x = x.view(-1, int(x.nelement() / x.shape[0])) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -model = LeNet().to(device=device) - - -###################################################################### -# Inspect a Module -# ---------------- -# -# Let's inspect the (unpruned) ``conv1`` layer in our LeNet model. It will contain two -# parameters ``weight`` and ``bias``, and no buffers, for now. -module = model.conv1 -print(list(module.named_parameters())) - -###################################################################### -print(list(module.named_buffers())) - -###################################################################### -# Pruning a Module -# ---------------- -# -# To prune a module (in this example, the ``conv1`` layer of our LeNet -# architecture), first select a pruning technique among those available in -# ``torch.nn.utils.prune`` (or -# `implement <#extending-torch-nn-utils-pruning-with-custom-pruning-functions>`_ -# your own by subclassing -# ``BasePruningMethod``). Then, specify the module and the name of the parameter to -# prune within that module. Finally, using the adequate keyword arguments -# required by the selected pruning technique, specify the pruning parameters. -# -# In this example, we will prune at random 30% of the connections in -# the parameter named ``weight`` in the ``conv1`` layer. -# The module is passed as the first argument to the function; ``name`` -# identifies the parameter within that module using its string identifier; and -# ``amount`` indicates either the percentage of connections to prune (if it -# is a float between 0. and 1.), or the absolute number of connections to -# prune (if it is a non-negative integer). -prune.random_unstructured(module, name="weight", amount=0.3) - -###################################################################### -# Pruning acts by removing ``weight`` from the parameters and replacing it with -# a new parameter called ``weight_orig`` (i.e. appending ``"_orig"`` to the -# initial parameter ``name``). ``weight_orig`` stores the unpruned version of -# the tensor. The ``bias`` was not pruned, so it will remain intact. -print(list(module.named_parameters())) - -###################################################################### -# The pruning mask generated by the pruning technique selected above is saved -# as a module buffer named ``weight_mask`` (i.e. appending ``"_mask"`` to the -# initial parameter ``name``). -print(list(module.named_buffers())) - -###################################################################### -# For the forward pass to work without modification, the ``weight`` attribute -# needs to exist. The pruning techniques implemented in -# ``torch.nn.utils.prune`` compute the pruned version of the weight (by -# combining the mask with the original parameter) and store them in the -# attribute ``weight``. Note, this is no longer a parameter of the ``module``, -# it is now simply an attribute. -print(module.weight) - -###################################################################### -# Finally, pruning is applied prior to each forward pass using PyTorch's -# ``forward_pre_hooks``. Specifically, when the ``module`` is pruned, as we -# have done here, it will acquire a ``forward_pre_hook`` for each parameter -# associated with it that gets pruned. In this case, since we have so far -# only pruned the original parameter named ``weight``, only one hook will be -# present. -print(module._forward_pre_hooks) - -###################################################################### -# For completeness, we can now prune the ``bias`` too, to see how the -# parameters, buffers, hooks, and attributes of the ``module`` change. -# Just for the sake of trying out another pruning technique, here we prune the -# 3 smallest entries in the bias by L1 norm, as implemented in the -# ``l1_unstructured`` pruning function. -prune.l1_unstructured(module, name="bias", amount=3) - -###################################################################### -# We now expect the named parameters to include both ``weight_orig`` (from -# before) and ``bias_orig``. The buffers will include ``weight_mask`` and -# ``bias_mask``. The pruned versions of the two tensors will exist as -# module attributes, and the module will now have two ``forward_pre_hooks``. -print(list(module.named_parameters())) - -###################################################################### -print(list(module.named_buffers())) - -###################################################################### -print(module.bias) - -###################################################################### -print(module._forward_pre_hooks) - -###################################################################### -# Iterative Pruning -# ----------------- -# -# The same parameter in a module can be pruned multiple times, with the -# effect of the various pruning calls being equal to the combination of the -# various masks applied in series. -# The combination of a new mask with the old mask is handled by the -# ``PruningContainer``'s ``compute_mask`` method. -# -# Say, for example, that we now want to further prune ``module.weight``, this -# time using structured pruning along the 0th axis of the tensor (the 0th axis -# corresponds to the output channels of the convolutional layer and has -# dimensionality 6 for ``conv1``), based on the channels' L2 norm. This can be -# achieved using the ``ln_structured`` function, with ``n=2`` and ``dim=0``. -prune.ln_structured(module, name="weight", amount=0.5, n=2, dim=0) - -# As we can verify, this will zero out all the connections corresponding to -# 50% (3 out of 6) of the channels, while preserving the action of the -# previous mask. -print(module.weight) - -############################################################################ -# The corresponding hook will now be of type -# ``torch.nn.utils.prune.PruningContainer``, and will store the history of -# pruning applied to the ``weight`` parameter. -for hook in module._forward_pre_hooks.values(): - if hook._tensor_name == "weight": # select out the correct hook - break - -print(list(hook)) # pruning history in the container - -###################################################################### -# Serializing a pruned model -# -------------------------- -# All relevant tensors, including the mask buffers and the original parameters -# used to compute the pruned tensors are stored in the model's ``state_dict`` -# and can therefore be easily serialized and saved, if needed. -print(model.state_dict().keys()) - - -###################################################################### -# Remove pruning re-parametrization -# --------------------------------- -# -# To make the pruning permanent, remove the re-parametrization in terms -# of ``weight_orig`` and ``weight_mask``, and remove the ``forward_pre_hook``, -# we can use the ``remove`` functionality from ``torch.nn.utils.prune``. -# Note that this doesn't undo the pruning, as if it never happened. It simply -# makes it permanent, instead, by reassigning the parameter ``weight`` to the -# model parameters, in its pruned version. - -###################################################################### -# Prior to removing the re-parametrization: -print(list(module.named_parameters())) -###################################################################### -print(list(module.named_buffers())) -###################################################################### -print(module.weight) - -###################################################################### -# After removing the re-parametrization: -prune.remove(module, 'weight') -print(list(module.named_parameters())) -###################################################################### -print(list(module.named_buffers())) - -###################################################################### -# Pruning multiple parameters in a model -# -------------------------------------- -# -# By specifying the desired pruning technique and parameters, we can easily -# prune multiple tensors in a network, perhaps according to their type, as we -# will see in this example. - -new_model = LeNet() -for name, module in new_model.named_modules(): - # prune 20% of connections in all 2D-conv layers - if isinstance(module, torch.nn.Conv2d): - prune.l1_unstructured(module, name='weight', amount=0.2) - # prune 40% of connections in all linear layers - elif isinstance(module, torch.nn.Linear): - prune.l1_unstructured(module, name='weight', amount=0.4) - -print(dict(new_model.named_buffers()).keys()) # to verify that all masks exist - -###################################################################### -# Global pruning -# -------------- -# -# So far, we only looked at what is usually referred to as "local" pruning, -# i.e. the practice of pruning tensors in a model one by one, by -# comparing the statistics (weight magnitude, activation, gradient, etc.) of -# each entry exclusively to the other entries in that tensor. However, a -# common and perhaps more powerful technique is to prune the model all at -# once, by removing (for example) the lowest 20% of connections across the -# whole model, instead of removing the lowest 20% of connections in each -# layer. This is likely to result in different pruning percentages per layer. -# Let's see how to do that using ``global_unstructured`` from -# ``torch.nn.utils.prune``. - -model = LeNet() - -parameters_to_prune = ( - (model.conv1, 'weight'), - (model.conv2, 'weight'), - (model.fc1, 'weight'), - (model.fc2, 'weight'), - (model.fc3, 'weight'), -) - -prune.global_unstructured( - parameters_to_prune, - pruning_method=prune.L1Unstructured, - amount=0.2, -) - -###################################################################### -# Now we can check the sparsity induced in every pruned parameter, which will -# not be equal to 20% in each layer. However, the global sparsity will be -# (approximately) 20%. -print( - "Sparsity in conv1.weight: {:.2f}%".format( - 100. * float(torch.sum(model.conv1.weight == 0)) - / float(model.conv1.weight.nelement()) - ) -) -print( - "Sparsity in conv2.weight: {:.2f}%".format( - 100. * float(torch.sum(model.conv2.weight == 0)) - / float(model.conv2.weight.nelement()) - ) -) -print( - "Sparsity in fc1.weight: {:.2f}%".format( - 100. * float(torch.sum(model.fc1.weight == 0)) - / float(model.fc1.weight.nelement()) - ) -) -print( - "Sparsity in fc2.weight: {:.2f}%".format( - 100. * float(torch.sum(model.fc2.weight == 0)) - / float(model.fc2.weight.nelement()) - ) -) -print( - "Sparsity in fc3.weight: {:.2f}%".format( - 100. * float(torch.sum(model.fc3.weight == 0)) - / float(model.fc3.weight.nelement()) - ) -) -print( - "Global sparsity: {:.2f}%".format( - 100. * float( - torch.sum(model.conv1.weight == 0) - + torch.sum(model.conv2.weight == 0) - + torch.sum(model.fc1.weight == 0) - + torch.sum(model.fc2.weight == 0) - + torch.sum(model.fc3.weight == 0) - ) - / float( - model.conv1.weight.nelement() - + model.conv2.weight.nelement() - + model.fc1.weight.nelement() - + model.fc2.weight.nelement() - + model.fc3.weight.nelement() - ) - ) -) - - -###################################################################### -# Extending ``torch.nn.utils.prune`` with custom pruning functions -# ------------------------------------------------------------------ -# To implement your own pruning function, you can extend the -# ``nn.utils.prune`` module by subclassing the ``BasePruningMethod`` -# base class, the same way all other pruning methods do. The base class -# implements the following methods for you: ``__call__``, ``apply_mask``, -# ``apply``, ``prune``, and ``remove``. Beyond some special cases, you shouldn't -# have to reimplement these methods for your new pruning technique. -# You will, however, have to implement ``__init__`` (the constructor), -# and ``compute_mask`` (the instructions on how to compute the mask -# for the given tensor according to the logic of your pruning -# technique). In addition, you will have to specify which type of -# pruning this technique implements (supported options are ``global``, -# ``structured``, and ``unstructured``). This is needed to determine -# how to combine masks in the case in which pruning is applied -# iteratively. In other words, when pruning a prepruned parameter, -# the current pruning technique is expected to act on the unpruned -# portion of the parameter. Specifying the ``PRUNING_TYPE`` will -# enable the ``PruningContainer`` (which handles the iterative -# application of pruning masks) to correctly identify the slice of the -# parameter to prune. -# -# Let's assume, for example, that you want to implement a pruning -# technique that prunes every other entry in a tensor (or -- if the -# tensor has previously been pruned -- in the remaining unpruned -# portion of the tensor). This will be of ``PRUNING_TYPE='unstructured'`` -# because it acts on individual connections in a layer and not on entire -# units/channels (``'structured'``), or across different parameters -# (``'global'``). - -class FooBarPruningMethod(prune.BasePruningMethod): - """Prune every other entry in a tensor - """ - PRUNING_TYPE = 'unstructured' - - def compute_mask(self, t, default_mask): - mask = default_mask.clone() - mask.view(-1)[::2] = 0 - return mask - -###################################################################### -# Now, to apply this to a parameter in an ``nn.Module``, you should -# also provide a simple function that instantiates the method and -# applies it. -def foobar_unstructured(module, name): - """Prunes tensor corresponding to parameter called `name` in `module` - by removing every other entry in the tensors. - Modifies module in place (and also return the modified module) - by: - 1) adding a named buffer called `name+'_mask'` corresponding to the - binary mask applied to the parameter `name` by the pruning method. - The parameter `name` is replaced by its pruned version, while the - original (unpruned) parameter is stored in a new parameter named - `name+'_orig'`. - - Args: - module (nn.Module): module containing the tensor to prune - name (string): parameter name within `module` on which pruning - will act. - - Returns: - module (nn.Module): modified (i.e. pruned) version of the input - module - - Examples: - >>> m = nn.Linear(3, 4) - >>> foobar_unstructured(m, name='bias') - """ - FooBarPruningMethod.apply(module, name) - return module - -###################################################################### -# Let's try it out! -model = LeNet() -foobar_unstructured(model.fc3, name='bias') - -print(model.fc3.bias_mask) diff --git a/intermediate_source/quantized_transfer_learning_tutorial.rst b/intermediate_source/quantized_transfer_learning_tutorial.rst deleted file mode 100644 index 9ba5e92d19..0000000000 --- a/intermediate_source/quantized_transfer_learning_tutorial.rst +++ /dev/null @@ -1,516 +0,0 @@ -(beta) Quantized Transfer Learning for Computer Vision Tutorial -======================================================================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. - This will allow you to experiment with the information presented below. - -**Author**: `Zafar Takhirov `_ - -**Reviewed by**: `Raghuraman Krishnamoorthi `_ - -**Edited by**: `Jessica Lin `_ - -This tutorial builds on the original `PyTorch Transfer Learning `_ -tutorial, written by `Sasank Chilamkurthy `_. - -Transfer learning refers to techniques that make use of a pretrained model for -application on a different data-set. -There are two main ways the transfer learning is used: - -1. **ConvNet as a fixed feature extractor**: Here, you `“freeze” `_ - the weights of all the parameters in the network except that of the final - several layers (aka “the head”, usually fully connected layers). - These last layers are replaced with new ones initialized with random - weights and only these layers are trained. -2. **Finetuning the ConvNet**: Instead of random initializaion, the model is - initialized using a pretrained network, after which the training proceeds as - usual but with a different dataset. - Usually the head (or part of it) is also replaced in the network in - case there is a different number of outputs. - It is common in this method to set the learning rate to a smaller number. - This is done because the network is already trained, and only minor changes - are required to "finetune" it to a new dataset. - -You can also combine the above two methods: -First you can freeze the feature extractor, and train the head. After -that, you can unfreeze the feature extractor (or part of it), set the -learning rate to something smaller, and continue training. - -In this part you will use the first method – extracting the features -using a quantized model. - - -Part 0. Prerequisites ---------------------- - -Before diving into the transfer learning, let us review the "prerequisites", -such as installations and data loading/visualizations. - -.. code:: python - - # Imports - import copy - import matplotlib.pyplot as plt - import numpy as np - import os - import time - - plt.ion() - -Installing the Nightly Build -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Because you will be using the beta parts of the PyTorch, it is -recommended to install the latest version of ``torch`` and -``torchvision``. You can find the most recent instructions on local -installation `here `_. -For example, to install without GPU support: - -.. code:: shell - - pip install numpy - pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # For CUDA support use https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html - - -Load Data -~~~~~~~~~ - -.. note :: This section is identical to the original transfer learning tutorial. -We will use ``torchvision`` and ``torch.utils.data`` packages to load -the data. - -The problem you are going to solve today is classifying **ants** and -**bees** from images. The dataset contains about 120 training images -each for ants and bees. There are 75 validation images for each class. -This is considered a very small dataset to generalize on. However, since -we are using transfer learning, we should be able to generalize -reasonably well. - -*This dataset is a very small subset of imagenet.* - -.. note :: Download the data from `here `_ - and extract it to the ``data`` directory. - -.. code:: python - - import torch - from torchvision import transforms, datasets - - # Data augmentation and normalization for training - # Just normalization for validation - data_transforms = { - 'train': transforms.Compose([ - transforms.Resize(224), - transforms.RandomCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]), - 'val': transforms.Compose([ - transforms.Resize(224), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]), - } - - data_dir = 'data/hymenoptera_data' - image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), - data_transforms[x]) - for x in ['train', 'val']} - dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16, - shuffle=True, num_workers=8) - for x in ['train', 'val']} - dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} - class_names = image_datasets['train'].classes - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - -Visualize a few images -~~~~~~~~~~~~~~~~~~~~~~ - -Let’s visualize a few training images so as to understand the data -augmentations. - -.. code:: python - - import torchvision - - def imshow(inp, title=None, ax=None, figsize=(5, 5)): - """Imshow for Tensor.""" - inp = inp.numpy().transpose((1, 2, 0)) - mean = np.array([0.485, 0.456, 0.406]) - std = np.array([0.229, 0.224, 0.225]) - inp = std * inp + mean - inp = np.clip(inp, 0, 1) - if ax is None: - fig, ax = plt.subplots(1, figsize=figsize) - ax.imshow(inp) - ax.set_xticks([]) - ax.set_yticks([]) - if title is not None: - ax.set_title(title) - - # Get a batch of training data - inputs, classes = next(iter(dataloaders['train'])) - - # Make a grid from batch - out = torchvision.utils.make_grid(inputs, nrow=4) - - fig, ax = plt.subplots(1, figsize=(10, 10)) - imshow(out, title=[class_names[x] for x in classes], ax=ax) - - -Support Function for Model Training -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Below is a generic function for model training. -This function also - -- Schedules the learning rate -- Saves the best model - -.. code:: python - - def train_model(model, criterion, optimizer, scheduler, num_epochs=25, device='cpu'): - """ - Support function for model training. - - Args: - model: Model to be trained - criterion: Optimization criterion (loss) - optimizer: Optimizer to use for training - scheduler: Instance of ``torch.optim.lr_scheduler`` - num_epochs: Number of epochs - device: Device to run the training on. Must be 'cpu' or 'cuda' - """ - since = time.time() - - best_model_wts = copy.deepcopy(model.state_dict()) - best_acc = 0.0 - - for epoch in range(num_epochs): - print('Epoch {}/{}'.format(epoch, num_epochs - 1)) - print('-' * 10) - - # Each epoch has a training and validation phase - for phase in ['train', 'val']: - if phase == 'train': - model.train() # Set model to training mode - else: - model.eval() # Set model to evaluate mode - - running_loss = 0.0 - running_corrects = 0 - - # Iterate over data. - for inputs, labels in dataloaders[phase]: - inputs = inputs.to(device) - labels = labels.to(device) - - # zero the parameter gradients - optimizer.zero_grad() - - # forward - # track history if only in train - with torch.set_grad_enabled(phase == 'train'): - outputs = model(inputs) - _, preds = torch.max(outputs, 1) - loss = criterion(outputs, labels) - - # backward + optimize only if in training phase - if phase == 'train': - loss.backward() - optimizer.step() - - # statistics - running_loss += loss.item() * inputs.size(0) - running_corrects += torch.sum(preds == labels.data) - if phase == 'train': - scheduler.step() - - epoch_loss = running_loss / dataset_sizes[phase] - epoch_acc = running_corrects.double() / dataset_sizes[phase] - - print('{} Loss: {:.4f} Acc: {:.4f}'.format( - phase, epoch_loss, epoch_acc)) - - # deep copy the model - if phase == 'val' and epoch_acc > best_acc: - best_acc = epoch_acc - best_model_wts = copy.deepcopy(model.state_dict()) - - print() - - time_elapsed = time.time() - since - print('Training complete in {:.0f}m {:.0f}s'.format( - time_elapsed // 60, time_elapsed % 60)) - print('Best val Acc: {:4f}'.format(best_acc)) - - # load best model weights - model.load_state_dict(best_model_wts) - return model - - -Support Function for Visualizing the Model Predictions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Generic function to display predictions for a few images - -.. code:: python - - def visualize_model(model, rows=3, cols=3): - was_training = model.training - model.eval() - current_row = current_col = 0 - fig, ax = plt.subplots(rows, cols, figsize=(cols*2, rows*2)) - - with torch.no_grad(): - for idx, (imgs, lbls) in enumerate(dataloaders['val']): - imgs = imgs.cpu() - lbls = lbls.cpu() - - outputs = model(imgs) - _, preds = torch.max(outputs, 1) - - for jdx in range(imgs.size()[0]): - imshow(imgs.data[jdx], ax=ax[current_row, current_col]) - ax[current_row, current_col].axis('off') - ax[current_row, current_col].set_title('predicted: {}'.format(class_names[preds[jdx]])) - - current_col += 1 - if current_col >= cols: - current_row += 1 - current_col = 0 - if current_row >= rows: - model.train(mode=was_training) - return - model.train(mode=was_training) - - -Part 1. Training a Custom Classifier based on a Quantized Feature Extractor ---------------------------------------------------------------------------- - -In this section you will use a “frozen” quantized feature extractor, and -train a custom classifier head on top of it. Unlike floating point -models, you don’t need to set requires_grad=False for the quantized -model, as it has no trainable parameters. Please, refer to the -`documentation `_ for -more details. - -Load a pretrained model: for this exercise you will be using -`ResNet-18 `_. - -.. code:: python - - import torchvision.models.quantization as models - - # You will need the number of filters in the `fc` for future use. - # Here the size of each output sample is set to 2. - # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)). - model_fe = models.resnet18(pretrained=True, progress=True, quantize=True) - num_ftrs = model_fe.fc.in_features - - -At this point you need to modify the pretrained model. The model -has the quantize/dequantize blocks in the beginning and the end. However, -because you will only use the feature extractor, the dequantization layer has -to move right before the linear layer (the head). The easiest way to do that -is to wrap the model in the ``nn.Sequential`` module. - -The first step is to isolate the feature extractor in the ResNet -model. Although in this example you are tasked to use all layers except -``fc`` as the feature extractor, in reality, you can take as many parts -as you need. This would be useful in case you would like to replace some -of the convolutional layers as well. - - -.. note:: When separating the feature extractor from the rest of a quantized - model, you have to manually place the quantizer/dequantized in the - beginning and the end of the parts you want to keep quantized. - -The function below creates a model with a custom head. - -.. code:: python - - from torch import nn - - def create_combined_model(model_fe): - # Step 1. Isolate the feature extractor. - model_fe_features = nn.Sequential( - model_fe.quant, # Quantize the input - model_fe.conv1, - model_fe.bn1, - model_fe.relu, - model_fe.maxpool, - model_fe.layer1, - model_fe.layer2, - model_fe.layer3, - model_fe.layer4, - model_fe.avgpool, - model_fe.dequant, # Dequantize the output - ) - - # Step 2. Create a new "head" - new_head = nn.Sequential( - nn.Dropout(p=0.5), - nn.Linear(num_ftrs, 2), - ) - - # Step 3. Combine, and don't forget the quant stubs. - new_model = nn.Sequential( - model_fe_features, - nn.Flatten(1), - new_head, - ) - return new_model - -.. warning:: Currently the quantized models can only be run on CPU. - However, it is possible to send the non-quantized parts of the model to a GPU. - -.. code:: python - - import torch.optim as optim - new_model = create_combined_model(model_fe) - new_model = new_model.to('cpu') - - criterion = nn.CrossEntropyLoss() - - # Note that we are only training the head. - optimizer_ft = optim.SGD(new_model.parameters(), lr=0.01, momentum=0.9) - - # Decay LR by a factor of 0.1 every 7 epochs - exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) - - -Train and evaluate -~~~~~~~~~~~~~~~~~~ - -This step takes around 15-25 min on CPU. Because the quantized model can -only run on the CPU, you cannot run the training on GPU. - -.. code:: python - - new_model = train_model(new_model, criterion, optimizer_ft, exp_lr_scheduler, - num_epochs=25, device='cpu') - - visualize_model(new_model) - plt.tight_layout() - - -Part 2. Finetuning the Quantizable Model ----------------------------------------- - -In this part, we fine tune the feature extractor used for transfer -learning, and quantize the feature extractor. Note that in both part 1 -and 2, the feature extractor is quantized. The difference is that in -part 1, we use a pretrained quantized model. In this part, we create a -quantized feature extractor after fine tuning on the data-set of -interest, so this is a way to get better accuracy with transfer learning -while having the benefits of quantization. Note that in our specific -example, the training set is really small (120 images) so the benefits -of fine tuning the entire model is not apparent. However, the procedure -shown here will improve accuracy for transfer learning with larger -datasets. - -The pretrained feature extractor must be quantizable. -To make sure it is quantizable, perform the following steps: - - 1. Fuse ``(Conv, BN, ReLU)``, ``(Conv, BN)``, and ``(Conv, ReLU)`` using - ``torch.quantization.fuse_modules``. - 2. Connect the feature extractor with a custom head. - This requires dequantizing the output of the feature extractor. - 3. Insert fake-quantization modules at appropriate locations - in the feature extractor to mimic quantization during training. - -For step (1), we use models from ``torchvision/models/quantization``, which -have a member method ``fuse_model``. This function fuses all the ``conv``, -``bn``, and ``relu`` modules. For custom models, this would require calling -the ``torch.quantization.fuse_modules`` API with the list of modules to fuse -manually. - -Step (2) is performed by the ``create_combined_model`` function -used in the previous section. - -Step (3) is achieved by using ``torch.quantization.prepare_qat``, which -inserts fake-quantization modules. - - -As step (4), you can start "finetuning" the model, and after that convert -it to a fully quantized version (Step 5). - -To convert the fine tuned model into a quantized model you can call the -``torch.quantization.convert`` function (in our case only -the feature extractor is quantized). - -.. note:: Because of the random initialization your results might differ from - the results shown in this tutorial. - -.. code:: python - - # notice `quantize=False` - model = models.resnet18(pretrained=True, progress=True, quantize=False) - num_ftrs = model.fc.in_features - - # Step 1 - model.train() - model.fuse_model() - # Step 2 - model_ft = create_combined_model(model) - model_ft[0].qconfig = torch.quantization.default_qat_qconfig # Use default QAT configuration - # Step 3 - model_ft = torch.quantization.prepare_qat(model_ft, inplace=True) - - -Finetuning the model -~~~~~~~~~~~~~~~~~~~~ - -In the current tutorial the whole model is fine tuned. In -general, this will lead to higher accuracy. However, due to the small -training set used here, we end up overfitting to the training set. - - -Step 4. Fine tune the model - -.. code:: python - - for param in model_ft.parameters(): - param.requires_grad = True - - model_ft.to(device) # We can fine-tune on GPU if available - - criterion = nn.CrossEntropyLoss() - - # Note that we are training everything, so the learning rate is lower - # Notice the smaller learning rate - optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-3, momentum=0.9, weight_decay=0.1) - - # Decay LR by a factor of 0.3 every several epochs - exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.3) - - model_ft_tuned = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, - num_epochs=25, device=device) - -Step 5. Convert to quantized model - -.. code:: python - - from torch.quantization import convert - model_ft_tuned.cpu() - - model_quantized_and_trained = convert(model_ft_tuned, inplace=False) - - -Lets see how the quantized model performs on a few images - -.. code:: python - - visualize_model(model_quantized_and_trained) - - plt.ioff() - plt.tight_layout() - plt.show() diff --git a/intermediate_source/realtime_rpi.rst b/intermediate_source/realtime_rpi.rst deleted file mode 100644 index bb1a576a2c..0000000000 --- a/intermediate_source/realtime_rpi.rst +++ /dev/null @@ -1,345 +0,0 @@ -Real Time Inference on Raspberry Pi 4 (30 fps!) -================================================= -**Author**: `Tristan Rice `_ - -PyTorch has out of the box support for Raspberry Pi 4. This tutorial will guide -you on how to setup a Raspberry Pi 4 for running PyTorch and run a MobileNet v2 -classification model in real time (30 fps+) on the CPU. - -This was all tested with Raspberry Pi 4 Model B 4GB but should work with the 2GB -variant as well as on the 3B with reduced performance. - -.. image:: https://user-images.githubusercontent.com/909104/153093710-bc736b6f-69d9-4a50-a3e8-9f2b2c9e04fd.gif - -Prerequisites -~~~~~~~~~~~~~~~~ - -To follow this tutorial you'll need a Raspberry Pi 4, a camera for it and all -the other standard accessories. - -* `Raspberry Pi 4 Model B 2GB+ `_ -* `Raspberry Pi Camera Module `_ -* Heat sinks and Fan (optional but recommended) -* 5V 3A USB-C Power Supply -* SD card (at least 8gb) -* SD card read/writer - - -Raspberry Pi 4 Setup -~~~~~~~~~~~~~~~~~~~~~~~ - -PyTorch only provides pip packages for Arm 64bit (aarch64) so you'll need to install a 64 bit version of the OS on your Raspberry Pi - -You can download the latest arm64 Raspberry Pi OS from https://downloads.raspberrypi.org/raspios_arm64/images/ and install it via rpi-imager. - -**32-bit Raspberry Pi OS will not work.** - -.. image:: https://user-images.githubusercontent.com/909104/152866212-36ce29b1-aba6-4924-8ae6-0a283f1fca14.gif - -Installation will take at least a few minutes depending on your internet speed and sdcard speed. Once it's done it should look like: - -.. image:: https://user-images.githubusercontent.com/909104/152867425-c005cff0-5f3f-47f1-922d-e0bbb541cd25.png - -Time to put your sdcard in your Raspberry Pi, connect the camera and boot it up. - -.. image:: https://user-images.githubusercontent.com/909104/152869862-c239c980-b089-4bd5-84eb-0a1e5cf22df2.png - - -Once that boots and you complete the initial setup you'll need to edit the ``/boot/config.txt`` file to enable the camera. - -.. code:: toml - - # This enables the extended features such as the camera. - start_x=1 - - # This needs to be at least 128M for the camera processing, if it's bigger you can just leave it as is. - gpu_mem=128 - - # You need to commment/remove the existing camera_auto_detect line since this causes issues with OpenCV/V4L2 capture. - #camera_auto_detect=1 - -And then reboot. After you reboot the video4linux2 device ``/dev/video0`` should exist. - -Installing PyTorch and OpenCV -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyTorch and all the other libraries we need have ARM 64-bit/aarch64 variants so you can just install them via pip and have it work like any other Linux system. - -.. code:: shell - - $ pip install torch torchvision torchaudio - $ pip install opencv-python - $ pip install numpy --upgrade - -.. image:: https://user-images.githubusercontent.com/909104/152874260-95a7a8bd-0f9b-438a-9c0b-5b67729e233f.png - - -We can now check that everything installed correctly: - -.. code:: shell - - $ python -c "import torch; print(torch.__version__)" - -.. image:: https://user-images.githubusercontent.com/909104/152874271-d7057c2d-80fd-4761-aed4-df6c8b7aa99f.png - - -Video Capture -~~~~~~~~~~~~~~ - -For video capture we're going to be using OpenCV to stream the video frames -instead of the more common ``picamera``. `picamera` isn't available on 64-bit -Raspberry Pi OS and it's much slower than OpenCV. OpenCV directly accesses the -``/dev/video0`` device to grab frames. - -The model we're using (MobileNetV2) takes in image sizes of ``224x224`` so we -can request that directly from OpenCV at 36fps. We're targeting 30fps for the -model but we request a slightly higher framerate than that so there's always -enough frames. - -.. code:: python - - import cv2 - from PIL import Image - - cap = cv2.VideoCapture(0) - cap.set(cv2.CAP_PROP_FRAME_WIDTH, 224) - cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 224) - cap.set(cv2.CAP_PROP_FPS, 36) - -OpenCV returns a ``numpy`` array in BGR so we need to read and do a bit of -shuffling to get it into the expected RGB format. - -.. code:: python - - ret, image = cap.read() - # convert opencv output from BGR to RGB - image = image[:, :, [2, 1, 0]] - -This data reading and processing takes about ``3.5 ms``. - -Image Preprocessing -~~~~~~~~~~~~~~~~~~~~ - -We need to take the frames and transform them into the format the model expects. This is the same processing as you would do on any machine with the standard torchvision transforms. - -.. code:: python - - from torchvision import transforms - - preprocess = transforms.Compose([ - # convert the frame to a CHW torch tensor for training - transforms.ToTensor(), - # normalize the colors to the range that mobilenet_v2/3 expect - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - input_tensor = preprocess(image) - # The model can handle multiple images simultaneously so we need to add an - # empty dimension for the batch. - # [3, 224, 224] -> [1, 3, 224, 224] - input_batch = input_tensor.unsqueeze(0) - -Model Choices -~~~~~~~~~~~~~~~ - -There's a number of models you can choose from to use with different performance -characteristics. Not all models provide a ``qnnpack`` pretrained variant so for -testing purposes you should chose one that does but if you train and quantize -your own model you can use any of them. - -We're using ``mobilenet_v2`` for this tutorial since it has good performance and -accuracy. - -Raspberry Pi 4 Benchmark Results: - -+--------------------+------+-----------------------+-----------------------+--------------------+ -| Model | FPS | Total Time (ms/frame) | Model Time (ms/frame) | qnnpack Pretrained | -+====================+======+=======================+=======================+====================+ -| mobilenet_v2 | 33.7 | 29.7 | 26.4 | True | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| mobilenet_v3_large | 29.3 | 34.1 | 30.7 | True | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| resnet18 | 9.2 | 109.0 | 100.3 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| resnet50 | 4.3 | 233.9 | 225.2 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| resnext101_32x8d | 1.1 | 892.5 | 885.3 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| inception_v3 | 4.9 | 204.1 | 195.5 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| googlenet | 7.4 | 135.3 | 132.0 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| shufflenet_v2_x0_5 | 46.7 | 21.4 | 18.2 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| shufflenet_v2_x1_0 | 24.4 | 41.0 | 37.7 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| shufflenet_v2_x1_5 | 16.8 | 59.6 | 56.3 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ -| shufflenet_v2_x2_0 | 11.6 | 86.3 | 82.7 | False | -+--------------------+------+-----------------------+-----------------------+--------------------+ - -MobileNetV2: Quantization and JIT -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For optimal performance we want a model that's quantized and fused. Quantized -means that it does the computation using int8 which is much more performant than -the standard float32 math. Fused means that consecutive operations have been -fused together into a more performant version where possible. Commonly things -like activations (``ReLU``) can be merged into the layer before (``Conv2d``) -during inference. - -The aarch64 version of pytorch requires using the ``qnnpack`` engine. - -.. code:: python - - import torch - torch.backends.quantized.engine = 'qnnpack' - -For this example we'll use a prequantized and fused version of MobileNetV2 that's provided out of the box by torchvision. - -.. code:: python - - from torchvision import models - net = models.quantization.mobilenet_v2(pretrained=True, quantize=True) - -We then want to jit the model to reduce Python overhead and fuse any ops. Jit gives us ~30fps instead of ~20fps without it. - -.. code:: python - - net = torch.jit.script(net) - -Putting It Together -~~~~~~~~~~~~~~~~~~~~~~~~~ - -We can now put all the pieces together and run it: - -.. code:: python - - import time - - import torch - import numpy as np - from torchvision import models, transforms - - import cv2 - from PIL import Image - - torch.backends.quantized.engine = 'qnnpack' - - cap = cv2.VideoCapture(0, cv2.CAP_V4L2) - cap.set(cv2.CAP_PROP_FRAME_WIDTH, 224) - cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 224) - cap.set(cv2.CAP_PROP_FPS, 36) - - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - net = models.quantization.mobilenet_v2(pretrained=True, quantize=True) - # jit model to take it from ~20fps to ~30fps - net = torch.jit.script(net) - - started = time.time() - last_logged = time.time() - frame_count = 0 - - with torch.no_grad(): - while True: - # read frame - ret, image = cap.read() - if not ret: - raise RuntimeError("failed to read frame") - - # convert opencv output from BGR to RGB - image = image[:, :, [2, 1, 0]] - permuted = image - - # preprocess - input_tensor = preprocess(image) - - # create a mini-batch as expected by the model - input_batch = input_tensor.unsqueeze(0) - - # run model - output = net(input_batch) - # do something with output ... - - # log model performance - frame_count += 1 - now = time.time() - if now - last_logged > 1: - print(f"{frame_count / (now-last_logged)} fps") - last_logged = now - frame_count = 0 - -Running it shows that we're hovering at ~30 fps. - -.. image:: https://user-images.githubusercontent.com/909104/152892609-7d115705-3ec9-4f8d-beed-a51711503a32.png - -This is with all the default settings in Raspberry Pi OS. If you disabled the UI -and all the other background services that are enabled by default it's more -performant and stable. - -If we check ``htop`` we see that we have almost 100% utilization. - -.. image:: https://user-images.githubusercontent.com/909104/152892630-f094b84b-19ba-48f6-8632-1b954abc59c7.png - -To verify that it's working end to end we can compute the probabilities of the -classes and -`use the ImageNet class labels `_ -to print the detections. - -.. code:: python - - top = list(enumerate(output[0].softmax(dim=0))) - top.sort(key=lambda x: x[1], reverse=True) - for idx, val in top[:10]: - print(f"{val.item()*100:.2f}% {classes[idx]}") - -``mobilenet_v3_large`` running in real time: - -.. image:: https://user-images.githubusercontent.com/909104/153093710-bc736b6f-69d9-4a50-a3e8-9f2b2c9e04fd.gif - - -Detecting an orange: - -.. image:: https://user-images.githubusercontent.com/909104/153092153-d9c08dfe-105b-408a-8e1e-295da8a78c19.jpg - - -Detecting a mug: - -.. image:: https://user-images.githubusercontent.com/909104/153092155-4b90002f-a0f3-4267-8d70-e713e7b4d5a0.jpg - - -Troubleshooting: Performance -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyTorch by default will use all of the cores available. If you have anything -running in the background on the Raspberry Pi it may cause contention with the -model inference causing latency spikes. To alleviate this you can reduce the -number of threads which will reduce the peak latency at a small performance -penalty. - -.. code:: python - - torch.set_num_threads(2) - -For ``shufflenet_v2_x1_5`` using ``2 threads`` instead of ``4 threads`` -increases best case latency to ``72 ms`` from ``60 ms`` but eliminates the -latency spikes of ``128 ms``. - -Next Steps -~~~~~~~~~~~~~ - -You can create your own model or fine tune an existing one. If you fine tune on -one of the models from -`torchvision.models.quantized -`_ -most of the work to fuse and quantize has already been done for you so you can -directly deploy with good performance on a Raspberry Pi. - -See more: - -* `Quantization `_ for more information on how to quantize and fuse your model. -* `Transfer Learning Tutorial `_ - for how to use transfer learning to fine tune a pre-existing model to your dataset. diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py deleted file mode 100644 index 30216ff880..0000000000 --- a/intermediate_source/reinforcement_ppo.py +++ /dev/null @@ -1,705 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Reinforcement Learning (PPO) with TorchRL Tutorial -================================================== -**Author**: `Vincent Moens `_ - -This tutorial demonstrates how to use PyTorch and :py:mod:`torchrl` to train a parametric policy -network to solve the Inverted Pendulum task from the `OpenAI-Gym/Farama-Gymnasium -control library `__. - -.. figure:: /_static/img/invpendulum.gif - :alt: Inverted pendulum - - Inverted pendulum - -Key learnings: - -- How to create an environment in TorchRL, transform its outputs, and collect data from this environment; -- How to make your classes talk to each other using :class:`~tensordict.TensorDict`; -- The basics of building your training loop with TorchRL: - - - How to compute the advantage signal for policy gradient methods; - - How to create a stochastic policy using a probabilistic neural network; - - How to create a dynamic replay buffer and sample from it without repetition. - -We will cover six crucial components of TorchRL: - -* `environments `__ -* `transforms `__ -* `models (policy and value function) `__ -* `loss modules `__ -* `data collectors `__ -* `replay buffers `__ - -""" - -###################################################################### -# If you are running this in Google Colab, make sure you install the following dependencies: -# -# .. code-block:: bash -# -# !pip3 install torchrl -# !pip3 install gym[mujoco] -# !pip3 install tqdm -# -# Proximal Policy Optimization (PPO) is a policy-gradient algorithm where a -# batch of data is being collected and directly consumed to train the policy to maximise -# the expected return given some proximality constraints. You can think of it -# as a sophisticated version of `REINFORCE `_, -# the foundational policy-optimization algorithm. For more information, see the -# `Proximal Policy Optimization Algorithms `_ paper. -# -# PPO is usually regarded as a fast and efficient method for online, on-policy -# reinforcement algorithm. TorchRL provides a loss-module that does all the work -# for you, so that you can rely on this implementation and focus on solving your -# problem rather than re-inventing the wheel every time you want to train a policy. -# -# For completeness, here is a brief overview of what the loss computes, even though -# this is taken care of by our :class:`~torchrl.objectives.ClipPPOLoss` module—the algorithm works as follows: -# 1. we will sample a batch of data by playing the -# policy in the environment for a given number of steps. -# 2. Then, we will perform a given number of optimization steps with random sub-samples of this batch using -# a clipped version of the REINFORCE loss. -# 3. The clipping will put a pessimistic bound on our loss: lower return estimates will -# be favored compared to higher ones. -# The precise formula of the loss is: -# -# .. math:: -# -# L(s,a,\theta_k,\theta) = \min\left( -# \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)} A^{\pi_{\theta_k}}(s,a), \;\; -# g(\epsilon, A^{\pi_{\theta_k}}(s,a)) -# \right), -# -# There are two components in that loss: in the first part of the minimum operator, -# we simply compute an importance-weighted version of the REINFORCE loss (for example, a -# REINFORCE loss that we have corrected for the fact that the current policy -# configuration lags the one that was used for the data collection). -# The second part of that minimum operator is a similar loss where we have clipped -# the ratios when they exceeded or were below a given pair of thresholds. -# -# This loss ensures that whether the advantage is positive or negative, policy -# updates that would produce significant shifts from the previous configuration -# are being discouraged. -# -# This tutorial is structured as follows: -# -# 1. First, we will define a set of hyperparameters we will be using for training. -# -# 2. Next, we will focus on creating our environment, or simulator, using TorchRL's -# wrappers and transforms. -# -# 3. Next, we will design the policy network and the value model, -# which is indispensable to the loss function. These modules will be used -# to configure our loss module. -# -# 4. Next, we will create the replay buffer and data loader. -# -# 5. Finally, we will run our training loop and analyze the results. -# -# Throughout this tutorial, we'll be using the :mod:`tensordict` library. -# :class:`~tensordict.TensorDict` is the lingua franca of TorchRL: it helps us abstract -# what a module reads and writes and care less about the specific data -# description and more about the algorithm itself. -# - -import warnings -warnings.filterwarnings("ignore") -from torch import multiprocessing - -# sphinx_gallery_start_ignore - -# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside -# `__main__` method call, but for the easy of reading the code switch to fork -# which is also a default spawn method in Google's Colaboratory -try: - multiprocessing.set_start_method("fork") -except RuntimeError: - pass - -# sphinx_gallery_end_ignore - -from collections import defaultdict - -import matplotlib.pyplot as plt -import torch -from tensordict.nn import TensorDictModule -from tensordict.nn.distributions import NormalParamExtractor -from torch import nn -from torchrl.collectors import SyncDataCollector -from torchrl.data.replay_buffers import ReplayBuffer -from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement -from torchrl.data.replay_buffers.storages import LazyTensorStorage -from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter, - TransformedEnv) -from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type -from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator -from torchrl.objectives import ClipPPOLoss -from torchrl.objectives.value import GAE -from tqdm import tqdm - -###################################################################### -# Define Hyperparameters -# ---------------------- -# -# We set the hyperparameters for our algorithm. Depending on the resources -# available, one may choose to execute the policy on GPU or on another -# device. -# The ``frame_skip`` will control how for how many frames is a single -# action being executed. The rest of the arguments that count frames -# must be corrected for this value (since one environment step will -# actually return ``frame_skip`` frames). -# - -is_fork = multiprocessing.get_start_method() == "fork" -device = ( - torch.device(0) - if torch.cuda.is_available() and not is_fork - else torch.device("cpu") -) -num_cells = 256 # number of cells in each layer i.e. output dim. -lr = 3e-4 -max_grad_norm = 1.0 - -###################################################################### -# Data collection parameters -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When collecting data, we will be able to choose how big each batch will be -# by defining a ``frames_per_batch`` parameter. We will also define how many -# frames (such as the number of interactions with the simulator) we will allow ourselves to -# use. In general, the goal of an RL algorithm is to learn to solve the task -# as fast as it can in terms of environment interactions: the lower the ``total_frames`` -# the better. -# -frames_per_batch = 1000 -# For a complete training, bring the number of frames up to 1M -total_frames = 50_000 - -###################################################################### -# PPO parameters -# ~~~~~~~~~~~~~~ -# -# At each data collection (or batch collection) we will run the optimization -# over a certain number of *epochs*, each time consuming the entire data we just -# acquired in a nested training loop. Here, the ``sub_batch_size`` is different from the -# ``frames_per_batch`` here above: recall that we are working with a "batch of data" -# coming from our collector, which size is defined by ``frames_per_batch``, and that -# we will further split in smaller sub-batches during the inner training loop. -# The size of these sub-batches is controlled by ``sub_batch_size``. -# -sub_batch_size = 64 # cardinality of the sub-samples gathered from the current data in the inner loop -num_epochs = 10 # optimization steps per batch of data collected -clip_epsilon = ( - 0.2 # clip value for PPO loss: see the equation in the intro for more context. -) -gamma = 0.99 -lmbda = 0.95 -entropy_eps = 1e-4 - -###################################################################### -# Define an environment -# --------------------- -# -# In RL, an *environment* is usually the way we refer to a simulator or a -# control system. Various libraries provide simulation environments for reinforcement -# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and -# many others. -# As a general library, TorchRL's goal is to provide an interchangeable interface -# to a large panel of RL simulators, allowing you to easily swap one environment -# with another. For example, creating a wrapped gym environment can be achieved with few characters: -# - -base_env = GymEnv("InvertedDoublePendulum-v4", device=device) - -###################################################################### -# There are a few things to notice in this code: first, we created -# the environment by calling the ``GymEnv`` wrapper. If extra keyword arguments -# are passed, they will be transmitted to the ``gym.make`` method, hence covering -# the most common environment construction commands. -# Alternatively, one could also directly create a gym environment using ``gym.make(env_name, **kwargs)`` -# and wrap it in a `GymWrapper` class. -# -# Also the ``device`` argument: for gym, this only controls the device where -# input action and observed states will be stored, but the execution will always -# be done on CPU. The reason for this is simply that gym does not support on-device -# execution, unless specified otherwise. For other libraries, we have control over -# the execution device and, as much as we can, we try to stay consistent in terms of -# storing and execution backends. -# -# Transforms -# ~~~~~~~~~~ -# -# We will append some transforms to our environments to prepare the data for -# the policy. In Gym, this is usually achieved via wrappers. TorchRL takes a different -# approach, more similar to other pytorch domain libraries, through the use of transforms. -# To add transforms to an environment, one should simply wrap it in a :class:`~torchrl.envs.transforms.TransformedEnv` -# instance and append the sequence of transforms to it. The transformed environment will inherit -# the device and meta-data of the wrapped environment, and transform these depending on the sequence -# of transforms it contains. -# -# Normalization -# ~~~~~~~~~~~~~ -# -# The first to encode is a normalization transform. -# As a rule of thumbs, it is preferable to have data that loosely -# match a unit Gaussian distribution: to obtain this, we will -# run a certain number of random steps in the environment and compute -# the summary statistics of these observations. -# -# We'll append two other transforms: the :class:`~torchrl.envs.transforms.DoubleToFloat` transform will -# convert double entries to single-precision numbers, ready to be read by the -# policy. The :class:`~torchrl.envs.transforms.StepCounter` transform will be used to count the steps before -# the environment is terminated. We will use this measure as a supplementary measure -# of performance. -# -# As we will see later, many of the TorchRL's classes rely on :class:`~tensordict.TensorDict` -# to communicate. You could think of it as a python dictionary with some extra -# tensor features. In practice, this means that many modules we will be working -# with need to be told what key to read (``in_keys``) and what key to write -# (``out_keys``) in the ``tensordict`` they will receive. Usually, if ``out_keys`` -# is omitted, it is assumed that the ``in_keys`` entries will be updated -# in-place. For our transforms, the only entry we are interested in is referred -# to as ``"observation"`` and our transform layers will be told to modify this -# entry and this entry only: -# - -env = TransformedEnv( - base_env, - Compose( - # normalize observations - ObservationNorm(in_keys=["observation"]), - DoubleToFloat(), - StepCounter(), - ), -) - -###################################################################### -# As you may have noticed, we have created a normalization layer but we did not -# set its normalization parameters. To do this, :class:`~torchrl.envs.transforms.ObservationNorm` can -# automatically gather the summary statistics of our environment: -# -env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0) - -###################################################################### -# The :class:`~torchrl.envs.transforms.ObservationNorm` transform has now been populated with a -# location and a scale that will be used to normalize the data. -# -# Let us do a little sanity check for the shape of our summary stats: -# -print("normalization constant shape:", env.transform[0].loc.shape) - -###################################################################### -# An environment is not only defined by its simulator and transforms, but also -# by a series of metadata that describe what can be expected during its -# execution. -# For efficiency purposes, TorchRL is quite stringent when it comes to -# environment specs, but you can easily check that your environment specs are -# adequate. -# In our example, the :class:`~torchrl.envs.libs.gym.GymWrapper` and -# :class:`~torchrl.envs.libs.gym.GymEnv` that inherits -# from it already take care of setting the proper specs for your environment so -# you should not have to care about this. -# -# Nevertheless, let's see a concrete example using our transformed -# environment by looking at its specs. -# There are three specs to look at: ``observation_spec`` which defines what -# is to be expected when executing an action in the environment, -# ``reward_spec`` which indicates the reward domain and finally the -# ``input_spec`` (which contains the ``action_spec``) and which represents -# everything an environment requires to execute a single step. -# -print("observation_spec:", env.observation_spec) -print("reward_spec:", env.reward_spec) -print("input_spec:", env.input_spec) -print("action_spec (as defined by input_spec):", env.action_spec) - -###################################################################### -# the :func:`check_env_specs` function runs a small rollout and compares its output against the environment -# specs. If no error is raised, we can be confident that the specs are properly defined: -# -check_env_specs(env) - -###################################################################### -# For fun, let's see what a simple random rollout looks like. You can -# call `env.rollout(n_steps)` and get an overview of what the environment inputs -# and outputs look like. Actions will automatically be drawn from the action spec -# domain, so you don't need to care about designing a random sampler. -# -# Typically, at each step, an RL environment receives an -# action as input, and outputs an observation, a reward and a done state. The -# observation may be composite, meaning that it could be composed of more than one -# tensor. This is not a problem for TorchRL, since the whole set of observations -# is automatically packed in the output :class:`~tensordict.TensorDict`. After executing a rollout -# (for example, a sequence of environment steps and random action generations) over a given -# number of steps, we will retrieve a :class:`~tensordict.TensorDict` instance with a shape -# that matches this trajectory length: -# -rollout = env.rollout(3) -print("rollout of three steps:", rollout) -print("Shape of the rollout TensorDict:", rollout.batch_size) - -###################################################################### -# Our rollout data has a shape of ``torch.Size([3])``, which matches the number of steps -# we ran it for. The ``"next"`` entry points to the data coming after the current step. -# In most cases, the ``"next"`` data at time `t` matches the data at ``t+1``, but this -# may not be the case if we are using some specific transformations (for example, multi-step). -# -# Policy -# ------ -# -# PPO utilizes a stochastic policy to handle exploration. This means that our -# neural network will have to output the parameters of a distribution, rather -# than a single value corresponding to the action taken. -# -# As the data is continuous, we use a Tanh-Normal distribution to respect the -# action space boundaries. TorchRL provides such distribution, and the only -# thing we need to care about is to build a neural network that outputs the -# right number of parameters for the policy to work with (a location, or mean, -# and a scale): -# -# .. math:: -# -# f_{\theta}(\text{observation}) = \mu_{\theta}(\text{observation}), \sigma^{+}_{\theta}(\text{observation}) -# -# The only extra-difficulty that is brought up here is to split our output in two -# equal parts and map the second to a strictly positive space. -# -# We design the policy in three steps: -# -# 1. Define a neural network ``D_obs`` -> ``2 * D_action``. Indeed, our ``loc`` (mu) and ``scale`` (sigma) both have dimension ``D_action``. -# -# 2. Append a :class:`~tensordict.nn.distributions.NormalParamExtractor` to extract a location and a scale (for example, splits the input in two equal parts and applies a positive transformation to the scale parameter). -# -# 3. Create a probabilistic :class:`~tensordict.nn.TensorDictModule` that can generate this distribution and sample from it. -# - -actor_net = nn.Sequential( - nn.LazyLinear(num_cells, device=device), - nn.Tanh(), - nn.LazyLinear(num_cells, device=device), - nn.Tanh(), - nn.LazyLinear(num_cells, device=device), - nn.Tanh(), - nn.LazyLinear(2 * env.action_spec.shape[-1], device=device), - NormalParamExtractor(), -) - -###################################################################### -# To enable the policy to "talk" with the environment through the ``tensordict`` -# data carrier, we wrap the ``nn.Module`` in a :class:`~tensordict.nn.TensorDictModule`. This -# class will simply ready the ``in_keys`` it is provided with and write the -# outputs in-place at the registered ``out_keys``. -# -policy_module = TensorDictModule( - actor_net, in_keys=["observation"], out_keys=["loc", "scale"] -) - -###################################################################### -# We now need to build a distribution out of the location and scale of our -# normal distribution. To do so, we instruct the -# :class:`~torchrl.modules.tensordict_module.ProbabilisticActor` -# class to build a :class:`~torchrl.modules.TanhNormal` out of the location and scale -# parameters. We also provide the minimum and maximum values of this -# distribution, which we gather from the environment specs. -# -# The name of the ``in_keys`` (and hence the name of the ``out_keys`` from -# the :class:`~tensordict.nn.TensorDictModule` above) cannot be set to any value one may -# like, as the :class:`~torchrl.modules.TanhNormal` distribution constructor will expect the -# ``loc`` and ``scale`` keyword arguments. That being said, -# :class:`~torchrl.modules.tensordict_module.ProbabilisticActor` also accepts -# ``Dict[str, str]`` typed ``in_keys`` where the key-value pair indicates -# what ``in_key`` string should be used for every keyword argument that is to be used. -# -policy_module = ProbabilisticActor( - module=policy_module, - spec=env.action_spec, - in_keys=["loc", "scale"], - distribution_class=TanhNormal, - distribution_kwargs={ - "min": env.action_spec.space.low, - "max": env.action_spec.space.high, - }, - return_log_prob=True, - # we'll need the log-prob for the numerator of the importance weights -) - -###################################################################### -# Value network -# ------------- -# -# The value network is a crucial component of the PPO algorithm, even though it -# won't be used at inference time. This module will read the observations and -# return an estimation of the discounted return for the following trajectory. -# This allows us to amortize learning by relying on the some utility estimation -# that is learned on-the-fly during training. Our value network share the same -# structure as the policy, but for simplicity we assign it its own set of -# parameters. -# -value_net = nn.Sequential( - nn.LazyLinear(num_cells, device=device), - nn.Tanh(), - nn.LazyLinear(num_cells, device=device), - nn.Tanh(), - nn.LazyLinear(num_cells, device=device), - nn.Tanh(), - nn.LazyLinear(1, device=device), -) - -value_module = ValueOperator( - module=value_net, - in_keys=["observation"], -) - -###################################################################### -# let's try our policy and value modules. As we said earlier, the usage of -# :class:`~tensordict.nn.TensorDictModule` makes it possible to directly read the output -# of the environment to run these modules, as they know what information to read -# and where to write it: -# -print("Running policy:", policy_module(env.reset())) -print("Running value:", value_module(env.reset())) - -###################################################################### -# Data collector -# -------------- -# -# TorchRL provides a set of `DataCollector classes `__. -# Briefly, these classes execute three operations: reset an environment, -# compute an action given the latest observation, execute a step in the environment, -# and repeat the last two steps until the environment signals a stop (or reaches -# a done state). -# -# They allow you to control how many frames to collect at each iteration -# (through the ``frames_per_batch`` parameter), -# when to reset the environment (through the ``max_frames_per_traj`` argument), -# on which ``device`` the policy should be executed, etc. They are also -# designed to work efficiently with batched and multiprocessed environments. -# -# The simplest data collector is the :class:`~torchrl.collectors.collectors.SyncDataCollector`: -# it is an iterator that you can use to get batches of data of a given length, and -# that will stop once a total number of frames (``total_frames``) have been -# collected. -# Other data collectors (:class:`~torchrl.collectors.collectors.MultiSyncDataCollector` and -# :class:`~torchrl.collectors.collectors.MultiaSyncDataCollector`) will execute -# the same operations in synchronous and asynchronous manner over a -# set of multiprocessed workers. -# -# As for the policy and environment before, the data collector will return -# :class:`~tensordict.TensorDict` instances with a total number of elements that will -# match ``frames_per_batch``. Using :class:`~tensordict.TensorDict` to pass data to the -# training loop allows you to write data loading pipelines -# that are 100% oblivious to the actual specificities of the rollout content. -# -collector = SyncDataCollector( - env, - policy_module, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - split_trajs=False, - device=device, -) - -###################################################################### -# Replay buffer -# ------------- -# -# Replay buffers are a common building piece of off-policy RL algorithms. -# In on-policy contexts, a replay buffer is refilled every time a batch of -# data is collected, and its data is repeatedly consumed for a certain number -# of epochs. -# -# TorchRL's replay buffers are built using a common container -# :class:`~torchrl.data.ReplayBuffer` which takes as argument the components -# of the buffer: a storage, a writer, a sampler and possibly some transforms. -# Only the storage (which indicates the replay buffer capacity) is mandatory. -# We also specify a sampler without repetition to avoid sampling multiple times -# the same item in one epoch. -# Using a replay buffer for PPO is not mandatory and we could simply -# sample the sub-batches from the collected batch, but using these classes -# make it easy for us to build the inner training loop in a reproducible way. -# - -replay_buffer = ReplayBuffer( - storage=LazyTensorStorage(max_size=frames_per_batch), - sampler=SamplerWithoutReplacement(), -) - -###################################################################### -# Loss function -# ------------- -# -# The PPO loss can be directly imported from TorchRL for convenience using the -# :class:`~torchrl.objectives.ClipPPOLoss` class. This is the easiest way of utilizing PPO: -# it hides away the mathematical operations of PPO and the control flow that -# goes with it. -# -# PPO requires some "advantage estimation" to be computed. In short, an advantage -# is a value that reflects an expectancy over the return value while dealing with -# the bias / variance tradeoff. -# To compute the advantage, one just needs to (1) build the advantage module, which -# utilizes our value operator, and (2) pass each batch of data through it before each -# epoch. -# The GAE module will update the input ``tensordict`` with new ``"advantage"`` and -# ``"value_target"`` entries. -# The ``"value_target"`` is a gradient-free tensor that represents the empirical -# value that the value network should represent with the input observation. -# Both of these will be used by :class:`~torchrl.objectives.ClipPPOLoss` to -# return the policy and value losses. -# - -advantage_module = GAE( - gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True -) - -loss_module = ClipPPOLoss( - actor_network=policy_module, - critic_network=value_module, - clip_epsilon=clip_epsilon, - entropy_bonus=bool(entropy_eps), - entropy_coef=entropy_eps, - # these keys match by default but we set this for completeness - critic_coef=1.0, - loss_critic_type="smooth_l1", -) - -optim = torch.optim.Adam(loss_module.parameters(), lr) -scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optim, total_frames // frames_per_batch, 0.0 -) - -###################################################################### -# Training loop -# ------------- -# We now have all the pieces needed to code our training loop. -# The steps include: -# -# * Collect data -# -# * Compute advantage -# -# * Loop over the collected to compute loss values -# * Back propagate -# * Optimize -# * Repeat -# -# * Repeat -# -# * Repeat -# - - -logs = defaultdict(list) -pbar = tqdm(total=total_frames) -eval_str = "" - -# We iterate over the collector until it reaches the total number of frames it was -# designed to collect: -for i, tensordict_data in enumerate(collector): - # we now have a batch of data to work with. Let's learn something from it. - for _ in range(num_epochs): - # We'll need an "advantage" signal to make PPO work. - # We re-compute it at each epoch as its value depends on the value - # network which is updated in the inner loop. - advantage_module(tensordict_data) - data_view = tensordict_data.reshape(-1) - replay_buffer.extend(data_view.cpu()) - for _ in range(frames_per_batch // sub_batch_size): - subdata = replay_buffer.sample(sub_batch_size) - loss_vals = loss_module(subdata.to(device)) - loss_value = ( - loss_vals["loss_objective"] - + loss_vals["loss_critic"] - + loss_vals["loss_entropy"] - ) - - # Optimization: backward, grad clipping and optimization step - loss_value.backward() - # this is not strictly mandatory but it's good practice to keep - # your gradient norm bounded - torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm) - optim.step() - optim.zero_grad() - - logs["reward"].append(tensordict_data["next", "reward"].mean().item()) - pbar.update(tensordict_data.numel()) - cum_reward_str = ( - f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})" - ) - logs["step_count"].append(tensordict_data["step_count"].max().item()) - stepcount_str = f"step count (max): {logs['step_count'][-1]}" - logs["lr"].append(optim.param_groups[0]["lr"]) - lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}" - if i % 10 == 0: - # We evaluate the policy once every 10 batches of data. - # Evaluation is rather simple: execute the policy without exploration - # (take the expected value of the action distribution) for a given - # number of steps (1000, which is our ``env`` horizon). - # The ``rollout`` method of the ``env`` can take a policy as argument: - # it will then execute this policy at each step. - with set_exploration_type(ExplorationType.MEAN), torch.no_grad(): - # execute a rollout with the trained policy - eval_rollout = env.rollout(1000, policy_module) - logs["eval reward"].append(eval_rollout["next", "reward"].mean().item()) - logs["eval reward (sum)"].append( - eval_rollout["next", "reward"].sum().item() - ) - logs["eval step_count"].append(eval_rollout["step_count"].max().item()) - eval_str = ( - f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} " - f"(init: {logs['eval reward (sum)'][0]: 4.4f}), " - f"eval step-count: {logs['eval step_count'][-1]}" - ) - del eval_rollout - pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str])) - - # We're also using a learning rate scheduler. Like the gradient clipping, - # this is a nice-to-have but nothing necessary for PPO to work. - scheduler.step() - -###################################################################### -# Results -# ------- -# -# Before the 1M step cap is reached, the algorithm should have reached a max -# step count of 1000 steps, which is the maximum number of steps before the -# trajectory is truncated. -# -plt.figure(figsize=(10, 10)) -plt.subplot(2, 2, 1) -plt.plot(logs["reward"]) -plt.title("training rewards (average)") -plt.subplot(2, 2, 2) -plt.plot(logs["step_count"]) -plt.title("Max step count (training)") -plt.subplot(2, 2, 3) -plt.plot(logs["eval reward (sum)"]) -plt.title("Return (test)") -plt.subplot(2, 2, 4) -plt.plot(logs["eval step_count"]) -plt.title("Max step count (test)") -plt.show() - -###################################################################### -# Conclusion and next steps -# ------------------------- -# -# In this tutorial, we have learned: -# -# 1. How to create and customize an environment with :py:mod:`torchrl`; -# 2. How to write a model and a loss function; -# 3. How to set up a typical training loop. -# -# If you want to experiment with this tutorial a bit more, you can apply the following modifications: -# -# * From an efficiency perspective, -# we could run several simulations in parallel to speed up data collection. -# Check :class:`~torchrl.envs.ParallelEnv` for further information. -# -# * From a logging perspective, one could add a :class:`torchrl.record.VideoRecorder` transform to -# the environment after asking for rendering to get a visual rendering of the -# inverted pendulum in action. Check :py:mod:`torchrl.record` to -# know more. -# diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py deleted file mode 100644 index 0ae3ea9a90..0000000000 --- a/intermediate_source/reinforcement_q_learning.py +++ /dev/null @@ -1,464 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Reinforcement Learning (DQN) Tutorial -===================================== -**Author**: `Adam Paszke `_ - `Mark Towers `_ - - -This tutorial shows how to use PyTorch to train a Deep Q Learning (DQN) agent -on the CartPole-v1 task from `Gymnasium `__. - -You might find it helpful to read the original `Deep Q Learning (DQN) `__ paper - -**Task** - -The agent has to decide between two actions - moving the cart left or -right - so that the pole attached to it stays upright. You can find more -information about the environment and other more challenging environments at -`Gymnasium's website `__. - -.. figure:: /_static/img/cartpole.gif - :alt: CartPole - - CartPole - -As the agent observes the current state of the environment and chooses -an action, the environment *transitions* to a new state, and also -returns a reward that indicates the consequences of the action. In this -task, rewards are +1 for every incremental timestep and the environment -terminates if the pole falls over too far or the cart moves more than 2.4 -units away from center. This means better performing scenarios will run -for longer duration, accumulating larger return. - -The CartPole task is designed so that the inputs to the agent are 4 real -values representing the environment state (position, velocity, etc.). -We take these 4 inputs without any scaling and pass them through a -small fully-connected network with 2 outputs, one for each action. -The network is trained to predict the expected value for each action, -given the input state. The action with the highest expected value is -then chosen. - - -**Packages** - - -First, let's import needed packages. Firstly, we need -`gymnasium `__ for the environment, -installed by using `pip`. This is a fork of the original OpenAI -Gym project and maintained by the same team since Gym v0.19. -If you are running this in Google Colab, run: - -.. code-block:: bash - - %%bash - pip3 install gymnasium[classic_control] - -We'll also use the following from PyTorch: - -- neural networks (``torch.nn``) -- optimization (``torch.optim``) -- automatic differentiation (``torch.autograd``) - -""" - -import gymnasium as gym -import math -import random -import matplotlib -import matplotlib.pyplot as plt -from collections import namedtuple, deque -from itertools import count - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F - -env = gym.make("CartPole-v1") - -# set up matplotlib -is_ipython = 'inline' in matplotlib.get_backend() -if is_ipython: - from IPython import display - -plt.ion() - -# if GPU is to be used -device = torch.device( - "cuda" if torch.cuda.is_available() else - "mps" if torch.backends.mps.is_available() else - "cpu" -) - - -###################################################################### -# Replay Memory -# ------------- -# -# We'll be using experience replay memory for training our DQN. It stores -# the transitions that the agent observes, allowing us to reuse this data -# later. By sampling from it randomly, the transitions that build up a -# batch are decorrelated. It has been shown that this greatly stabilizes -# and improves the DQN training procedure. -# -# For this, we're going to need two classes: -# -# - ``Transition`` - a named tuple representing a single transition in -# our environment. It essentially maps (state, action) pairs -# to their (next_state, reward) result, with the state being the -# screen difference image as described later on. -# - ``ReplayMemory`` - a cyclic buffer of bounded size that holds the -# transitions observed recently. It also implements a ``.sample()`` -# method for selecting a random batch of transitions for training. -# - -Transition = namedtuple('Transition', - ('state', 'action', 'next_state', 'reward')) - - -class ReplayMemory(object): - - def __init__(self, capacity): - self.memory = deque([], maxlen=capacity) - - def push(self, *args): - """Save a transition""" - self.memory.append(Transition(*args)) - - def sample(self, batch_size): - return random.sample(self.memory, batch_size) - - def __len__(self): - return len(self.memory) - - -###################################################################### -# Now, let's define our model. But first, let's quickly recap what a DQN is. -# -# DQN algorithm -# ------------- -# -# Our environment is deterministic, so all equations presented here are -# also formulated deterministically for the sake of simplicity. In the -# reinforcement learning literature, they would also contain expectations -# over stochastic transitions in the environment. -# -# Our aim will be to train a policy that tries to maximize the discounted, -# cumulative reward -# :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where -# :math:`R_{t_0}` is also known as the *return*. The discount, -# :math:`\gamma`, should be a constant between :math:`0` and :math:`1` -# that ensures the sum converges. A lower :math:`\gamma` makes -# rewards from the uncertain far future less important for our agent -# than the ones in the near future that it can be fairly confident -# about. It also encourages agents to collect reward closer in time -# than equivalent rewards that are temporally far away in the future. -# -# The main idea behind Q-learning is that if we had a function -# :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell -# us what our return would be, if we were to take an action in a given -# state, then we could easily construct a policy that maximizes our -# rewards: -# -# .. math:: \pi^*(s) = \arg\!\max_a \ Q^*(s, a) -# -# However, we don't know everything about the world, so we don't have -# access to :math:`Q^*`. But, since neural networks are universal function -# approximators, we can simply create one and train it to resemble -# :math:`Q^*`. -# -# For our training update rule, we'll use a fact that every :math:`Q` -# function for some policy obeys the Bellman equation: -# -# .. math:: Q^{\pi}(s, a) = r + \gamma Q^{\pi}(s', \pi(s')) -# -# The difference between the two sides of the equality is known as the -# temporal difference error, :math:`\delta`: -# -# .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a)) -# -# To minimize this error, we will use the `Huber -# loss `__. The Huber loss acts -# like the mean squared error when the error is small, but like the mean -# absolute error when the error is large - this makes it more robust to -# outliers when the estimates of :math:`Q` are very noisy. We calculate -# this over a batch of transitions, :math:`B`, sampled from the replay -# memory: -# -# .. math:: -# -# \mathcal{L} = \frac{1}{|B|}\sum_{(s, a, s', r) \ \in \ B} \mathcal{L}(\delta) -# -# .. math:: -# -# \text{where} \quad \mathcal{L}(\delta) = \begin{cases} -# \frac{1}{2}{\delta^2} & \text{for } |\delta| \le 1, \\ -# |\delta| - \frac{1}{2} & \text{otherwise.} -# \end{cases} -# -# Q-network -# ^^^^^^^^^ -# -# Our model will be a feed forward neural network that takes in the -# difference between the current and previous screen patches. It has two -# outputs, representing :math:`Q(s, \mathrm{left})` and -# :math:`Q(s, \mathrm{right})` (where :math:`s` is the input to the -# network). In effect, the network is trying to predict the *expected return* of -# taking each action given the current input. -# - -class DQN(nn.Module): - - def __init__(self, n_observations, n_actions): - super(DQN, self).__init__() - self.layer1 = nn.Linear(n_observations, 128) - self.layer2 = nn.Linear(128, 128) - self.layer3 = nn.Linear(128, n_actions) - - # Called with either one element to determine next action, or a batch - # during optimization. Returns tensor([[left0exp,right0exp]...]). - def forward(self, x): - x = F.relu(self.layer1(x)) - x = F.relu(self.layer2(x)) - return self.layer3(x) - - -###################################################################### -# Training -# -------- -# -# Hyperparameters and utilities -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# This cell instantiates our model and its optimizer, and defines some -# utilities: -# -# - ``select_action`` - will select an action according to an epsilon -# greedy policy. Simply put, we'll sometimes use our model for choosing -# the action, and sometimes we'll just sample one uniformly. The -# probability of choosing a random action will start at ``EPS_START`` -# and will decay exponentially towards ``EPS_END``. ``EPS_DECAY`` -# controls the rate of the decay. -# - ``plot_durations`` - a helper for plotting the duration of episodes, -# along with an average over the last 100 episodes (the measure used in -# the official evaluations). The plot will be underneath the cell -# containing the main training loop, and will update after every -# episode. -# - -# BATCH_SIZE is the number of transitions sampled from the replay buffer -# GAMMA is the discount factor as mentioned in the previous section -# EPS_START is the starting value of epsilon -# EPS_END is the final value of epsilon -# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay -# TAU is the update rate of the target network -# LR is the learning rate of the ``AdamW`` optimizer -BATCH_SIZE = 128 -GAMMA = 0.99 -EPS_START = 0.9 -EPS_END = 0.05 -EPS_DECAY = 1000 -TAU = 0.005 -LR = 1e-4 - -# Get number of actions from gym action space -n_actions = env.action_space.n -# Get the number of state observations -state, info = env.reset() -n_observations = len(state) - -policy_net = DQN(n_observations, n_actions).to(device) -target_net = DQN(n_observations, n_actions).to(device) -target_net.load_state_dict(policy_net.state_dict()) - -optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True) -memory = ReplayMemory(10000) - - -steps_done = 0 - - -def select_action(state): - global steps_done - sample = random.random() - eps_threshold = EPS_END + (EPS_START - EPS_END) * \ - math.exp(-1. * steps_done / EPS_DECAY) - steps_done += 1 - if sample > eps_threshold: - with torch.no_grad(): - # t.max(1) will return the largest column value of each row. - # second column on max result is index of where max element was - # found, so we pick action with the larger expected reward. - return policy_net(state).max(1).indices.view(1, 1) - else: - return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long) - - -episode_durations = [] - - -def plot_durations(show_result=False): - plt.figure(1) - durations_t = torch.tensor(episode_durations, dtype=torch.float) - if show_result: - plt.title('Result') - else: - plt.clf() - plt.title('Training...') - plt.xlabel('Episode') - plt.ylabel('Duration') - plt.plot(durations_t.numpy()) - # Take 100 episode averages and plot them too - if len(durations_t) >= 100: - means = durations_t.unfold(0, 100, 1).mean(1).view(-1) - means = torch.cat((torch.zeros(99), means)) - plt.plot(means.numpy()) - - plt.pause(0.001) # pause a bit so that plots are updated - if is_ipython: - if not show_result: - display.display(plt.gcf()) - display.clear_output(wait=True) - else: - display.display(plt.gcf()) - - -###################################################################### -# Training loop -# ^^^^^^^^^^^^^ -# -# Finally, the code for training our model. -# -# Here, you can find an ``optimize_model`` function that performs a -# single step of the optimization. It first samples a batch, concatenates -# all the tensors into a single one, computes :math:`Q(s_t, a_t)` and -# :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our -# loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal -# state. We also use a target network to compute :math:`V(s_{t+1})` for -# added stability. The target network is updated at every step with a -# `soft update `__ controlled by -# the hyperparameter ``TAU``, which was previously defined. -# - -def optimize_model(): - if len(memory) < BATCH_SIZE: - return - transitions = memory.sample(BATCH_SIZE) - # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for - # detailed explanation). This converts batch-array of Transitions - # to Transition of batch-arrays. - batch = Transition(*zip(*transitions)) - - # Compute a mask of non-final states and concatenate the batch elements - # (a final state would've been the one after which simulation ended) - non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, - batch.next_state)), device=device, dtype=torch.bool) - non_final_next_states = torch.cat([s for s in batch.next_state - if s is not None]) - state_batch = torch.cat(batch.state) - action_batch = torch.cat(batch.action) - reward_batch = torch.cat(batch.reward) - - # Compute Q(s_t, a) - the model computes Q(s_t), then we select the - # columns of actions taken. These are the actions which would've been taken - # for each batch state according to policy_net - state_action_values = policy_net(state_batch).gather(1, action_batch) - - # Compute V(s_{t+1}) for all next states. - # Expected values of actions for non_final_next_states are computed based - # on the "older" target_net; selecting their best reward with max(1).values - # This is merged based on the mask, such that we'll have either the expected - # state value or 0 in case the state was final. - next_state_values = torch.zeros(BATCH_SIZE, device=device) - with torch.no_grad(): - next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values - # Compute the expected Q values - expected_state_action_values = (next_state_values * GAMMA) + reward_batch - - # Compute Huber loss - criterion = nn.SmoothL1Loss() - loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1)) - - # Optimize the model - optimizer.zero_grad() - loss.backward() - # In-place gradient clipping - torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100) - optimizer.step() - - -###################################################################### -# -# Below, you can find the main training loop. At the beginning we reset -# the environment and obtain the initial ``state`` Tensor. Then, we sample -# an action, execute it, observe the next state and the reward (always -# 1), and optimize our model once. When the episode ends (our model -# fails), we restart the loop. -# -# Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50 -# episodes are scheduled so training does not take too long. However, 50 -# episodes is insufficient for to observe good performance on CartPole. -# You should see the model constantly achieve 500 steps within 600 training -# episodes. Training RL agents can be a noisy process, so restarting training -# can produce better results if convergence is not observed. -# - -if torch.cuda.is_available() or torch.backends.mps.is_available(): - num_episodes = 600 -else: - num_episodes = 50 - -for i_episode in range(num_episodes): - # Initialize the environment and get its state - state, info = env.reset() - state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) - for t in count(): - action = select_action(state) - observation, reward, terminated, truncated, _ = env.step(action.item()) - reward = torch.tensor([reward], device=device) - done = terminated or truncated - - if terminated: - next_state = None - else: - next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0) - - # Store the transition in memory - memory.push(state, action, next_state, reward) - - # Move to the next state - state = next_state - - # Perform one step of the optimization (on the policy network) - optimize_model() - - # Soft update of the target network's weights - # θ′ ← τ θ + (1 −τ )θ′ - target_net_state_dict = target_net.state_dict() - policy_net_state_dict = policy_net.state_dict() - for key in policy_net_state_dict: - target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU) - target_net.load_state_dict(target_net_state_dict) - - if done: - episode_durations.append(t + 1) - plot_durations() - break - -print('Complete') -plot_durations(show_result=True) -plt.ioff() -plt.show() - -###################################################################### -# Here is the diagram that illustrates the overall resulting data flow. -# -# .. figure:: /_static/img/reinforcement_learning_diagram.jpg -# -# Actions are chosen either randomly or based on a policy, getting the next -# step sample from the gym environment. We record the results in the -# replay memory and also run optimization step on every iteration. -# Optimization picks a random batch from the replay memory to do training of the -# new policy. The "older" target_net is also used in optimization to compute the -# expected Q values. A soft update of its weights are performed at every step. -# diff --git a/intermediate_source/rpc_async_execution.rst b/intermediate_source/rpc_async_execution.rst deleted file mode 100644 index cf4716179e..0000000000 --- a/intermediate_source/rpc_async_execution.rst +++ /dev/null @@ -1,524 +0,0 @@ -Implementing Batch RPC Processing Using Asynchronous Executions -=============================================================== -**Author**: `Shen Li `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ -- `Getting started with Distributed RPC Framework `__ -- `Implementing a Parameter Server using Distributed RPC Framework `__ -- `RPC Asynchronous Execution Decorator `__ - -This tutorial demonstrates how to build batch-processing RPC applications with -the `@rpc.functions.async_execution `__ -decorator, which helps to speed up training by reducing the number of blocked -RPC threads and consolidating CUDA operations on the callee. This shares the -same idea as `Batch Inference with TorchServe `__. - -.. note:: This tutorial requires PyTorch v1.6.0 or above. - -Basics ------- - -Previous tutorials have shown the steps to build distributed training -applications using `torch.distributed.rpc `__, -but they didn't elaborate on what happens on the callee side when processing an -RPC request. As of PyTorch v1.5, each RPC request will block one thread on the -callee to execute the function in that request until that function returns. -This works for many use cases, but there is one caveat. If the user function -blocks on IO, e.g., with nested RPC invocation, or signaling, e.g., waiting for -a different RPC request to unblock, the RPC thread on the callee will have to -idle waiting until the IO finishes or the signaling event occurs. As a result, -RPC callees are likely to use more threads than necessary. The cause of this -problem is that RPC treats user functions as black boxes, and knows very little -about what happens in the function. To allow user functions to yield and free -RPC threads, more hints need to be provided to the RPC system. - -Since v1.6.0, PyTorch addresses this problem by introducing two new concepts: - -* A `torch.futures.Future `__ type - that encapsulates an asynchronous execution, which also supports installing - callback functions. -* An `@rpc.functions.async_execution `__ - decorator that allows applications to tell the callee that the target function - will return a future and can pause and yield multiple times during execution. - -With these two tools, the application code can break a user function into -multiple smaller functions, chain them together as callbacks on ``Future`` -objects, and return the ``Future`` that contains the final result. On the callee -side, when getting the ``Future`` object, it installs subsequent RPC response -preparation and communication as callbacks as well, which will be triggered -when the final result is ready. In this way, the callee no longer needs to block -one thread and wait until the final return value is ready. Please refer to the -API doc of -`@rpc.functions.async_execution `__ -for simple examples. - -Besides reducing the number of idle threads on the callee, these tools also help -to make batch RPC processing easier and faster. The following two sections of -this tutorial demonstrate how to build distributed batch-updating parameter -server and batch-processing reinforcement learning applications using the -`@rpc.functions.async_execution `__ -decorator. - -Batch-Updating Parameter Server -------------------------------- - -Consider a synchronized parameter server training application with one parameter -server (PS) and multiple trainers. In this application, the PS holds the -parameters and waits for all trainers to report gradients. In every iteration, -it waits until receiving gradients from all trainers and then updates all -parameters in one shot. The code below shows the implementation of the PS class. -The ``update_and_fetch_model`` method is decorated using -``@rpc.functions.async_execution`` and will be called by trainers. Each -invocation returns a ``Future`` object that will be populated with the updated -model. Invocations launched by most trainers just accumulate gradients to the -``.grad`` field, return immediately, and yield the RPC thread on the PS. The -last arriving trainer will trigger the optimizer step and consume all previously -reported gradients. Then it sets the ``future_model`` with the updated model, -which in turn notifies all previous requests from other trainers through the -``Future`` object and sends out the updated model to all trainers. - -.. code:: python - - import threading - import torchvision - import torch - import torch.distributed.rpc as rpc - from torch import optim - - num_classes, batch_update_size = 30, 5 - - class BatchUpdateParameterServer(object): - def __init__(self, batch_update_size=batch_update_size): - self.model = torchvision.models.resnet50(num_classes=num_classes) - self.lock = threading.Lock() - self.future_model = torch.futures.Future() - self.batch_update_size = batch_update_size - self.curr_update_size = 0 - self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9) - for p in self.model.parameters(): - p.grad = torch.zeros_like(p) - - def get_model(self): - return self.model - - @staticmethod - @rpc.functions.async_execution - def update_and_fetch_model(ps_rref, grads): - # Using the RRef to retrieve the local PS instance - self = ps_rref.local_value() - with self.lock: - self.curr_update_size += 1 - # accumulate gradients into .grad field - for p, g in zip(self.model.parameters(), grads): - p.grad += g - - # Save the current future_model and return it to make sure the - # returned Future object holds the correct model even if another - # thread modifies future_model before this thread returns. - fut = self.future_model - - if self.curr_update_size >= self.batch_update_size: - # update the model - for p in self.model.parameters(): - p.grad /= self.batch_update_size - self.curr_update_size = 0 - self.optimizer.step() - self.optimizer.zero_grad() - # by settiing the result on the Future object, all previous - # requests expecting this updated model will be notified and - # the their responses will be sent accordingly. - fut.set_result(self.model) - self.future_model = torch.futures.Future() - - return fut - -For the trainers, they are all initialized using the same set of -parameters from the PS. In every iteration, each trainer first runs the forward -and the backward passes to generate gradients locally. Then, each trainer -reports its gradients to the PS using RPC, and fetches back the updated -parameters through the return value of the same RPC request. In the trainer's -implementation, whether the target function is marked with -``@rpc.functions.async_execution`` or not makes no difference. The -trainer simply calls ``update_and_fetch_model`` using ``rpc_sync`` which will -block on the trainer until the updated model is returned. - -.. code:: python - - batch_size, image_w, image_h = 20, 64, 64 - - class Trainer(object): - def __init__(self, ps_rref): - self.ps_rref, self.loss_fn = ps_rref, torch.nn.MSELoss() - self.one_hot_indices = torch.LongTensor(batch_size) \ - .random_(0, num_classes) \ - .view(batch_size, 1) - - def get_next_batch(self): - for _ in range(6): - inputs = torch.randn(batch_size, 3, image_w, image_h) - labels = torch.zeros(batch_size, num_classes) \ - .scatter_(1, self.one_hot_indices, 1) - yield inputs.cuda(), labels.cuda() - - def train(self): - name = rpc.get_worker_info().name - # get initial model parameters - m = self.ps_rref.rpc_sync().get_model().cuda() - # start training - for inputs, labels in self.get_next_batch(): - self.loss_fn(m(inputs), labels).backward() - m = rpc.rpc_sync( - self.ps_rref.owner(), - BatchUpdateParameterServer.update_and_fetch_model, - args=(self.ps_rref, [p.grad for p in m.cpu().parameters()]), - ).cuda() - -We skip the code that launches multiple processes in this tutorial and please -refer to the `examples `__ -repo for the full implementation. Note that, it is possible to implement batch -processing without the -`@rpc.functions.async_execution `__ -decorator. However, that would require either blocking more RPC threads on -the PS or use another round of RPC to fetch updated models, where the latter -would add both more code complexity and more communication overhead. - -This section uses a simple parameter sever training example to show how to -implement batch RPC applications using the -`@rpc.functions.async_execution `__ -decorator. In the next section, we re-implement the reinforcement learning -example in the previous -`Getting started with Distributed RPC Framework `__ -tutorial using batch processing, and demonstrate its impact on the training -speed. - -Batch-Processing CartPole Solver --------------------------------- - -This section uses CartPole-v1 from `OpenAI Gym `__ as -an example to show the performance impact of batch processing RPC. Please note -that since the goal is to demonstrate the usage of -`@rpc.functions.async_execution `__ -instead of building the best CartPole solver or solving most different RL -problems, we use very simple policies and reward calculation strategies and -focus on the multi-observer single-agent batch RPC implementation. We use a -similar ``Policy`` model as the previous tutorial which is shown below. Compared -to the previous tutorial, the difference is that its constructor takes an -additional ``batch`` argument which controls the ``dim`` parameter for -``F.softmax`` because with batching, the ``x`` argument in the ``forward`` -function contains states from multiple observers and hence the dimension needs -to change properly. Everything else stays intact. - -.. code:: python - - import argparse - import torch.nn as nn - import torch.nn.functional as F - - parser = argparse.ArgumentParser(description='PyTorch RPC Batch RL example') - parser.add_argument('--gamma', type=float, default=1.0, metavar='G', - help='discount factor (default: 1.0)') - parser.add_argument('--seed', type=int, default=543, metavar='N', - help='random seed (default: 543)') - parser.add_argument('--num-episode', type=int, default=10, metavar='E', - help='number of episodes (default: 10)') - args = parser.parse_args() - - torch.manual_seed(args.seed) - - class Policy(nn.Module): - def __init__(self, batch=True): - super(Policy, self).__init__() - self.affine1 = nn.Linear(4, 128) - self.dropout = nn.Dropout(p=0.6) - self.affine2 = nn.Linear(128, 2) - self.dim = 2 if batch else 1 - - def forward(self, x): - x = self.affine1(x) - x = self.dropout(x) - x = F.relu(x) - action_scores = self.affine2(x) - return F.softmax(action_scores, dim=self.dim) - - -The constructor of the ``Observer`` adjusts accordingly as well. It also takes a -``batch`` argument, which governs which ``Agent`` function it uses to select -actions. In batch mode, it calls ``select_action_batch`` function on ``Agent`` -which will be presented shortly, and this function will be decorated with -`@rpc.functions.async_execution `__. - - -.. code:: python - - import gym - import torch.distributed.rpc as rpc - - class Observer: - def __init__(self, batch=True): - self.id = rpc.get_worker_info().id - 1 - self.env = gym.make('CartPole-v1') - self.env.seed(args.seed) - self.select_action = Agent.select_action_batch if batch else Agent.select_action - -Compared to the previous tutorial -`Getting started with Distributed RPC Framework `__, -observers behave a little differently. Instead of exiting when the environment -is stopped, it always runs ``n_steps`` iterations in every episode. When the -environment returns, the observer simply resets the environment and start over -again. With this design, the agent will receive a fixed number of states from -every observer and hence can pack them into a fixed-size tensor. In every -step, the ``Observer`` uses RPC to send its state to the ``Agent`` and fetches -the action through the return value. At the end of every episode, it returns the -rewards of all steps to ``Agent``. Note that this ``run_episode`` function will -be called by the ``Agent`` using RPC. So the ``rpc_sync`` call in this function -will be a nested RPC invocation. We could mark this function as ``@rpc.functions.async_execution`` -too to avoid blocking one thread on the ``Observer``. However, as the bottleneck -is the ``Agent`` instead of the ``Observer``, it should be OK to block one -thread on the ``Observer`` process. - - -.. code:: python - - import torch - - class Observer: - ... - - def run_episode(self, agent_rref, n_steps): - state, ep_reward = self.env.reset(), NUM_STEPS - rewards = torch.zeros(n_steps) - start_step = 0 - for step in range(n_steps): - state = torch.from_numpy(state).float().unsqueeze(0) - # send the state to the agent to get an action - action = rpc.rpc_sync( - agent_rref.owner(), - self.select_action, - args=(agent_rref, self.id, state) - ) - - # apply the action to the environment, and get the reward - state, reward, done, _ = self.env.step(action) - rewards[step] = reward - - if done or step + 1 >= n_steps: - curr_rewards = rewards[start_step:(step + 1)] - R = 0 - for i in range(curr_rewards.numel() -1, -1, -1): - R = curr_rewards[i] + args.gamma * R - curr_rewards[i] = R - state = self.env.reset() - if start_step == 0: - ep_reward = min(ep_reward, step - start_step + 1) - start_step = step + 1 - - return [rewards, ep_reward] - -The constructor of the ``Agent`` also takes a ``batch`` argument, which controls -how action probs are batched. In batch mode, the ``saved_log_probs`` contains a -list of tensors, where each tensor contains action robs from all observers in -one step. Without batching, the ``saved_log_probs`` is a dictionary where the -key is the observer id and the value is a list of action probs for that -observer. - -.. code:: python - - import threading - from torch.distributed.rpc import RRef - - class Agent: - def __init__(self, world_size, batch=True): - self.ob_rrefs = [] - self.agent_rref = RRef(self) - self.rewards = {} - self.policy = Policy(batch).cuda() - self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) - self.running_reward = 0 - - for ob_rank in range(1, world_size): - ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) - self.ob_rrefs.append(rpc.remote(ob_info, Observer, args=(batch,))) - self.rewards[ob_info.id] = [] - - self.states = torch.zeros(len(self.ob_rrefs), 1, 4) - self.batch = batch - self.saved_log_probs = [] if batch else {k:[] for k in range(len(self.ob_rrefs))} - self.future_actions = torch.futures.Future() - self.lock = threading.Lock() - self.pending_states = len(self.ob_rrefs) - -The non-batching ``select_acion`` simply runs the state throw the policy, saves -the action prob, and returns the action to the observer right away. - -.. code:: python - - from torch.distributions import Categorical - - class Agent: - ... - - @staticmethod - def select_action(agent_rref, ob_id, state): - self = agent_rref.local_value() - probs = self.policy(state.cuda()) - m = Categorical(probs) - action = m.sample() - self.saved_log_probs[ob_id].append(m.log_prob(action)) - return action.item() - -With batching, the state is stored in a 2D tensor ``self.states``, using the -observer id as the row id. Then, it chains a ``Future`` by installing a callback -function to the batch-generated ``self.future_actions`` ``Future`` object, which -will be populated with the specific row indexed using the id of that observer. -The last arriving observer runs all batched states through the policy in one -shot and set ``self.future_actions`` accordingly. When this occurs, all the -callback functions installed on ``self.future_actions`` will be triggered and -their return values will be used to populate the chained ``Future`` object, -which in turn notifies the ``Agent`` to prepare and communicate responses for -all previous RPC requests from other observers. - -.. code:: python - - class Agent: - ... - - @staticmethod - @rpc.functions.async_execution - def select_action_batch(agent_rref, ob_id, state): - self = agent_rref.local_value() - self.states[ob_id].copy_(state) - future_action = self.future_actions.then( - lambda future_actions: future_actions.wait()[ob_id].item() - ) - - with self.lock: - self.pending_states -= 1 - if self.pending_states == 0: - self.pending_states = len(self.ob_rrefs) - probs = self.policy(self.states.cuda()) - m = Categorical(probs) - actions = m.sample() - self.saved_log_probs.append(m.log_prob(actions).t()[0]) - future_actions = self.future_actions - self.future_actions = torch.futures.Future() - future_actions.set_result(actions.cpu()) - return future_action - -Now let's define how different RPC functions are stitched together. The ``Agent`` -controls the execution of every episode. It first uses ``rpc_async`` to kick off -the episode on all observers and block on the returned futures which will be -populated with observer rewards. Note that the code below uses the RRef helper -``ob_rref.rpc_async()`` to launch the ``run_episode`` function on the owner -of the ``ob_rref`` RRef with the provided arguments. -It then converts the saved action probs and returned observer rewards into -expected data format, and launch the training step. Finally, it resets all -states and returns the reward of the current episode. This function is the entry -point to run one episode. - -.. code:: python - - class Agent: - ... - - def run_episode(self, n_steps=0): - futs = [] - for ob_rref in self.ob_rrefs: - # make async RPC to kick off an episode on all observers - futs.append(ob_rref.rpc_async().run_episode(self.agent_rref, n_steps)) - - # wait until all obervers have finished this episode - rets = torch.futures.wait_all(futs) - rewards = torch.stack([ret[0] for ret in rets]).cuda().t() - ep_rewards = sum([ret[1] for ret in rets]) / len(rets) - - # stack saved probs into one tensor - if self.batch: - probs = torch.stack(self.saved_log_probs) - else: - probs = [torch.stack(self.saved_log_probs[i]) for i in range(len(rets))] - probs = torch.stack(probs) - - policy_loss = -probs * rewards / len(rets) - policy_loss.sum().backward() - self.optimizer.step() - self.optimizer.zero_grad() - - # reset variables - self.saved_log_probs = [] if self.batch else {k:[] for k in range(len(self.ob_rrefs))} - self.states = torch.zeros(len(self.ob_rrefs), 1, 4) - - # calculate running rewards - self.running_reward = 0.5 * ep_rewards + 0.5 * self.running_reward - return ep_rewards, self.running_reward - -The rest of the code is normal processes launching and logging which are -similar to other RPC tutorials. In this tutorial, all observers passively -waiting for commands from the agent. Please refer to the -`examples `__ -repo for the full implementation. - -.. code:: python - - def run_worker(rank, world_size, n_episode, batch, print_log=True): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - if rank == 0: - # rank0 is the agent - rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size) - - agent = Agent(world_size, batch) - for i_episode in range(n_episode): - last_reward, running_reward = agent.run_episode(n_steps=NUM_STEPS) - - if print_log: - print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( - i_episode, last_reward, running_reward)) - else: - # other ranks are the observer - rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size) - # observers passively waiting for instructions from agents - rpc.shutdown() - - - def main(): - for world_size in range(2, 12): - delays = [] - for batch in [True, False]: - tik = time.time() - mp.spawn( - run_worker, - args=(world_size, args.num_episode, batch), - nprocs=world_size, - join=True - ) - tok = time.time() - delays.append(tok - tik) - - print(f"{world_size}, {delays[0]}, {delays[1]}") - - - if __name__ == '__main__': - main() - -Batch RPC helps to consolidate the action inference into less CUDA operations, -and hence reduces the amortized overhead. The above ``main`` function runs the -same code on both batch and no-batch modes using different numbers of observers, -ranging from 1 to 10. The figure below plots the execution time of different -world sizes using default argument values. The results confirmed our expectation -that batch processing helped to speed up training. - - -.. figure:: /_static/img/rpc-images/batch.png - :alt: - -Learn More ----------- - -- `Batch-Updating Parameter Server Source Code `__ -- `Batch-Processing CartPole Solver `__ -- `Distributed Autograd `__ diff --git a/intermediate_source/rpc_param_server_tutorial.rst b/intermediate_source/rpc_param_server_tutorial.rst deleted file mode 100644 index 324331646c..0000000000 --- a/intermediate_source/rpc_param_server_tutorial.rst +++ /dev/null @@ -1,386 +0,0 @@ - -Implementing a Parameter Server Using Distributed RPC Framework -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -**Author**\ : `Rohan Varma `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ -- `RPC API documents `__ - -This tutorial walks through a simple example of implementing a parameter server using PyTorch's `Distributed RPC framework `_. The parameter server framework is a paradigm in which a set of servers store parameters, such as large embedding tables, and several trainers query the parameter servers in order to retrieve the most up to date parameters. These trainers can run a training loop locally and occasionally synchronize with the parameter server to get the latest parameters. For more reading on the parameter server approach, check out `this paper `_. - -Using the Distributed RPC Framework, we'll build an example where multiple trainers use RPC to communicate with the same parameter server and use `RRef `_ to access states on the remote parameter server instance. Each trainer will launch its dedicated backward pass in a distributed fashion through stitching of the autograd graph across multiple nodes using distributed autograd. - -**Note**\ : This tutorial covers the use of the Distributed RPC Framework, which is useful for splitting a model onto multiple machines, or for implementing a parameter-server training strategy where network trainers fetch parameters hosted on a different machine. If instead you are looking for replicating your model across many GPUs, please see the `Distributed Data Parallel tutorial `_. There is also another `RPC tutorial `_ that covers reinforcement learning and RNN use cases. - -Let's start with the familiar: importing our required modules and defining a simple ConvNet that will train on the MNIST dataset. The below network is largely adopted from the network defined in the `pytorch/examples repo `_. - -.. code-block:: python - - import argparse - import os - import time - from threading import Lock - - import torch - import torch.distributed.autograd as dist_autograd - import torch.distributed.rpc as rpc - import torch.multiprocessing as mp - import torch.nn as nn - import torch.nn.functional as F - from torch import optim - from torch.distributed.optim import DistributedOptimizer - from torchvision import datasets, transforms - - # --------- MNIST Network to train, from pytorch/examples ----- - - class Net(nn.Module): - def __init__(self, num_gpus=0): - super(Net, self).__init__() - print(f"Using {num_gpus} GPUs to train") - self.num_gpus = num_gpus - device = torch.device( - "cuda:0" if torch.cuda.is_available() and self.num_gpus > 0 else "cpu") - print(f"Putting first 2 convs on {str(device)}") - # Put conv layers on the first cuda device, or CPU if no cuda device - self.conv1 = nn.Conv2d(1, 32, 3, 1).to(device) - self.conv2 = nn.Conv2d(32, 64, 3, 1).to(device) - # Put rest of the network on the 2nd cuda device, if there is one - if "cuda" in str(device) and num_gpus > 1: - device = torch.device("cuda:1") - - print(f"Putting rest of layers on {str(device)}") - self.dropout1 = nn.Dropout2d(0.25).to(device) - self.dropout2 = nn.Dropout2d(0.5).to(device) - self.fc1 = nn.Linear(9216, 128).to(device) - self.fc2 = nn.Linear(128, 10).to(device) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.max_pool2d(x, 2) - - x = self.dropout1(x) - x = torch.flatten(x, 1) - # Move tensor to next device if necessary - next_device = next(self.fc1.parameters()).device - x = x.to(next_device) - - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output -Next, let's define some helper functions that will be useful for the rest of our script. The following uses `rpc_sync `_ and `RRef `_ in order to define a function that invokes a given method on an object living on a remote node. Below, our handle to the remote object is given by the ``rref`` argument, and we run it on its owning node: ``rref.owner()``. On the caller node, we run this command synchronously through the use of ``rpc_sync``\ , meaning that we will block until a response is received. - -.. code-block:: python - - # --------- Helper Methods -------------------- - - # On the local node, call a method with first arg as the value held by the - # RRef. Other args are passed in as arguments to the function called. - # Useful for calling instance methods. method could be any matching function, including - # class methods. - def call_method(method, rref, *args, **kwargs): - return method(rref.local_value(), *args, **kwargs) - - # Given an RRef, return the result of calling the passed in method on the value - # held by the RRef. This call is done on the remote node that owns - # the RRef and passes along the given argument. - # Example: If the value held by the RRef is of type Foo, then - # remote_method(Foo.bar, rref, arg1, arg2) is equivalent to calling - # .bar(arg1, arg2) on the remote node and getting the result - # back. - - def remote_method(method, rref, *args, **kwargs): - args = [method, rref] + list(args) - return rpc.rpc_sync(rref.owner(), call_method, args=args, kwargs=kwargs) -Now, we're ready to define our parameter server. We will subclass ``nn.Module`` and save a handle to our network defined above. We'll also save an input device which will be the device our input is transferred to before invoking the model. - -.. code-block:: python - - # --------- Parameter Server -------------------- - class ParameterServer(nn.Module): - def __init__(self, num_gpus=0): - super().__init__() - model = Net(num_gpus=num_gpus) - self.model = model - self.input_device = torch.device( - "cuda:0" if torch.cuda.is_available() and num_gpus > 0 else "cpu") -Next, we'll define our forward pass. Note that regardless of the device of the model output, we move the output to CPU, as the Distributed RPC Framework currently only supports sending CPU tensors over RPC. We have intentionally disabled sending CUDA tensors over RPC due to the potential for different devices (CPU/GPU) on on the caller/callee, but may support this in future releases. - -.. code-block:: python - - class ParameterServer(nn.Module): - ... - def forward(self, inp): - inp = inp.to(self.input_device) - out = self.model(inp) - # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors. - # Tensors must be moved in and out of GPU memory due to this. - out = out.to("cpu") - return out -Next, we'll define a few miscellaneous functions useful for training and verification purposes. The first, ``get_dist_gradients``\ , will take in a Distributed Autograd context ID and call into the ``dist_autograd.get_gradients`` API in order to retrieve gradients computed by distributed autograd. More information can be found in the `distributed autograd documentation `_. Note that we also iterate through the resulting dictionary and convert each tensor to a CPU tensor, as the framework currently only supports sending tensors over RPC. Next, ``get_param_rrefs`` will iterate through our model parameters and wrap them as a (local) `RRef `_. This method will be invoked over RPC by trainer nodes and will return a list of the parameters to be optimized. This is required as input to the `Distributed Optimizer `_\ , which requires all parameters it must optimize as a list of ``RRef``\ s. - -.. code-block:: python - - # Use dist autograd to retrieve gradients accumulated for this model. - # Primarily used for verification. - def get_dist_gradients(self, cid): - grads = dist_autograd.get_gradients(cid) - # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors. - # Tensors must be moved in and out of GPU memory due to this. - cpu_grads = {} - for k, v in grads.items(): - k_cpu, v_cpu = k.to("cpu"), v.to("cpu") - cpu_grads[k_cpu] = v_cpu - return cpu_grads - - # Wrap local parameters in a RRef. Needed for building the - # DistributedOptimizer which optimizes paramters remotely. - def get_param_rrefs(self): - param_rrefs = [rpc.RRef(param) for param in self.model.parameters()] - return param_rrefs -Finally, we'll create methods to initialize our parameter server. Note that there will only be one instance of a parameter server across all processes, and all trainers will talk to the same parameter server and update the same stored model. As seen in ``run_parameter_server``\ , the server itself does not take any independent actions; it waits for requests from trainers (which are yet to be defined) and responds to them by running the requested function. - -.. code-block:: python - - # The global parameter server instance. - param_server = None - # A lock to ensure we only have one parameter server. - global_lock = Lock() - - - def get_parameter_server(num_gpus=0): - """ - Returns a singleton parameter server to all trainer processes - """ - global param_server - # Ensure that we get only one handle to the ParameterServer. - with global_lock: - if not param_server: - # construct it once - param_server = ParameterServer(num_gpus=num_gpus) - return param_server - - def run_parameter_server(rank, world_size): - # The parameter server just acts as a host for the model and responds to - # requests from trainers. - # rpc.shutdown() will wait for all workers to complete by default, which - # in this case means that the parameter server will wait for all trainers - # to complete, and then exit. - print("PS master initializing RPC") - rpc.init_rpc(name="parameter_server", rank=rank, world_size=world_size) - print("RPC initialized! Running parameter server...") - rpc.shutdown() - print("RPC shutdown on parameter server.") -Note that above, ``rpc.shutdown()`` will not immediately shut down the Parameter Server. Instead, it will wait for all workers (trainers in this case) to also call into ``rpc.shutdown()``. This gives us the guarantee that the parameter server will not go offline before all trainers (yet to be define) have completed their training process. - -Next, we'll define our ``TrainerNet`` class. This will also be a subclass of ``nn.Module``\ , and our ``__init__`` method will use the ``rpc.remote`` API to obtain an RRef, or Remote Reference, to our parameter server. Note that here we are not copying the parameter server to our local process, instead, we can think of ``self.param_server_rref`` as a distributed shared pointer to the parameter server that lives on a separate process. - -.. code-block:: python - - # --------- Trainers -------------------- - - # nn.Module corresponding to the network trained by this trainer. The - # forward() method simply invokes the network on the given parameter - # server. - class TrainerNet(nn.Module): - def __init__(self, num_gpus=0): - super().__init__() - self.num_gpus = num_gpus - self.param_server_rref = rpc.remote( - "parameter_server", get_parameter_server, args=(num_gpus,)) -Next, we'll define a method called ``get_global_param_rrefs``. To motivate the need for this method, it is worth it to read through the documentation on `DistributedOptimizer `_, specifically the API signature. The optimizer must be passed a list of ``RRef``\ s corresponding to the remote parameters to be optimized, so here we obtain the necessary ``RRef``\ s. Since the only remote worker that a given ``TrainerNet`` interacts with is the ``ParameterServer``\ , we simply invoke a ``remote_method`` on the ``ParameterServer``. We use the ``get_param_rrefs`` method which we defined in the ``ParameterServer`` class. This method will return a list of ``RRef``\ s to the parameters that need to be optimized. Note that in this case our ``TrainerNet`` does not define its own paramaters; if it did, we would need to wrap each parameter in an ``RRef`` as well and include it into our input to ``DistributedOptimizer``. - -.. code-block:: python - - class TrainerNet(nn.Module): - ... - def get_global_param_rrefs(self): - remote_params = remote_method( - ParameterServer.get_param_rrefs, - self.param_server_rref) - return remote_params -Now, we're ready to define our ``forward`` method, which will invoke (synchronous) RPC to run the forward pass of the network defined on the ``ParameterServer``. Note that we pass in ``self.param_server_rref``\ , which is a remote handle to our ``ParameterServer``\ , to our RPC call. This call will send an RPC to the node on which our ``ParameterServer`` is running, invoke the ``forward`` pass, and return the ``Tensor`` corresponding to the model's output. - -.. code-block:: python - - class TrainerNet(nn.Module): - ... - def forward(self, x): - model_output = remote_method( - ParameterServer.forward, self.param_server_rref, x) - return model_output -With our trainer fully defined, it's now time to write our neural network training loop that will create our network and optimizer, run some inputs through the network and compute the loss. The training loop looks a lot like that of a local training program, with some modifications due to the nature of our network being distributed across machines. - -Below, we initialize our ``TrainerNet`` and build a ``DistributedOptimizer``. Note that as mentioned above, we must pass in all of the global (across all nodes participating in distributed training) parameters that we want to be optimized. In addition, we pass in the local optimizer to be used, in this case, SGD. Note that we can configure the underlying optimizer algorithm in the same way as creating a local optimizer - all arguments for ``optimizer.SGD`` will be forwarded properly. As an example, we pass in a custom learning rate that will be used as the learning rate for all local optimizers. - -.. code-block:: python - - def run_training_loop(rank, num_gpus, train_loader, test_loader): - # Runs the typical nueral network forward + backward + optimizer step, but - # in a distributed fashion. - net = TrainerNet(num_gpus=num_gpus) - # Build DistributedOptimizer. - param_rrefs = net.get_global_param_rrefs() - opt = DistributedOptimizer(optim.SGD, param_rrefs, lr=0.03) -Next, we define our main training loop. We loop through iterables given by PyTorch's `DataLoader `_. Before writing our typical forward/backward/optimizer loop, we first wrap the logic within a `Distributed Autograd context `_. Note that this is needed to record RPCs invoked in the model's forward pass, so that an appropriate graph can be constructed which includes all participating distributed workers in the backward pass. The distributed autograd context returns a ``context_id`` which serves as an identifier for accumulating and optimizing gradients corresponding to a particular iteration. - -As opposed to calling the typical ``loss.backward()`` which would kick off the backward pass on this local worker, we call ``dist_autograd.backward()`` and pass in our context_id as well as ``loss``\ , which is the root at which we want the backward pass to begin. In addition, we pass this ``context_id`` into our optimizer call, which is required to be able to look up the corresponding gradients computed by this particular backwards pass across all nodes. - -.. code-block:: python - - def run_training_loop(rank, num_gpus, train_loader, test_loader): - ... - for i, (data, target) in enumerate(train_loader): - with dist_autograd.context() as cid: - model_output = net(data) - target = target.to(model_output.device) - loss = F.nll_loss(model_output, target) - if i % 5 == 0: - print(f"Rank {rank} training batch {i} loss {loss.item()}") - dist_autograd.backward(cid, [loss]) - # Ensure that dist autograd ran successfully and gradients were - # returned. - assert remote_method( - ParameterServer.get_dist_gradients, - net.param_server_rref, - cid) != {} - opt.step(cid) - - print("Training complete!") - print("Getting accuracy....") - get_accuracy(test_loader, net) -The following simply computes the accuracy of our model after we're done training, much like a traditional local model. However, note that the ``net`` we pass into this function above is an instance of ``TrainerNet`` and therefore the forward pass invokes RPC in a transparent fashion. - -.. code-block:: python - - def get_accuracy(test_loader, model): - model.eval() - correct_sum = 0 - # Use GPU to evaluate if possible - device = torch.device("cuda:0" if model.num_gpus > 0 - and torch.cuda.is_available() else "cpu") - with torch.no_grad(): - for i, (data, target) in enumerate(test_loader): - out = model(data, -1) - pred = out.argmax(dim=1, keepdim=True) - pred, target = pred.to(device), target.to(device) - correct = pred.eq(target.view_as(pred)).sum().item() - correct_sum += correct - - print(f"Accuracy {correct_sum / len(test_loader.dataset)}") -Next, similar to how we defined ``run_parameter_server`` as the main loop for our ``ParameterServer`` that is responsible for initializing RPC, let's define a similar loop for our trainers. The difference will be that our trainers must run the training loop we defined above: - -.. code-block:: python - - # Main loop for trainers. - def run_worker(rank, world_size, num_gpus, train_loader, test_loader): - print(f"Worker rank {rank} initializing RPC") - rpc.init_rpc( - name=f"trainer_{rank}", - rank=rank, - world_size=world_size) - - print(f"Worker {rank} done initializing RPC") - - run_training_loop(rank, num_gpus, train_loader, test_loader) - rpc.shutdown() -Note that similar to ``run_parameter_server``\ , ``rpc.shutdown()`` will by default wait for all workers, both trainers and ParameterServers, to call into ``rpc.shutdown()`` before this node exits. This ensures that nodes are terminated gracefully and no node goes offline while another is expecting it to be online. - -We've now completed our trainer and parameter server specific code, and all that's left is to add code to launch trainers and parameter servers. First, we must take in various arguments that apply to our parameter server and trainers. ``world_size`` corresponds to the total number of nodes that will participate in training, and is the sum of all trainers and the parameter server. We also must pass in a unique ``rank`` for each individual process, from 0 (where we will run our single parameter server) to ``world_size - 1``. ``master_addr`` and ``master_port`` are arguments that can be used to identify where the rank 0 process is running, and will be used by individual nodes to discover each other. To test this example out locally, simply pass in ``localhost`` and the same ``master_port`` to all instances spawned. Note that for demonstration purposes, this example supports only between 0-2 GPUs, although the pattern can be extended to make use of additional GPUs. - -.. code-block:: python - - if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Parameter-Server RPC based training") - parser.add_argument( - "--world_size", - type=int, - default=4, - help="""Total number of participating processes. Should be the sum of - master node and all training nodes.""") - parser.add_argument( - "--rank", - type=int, - default=None, - help="Global rank of this process. Pass in 0 for master.") - parser.add_argument( - "--num_gpus", - type=int, - default=0, - help="""Number of GPUs to use for training, Currently supports between 0 - and 2 GPUs. Note that this argument will be passed to the parameter servers.""") - parser.add_argument( - "--master_addr", - type=str, - default="localhost", - help="""Address of master, will default to localhost if not provided. - Master must be able to accept network traffic on the address + port.""") - parser.add_argument( - "--master_port", - type=str, - default="29500", - help="""Port that master is listening on, will default to 29500 if not - provided. Master must be able to accept network traffic on the host and port.""") - - args = parser.parse_args() - assert args.rank is not None, "must provide rank argument." - assert args.num_gpus <= 3, f"Only 0-2 GPUs currently supported (got {args.num_gpus})." - os.environ['MASTER_ADDR'] = args.master_addr - os.environ["MASTER_PORT"] = args.master_port -Now, we'll create a process corresponding to either a parameter server or trainer depending on our command line arguments. We'll create a ``ParameterServer`` if our passed in rank is 0, and a ``TrainerNet`` otherwise. Note that we're using ``torch.multiprocessing`` to launch a subprocess corresponding to the function that we want to execute, and waiting on this process's completion from the main thread with ``p.join()``. In the case of initializing our trainers, we also use PyTorch's `dataloaders `_ in order to specify train and test data loaders on the MNIST dataset. - -.. code-block:: python - - processes = [] - world_size = args.world_size - if args.rank == 0: - p = mp.Process(target=run_parameter_server, args=(0, world_size)) - p.start() - processes.append(p) - else: - # Get data to train on - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=32, shuffle=True,) - test_loader = torch.utils.data.DataLoader( - datasets.MNIST( - '../data', - train=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=32, - shuffle=True, - ) - # start training worker on this node - p = mp.Process( - target=run_worker, - args=( - args.rank, - world_size, args.num_gpus, - train_loader, - test_loader)) - p.start() - processes.append(p) - - for p in processes: - p.join() -To run the example locally, run the following command worker for the server and each worker you wish to spawn, in separate terminal windows: ``python rpc_parameter_server.py --world_size=WORLD_SIZE --rank=RANK``. For example, for a master node with world size of 2, the command would be ``python rpc_parameter_server.py --world_size=2 --rank=0``. The trainer can then be launched with the command ``python rpc_parameter_server.py --world_size=2 --rank=1`` in a separate window, and this will begin training with one server and a single trainer. Note that this tutorial assumes that training occurs using between 0 and 2 GPUs, and this argument can be configured by passing ``--num_gpus=N`` into the training script. - -You can pass in the command line arguments ``--master_addr=ADDRESS`` and ``--master_port=PORT`` to indicate the address and port that the master worker is listening on, for example, to test functionality where trainers and master nodes run on different machines. diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst deleted file mode 100644 index 835e6f0649..0000000000 --- a/intermediate_source/rpc_tutorial.rst +++ /dev/null @@ -1,622 +0,0 @@ -Getting Started with Distributed RPC Framework -================================================= -**Author**: `Shen Li `_ - -.. note:: - |edit| View and edit this tutorial in `github `__. - -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ -- `RPC API documents `__ - -This tutorial uses two simple examples to demonstrate how to build distributed -training with the `torch.distributed.rpc `__ -package which was first introduced as an experimental feature in PyTorch v1.4. -Source code of the two examples can be found in -`PyTorch examples `__. - -Previous tutorials, -`Getting Started With Distributed Data Parallel `__ -and `Writing Distributed Applications With PyTorch `__, -described `DistributedDataParallel `__ -which supports a specific training paradigm where the model is replicated across -multiple processes and each process handles a split of the input data. -Sometimes, you might run into scenarios that require different training -paradigms. For example: - -1) In reinforcement learning, it might be relatively expensive to acquire - training data from environments while the model itself can be quite small. In - this case, it might be useful to spawn multiple observers running in parallel - and share a single agent. In this case, the agent takes care of the training - locally, but the application would still need libraries to send and receive - data between observers and the trainer. -2) Your model might be too large to fit in GPUs on a single machine, and hence - would need a library to help split the model onto multiple machines. Or you - might be implementing a `parameter server `__ - training framework, where model parameters and trainers live on different - machines. - - -The `torch.distributed.rpc `__ package -can help with the above scenarios. In case 1, `RPC `__ -and `RRef `__ allow sending data -from one worker to another while easily referencing remote data objects. In -case 2, `distributed autograd `__ -and `distributed optimizer `__ -make executing backward pass and optimizer step as if it is local training. In -the next two sections, we will demonstrate APIs of -`torch.distributed.rpc `__ using a -reinforcement learning example and a language model example. Please note, this -tutorial does not aim at building the most accurate or efficient models to -solve given problems, instead, the main goal here is to show how to use the -`torch.distributed.rpc `__ package to -build distributed training applications. - - - -Distributed Reinforcement Learning using RPC and RRef ------------------------------------------------------ - -This section describes steps to build a toy distributed reinforcement learning -model using RPC to solve CartPole-v1 from `OpenAI Gym `__. -The policy code is mostly borrowed from the existing single-thread -`example `__ -as shown below. We will skip details of the ``Policy`` design, and focus on RPC -usages. - -.. code:: python - - import torch.nn as nn - import torch.nn.functional as F - - class Policy(nn.Module): - - def __init__(self): - super(Policy, self).__init__() - self.affine1 = nn.Linear(4, 128) - self.dropout = nn.Dropout(p=0.6) - self.affine2 = nn.Linear(128, 2) - - def forward(self, x): - x = self.affine1(x) - x = self.dropout(x) - x = F.relu(x) - action_scores = self.affine2(x) - return F.softmax(action_scores, dim=1) - - -We are ready to present the observer. In this example, each observer creates its -own environment, and waits for the agent's command to run an episode. In each -episode, one observer loops at most ``n_steps`` iterations, and in each -iteration, it uses RPC to pass its environment state to the agent and gets an -action back. Then it applies that action to its environment, and gets the reward -and the next state from the environment. After that, the observer uses another -RPC to report the reward to the agent. Again, please note that, this is -obviously not the most efficient observer implementation. For example, one -simple optimization could be packing current state and last reward in one RPC to -reduce the communication overhead. However, the goal is to demonstrate RPC API -instead of building the best solver for CartPole. So, let's keep the logic -simple and the two steps explicit in this example. - -.. code:: python - - import argparse - import gym - import torch.distributed.rpc as rpc - - parser = argparse.ArgumentParser( - description="RPC Reinforcement Learning Example", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument('--world_size', default=2, type=int, metavar='W', - help='number of workers') - parser.add_argument('--log_interval', type=int, default=10, metavar='N', - help='interval between training status logs') - parser.add_argument('--gamma', type=float, default=0.99, metavar='G', - help='how much to value future rewards') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed for reproducibility') - args = parser.parse_args() - - class Observer: - - def __init__(self): - self.id = rpc.get_worker_info().id - self.env = gym.make('CartPole-v1') - self.env.seed(args.seed) - - def run_episode(self, agent_rref): - state, ep_reward = self.env.reset(), 0 - for _ in range(10000): - # send the state to the agent to get an action - action = agent_rref.rpc_sync().select_action(self.id, state) - - # apply the action to the environment, and get the reward - state, reward, done, _ = self.env.step(action) - - # report the reward to the agent for training purpose - agent_rref.rpc_sync().report_reward(self.id, reward) - - # finishes after the number of self.env._max_episode_steps - if done: - break - - -The code for agent is a little more complex, and we will break it into multiple -pieces. In this example, the agent serves as both the trainer and the master, -such that it sends command to multiple distributed observers to run episodes, -and it also records all actions and rewards locally which will be used during -the training phase after each episode. The code below shows ``Agent`` -constructor where most lines are initializing various components. The loop at -the end initializes observers remotely on other workers, and holds ``RRefs`` to -those observers locally. The agent will use those observer ``RRefs`` later to -send commands. Applications don't need to worry about the lifetime of ``RRefs``. -The owner of each ``RRef`` maintains a reference counting map to track its -lifetime, and guarantees the remote data object will not be deleted as long as -there is any live user of that ``RRef``. Please refer to the ``RRef`` -`design doc `__ for details. - - -.. code:: python - - import gym - import numpy as np - - import torch - import torch.distributed.rpc as rpc - import torch.optim as optim - from torch.distributed.rpc import RRef, rpc_async, remote - from torch.distributions import Categorical - - class Agent: - def __init__(self, world_size): - self.ob_rrefs = [] - self.agent_rref = RRef(self) - self.rewards = {} - self.saved_log_probs = {} - self.policy = Policy() - self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) - self.eps = np.finfo(np.float32).eps.item() - self.running_reward = 0 - self.reward_threshold = gym.make('CartPole-v1').spec.reward_threshold - for ob_rank in range(1, world_size): - ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) - self.ob_rrefs.append(remote(ob_info, Observer)) - self.rewards[ob_info.id] = [] - self.saved_log_probs[ob_info.id] = [] - - -Next, the agent exposes two APIs to observers for selecting actions and -reporting rewards. Those functions only run locally on the agent, but will -be triggered by observers through RPC. - - -.. code:: python - - class Agent: - ... - def select_action(self, ob_id, state): - state = torch.from_numpy(state).float().unsqueeze(0) - probs = self.policy(state) - m = Categorical(probs) - action = m.sample() - self.saved_log_probs[ob_id].append(m.log_prob(action)) - return action.item() - - def report_reward(self, ob_id, reward): - self.rewards[ob_id].append(reward) - - -Let's add a ``run_episode`` function on agent which tells all observers -to execute an episode. In this function, it first creates a list to collect -futures from asynchronous RPCs, and then loop over all observer ``RRefs`` to -make asynchronous RPCs. In these RPCs, the agent also passes an ``RRef`` of -itself to the observer, so that the observer can call functions on the agent as -well. As shown above, each observer will make RPCs back to the agent, which are -nested RPCs. After each episode, the ``saved_log_probs`` and ``rewards`` will -contain the recorded action probs and rewards. - - -.. code:: python - - class Agent: - ... - def run_episode(self): - futs = [] - for ob_rref in self.ob_rrefs: - # make async RPC to kick off an episode on all observers - futs.append( - rpc_async( - ob_rref.owner(), - ob_rref.rpc_sync().run_episode, - args=(self.agent_rref,) - ) - ) - - # wait until all obervers have finished this episode - for fut in futs: - fut.wait() - - -Finally, after one episode, the agent needs to train the model, which -is implemented in the ``finish_episode`` function below. There is no RPCs in -this function and it is mostly borrowed from the single-thread -`example `__. -Hence, we skip describing its contents. - - - -.. code:: python - - class Agent: - ... - def finish_episode(self): - # joins probs and rewards from different observers into lists - R, probs, rewards = 0, [], [] - for ob_id in self.rewards: - probs.extend(self.saved_log_probs[ob_id]) - rewards.extend(self.rewards[ob_id]) - - # use the minimum observer reward to calculate the running reward - min_reward = min([sum(self.rewards[ob_id]) for ob_id in self.rewards]) - self.running_reward = 0.05 * min_reward + (1 - 0.05) * self.running_reward - - # clear saved probs and rewards - for ob_id in self.rewards: - self.rewards[ob_id] = [] - self.saved_log_probs[ob_id] = [] - - policy_loss, returns = [], [] - for r in rewards[::-1]: - R = r + args.gamma * R - returns.insert(0, R) - returns = torch.tensor(returns) - returns = (returns - returns.mean()) / (returns.std() + self.eps) - for log_prob, R in zip(probs, returns): - policy_loss.append(-log_prob * R) - self.optimizer.zero_grad() - policy_loss = torch.cat(policy_loss).sum() - policy_loss.backward() - self.optimizer.step() - return min_reward - - -With ``Policy``, ``Observer``, and ``Agent`` classes, we are ready to launch -multiple processes to perform the distributed training. In this example, all -processes run the same ``run_worker`` function, and they use the rank to -distinguish their role. Rank 0 is always the agent, and all other ranks are -observers. The agent serves as master by repeatedly calling ``run_episode`` and -``finish_episode`` until the running reward surpasses the reward threshold -specified by the environment. All observers passively waiting for commands -from the agent. The code is wrapped by -`rpc.init_rpc `__ and -`rpc.shutdown `__, -which initializes and terminates RPC instances respectively. More details are -available in the `API page `__. - - -.. code:: python - - import os - from itertools import count - - import torch.multiprocessing as mp - - AGENT_NAME = "agent" - OBSERVER_NAME="obs{}" - - def run_worker(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - if rank == 0: - # rank0 is the agent - rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size) - - agent = Agent(world_size) - print(f"This will run until reward threshold of {agent.reward_threshold}" - " is reached. Ctrl+C to exit.") - for i_episode in count(1): - agent.run_episode() - last_reward = agent.finish_episode() - - if i_episode % args.log_interval == 0: - print(f"Episode {i_episode}\tLast reward: {last_reward:.2f}\tAverage reward: " - f"{agent.running_reward:.2f}") - if agent.running_reward > agent.reward_threshold: - print(f"Solved! Running reward is now {agent.running_reward}!") - break - else: - # other ranks are the observer - rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size) - # observers passively waiting for instructions from the agent - - # block until all rpcs finish, and shutdown the RPC instance - rpc.shutdown() - - - mp.spawn( - run_worker, - args=(args.world_size, ), - nprocs=args.world_size, - join=True - ) - -Below are some sample outputs when training with `world_size=2`. - -:: - - This will run until reward threshold of 475.0 is reached. Ctrl+C to exit. - Episode 10 Last reward: 26.00 Average reward: 10.01 - Episode 20 Last reward: 16.00 Average reward: 11.27 - Episode 30 Last reward: 49.00 Average reward: 18.62 - Episode 40 Last reward: 45.00 Average reward: 26.09 - Episode 50 Last reward: 44.00 Average reward: 30.03 - Episode 60 Last reward: 111.00 Average reward: 42.23 - Episode 70 Last reward: 131.00 Average reward: 70.11 - Episode 80 Last reward: 87.00 Average reward: 76.51 - Episode 90 Last reward: 86.00 Average reward: 95.93 - Episode 100 Last reward: 13.00 Average reward: 123.93 - Episode 110 Last reward: 33.00 Average reward: 91.39 - Episode 120 Last reward: 73.00 Average reward: 76.38 - Episode 130 Last reward: 137.00 Average reward: 88.08 - Episode 140 Last reward: 89.00 Average reward: 104.96 - Episode 150 Last reward: 97.00 Average reward: 98.74 - Episode 160 Last reward: 150.00 Average reward: 100.87 - Episode 170 Last reward: 126.00 Average reward: 104.38 - Episode 180 Last reward: 500.00 Average reward: 213.74 - Episode 190 Last reward: 322.00 Average reward: 300.22 - Episode 200 Last reward: 165.00 Average reward: 272.71 - Episode 210 Last reward: 168.00 Average reward: 233.11 - Episode 220 Last reward: 184.00 Average reward: 195.02 - Episode 230 Last reward: 284.00 Average reward: 208.32 - Episode 240 Last reward: 395.00 Average reward: 247.37 - Episode 250 Last reward: 500.00 Average reward: 335.42 - Episode 260 Last reward: 500.00 Average reward: 386.30 - Episode 270 Last reward: 500.00 Average reward: 405.29 - Episode 280 Last reward: 500.00 Average reward: 443.29 - Episode 290 Last reward: 500.00 Average reward: 464.65 - Solved! Running reward is now 475.3163778435275! - - -In this example, we show how to use RPC as the communication vehicle to pass -data across workers, and how to use RRef to reference remote objects. It is true -that you could build the entire structure directly on top of ``ProcessGroup`` -``send`` and ``recv`` APIs or use other communication/RPC libraries. However, -by using `torch.distributed.rpc`, you can get the native support and -continuously optimized performance under the hood. - -Next, we will show how to combine RPC and RRef with distributed autograd and -distributed optimizer to perform distributed model parallel training. - - - -Distributed RNN using Distributed Autograd and Distributed Optimizer --------------------------------------------------------------------- - -In this section, we use an RNN model to show how to build distributed model -parallel training with the RPC API. The example RNN model is very small and -can easily fit into a single GPU, but we still divide its layers onto two -different workers to demonstrate the idea. Developer can apply the similar -techniques to distribute much larger models across multiple devices and -machines. - -The RNN model design is borrowed from the word language model in PyTorch -`example `__ -repository, which contains three main components, an embedding table, an -``LSTM`` layer, and a decoder. The code below wraps the embedding table and the -decoder into sub-modules, so that their constructors can be passed to the RPC -API. In the ``EmbeddingTable`` sub-module, we intentionally put the -``Embedding`` layer on GPU to cover the use case. In v1.4, RPC always creates -CPU tensor arguments or return values on the destination worker. If the function -takes a GPU tensor, you need to move it to the proper device explicitly. - - -.. code:: python - - class EmbeddingTable(nn.Module): - r""" - Encoding layers of the RNNModel - """ - def __init__(self, ntoken, ninp, dropout): - super(EmbeddingTable, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp).cuda() - self.encoder.weight.data.uniform_(-0.1, 0.1) - - def forward(self, input): - return self.drop(self.encoder(input.cuda()).cpu() - - - class Decoder(nn.Module): - def __init__(self, ntoken, nhid, dropout): - super(Decoder, self).__init__() - self.drop = nn.Dropout(dropout) - self.decoder = nn.Linear(nhid, ntoken) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-0.1, 0.1) - - def forward(self, output): - return self.decoder(self.drop(output)) - - -With the above sub-modules, we can now piece them together using RPC to -create an RNN model. In the code below ``ps`` represents a parameter server, -which hosts parameters of the embedding table and the decoder. The constructor -uses the `remote `__ -API to create an ``EmbeddingTable`` object and a ``Decoder`` object on the -parameter server, and locally creates the ``LSTM`` sub-module. During the -forward pass, the trainer uses the ``EmbeddingTable`` ``RRef`` to find the -remote sub-module and passes the input data to the ``EmbeddingTable`` using RPC -and fetches the lookup results. Then, it runs the embedding through the local -``LSTM`` layer, and finally uses another RPC to send the output to the -``Decoder`` sub-module. In general, to implement distributed model parallel -training, developers can divide the model into sub-modules, invoke RPC to create -sub-module instances remotely, and use on ``RRef`` to find them when necessary. -As you can see in the code below, it looks very similar to single-machine model -parallel training. The main difference is replacing ``Tensor.to(device)`` with -RPC functions. - - -.. code:: python - - class RNNModel(nn.Module): - def __init__(self, ps, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(RNNModel, self).__init__() - - # setup embedding table remotely - self.emb_table_rref = rpc.remote(ps, EmbeddingTable, args=(ntoken, ninp, dropout)) - # setup LSTM locally - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - # setup decoder remotely - self.decoder_rref = rpc.remote(ps, Decoder, args=(ntoken, nhid, dropout)) - - def forward(self, input, hidden): - # pass input to the remote embedding table and fetch emb tensor back - emb = _remote_method(EmbeddingTable.forward, self.emb_table_rref, input) - output, hidden = self.rnn(emb, hidden) - # pass output to the rremote decoder and get the decoded output back - decoded = _remote_method(Decoder.forward, self.decoder_rref, output) - return decoded, hidden - -Before introducing the distributed optimizer, let's add a helper function to -generate a list of RRefs of model parameters, which will be consumed by the -distributed optimizer. In local training, applications could call -``Module.parameters()`` to grab references to all parameter tensors, and pass it -to the local optimizer for subsequent updates. However, the same API does not -work in distributed training scenarios as some parameters live on remote -machines. Therefore, instead of taking a list of parameter ``Tensors``, the -distributed optimizer takes a list of ``RRefs``, one ``RRef`` per model -parameter for both local and remote model parameters. The helper function is -pretty simple, just call ``Module.parameters()`` and creates a local ``RRef`` on -each of the parameters. - - -.. code:: python - - def _parameter_rrefs(module): - param_rrefs = [] - for param in module.parameters(): - param_rrefs.append(RRef(param)) - return param_rrefs - - -Then, as the ``RNNModel`` contains three sub-modules, we need to call -``_parameter_rrefs`` three times, and wrap that into another helper function. - - -.. code:: python - - class RNNModel(nn.Module): - ... - def parameter_rrefs(self): - remote_params = [] - # get RRefs of embedding table - remote_params.extend(_remote_method(_parameter_rrefs, self.emb_table_rref)) - # create RRefs for local parameters - remote_params.extend(_parameter_rrefs(self.rnn)) - # get RRefs of decoder - remote_params.extend(_remote_method(_parameter_rrefs, self.decoder_rref)) - return remote_params - - -Now, we are ready to implement the training loop. After initializing model -arguments, we create the ``RNNModel`` and the ``DistributedOptimizer``. The -distributed optimizer will take a list of parameter ``RRefs``, find all distinct -owner workers, and create the given local optimizer (i.e., ``SGD`` in this case, -you can use other local optimizers as well) on each of the owner worker using -the given arguments (i.e., ``lr=0.05``). - -In the training loop, it first creates a distributed autograd context, which -will help the distributed autograd engine to find gradients and involved RPC -send/recv functions. The design details of the distributed autograd engine can -be found in its `design note `__. -Then, it kicks off the forward pass as if it is a local -model, and run the distributed backward pass. For the distributed backward, you -only need to specify a list of roots, in this case, it is the loss ``Tensor``. -The distributed autograd engine will traverse the distributed graph -automatically and write gradients properly. Next, it runs the ``step`` -function on the distributed optimizer, which will reach out to all involved -local optimizers to update model parameters. Compared to local training, one -minor difference is that you don't need to run ``zero_grad()`` because each -autograd context has dedicated space to store gradients, and as we create a -context per iteration, those gradients from different iterations will not -accumulate to the same set of ``Tensors``. - - -.. code:: python - - def run_trainer(): - batch = 5 - ntoken = 10 - ninp = 2 - - nhid = 3 - nindices = 3 - nlayers = 4 - hidden = ( - torch.randn(nlayers, nindices, nhid), - torch.randn(nlayers, nindices, nhid) - ) - - model = rnn.RNNModel('ps', ntoken, ninp, nhid, nlayers) - - # setup distributed optimizer - opt = DistributedOptimizer( - optim.SGD, - model.parameter_rrefs(), - lr=0.05, - ) - - criterion = torch.nn.CrossEntropyLoss() - - def get_next_batch(): - for _ in range(5): - data = torch.LongTensor(batch, nindices) % ntoken - target = torch.LongTensor(batch, ntoken) % nindices - yield data, target - - # train for 10 iterations - for epoch in range(10): - for data, target in get_next_batch(): - # create distributed autograd context - with dist_autograd.context() as context_id: - hidden[0].detach_() - hidden[1].detach_() - output, hidden = model(data, hidden) - loss = criterion(output, target) - # run distributed backward pass - dist_autograd.backward(context_id, [loss]) - # run distributed optimizer - opt.step(context_id) - # not necessary to zero grads since they are - # accumulated into the distributed autograd context - # which is reset every iteration. - print("Training epoch {}".format(epoch)) - - -Finally, let's add some glue code to launch the parameter server and the trainer -processes. - - -.. code:: python - - def run_worker(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - if rank == 1: - rpc.init_rpc("trainer", rank=rank, world_size=world_size) - _run_trainer() - else: - rpc.init_rpc("ps", rank=rank, world_size=world_size) - # parameter server do nothing - pass - - # block until all rpcs finish - rpc.shutdown() - - - if __name__=="__main__": - world_size = 2 - mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True) diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py deleted file mode 100644 index 666d240ece..0000000000 --- a/intermediate_source/scaled_dot_product_attention_tutorial.py +++ /dev/null @@ -1,407 +0,0 @@ -""" -(Beta) Implementing High-Performance Transformers with Scaled Dot Product Attention (SDPA) -========================================================================================== - - -**Author:** `Driss Guessous `_ -""" - -###################################################################### -# Summary -# ~~~~~~~~ -# -# In this tutorial, we want to highlight a new ``torch.nn.functional`` function -# that can be helpful for implementing transformer architectures. The -# function is named ``torch.nn.functional.scaled_dot_product_attention``. -# For detailed description of the function, see the `PyTorch documentation `__. -# This function has already been incorporated into ``torch.nn.MultiheadAttention`` and ``torch.nn.TransformerEncoderLayer``. -# -# Overview -# ~~~~~~~~~ -# At a high level, this PyTorch function calculates the -# scaled dot product attention (SDPA) between query, key, and value according to -# the definition found in the paper `Attention is all you -# need `__. While this function can -# be written in PyTorch using existing functions, a fused implementation can provide -# large performance benefits over a naive implementation. -# -# Fused implementations -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# For CUDA tensor inputs, the function will dispatch into one of the following -# implementations: -# -# * `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness `__ -# * `Memory-Efficient Attention `__ -# * A PyTorch implementation defined in C++ -# -# .. note:: -# -# This tutorial requires PyTorch 2.0.0 or later. -# - -import torch -import torch.nn as nn -import torch.nn.functional as F -device = "cuda" if torch.cuda.is_available() else "cpu" - -# Example Usage: -query, key, value = torch.randn(2, 3, 8, device=device), torch.randn(2, 3, 8, device=device), torch.randn(2, 3, 8, device=device) -F.scaled_dot_product_attention(query, key, value) - - -###################################################################### -# Explicit Dispatcher Control -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# While the function will implicitly dispatch to one of the three -# implementations, the user can also explicitly control the dispatch via -# the use of a context manager. This context manager allows users to -# explicitly disable certain implementations. If a user wants to ensure -# the function is indeed using the fastest implementation for their -# specific inputs, the context manager can be used to sweep through -# measuring performance. -# - -# Lets define a helpful benchmarking function: -import torch.utils.benchmark as benchmark -def benchmark_torch_function_in_microseconds(f, *args, **kwargs): - t0 = benchmark.Timer( - stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} - ) - return t0.blocked_autorange().mean * 1e6 - -# Lets define the hyper-parameters of our input -batch_size = 32 -max_sequence_len = 1024 -num_heads = 32 -embed_dimension = 32 - -dtype = torch.float16 - -query = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, device=device, dtype=dtype) -key = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, device=device, dtype=dtype) -value = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, device=device, dtype=dtype) - -print(f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds") - -# Lets explore the speed of each of the 3 implementations -from torch.nn.attention import SDPBackend, sdpa_kernel - - -with sdpa_kernel(SDPBackend.MATH): - math_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value) - print(f"The math implementation runs in {math_time:.3f} microseconds") - -with sdpa_kernel(SDPBackend.FLASH_ATTENTION): - try: - flash_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value) - print(f"The flash attention implementation runs in {flash_time:.3f} microseconds") - except RuntimeError: - print("FlashAttention is not supported. See warnings for reasons.") - -with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION): - try: - efficient_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value) - print(f"The memory efficient implementation runs in {efficient_time:.3f} microseconds") - except RuntimeError: - print("EfficientAttention is not supported. See warnings for reasons.") - - -###################################################################### -# Hardware dependence -# ~~~~~~~~~~~~~~~~~~~ -# -# Depending on what machine you ran the above cell on and what hardware is -# available, your results might be different. -# - If you don’t have a GPU and are running on CPU then with FP32 the context manager -# will have no effect and all three runs should return similar timings. -# - Depending on what compute capability your graphics card supports -# flash attention or memory efficient might have failed. - - -###################################################################### -# Causal Self Attention -# ~~~~~~~~~~~~~~~~~~~~~ -# -# Below is an example implementation of a multi-headed causal self -# attention block inspired by -# `Andrej Karpathy NanoGPT `__ repository. -# - -class CausalSelfAttention(nn.Module): - - def __init__(self, num_heads: int, embed_dimension: int, bias: bool=False, is_causal: bool=False, dropout:float=0.0): - super().__init__() - assert embed_dimension % num_heads == 0 - # key, query, value projections for all heads, but in a batch - self.c_attn = nn.Linear(embed_dimension, 3 * embed_dimension, bias=bias) - # output projection - self.c_proj = nn.Linear(embed_dimension, embed_dimension, bias=bias) - # regularization - self.dropout = dropout - self.resid_dropout = nn.Dropout(dropout) - self.num_heads = num_heads - self.embed_dimension = embed_dimension - # Perform causal masking - self.is_causal = is_causal - - def forward(self, x): - # calculate query, key, values for all heads in batch and move head forward to be the batch dim - query_projected = self.c_attn(x) - - batch_size = query_projected.size(0) - embed_dim = query_projected.size(2) - head_dim = embed_dim // (self.num_heads * 3) - - query, key, value = query_projected.chunk(3, -1) - query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2) - key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2) - value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2) - - if self.training: - dropout = self.dropout - is_causal = self.is_causal - else: - dropout = 0.0 - is_causal = False - - y = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=dropout, is_causal=is_causal) - y = y.transpose(1, 2).view(batch_size, -1, self.num_heads * head_dim) - - y = self.resid_dropout(self.c_proj(y)) - return y - - -num_heads = 8 -heads_per_dim = 64 -embed_dimension = num_heads * heads_per_dim -dtype = torch.float16 -model = CausalSelfAttention(num_heads=num_heads, embed_dimension=embed_dimension, bias=False, is_causal=True, dropout=0.1).to("cuda").to(dtype).eval() -print(model) - - -##################################################################### -# ``NestedTensor`` and Dense tensor support -# ----------------------------------------- -# -# SDPA supports both ``NestedTensor`` and Dense tensor inputs. ``NestedTensors`` handle the case where the input is a batch of variable length sequences -# without needing to pad each sequence to the maximum length in the batch. For more information about ``NestedTensors`` see -# `torch.nested `__ and `NestedTensors Tutorial `__. -# - -import random -def generate_rand_batch( - batch_size, - max_sequence_len, - embed_dimension, - pad_percentage=None, - dtype=torch.float16, - device="cuda", -): - if not pad_percentage: - return ( - torch.randn( - batch_size, - max_sequence_len, - embed_dimension, - dtype=dtype, - device=device, - ), - None, - ) - # Random sequence lengths - seq_len_list = [ - int(max_sequence_len * (1 - random.gauss(pad_percentage, 0.01))) - for _ in range(batch_size) - ] - # Make random entry in the batch have max sequence length - seq_len_list[random.randint(0, batch_size - 1)] = max_sequence_len - return ( - torch.nested.nested_tensor( - [ - torch.randn(seq_len, embed_dimension, - dtype=dtype, device=device) - for seq_len in seq_len_list - ] - ), - seq_len_list, - ) - -random_nt, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=0.5, dtype=dtype, device=device) -random_dense, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=None, dtype=dtype, device=device) - -# Currently the fused implementations don't support ``NestedTensor`` for training -model.eval() - -with sdpa_kernel(SDPBackend.FLASH_ATTENTION): - try: - print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds") - print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds") - except RuntimeError: - print("FlashAttention is not supported. See warnings for reasons.") - - -###################################################################### -# Using SDPA with ``torch.compile`` -# ================================= -# -# With the release of PyTorch 2.0, a new feature called -# ``torch.compile()`` has been introduced, which can provide -# significant performance improvements over eager mode. -# Scaled dot product attention is fully composable with ``torch.compile()``. -# To demonstrate this, let's compile the ``CausalSelfAttention`` module using -# ``torch.compile()`` and observe the resulting performance improvements. -# - -batch_size = 32 -max_sequence_len = 256 -x = torch.rand(batch_size, max_sequence_len, - embed_dimension, device=device, dtype=dtype) -print( - f"The non compiled module runs in {benchmark_torch_function_in_microseconds(model, x):.3f} microseconds") - - -compiled_model = torch.compile(model) -# Let's compile it -compiled_model(x) -print( - f"The compiled module runs in {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds") - - -###################################################################### -# -# The exact execution time is dependent on machine, however the results for mine: -# The non compiled module runs in 166.616 microseconds -# The compiled module runs in 166.726 microseconds -# That is not what we were expecting. Let's dig a little deeper. -# PyTorch comes with an amazing built-in profiler that you can use to -# inspect the performance characteristics of your code. -# - -from torch.profiler import profile, record_function, ProfilerActivity -activities = [ProfilerActivity.CPU] -if device == 'cuda': - activities.append(ProfilerActivity.CUDA) - -with profile(activities=activities, record_shapes=False) as prof: - with record_function(" Non-Compilied Causal Attention"): - for _ in range(25): - model(x) -print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) - - -with profile(activities=activities, record_shapes=False) as prof: - with record_function("Compiled Causal Attention"): - for _ in range(25): - compiled_model(x) -print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) - -# For even more insights, you can export the trace and use ``chrome://tracing`` to view the results -# -# .. code-block:: python -# -# prof.export_chrome_trace("compiled_causal_attention_trace.json"). - - - - -###################################################################### -# The previous code snippet generates a report of the top 10 PyTorch functions -# that consumed the most GPU execution time, for both the compiled and non-compiled module. -# The analysis reveals that the majority of time spent on the GPU is concentrated -# on the same set of functions for both modules. -# The reason for this here is that ``torch.compile`` is very good at removing the -# framework overhead associated with PyTorch. If your model is launching -# large, efficient CUDA kernels, which in this case ``CausalSelfAttention`` -# is, then the overhead of PyTorch can be hidden. -# -# In reality, your module does not normally consist of a singular -# ``CausalSelfAttention`` block. When experimenting with `Andrej Karpathy NanoGPT `__ repository, compiling -# the module took the time per train step from: ``6090.49ms`` to -# ``3273.17ms``! This was done on commit: ``ae3a8d5`` of NanoGPT training on -# the Shakespeare dataset. -# - -###################################################################### -# Using SDPA with attn_bias subclasses` -# ========================================== -# -# As of PyTorch 2.3, we have added a new submodule that contains tensor subclasses. -# Designed to be used with ``torch.nn.functional.scaled_dot_product_attention``. -# The module is named ``torch.nn.attention.bias`` and contains the following two -# utilities for generating causal attention variants: -# -# - ``torch.nn.attention.bias.causal_upper_left`` -# - ``torch.nn.attention.bias.causal_lower_right`` -# -# .. note:: -# The current argument ``is_causal`` in ``torch.nn.functional.scaled_dot_product_attention`` -# is the same as using ``torch.nn.attention.bias.causal_upper_left``. -# - -from torch.nn.attention.bias import causal_lower_right, causal_upper_left - -batch_size = 32 -sequence_length_q = 2 -sequence_length_kv = 10 -num_heads = 16 -embed_dimension = 32 - -dtype = torch.float16 - -query = torch.rand(batch_size, num_heads, sequence_length_q, embed_dimension, device=device, dtype=dtype) -key = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype) -value = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype) - -upper_left_bias = causal_upper_left(sequence_length_q, sequence_length_kv) -lower_right_bias = causal_lower_right(sequence_length_q, sequence_length_kv) - -print(type(upper_left_bias)) -print(type(lower_right_bias)) - -assert type(upper_left_bias) == type(lower_right_bias) -assert issubclass(type(upper_left_bias), torch.Tensor) - -# As you can see from the previous output, are the same type ``torch.nn.attention.bias.CausalBias`` -# and subclass ``torch.Tensor`` - -# Lets see what these tensors look like -print(upper_left_bias) -print(lower_right_bias) - -# Upper Left Bias aligns the causal attention mask to the upper left corner of the attention scores matrix. -# This only has an impact when the attention scores matrix is not square, which is common for decoding use cases. -# Another way of thinking about this concept is that when you use upper left bias, -# the 0th token in the query is aligned to the 0th token in the key, while for lower right bias, -# Assuming the attention score matrix is two dimensional, ``attn_score[0][0]`` is the attention score -# between the 0th token in the query and the 0th token in the key. -# For lower right bias, the sequence of q is aligned so that the last token in q is aligned to the last token in k -# (for example, ``attn_score[-1][-1])`` is all True since the last token in q is at the same position as the last token in k -# even if the sequence length of q and k are different. - -# These objects are intended to be used with sdpa -out_upper_left = F.scaled_dot_product_attention(query, key, value, upper_left_bias) -out_lower_right = F.scaled_dot_product_attention(query, key, value, lower_right_bias) -out_is_causal = F.scaled_dot_product_attention(query, key, value, is_causal=True) - -assert torch.allclose(out_upper_left, out_is_causal) -assert not torch.allclose(out_upper_left, out_lower_right) - -# These attention biases should also be compatible with torch.compile -compiled_sdpa = torch.compile(F.scaled_dot_product_attention, fullgraph=True) -out_upper_left = compiled_sdpa(query, key, value, upper_left_bias) - -###################################################################### -# Conclusion -# ========== -# -# In this tutorial, we have demonstrated the basic usage of -# ``torch.nn.functional.scaled_dot_product_attention``. We have shown how -# the ``sdpa_kernel`` context manager can be used to assert a certain -# implementation is used on GPU. As well, we built a simple -# ``CausalSelfAttention`` module that works with ``NestedTensor`` and is torch -# compilable. In the process we have shown how to the profiling tools can -# be used to explore the performance characteristics of a user defined -# module. -# diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py deleted file mode 100755 index c9e360d751..0000000000 --- a/intermediate_source/seq2seq_translation_tutorial.py +++ /dev/null @@ -1,872 +0,0 @@ -# -*- coding: utf-8 -*- -""" -NLP From Scratch: Translation with a Sequence to Sequence Network and Attention -******************************************************************************* -**Author**: `Sean Robertson `_ - -This is the third and final tutorial on doing "NLP From Scratch", where we -write our own classes and functions to preprocess the data to do our NLP -modeling tasks. We hope after you complete this tutorial that you'll proceed to -learn how `torchtext` can handle much of this preprocessing for you in the -three tutorials immediately following this one. - -In this project we will be teaching a neural network to translate from -French to English. - -.. code-block:: sh - - [KEY: > input, = target, < output] - - > il est en train de peindre un tableau . - = he is painting a picture . - < he is painting a picture . - - > pourquoi ne pas essayer ce vin delicieux ? - = why not try that delicious wine ? - < why not try that delicious wine ? - - > elle n est pas poete mais romanciere . - = she is not a poet but a novelist . - < she not not a poet but a novelist . - - > vous etes trop maigre . - = you re too skinny . - < you re all alone . - -... to varying degrees of success. - -This is made possible by the simple but powerful idea of the `sequence -to sequence network `__, in which two -recurrent neural networks work together to transform one sequence to -another. An encoder network condenses an input sequence into a vector, -and a decoder network unfolds that vector into a new sequence. - -.. figure:: /_static/img/seq-seq-images/seq2seq.png - :alt: - -To improve upon this model we'll use an `attention -mechanism `__, which lets the decoder -learn to focus over a specific range of the input sequence. - -**Recommended Reading:** - -I assume you have at least installed PyTorch, know Python, and -understand Tensors: - -- https://pytorch.org/ For installation instructions -- :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general -- :doc:`/beginner/pytorch_with_examples` for a wide and deep overview -- :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user - - -It would also be useful to know about Sequence to Sequence networks and -how they work: - -- `Learning Phrase Representations using RNN Encoder-Decoder for - Statistical Machine Translation `__ -- `Sequence to Sequence Learning with Neural - Networks `__ -- `Neural Machine Translation by Jointly Learning to Align and - Translate `__ -- `A Neural Conversational Model `__ - -You will also find the previous tutorials on -:doc:`/intermediate/char_rnn_classification_tutorial` -and :doc:`/intermediate/char_rnn_generation_tutorial` -helpful as those concepts are very similar to the Encoder and Decoder -models, respectively. - -**Requirements** -""" -from __future__ import unicode_literals, print_function, division -from io import open -import unicodedata -import re -import random - -import torch -import torch.nn as nn -from torch import optim -import torch.nn.functional as F - -import numpy as np -from torch.utils.data import TensorDataset, DataLoader, RandomSampler - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -###################################################################### -# Loading data files -# ================== -# -# The data for this project is a set of many thousands of English to -# French translation pairs. -# -# `This question on Open Data Stack -# Exchange `__ -# pointed me to the open translation site https://tatoeba.org/ which has -# downloads available at https://tatoeba.org/eng/downloads - and better -# yet, someone did the extra work of splitting language pairs into -# individual text files here: https://www.manythings.org/anki/ -# -# The English to French pairs are too big to include in the repository, so -# download to ``data/eng-fra.txt`` before continuing. The file is a tab -# separated list of translation pairs: -# -# .. code-block:: sh -# -# I am cold. J'ai froid. -# -# .. note:: -# Download the data from -# `here `_ -# and extract it to the current directory. - -###################################################################### -# Similar to the character encoding used in the character-level RNN -# tutorials, we will be representing each word in a language as a one-hot -# vector, or giant vector of zeros except for a single one (at the index -# of the word). Compared to the dozens of characters that might exist in a -# language, there are many many more words, so the encoding vector is much -# larger. We will however cheat a bit and trim the data to only use a few -# thousand words per language. -# -# .. figure:: /_static/img/seq-seq-images/word-encoding.png -# :alt: -# -# - - -###################################################################### -# We'll need a unique index per word to use as the inputs and targets of -# the networks later. To keep track of all this we will use a helper class -# called ``Lang`` which has word → index (``word2index``) and index → word -# (``index2word``) dictionaries, as well as a count of each word -# ``word2count`` which will be used to replace rare words later. -# - -SOS_token = 0 -EOS_token = 1 - -class Lang: - def __init__(self, name): - self.name = name - self.word2index = {} - self.word2count = {} - self.index2word = {0: "SOS", 1: "EOS"} - self.n_words = 2 # Count SOS and EOS - - def addSentence(self, sentence): - for word in sentence.split(' '): - self.addWord(word) - - def addWord(self, word): - if word not in self.word2index: - self.word2index[word] = self.n_words - self.word2count[word] = 1 - self.index2word[self.n_words] = word - self.n_words += 1 - else: - self.word2count[word] += 1 - - -###################################################################### -# The files are all in Unicode, to simplify we will turn Unicode -# characters to ASCII, make everything lowercase, and trim most -# punctuation. -# - -# Turn a Unicode string to plain ASCII, thanks to -# https://stackoverflow.com/a/518232/2809427 -def unicodeToAscii(s): - return ''.join( - c for c in unicodedata.normalize('NFD', s) - if unicodedata.category(c) != 'Mn' - ) - -# Lowercase, trim, and remove non-letter characters -def normalizeString(s): - s = unicodeToAscii(s.lower().strip()) - s = re.sub(r"([.!?])", r" \1", s) - s = re.sub(r"[^a-zA-Z!?]+", r" ", s) - return s.strip() - - -###################################################################### -# To read the data file we will split the file into lines, and then split -# lines into pairs. The files are all English → Other Language, so if we -# want to translate from Other Language → English I added the ``reverse`` -# flag to reverse the pairs. -# - -def readLangs(lang1, lang2, reverse=False): - print("Reading lines...") - - # Read the file and split into lines - lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\ - read().strip().split('\n') - - # Split every line into pairs and normalize - pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] - - # Reverse pairs, make Lang instances - if reverse: - pairs = [list(reversed(p)) for p in pairs] - input_lang = Lang(lang2) - output_lang = Lang(lang1) - else: - input_lang = Lang(lang1) - output_lang = Lang(lang2) - - return input_lang, output_lang, pairs - - -###################################################################### -# Since there are a *lot* of example sentences and we want to train -# something quickly, we'll trim the data set to only relatively short and -# simple sentences. Here the maximum length is 10 words (that includes -# ending punctuation) and we're filtering to sentences that translate to -# the form "I am" or "He is" etc. (accounting for apostrophes replaced -# earlier). -# - -MAX_LENGTH = 10 - -eng_prefixes = ( - "i am ", "i m ", - "he is", "he s ", - "she is", "she s ", - "you are", "you re ", - "we are", "we re ", - "they are", "they re " -) - -def filterPair(p): - return len(p[0].split(' ')) < MAX_LENGTH and \ - len(p[1].split(' ')) < MAX_LENGTH and \ - p[1].startswith(eng_prefixes) - - -def filterPairs(pairs): - return [pair for pair in pairs if filterPair(pair)] - - -###################################################################### -# The full process for preparing the data is: -# -# - Read text file and split into lines, split lines into pairs -# - Normalize text, filter by length and content -# - Make word lists from sentences in pairs -# - -def prepareData(lang1, lang2, reverse=False): - input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse) - print("Read %s sentence pairs" % len(pairs)) - pairs = filterPairs(pairs) - print("Trimmed to %s sentence pairs" % len(pairs)) - print("Counting words...") - for pair in pairs: - input_lang.addSentence(pair[0]) - output_lang.addSentence(pair[1]) - print("Counted words:") - print(input_lang.name, input_lang.n_words) - print(output_lang.name, output_lang.n_words) - return input_lang, output_lang, pairs - -input_lang, output_lang, pairs = prepareData('eng', 'fra', True) -print(random.choice(pairs)) - - -###################################################################### -# The Seq2Seq Model -# ================= -# -# A Recurrent Neural Network, or RNN, is a network that operates on a -# sequence and uses its own output as input for subsequent steps. -# -# A `Sequence to Sequence network `__, or -# seq2seq network, or `Encoder Decoder -# network `__, is a model -# consisting of two RNNs called the encoder and decoder. The encoder reads -# an input sequence and outputs a single vector, and the decoder reads -# that vector to produce an output sequence. -# -# .. figure:: /_static/img/seq-seq-images/seq2seq.png -# :alt: -# -# Unlike sequence prediction with a single RNN, where every input -# corresponds to an output, the seq2seq model frees us from sequence -# length and order, which makes it ideal for translation between two -# languages. -# -# Consider the sentence ``Je ne suis pas le chat noir`` → ``I am not the -# black cat``. Most of the words in the input sentence have a direct -# translation in the output sentence, but are in slightly different -# orders, e.g. ``chat noir`` and ``black cat``. Because of the ``ne/pas`` -# construction there is also one more word in the input sentence. It would -# be difficult to produce a correct translation directly from the sequence -# of input words. -# -# With a seq2seq model the encoder creates a single vector which, in the -# ideal case, encodes the "meaning" of the input sequence into a single -# vector — a single point in some N dimensional space of sentences. -# - - -###################################################################### -# The Encoder -# ----------- -# -# The encoder of a seq2seq network is a RNN that outputs some value for -# every word from the input sentence. For every input word the encoder -# outputs a vector and a hidden state, and uses the hidden state for the -# next input word. -# -# .. figure:: /_static/img/seq-seq-images/encoder-network.png -# :alt: -# -# - -class EncoderRNN(nn.Module): - def __init__(self, input_size, hidden_size, dropout_p=0.1): - super(EncoderRNN, self).__init__() - self.hidden_size = hidden_size - - self.embedding = nn.Embedding(input_size, hidden_size) - self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) - self.dropout = nn.Dropout(dropout_p) - - def forward(self, input): - embedded = self.dropout(self.embedding(input)) - output, hidden = self.gru(embedded) - return output, hidden - -###################################################################### -# The Decoder -# ----------- -# -# The decoder is another RNN that takes the encoder output vector(s) and -# outputs a sequence of words to create the translation. -# - - -###################################################################### -# Simple Decoder -# ^^^^^^^^^^^^^^ -# -# In the simplest seq2seq decoder we use only last output of the encoder. -# This last output is sometimes called the *context vector* as it encodes -# context from the entire sequence. This context vector is used as the -# initial hidden state of the decoder. -# -# At every step of decoding, the decoder is given an input token and -# hidden state. The initial input token is the start-of-string ```` -# token, and the first hidden state is the context vector (the encoder's -# last hidden state). -# -# .. figure:: /_static/img/seq-seq-images/decoder-network.png -# :alt: -# -# - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size, output_size): - super(DecoderRNN, self).__init__() - self.embedding = nn.Embedding(output_size, hidden_size) - self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) - self.out = nn.Linear(hidden_size, output_size) - - def forward(self, encoder_outputs, encoder_hidden, target_tensor=None): - batch_size = encoder_outputs.size(0) - decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token) - decoder_hidden = encoder_hidden - decoder_outputs = [] - - for i in range(MAX_LENGTH): - decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden) - decoder_outputs.append(decoder_output) - - if target_tensor is not None: - # Teacher forcing: Feed the target as the next input - decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing - else: - # Without teacher forcing: use its own predictions as the next input - _, topi = decoder_output.topk(1) - decoder_input = topi.squeeze(-1).detach() # detach from history as input - - decoder_outputs = torch.cat(decoder_outputs, dim=1) - decoder_outputs = F.log_softmax(decoder_outputs, dim=-1) - return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop - - def forward_step(self, input, hidden): - output = self.embedding(input) - output = F.relu(output) - output, hidden = self.gru(output, hidden) - output = self.out(output) - return output, hidden - -###################################################################### -# I encourage you to train and observe the results of this model, but to -# save space we'll be going straight for the gold and introducing the -# Attention Mechanism. -# - - -###################################################################### -# Attention Decoder -# ^^^^^^^^^^^^^^^^^ -# -# If only the context vector is passed between the encoder and decoder, -# that single vector carries the burden of encoding the entire sentence. -# -# Attention allows the decoder network to "focus" on a different part of -# the encoder's outputs for every step of the decoder's own outputs. First -# we calculate a set of *attention weights*. These will be multiplied by -# the encoder output vectors to create a weighted combination. The result -# (called ``attn_applied`` in the code) should contain information about -# that specific part of the input sequence, and thus help the decoder -# choose the right output words. -# -# .. figure:: https://i.imgur.com/1152PYf.png -# :alt: -# -# Calculating the attention weights is done with another feed-forward -# layer ``attn``, using the decoder's input and hidden state as inputs. -# Because there are sentences of all sizes in the training data, to -# actually create and train this layer we have to choose a maximum -# sentence length (input length, for encoder outputs) that it can apply -# to. Sentences of the maximum length will use all the attention weights, -# while shorter sentences will only use the first few. -# -# .. figure:: /_static/img/seq-seq-images/attention-decoder-network.png -# :alt: -# -# -# Bahdanau attention, also known as additive attention, is a commonly used -# attention mechanism in sequence-to-sequence models, particularly in neural -# machine translation tasks. It was introduced by Bahdanau et al. in their -# paper titled `Neural Machine Translation by Jointly Learning to Align and Translate `__. -# This attention mechanism employs a learned alignment model to compute attention -# scores between the encoder and decoder hidden states. It utilizes a feed-forward -# neural network to calculate alignment scores. -# -# However, there are alternative attention mechanisms available, such as Luong attention, -# which computes attention scores by taking the dot product between the decoder hidden -# state and the encoder hidden states. It does not involve the non-linear transformation -# used in Bahdanau attention. -# -# In this tutorial, we will be using Bahdanau attention. However, it would be a valuable -# exercise to explore modifying the attention mechanism to use Luong attention. - -class BahdanauAttention(nn.Module): - def __init__(self, hidden_size): - super(BahdanauAttention, self).__init__() - self.Wa = nn.Linear(hidden_size, hidden_size) - self.Ua = nn.Linear(hidden_size, hidden_size) - self.Va = nn.Linear(hidden_size, 1) - - def forward(self, query, keys): - scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys))) - scores = scores.squeeze(2).unsqueeze(1) - - weights = F.softmax(scores, dim=-1) - context = torch.bmm(weights, keys) - - return context, weights - -class AttnDecoderRNN(nn.Module): - def __init__(self, hidden_size, output_size, dropout_p=0.1): - super(AttnDecoderRNN, self).__init__() - self.embedding = nn.Embedding(output_size, hidden_size) - self.attention = BahdanauAttention(hidden_size) - self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True) - self.out = nn.Linear(hidden_size, output_size) - self.dropout = nn.Dropout(dropout_p) - - def forward(self, encoder_outputs, encoder_hidden, target_tensor=None): - batch_size = encoder_outputs.size(0) - decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token) - decoder_hidden = encoder_hidden - decoder_outputs = [] - attentions = [] - - for i in range(MAX_LENGTH): - decoder_output, decoder_hidden, attn_weights = self.forward_step( - decoder_input, decoder_hidden, encoder_outputs - ) - decoder_outputs.append(decoder_output) - attentions.append(attn_weights) - - if target_tensor is not None: - # Teacher forcing: Feed the target as the next input - decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing - else: - # Without teacher forcing: use its own predictions as the next input - _, topi = decoder_output.topk(1) - decoder_input = topi.squeeze(-1).detach() # detach from history as input - - decoder_outputs = torch.cat(decoder_outputs, dim=1) - decoder_outputs = F.log_softmax(decoder_outputs, dim=-1) - attentions = torch.cat(attentions, dim=1) - - return decoder_outputs, decoder_hidden, attentions - - - def forward_step(self, input, hidden, encoder_outputs): - embedded = self.dropout(self.embedding(input)) - - query = hidden.permute(1, 0, 2) - context, attn_weights = self.attention(query, encoder_outputs) - input_gru = torch.cat((embedded, context), dim=2) - - output, hidden = self.gru(input_gru, hidden) - output = self.out(output) - - return output, hidden, attn_weights - - -###################################################################### -# .. note:: There are other forms of attention that work around the length -# limitation by using a relative position approach. Read about "local -# attention" in `Effective Approaches to Attention-based Neural Machine -# Translation `__. -# -# Training -# ======== -# -# Preparing Training Data -# ----------------------- -# -# To train, for each pair we will need an input tensor (indexes of the -# words in the input sentence) and target tensor (indexes of the words in -# the target sentence). While creating these vectors we will append the -# EOS token to both sequences. -# - -def indexesFromSentence(lang, sentence): - return [lang.word2index[word] for word in sentence.split(' ')] - -def tensorFromSentence(lang, sentence): - indexes = indexesFromSentence(lang, sentence) - indexes.append(EOS_token) - return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1) - -def tensorsFromPair(pair): - input_tensor = tensorFromSentence(input_lang, pair[0]) - target_tensor = tensorFromSentence(output_lang, pair[1]) - return (input_tensor, target_tensor) - -def get_dataloader(batch_size): - input_lang, output_lang, pairs = prepareData('eng', 'fra', True) - - n = len(pairs) - input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32) - target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32) - - for idx, (inp, tgt) in enumerate(pairs): - inp_ids = indexesFromSentence(input_lang, inp) - tgt_ids = indexesFromSentence(output_lang, tgt) - inp_ids.append(EOS_token) - tgt_ids.append(EOS_token) - input_ids[idx, :len(inp_ids)] = inp_ids - target_ids[idx, :len(tgt_ids)] = tgt_ids - - train_data = TensorDataset(torch.LongTensor(input_ids).to(device), - torch.LongTensor(target_ids).to(device)) - - train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) - return input_lang, output_lang, train_dataloader - - -###################################################################### -# Training the Model -# ------------------ -# -# To train we run the input sentence through the encoder, and keep track -# of every output and the latest hidden state. Then the decoder is given -# the ```` token as its first input, and the last hidden state of the -# encoder as its first hidden state. -# -# "Teacher forcing" is the concept of using the real target outputs as -# each next input, instead of using the decoder's guess as the next input. -# Using teacher forcing causes it to converge faster but `when the trained -# network is exploited, it may exhibit -# instability `__. -# -# You can observe outputs of teacher-forced networks that read with -# coherent grammar but wander far from the correct translation - -# intuitively it has learned to represent the output grammar and can "pick -# up" the meaning once the teacher tells it the first few words, but it -# has not properly learned how to create the sentence from the translation -# in the first place. -# -# Because of the freedom PyTorch's autograd gives us, we can randomly -# choose to use teacher forcing or not with a simple if statement. Turn -# ``teacher_forcing_ratio`` up to use more of it. -# - -def train_epoch(dataloader, encoder, decoder, encoder_optimizer, - decoder_optimizer, criterion): - - total_loss = 0 - for data in dataloader: - input_tensor, target_tensor = data - - encoder_optimizer.zero_grad() - decoder_optimizer.zero_grad() - - encoder_outputs, encoder_hidden = encoder(input_tensor) - decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor) - - loss = criterion( - decoder_outputs.view(-1, decoder_outputs.size(-1)), - target_tensor.view(-1) - ) - loss.backward() - - encoder_optimizer.step() - decoder_optimizer.step() - - total_loss += loss.item() - - return total_loss / len(dataloader) - - -###################################################################### -# This is a helper function to print time elapsed and estimated time -# remaining given the current time and progress %. -# - -import time -import math - -def asMinutes(s): - m = math.floor(s / 60) - s -= m * 60 - return '%dm %ds' % (m, s) - -def timeSince(since, percent): - now = time.time() - s = now - since - es = s / (percent) - rs = es - s - return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) - - -###################################################################### -# The whole training process looks like this: -# -# - Start a timer -# - Initialize optimizers and criterion -# - Create set of training pairs -# - Start empty losses array for plotting -# -# Then we call ``train`` many times and occasionally print the progress (% -# of examples, time so far, estimated time) and average loss. -# - -def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, - print_every=100, plot_every=100): - start = time.time() - plot_losses = [] - print_loss_total = 0 # Reset every print_every - plot_loss_total = 0 # Reset every plot_every - - encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) - decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) - criterion = nn.NLLLoss() - - for epoch in range(1, n_epochs + 1): - loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) - print_loss_total += loss - plot_loss_total += loss - - if epoch % print_every == 0: - print_loss_avg = print_loss_total / print_every - print_loss_total = 0 - print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs), - epoch, epoch / n_epochs * 100, print_loss_avg)) - - if epoch % plot_every == 0: - plot_loss_avg = plot_loss_total / plot_every - plot_losses.append(plot_loss_avg) - plot_loss_total = 0 - - showPlot(plot_losses) - -###################################################################### -# Plotting results -# ---------------- -# -# Plotting is done with matplotlib, using the array of loss values -# ``plot_losses`` saved while training. -# - -import matplotlib.pyplot as plt -plt.switch_backend('agg') -import matplotlib.ticker as ticker -import numpy as np - -def showPlot(points): - plt.figure() - fig, ax = plt.subplots() - # this locator puts ticks at regular intervals - loc = ticker.MultipleLocator(base=0.2) - ax.yaxis.set_major_locator(loc) - plt.plot(points) - - -###################################################################### -# Evaluation -# ========== -# -# Evaluation is mostly the same as training, but there are no targets so -# we simply feed the decoder's predictions back to itself for each step. -# Every time it predicts a word we add it to the output string, and if it -# predicts the EOS token we stop there. We also store the decoder's -# attention outputs for display later. -# - -def evaluate(encoder, decoder, sentence, input_lang, output_lang): - with torch.no_grad(): - input_tensor = tensorFromSentence(input_lang, sentence) - - encoder_outputs, encoder_hidden = encoder(input_tensor) - decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden) - - _, topi = decoder_outputs.topk(1) - decoded_ids = topi.squeeze() - - decoded_words = [] - for idx in decoded_ids: - if idx.item() == EOS_token: - decoded_words.append('') - break - decoded_words.append(output_lang.index2word[idx.item()]) - return decoded_words, decoder_attn - - -###################################################################### -# We can evaluate random sentences from the training set and print out the -# input, target, and output to make some subjective quality judgements: -# - -def evaluateRandomly(encoder, decoder, n=10): - for i in range(n): - pair = random.choice(pairs) - print('>', pair[0]) - print('=', pair[1]) - output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang) - output_sentence = ' '.join(output_words) - print('<', output_sentence) - print('') - - -###################################################################### -# Training and Evaluating -# ======================= -# -# With all these helper functions in place (it looks like extra work, but -# it makes it easier to run multiple experiments) we can actually -# initialize a network and start training. -# -# Remember that the input sentences were heavily filtered. For this small -# dataset we can use relatively small networks of 256 hidden nodes and a -# single GRU layer. After about 40 minutes on a MacBook CPU we'll get some -# reasonable results. -# -# .. note:: -# If you run this notebook you can train, interrupt the kernel, -# evaluate, and continue training later. Comment out the lines where the -# encoder and decoder are initialized and run ``trainIters`` again. -# - -hidden_size = 128 -batch_size = 32 - -input_lang, output_lang, train_dataloader = get_dataloader(batch_size) - -encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device) -decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device) - -train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5) - -###################################################################### -# -# Set dropout layers to ``eval`` mode -encoder.eval() -decoder.eval() -evaluateRandomly(encoder, decoder) - - -###################################################################### -# Visualizing Attention -# --------------------- -# -# A useful property of the attention mechanism is its highly interpretable -# outputs. Because it is used to weight specific encoder outputs of the -# input sequence, we can imagine looking where the network is focused most -# at each time step. -# -# You could simply run ``plt.matshow(attentions)`` to see attention output -# displayed as a matrix. For a better viewing experience we will do the -# extra work of adding axes and labels: -# - -def showAttention(input_sentence, output_words, attentions): - fig = plt.figure() - ax = fig.add_subplot(111) - cax = ax.matshow(attentions.cpu().numpy(), cmap='bone') - fig.colorbar(cax) - - # Set up axes - ax.set_xticklabels([''] + input_sentence.split(' ') + - [''], rotation=90) - ax.set_yticklabels([''] + output_words) - - # Show label at every tick - ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) - ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) - - plt.show() - - -def evaluateAndShowAttention(input_sentence): - output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang) - print('input =', input_sentence) - print('output =', ' '.join(output_words)) - showAttention(input_sentence, output_words, attentions[0, :len(output_words), :]) - - -evaluateAndShowAttention('il n est pas aussi grand que son pere') - -evaluateAndShowAttention('je suis trop fatigue pour conduire') - -evaluateAndShowAttention('je suis desole si c est une question idiote') - -evaluateAndShowAttention('je suis reellement fiere de vous') - - -###################################################################### -# Exercises -# ========= -# -# - Try with a different dataset -# -# - Another language pair -# - Human → Machine (e.g. IOT commands) -# - Chat → Response -# - Question → Answer -# -# - Replace the embeddings with pretrained word embeddings such as ``word2vec`` or -# ``GloVe`` -# - Try with more layers, more hidden units, and more sentences. Compare -# the training time and results. -# - If you use a translation file where pairs have two of the same phrase -# (``I am test \t I am test``), you can use this as an autoencoder. Try -# this: -# -# - Train as an autoencoder -# - Save only the Encoder network -# - Train a new Decoder for translation from there -# diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py deleted file mode 100644 index 99efe41b39..0000000000 --- a/intermediate_source/spatial_transformer_tutorial.py +++ /dev/null @@ -1,257 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Spatial Transformer Networks Tutorial -===================================== -**Author**: `Ghassen HAMROUNI `_ - -.. figure:: /_static/img/stn/FSeq.png - -In this tutorial, you will learn how to augment your network using -a visual attention mechanism called spatial transformer -networks. You can read more about the spatial transformer -networks in the `DeepMind paper `__ - -Spatial transformer networks are a generalization of differentiable -attention to any spatial transformation. Spatial transformer networks -(STN for short) allow a neural network to learn how to perform spatial -transformations on the input image in order to enhance the geometric -invariance of the model. -For example, it can crop a region of interest, scale and correct -the orientation of an image. It can be a useful mechanism because CNNs -are not invariant to rotation and scale and more general affine -transformations. - -One of the best things about STN is the ability to simply plug it into -any existing CNN with very little modification. -""" -# License: BSD -# Author: Ghassen Hamrouni - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torchvision -from torchvision import datasets, transforms -import matplotlib.pyplot as plt -import numpy as np - -plt.ion() # interactive mode - -###################################################################### -# Loading the data -# ---------------- -# -# In this post we experiment with the classic MNIST dataset. Using a -# standard convolutional network augmented with a spatial transformer -# network. - -from six.moves import urllib -opener = urllib.request.build_opener() -opener.addheaders = [('User-agent', 'Mozilla/5.0')] -urllib.request.install_opener(opener) - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -# Training dataset -train_loader = torch.utils.data.DataLoader( - datasets.MNIST(root='.', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), batch_size=64, shuffle=True, num_workers=4) -# Test dataset -test_loader = torch.utils.data.DataLoader( - datasets.MNIST(root='.', train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), batch_size=64, shuffle=True, num_workers=4) - -###################################################################### -# Depicting spatial transformer networks -# -------------------------------------- -# -# Spatial transformer networks boils down to three main components : -# -# - The localization network is a regular CNN which regresses the -# transformation parameters. The transformation is never learned -# explicitly from this dataset, instead the network learns automatically -# the spatial transformations that enhances the global accuracy. -# - The grid generator generates a grid of coordinates in the input -# image corresponding to each pixel from the output image. -# - The sampler uses the parameters of the transformation and applies -# it to the input image. -# -# .. figure:: /_static/img/stn/stn-arch.png -# -# .. note:: -# We need the latest version of PyTorch that contains -# affine_grid and grid_sample modules. -# - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - # Spatial transformer localization-network - self.localization = nn.Sequential( - nn.Conv2d(1, 8, kernel_size=7), - nn.MaxPool2d(2, stride=2), - nn.ReLU(True), - nn.Conv2d(8, 10, kernel_size=5), - nn.MaxPool2d(2, stride=2), - nn.ReLU(True) - ) - - # Regressor for the 3 * 2 affine matrix - self.fc_loc = nn.Sequential( - nn.Linear(10 * 3 * 3, 32), - nn.ReLU(True), - nn.Linear(32, 3 * 2) - ) - - # Initialize the weights/bias with identity transformation - self.fc_loc[2].weight.data.zero_() - self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float)) - - # Spatial transformer network forward function - def stn(self, x): - xs = self.localization(x) - xs = xs.view(-1, 10 * 3 * 3) - theta = self.fc_loc(xs) - theta = theta.view(-1, 2, 3) - - grid = F.affine_grid(theta, x.size()) - x = F.grid_sample(x, grid) - - return x - - def forward(self, x): - # transform the input - x = self.stn(x) - - # Perform the usual forward pass - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - - -model = Net().to(device) - -###################################################################### -# Training the model -# ------------------ -# -# Now, let's use the SGD algorithm to train the model. The network is -# learning the classification task in a supervised way. In the same time -# the model is learning STN automatically in an end-to-end fashion. - - -optimizer = optim.SGD(model.parameters(), lr=0.01) - - -def train(epoch): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(device), target.to(device) - - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % 500 == 0: - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss.item())) -# -# A simple test procedure to measure the STN performances on MNIST. -# - - -def test(): - with torch.no_grad(): - model.eval() - test_loss = 0 - correct = 0 - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - - # sum up batch loss - test_loss += F.nll_loss(output, target, size_average=False).item() - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n' - .format(test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - -###################################################################### -# Visualizing the STN results -# --------------------------- -# -# Now, we will inspect the results of our learned visual attention -# mechanism. -# -# We define a small helper function in order to visualize the -# transformations while training. - - -def convert_image_np(inp): - """Convert a Tensor to numpy image.""" - inp = inp.numpy().transpose((1, 2, 0)) - mean = np.array([0.485, 0.456, 0.406]) - std = np.array([0.229, 0.224, 0.225]) - inp = std * inp + mean - inp = np.clip(inp, 0, 1) - return inp - -# We want to visualize the output of the spatial transformers layer -# after the training, we visualize a batch of input images and -# the corresponding transformed batch using STN. - - -def visualize_stn(): - with torch.no_grad(): - # Get a batch of training data - data = next(iter(test_loader))[0].to(device) - - input_tensor = data.cpu() - transformed_input_tensor = model.stn(data).cpu() - - in_grid = convert_image_np( - torchvision.utils.make_grid(input_tensor)) - - out_grid = convert_image_np( - torchvision.utils.make_grid(transformed_input_tensor)) - - # Plot the results side-by-side - f, axarr = plt.subplots(1, 2) - axarr[0].imshow(in_grid) - axarr[0].set_title('Dataset Images') - - axarr[1].imshow(out_grid) - axarr[1].set_title('Transformed Images') - -for epoch in range(1, 20 + 1): - train(epoch) - test() - -# Visualize the STN transformation on some input batch -visualize_stn() - -plt.ioff() -plt.show() diff --git a/intermediate_source/speech_recognition_pipeline_tutorial.rst b/intermediate_source/speech_recognition_pipeline_tutorial.rst deleted file mode 100644 index 4ec497b3bd..0000000000 --- a/intermediate_source/speech_recognition_pipeline_tutorial.rst +++ /dev/null @@ -1,10 +0,0 @@ -Speech Recognition with Wav2Vec2 -================================ - -This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html - -It will redirect in 3 seconds. - -.. raw:: html - - diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py deleted file mode 100644 index 00bdcfbf07..0000000000 --- a/intermediate_source/tensorboard_profiler_tutorial.py +++ /dev/null @@ -1,501 +0,0 @@ -""" -PyTorch Profiler With TensorBoard -==================================== -This tutorial demonstrates how to use TensorBoard plugin with PyTorch Profiler -to detect performance bottlenecks of the model. - -Introduction ------------- -PyTorch 1.8 includes an updated profiler API capable of -recording the CPU side operations as well as the CUDA kernel launches on the GPU side. -The profiler can visualize this information -in TensorBoard Plugin and provide analysis of the performance bottlenecks. - -In this tutorial, we will use a simple Resnet model to demonstrate how to -use TensorBoard plugin to analyze model performance. - -Setup ------ -To install ``torch`` and ``torchvision`` use the following command: - -.. code-block:: - - pip install torch torchvision - - -""" - - -###################################################################### -# Steps -# ----- -# -# 1. Prepare the data and model -# 2. Use profiler to record execution events -# 3. Run the profiler -# 4. Use TensorBoard to view results and analyze model performance -# 5. Improve performance with the help of profiler -# 6. Analyze performance with other advanced features -# 7. Additional Practices: Profiling PyTorch on AMD GPUs -# -# 1. Prepare the data and model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# First, import all necessary libraries: -# - -import torch -import torch.nn -import torch.optim -import torch.profiler -import torch.utils.data -import torchvision.datasets -import torchvision.models -import torchvision.transforms as T - -###################################################################### -# Then prepare the input data. For this tutorial, we use the CIFAR10 dataset. -# Transform it to the desired format and use ``DataLoader`` to load each batch. - -transform = T.Compose( - [T.Resize(224), - T.ToTensor(), - T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) -train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) -train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True) - -###################################################################### -# Next, create Resnet model, loss function, and optimizer objects. -# To run on GPU, move model and loss to GPU device. - -device = torch.device("cuda:0") -model = torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device) -criterion = torch.nn.CrossEntropyLoss().cuda(device) -optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) -model.train() - - -###################################################################### -# Define the training step for each batch of input data. - -def train(data): - inputs, labels = data[0].to(device=device), data[1].to(device=device) - outputs = model(inputs) - loss = criterion(outputs, labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - -###################################################################### -# 2. Use profiler to record execution events -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The profiler is enabled through the context manager and accepts several parameters, -# some of the most useful are: -# -# - ``schedule`` - callable that takes step (int) as a single parameter -# and returns the profiler action to perform at each step. -# -# In this example with ``wait=1, warmup=1, active=3, repeat=1``, -# profiler will skip the first step/iteration, -# start warming up on the second, -# record the following three iterations, -# after which the trace will become available and on_trace_ready (when set) is called. -# In total, the cycle repeats once. Each cycle is called a "span" in TensorBoard plugin. -# -# During ``wait`` steps, the profiler is disabled. -# During ``warmup`` steps, the profiler starts tracing but the results are discarded. -# This is for reducing the profiling overhead. -# The overhead at the beginning of profiling is high and easy to bring skew to the profiling result. -# During ``active`` steps, the profiler works and records events. -# - ``on_trace_ready`` - callable that is called at the end of each cycle; -# In this example we use ``torch.profiler.tensorboard_trace_handler`` to generate result files for TensorBoard. -# After profiling, result files will be saved into the ``./log/resnet18`` directory. -# Specify this directory as a ``logdir`` parameter to analyze profile in TensorBoard. -# - ``record_shapes`` - whether to record shapes of the operator inputs. -# - ``profile_memory`` - Track tensor memory allocation/deallocation. Note, for old version of pytorch with version -# before 1.10, if you suffer long profiling time, please disable it or upgrade to new version. -# - ``with_stack`` - Record source information (file and line number) for the ops. -# If the TensorBoard is launched in VS Code (`reference `_), -# clicking a stack frame will navigate to the specific code line. - -with torch.profiler.profile( - schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), - on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'), - record_shapes=True, - profile_memory=True, - with_stack=True -) as prof: - for step, batch_data in enumerate(train_loader): - prof.step() # Need to call this at each step to notify profiler of steps' boundary. - if step >= 1 + 1 + 3: - break - train(batch_data) - -###################################################################### -# Alternatively, the following non-context manager start/stop is supported as well. -prof = torch.profiler.profile( - schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), - on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'), - record_shapes=True, - with_stack=True) -prof.start() -for step, batch_data in enumerate(train_loader): - prof.step() - if step >= 1 + 1 + 3: - break - train(batch_data) -prof.stop() - -###################################################################### -# 3. Run the profiler -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Run the above code. The profiling result will be saved under ``./log/resnet18`` directory. - - -###################################################################### -# 4. Use TensorBoard to view results and analyze model performance -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. note:: -# TensorBoard Plugin support has been deprecated, so some of these functions may not -# work as previously. Please take a look at the replacement, `HTA `_. -# -# Install PyTorch Profiler TensorBoard Plugin. -# -# .. code-block:: -# -# pip install torch_tb_profiler -# - -###################################################################### -# Launch the TensorBoard. -# -# .. code-block:: -# -# tensorboard --logdir=./log -# - -###################################################################### -# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser (**Safari is not supported**). -# -# .. code-block:: -# -# http://localhost:6006/#pytorch_profiler -# - -###################################################################### -# You could see Profiler plugin page as shown below. -# -# - Overview -# .. image:: ../../_static/img/profiler_overview1.png -# :scale: 25 % -# -# The overview shows a high-level summary of model performance. -# -# The "GPU Summary" panel shows the GPU configuration, GPU usage and Tensor Cores usage. -# In this example, the GPU Utilization is low. -# The details of these metrics are `here `_. -# -# The "Step Time Breakdown" shows distribution of time spent in each step over different categories of execution. -# In this example, you can see the ``DataLoader`` overhead is significant. -# -# The bottom "Performance Recommendation" uses the profiling data -# to automatically highlight likely bottlenecks, -# and gives you actionable optimization suggestions. -# -# You can change the view page in left "Views" dropdown list. -# -# .. image:: ../../_static/img/profiler_views_list.png -# :alt: -# -# -# - Operator view -# The operator view displays the performance of every PyTorch operator -# that is executed either on the host or device. -# -# .. image:: ../../_static/img/profiler_operator_view.png -# :scale: 25 % -# The "Self" duration does not include its child operators’ time. -# The "Total" duration includes its child operators’ time. -# -# - View call stack -# Click the ``View Callstack`` of an operator, the operators with same name but different call stacks will be shown. -# Then click a ``View Callstack`` in this sub-table, the call stack frames will be shown. -# -# .. image:: ../../_static/img/profiler_callstack.png -# :scale: 25 % -# -# If the TensorBoard is launched inside VS Code -# (`Launch Guide `_), -# clicking a call stack frame will navigate to the specific code line. -# -# .. image:: ../../_static/img/profiler_vscode.png -# :scale: 25 % -# -# -# - Kernel view -# The GPU kernel view shows all kernels’ time spent on GPU. -# -# .. image:: ../../_static/img/profiler_kernel_view.png -# :scale: 25 % -# Tensor Cores Used: -# Whether this kernel uses Tensor Cores. -# -# Mean Blocks per SM: -# Blocks per SM = Blocks of this kernel / SM number of this GPU. -# If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized. -# "Mean Blocks per SM" is weighted average of all runs of this kernel name, using each run’s duration as weight. -# -# Mean Est. Achieved Occupancy: -# Est. Achieved Occupancy is defined in this column’s tooltip. -# For most cases such as memory bandwidth bounded kernels, the higher the better. -# "Mean Est. Achieved Occupancy" is weighted average of all runs of this kernel name, -# using each run’s duration as weight. -# -# - Trace view -# The trace view shows timeline of profiled operators and GPU kernels. -# You can select it to see details as below. -# -# .. image:: ../../_static/img/profiler_trace_view1.png -# :scale: 25 % -# -# You can move the graph and zoom in/out with the help of right side toolbar. -# And keyboard can also be used to zoom and move around inside the timeline. -# The ‘w’ and ‘s’ keys zoom in centered around the mouse, -# and the ‘a’ and ‘d’ keys move the timeline left and right. -# You can hit these keys multiple times until you see a readable representation. -# -# If a backward operator's "Incoming Flow" field is with value "forward correspond to backward", -# you can click the text to get its launching forward operator. -# -# .. image:: ../../_static/img/profiler_trace_view_fwd_bwd.png -# :scale: 25 % -# -# In this example, we can see the event prefixed with ``enumerate(DataLoader)`` costs a lot of time. -# And during most of this period, the GPU is idle. -# Because this function is loading data and transforming data on host side, -# during which the GPU resource is wasted. - - -###################################################################### -# 5. Improve performance with the help of profiler -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# At the bottom of "Overview" page, the suggestion in "Performance Recommendation" hints the bottleneck is ``DataLoader``. -# The PyTorch ``DataLoader`` uses single process by default. -# User could enable multi-process data loading by setting the parameter ``num_workers``. -# `Here `_ is more details. -# -# In this example, we follow the "Performance Recommendation" and set ``num_workers`` as below, -# pass a different name such as ``./log/resnet18_4workers`` to ``tensorboard_trace_handler``, and run it again. -# -# .. code-block:: -# -# train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True, num_workers=4) -# - -###################################################################### -# Then let’s choose the recently profiled run in left "Runs" dropdown list. -# -# .. image:: ../../_static/img/profiler_overview2.png -# :scale: 25 % -# -# From the above view, we can find the step time is reduced to about 76ms comparing with previous run's 132ms, -# and the time reduction of ``DataLoader`` mainly contributes. -# -# .. image:: ../../_static/img/profiler_trace_view2.png -# :scale: 25 % -# -# From the above view, we can see that the runtime of ``enumerate(DataLoader)`` is reduced, -# and the GPU utilization is increased. - -###################################################################### -# 6. Analyze performance with other advanced features -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# - Memory view -# To profile memory, ``profile_memory`` must be set to ``True`` in arguments of ``torch.profiler.profile``. -# -# You can try it by using existing example on Azure -# -# .. code-block:: -# -# pip install azure-storage-blob -# tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/memory_demo_1_10 -# -# The profiler records all memory allocation/release events and allocator's internal state during profiling. -# The memory view consists of three components as shown in the following. -# -# .. image:: ../../_static/img/profiler_memory_view.png -# :scale: 25 % -# -# The components are memory curve graph, memory events table and memory statistics table, from top to bottom, respectively. -# -# The memory type could be selected in "Device" selection box. -# For example, "GPU0" means the following table only shows each operator's memory usage on GPU 0, not including CPU or other GPUs. -# -# The memory curve shows the trends of memory consumption. The "Allocated" curve shows the total memory that is actually -# in use, e.g., tensors. In PyTorch, caching mechanism is employed in CUDA allocator and some other allocators. The -# "Reserved" curve shows the total memory that is reserved by the allocator. You can left click and drag on the graph -# to select events in the desired range: -# -# .. image:: ../../_static/img/profiler_memory_curve_selecting.png -# :scale: 25 % -# -# After selection, the three components will be updated for the restricted time range, so that you can gain more -# information about it. By repeating this process, you can zoom into a very fine-grained detail. Right click on the graph -# will reset the graph to the initial state. -# -# .. image:: ../../_static/img/profiler_memory_curve_single.png -# :scale: 25 % -# -# In the memory events table, the allocation and release events are paired into one entry. The "operator" column shows -# the immediate ATen operator that is causing the allocation. Notice that in PyTorch, ATen operators commonly use -# ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an -# ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as -# ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration" -# columns' data might be missing if the event occurs outside of the time range. -# -# In the memory statistics table, the "Size Increase" column sums up all allocation size and minus all the memory -# release size, that is, the net increase of memory usage after this operator. The "Self Size Increase" column is -# similar to "Size Increase", but it does not count children operators' allocation. With regards to ATen operators' -# implementation detail, some operators might call other operators, so memory allocations can happen at any level of the -# call stack. That says, "Self Size Increase" only count the memory usage increase at current level of call stack. -# Finally, the "Allocation Size" column sums up all allocation without considering the memory release. -# -# - Distributed view -# The plugin now supports distributed view on profiling DDP with NCCL/GLOO as backend. -# -# You can try it by using existing example on Azure: -# -# .. code-block:: -# -# pip install azure-storage-blob -# tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/distributed_bert -# -# .. image:: ../../_static/img/profiler_distributed_view.png -# :scale: 25 % -# -# The "Computation/Communication Overview" shows computation/communication ratio and their overlapping degree. -# From this view, User can figure out load balance issue among workers. -# For example, if the computation + overlapping time of one worker is much larger than others, -# there may be a problem of load balance or this worker may be a straggler. -# -# The "Synchronizing/Communication Overview" shows the efficiency of communication. -# "Data Transfer Time" is the time for actual data exchanging. -# "Synchronizing Time" is the time for waiting and synchronizing with other workers. -# -# If one worker’s "Synchronizing Time" is much shorter than that of other workers’, -# this worker may be a straggler which may have more computation workload than other workers’. -# -# The "Communication Operations Stats" summarizes the detailed statistics of all communication ops in each worker. - -###################################################################### -# 7. Additional Practices: Profiling PyTorch on AMD GPUs -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# -# The AMD ROCm Platform is an open-source software stack designed for GPU computation, consisting of drivers, development tools, and APIs. -# We can run the above mentioned steps on AMD GPUs. In this section, we will use Docker to install the ROCm base development image -# before installing PyTorch. - - -###################################################################### -# For the purpose of example, let's create a directory called ``profiler_tutorial``, and save the code in **Step 1** as ``test_cifar10.py`` in this directory. -# -# .. code-block:: -# -# mkdir ~/profiler_tutorial -# cd profiler_tutorial -# vi test_cifar10.py - - -###################################################################### -# At the time of this writing, the Stable(``2.1.1``) Linux version of PyTorch on ROCm Platform is `ROCm 5.6 `_. -# -# -# - Obtain a base Docker image with the correct user-space ROCm version installed from `Docker Hub `_. -# -# It is ``rocm/dev-ubuntu-20.04:5.6``. -# -# - Start the ROCm base Docker container: -# -# -# .. code-block:: -# -# docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ~/profiler_tutorial:/profiler_tutorial rocm/dev-ubuntu-20.04:5.6 -# -# -# - Inside the container, install any dependencies needed for installing the wheels package. -# -# .. code-block:: -# -# sudo apt update -# sudo apt install libjpeg-dev python3-dev -y -# pip3 install wheel setuptools -# sudo apt install python-is-python3 -# -# -# - Install the wheels: -# -# .. code-block:: -# -# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6 -# -# -# - Install the ``torch_tb_profiler``, and then, run the Python file ``test_cifar10.py``: -# -# .. code-block:: -# -# pip install torch_tb_profiler -# cd /profiler_tutorial -# python test_cifar10.py -# -# -# Now, we have all the data needed to view in TensorBoard: -# -# .. code-block:: -# -# tensorboard --logdir=./log -# -# Choose different views as described in **Step 4**. For example, below is the **Operator** View: -# -# .. image:: ../../_static/img/profiler_rocm_tensorboard_operartor_view.png -# :scale: 25 % - - -###################################################################### -# At the time this section is written, **Trace** view does not work and it displays nothing. You can work around by typing ``chrome://tracing`` in your Chrome Browser. -# -# -# - Copy the ``trace.json`` file under ``~/profiler_tutorial/log/resnet18`` directory to the Windows. -# You may need to copy the file by using ``scp`` if the file is located in a remote location. -# -# - Click **Load** button to load the trace JSON file from the ``chrome://tracing`` page in the browser. -# -# .. image:: ../../_static/img/profiler_rocm_chrome_trace_view.png -# :scale: 25 % - - -###################################################################### -# As mentioned previously, you can move the graph and zoom in and out. -# You can also use keyboard to zoom and move around inside the timeline. -# The ``w`` and ``s`` keys zoom in centered around the mouse, -# and the ``a`` and ``d`` keys move the timeline left and right. -# You can hit these keys multiple times until you see a readable representation. - - - -###################################################################### -# Learn More -# ---------- -# -# Take a look at the following documents to continue your learning, -# and feel free to open an issue `here `_. -# -# - `PyTorch TensorBoard Profiler Github `_ -# - `torch.profiler API `_ -# - `HTA `_ diff --git a/intermediate_source/tensorboard_tutorial.rst b/intermediate_source/tensorboard_tutorial.rst deleted file mode 100644 index d62a12ba0e..0000000000 --- a/intermediate_source/tensorboard_tutorial.rst +++ /dev/null @@ -1,404 +0,0 @@ -Visualizing Models, Data, and Training with TensorBoard -======================================================= - -In the `60 Minute Blitz `_, -we show you how to load in data, -feed it through a model we define as a subclass of ``nn.Module``, -train this model on training data, and test it on test data. -To see what's happening, we print out some statistics as the model -is training to get a sense for whether training is progressing. -However, we can do much better than that: PyTorch integrates with -TensorBoard, a tool designed for visualizing the results of neural -network training runs. This tutorial illustrates some of its -functionality, using the -`Fashion-MNIST dataset `__ -which can be read into PyTorch using `torchvision.datasets`. - -In this tutorial, we'll learn how to: - - 1. Read in data and with appropriate transforms (nearly identical to the prior tutorial). - 2. Set up TensorBoard. - 3. Write to TensorBoard. - 4. Inspect a model architecture using TensorBoard. - 5. Use TensorBoard to create interactive versions of the visualizations we created in last tutorial, with less code - -Specifically, on point #5, we'll see: - - * A couple of ways to inspect our training data - * How to track our model's performance as it trains - * How to assess our model's performance once it is trained. - -We'll begin with similar boilerplate code as in the `CIFAR-10 tutorial `__: - -.. code:: python - - # imports - import matplotlib.pyplot as plt - import numpy as np - - import torch - import torchvision - import torchvision.transforms as transforms - - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - - # transforms - transform = transforms.Compose( - [transforms.ToTensor(), - transforms.Normalize((0.5,), (0.5,))]) - - # datasets - trainset = torchvision.datasets.FashionMNIST('./data', - download=True, - train=True, - transform=transform) - testset = torchvision.datasets.FashionMNIST('./data', - download=True, - train=False, - transform=transform) - - # dataloaders - trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, - shuffle=True, num_workers=2) - - - testloader = torch.utils.data.DataLoader(testset, batch_size=4, - shuffle=False, num_workers=2) - - # constant for classes - classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', - 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot') - - # helper function to show an image - # (used in the `plot_classes_preds` function below) - def matplotlib_imshow(img, one_channel=False): - if one_channel: - img = img.mean(dim=0) - img = img / 2 + 0.5 # unnormalize - npimg = img.numpy() - if one_channel: - plt.imshow(npimg, cmap="Greys") - else: - plt.imshow(np.transpose(npimg, (1, 2, 0))) - -We'll define a similar model architecture from that tutorial, making only -minor modifications to account for the fact that the images are now -one channel instead of three and 28x28 instead of 32x32: - -.. code:: python - - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 4 * 4, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 4 * 4) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - - net = Net() - -We'll define the same ``optimizer`` and ``criterion`` from before: - -.. code:: python - - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - -1. TensorBoard setup -~~~~~~~~~~~~~~~~~~~~~ - -Now we'll set up TensorBoard, importing ``tensorboard`` from ``torch.utils`` and defining a -``SummaryWriter``, our key object for writing information to TensorBoard. - -.. code:: python - - from torch.utils.tensorboard import SummaryWriter - - # default `log_dir` is "runs" - we'll be more specific here - writer = SummaryWriter('runs/fashion_mnist_experiment_1') - -Note that this line alone creates a ``runs/fashion_mnist_experiment_1`` -folder. - -2. Writing to TensorBoard -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Now let's write an image to our TensorBoard - specifically, a grid - -using `make_grid `__. - -.. code:: python - - # get some random training images - dataiter = iter(trainloader) - images, labels = next(dataiter) - - # create grid of images - img_grid = torchvision.utils.make_grid(images) - - # show images - matplotlib_imshow(img_grid, one_channel=True) - - # write to tensorboard - writer.add_image('four_fashion_mnist_images', img_grid) - -Now running - -:: - - tensorboard --logdir=runs - -from the command line and then navigating to `http://localhost:6006 `_ -should show the following. - -.. image:: ../../_static/img/tensorboard_first_view.png - -Now you know how to use TensorBoard! This example, however, could be -done in a Jupyter Notebook - where TensorBoard really excels is in -creating interactive visualizations. We'll cover one of those next, -and several more by the end of the tutorial. - -3. Inspect the model using TensorBoard -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One of TensorBoard's strengths is its ability to visualize complex model -structures. Let's visualize the model we built. - -.. code:: python - - writer.add_graph(net, images) - writer.close() - -Now upon refreshing TensorBoard you should see a "Graphs" tab that -looks like this: - -.. image:: ../../_static/img/tensorboard_model_viz.png - -Go ahead and double click on "Net" to see it expand, seeing a -detailed view of the individual operations that make up the model. - -TensorBoard has a very handy feature for visualizing high dimensional -data such as image data in a lower dimensional space; we'll cover this -next. - -4. Adding a "Projector" to TensorBoard -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We can visualize the lower dimensional representation of higher -dimensional data via the `add_embedding `__ method - -.. code:: python - - # helper function - def select_n_random(data, labels, n=100): - ''' - Selects n random datapoints and their corresponding labels from a dataset - ''' - assert len(data) == len(labels) - - perm = torch.randperm(len(data)) - return data[perm][:n], labels[perm][:n] - - # select random images and their target indices - images, labels = select_n_random(trainset.data, trainset.targets) - - # get the class labels for each image - class_labels = [classes[lab] for lab in labels] - - # log embeddings - features = images.view(-1, 28 * 28) - writer.add_embedding(features, - metadata=class_labels, - label_img=images.unsqueeze(1)) - writer.close() - -Now in the "Projector" tab of TensorBoard, you can see these 100 -images - each of which is 784 dimensional - projected down into three -dimensional space. Furthermore, this is interactive: you can click -and drag to rotate the three dimensional projection. Finally, a couple -of tips to make the visualization easier to see: select "color: label" -on the top left, as well as enabling "night mode", which will make the -images easier to see since their background is white: - -.. image:: ../../_static/img/tensorboard_projector.png - -Now we've thoroughly inspected our data, let's show how TensorBoard -can make tracking model training and evaluation clearer, starting with -training. - -5. Tracking model training with TensorBoard -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In the previous example, we simply *printed* the model's running loss -every 2000 iterations. Now, we'll instead log the running loss to -TensorBoard, along with a view into the predictions the model is -making via the ``plot_classes_preds`` function. - -.. code:: python - - # helper functions - - def images_to_probs(net, images): - ''' - Generates predictions and corresponding probabilities from a trained - network and a list of images - ''' - output = net(images) - # convert output probabilities to predicted class - _, preds_tensor = torch.max(output, 1) - preds = np.squeeze(preds_tensor.numpy()) - return preds, [F.softmax(el, dim=0)[i].item() for i, el in zip(preds, output)] - - - def plot_classes_preds(net, images, labels): - ''' - Generates matplotlib Figure using a trained network, along with images - and labels from a batch, that shows the network's top prediction along - with its probability, alongside the actual label, coloring this - information based on whether the prediction was correct or not. - Uses the "images_to_probs" function. - ''' - preds, probs = images_to_probs(net, images) - # plot the images in the batch, along with predicted and true labels - fig = plt.figure(figsize=(12, 48)) - for idx in np.arange(4): - ax = fig.add_subplot(1, 4, idx+1, xticks=[], yticks=[]) - matplotlib_imshow(images[idx], one_channel=True) - ax.set_title("{0}, {1:.1f}%\n(label: {2})".format( - classes[preds[idx]], - probs[idx] * 100.0, - classes[labels[idx]]), - color=("green" if preds[idx]==labels[idx].item() else "red")) - return fig - -Finally, let's train the model using the same model training code from -the prior tutorial, but writing results to TensorBoard every 1000 -batches instead of printing to console; this is done using the -`add_scalar `__ -function. - -In addition, as we train, we'll generate an image showing the model's -predictions vs. the actual results on the four images included in that -batch. - -.. code:: python - - running_loss = 0.0 - for epoch in range(1): # loop over the dataset multiple times - - for i, data in enumerate(trainloader, 0): - - # get the inputs; data is a list of [inputs, labels] - inputs, labels = data - - # zero the parameter gradients - optimizer.zero_grad() - - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() - - running_loss += loss.item() - if i % 1000 == 999: # every 1000 mini-batches... - - # ...log the running loss - writer.add_scalar('training loss', - running_loss / 1000, - epoch * len(trainloader) + i) - - # ...log a Matplotlib Figure showing the model's predictions on a - # random mini-batch - writer.add_figure('predictions vs. actuals', - plot_classes_preds(net, inputs, labels), - global_step=epoch * len(trainloader) + i) - running_loss = 0.0 - print('Finished Training') - -You can now look at the scalars tab to see the running loss plotted -over the 15,000 iterations of training: - -.. image:: ../../_static/img/tensorboard_scalar_runs.png - -In addition, we can look at the predictions the model made on -arbitrary batches throughout learning. See the "Images" tab and scroll -down under the "predictions vs. actuals" visualization to see this; -this shows us that, for example, after just 3000 training iterations, -the model was already able to distinguish between visually distinct -classes such as shirts, sneakers, and coats, though it isn't as -confident as it becomes later on in training: - -.. image:: ../../_static/img/tensorboard_images.png - -In the prior tutorial, we looked at per-class accuracy once the model -had been trained; here, we'll use TensorBoard to plot precision-recall -curves (good explanation -`here `__) -for each class. - -6. Assessing trained models with TensorBoard -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - # 1. gets the probability predictions in a test_size x num_classes Tensor - # 2. gets the preds in a test_size Tensor - # takes ~10 seconds to run - class_probs = [] - class_label = [] - with torch.no_grad(): - for data in testloader: - images, labels = data - output = net(images) - class_probs_batch = [F.softmax(el, dim=0) for el in output] - - class_probs.append(class_probs_batch) - class_label.append(labels) - - test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) - test_label = torch.cat(class_label) - - # helper function - def add_pr_curve_tensorboard(class_index, test_probs, test_label, global_step=0): - ''' - Takes in a "class_index" from 0 to 9 and plots the corresponding - precision-recall curve - ''' - tensorboard_truth = test_label == class_index - tensorboard_probs = test_probs[:, class_index] - - writer.add_pr_curve(classes[class_index], - tensorboard_truth, - tensorboard_probs, - global_step=global_step) - writer.close() - - # plot all the pr curves - for i in range(len(classes)): - add_pr_curve_tensorboard(i, test_probs, test_label) - -You will now see a "PR Curves" tab that contains the precision-recall -curves for each class. Go ahead and poke around; you'll see that on -some classes the model has nearly 100% "area under the curve", -whereas on others this area is lower: - -.. image:: ../../_static/img/tensorboard_pr_curves.png - -And that's an intro to TensorBoard and PyTorch's integration with it. -Of course, you could do everything TensorBoard does in your Jupyter -Notebook, but with TensorBoard, you gets visuals that are interactive -by default. diff --git a/intermediate_source/text_to_speech_with_torchaudio.rst b/intermediate_source/text_to_speech_with_torchaudio.rst deleted file mode 100644 index bbb6d7f272..0000000000 --- a/intermediate_source/text_to_speech_with_torchaudio.rst +++ /dev/null @@ -1,10 +0,0 @@ -Text-to-speech with Tacotron2 -============================= - -This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html - -It will redirect in 3 seconds. - -.. raw:: html - - diff --git a/intermediate_source/tiatoolbox_tutorial.rst b/intermediate_source/tiatoolbox_tutorial.rst deleted file mode 100644 index de9b303133..0000000000 --- a/intermediate_source/tiatoolbox_tutorial.rst +++ /dev/null @@ -1,994 +0,0 @@ -Whole Slide Image Classification Using PyTorch and TIAToolbox -============================================================= - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. This will allow you to experiment with the information presented below. - - -Introduction ------------- - -In this tutorial, we will show how to classify Whole Slide Images (WSIs) -using PyTorch deep learning models with help from TIAToolbox. A WSI -is an image of a sample of human tissue taken through a surgery or biopsy and -scanned using specialized scanners. They are used by pathologists and -computational pathology researchers to `study diseases such as cancer at the microscopic -level `__ in -order to understand for example tumor growth and help improve treatment -for patients. - -What makes WSIs challenging to process is their enormous size. For -example, a typical slide image has in the order of `100,000x100,000 -pixels `__ where each pixel can -correspond to about 0.25x0.25 microns on the slide. This introduces -challenges in loading and processing such images, not to mention -hundreds or even thousands of WSIs in a single study (larger studies -produce better results)! - -Conventional image processing pipelines are not suitable for WSI -processing so we need better tools. This is where -`TIAToolbox `__ can -help as it brings a set of useful tools to import and process tissue -slides in a fast and computationally efficient manner. Typically, WSIs -are saved in a pyramid structure with multiple copies of the same image -at various magnification levels optimized for visualization. The level 0 -(or the bottom level) of the pyramid contains the image at the highest -magnification or zoom level, whereas the higher levels in the pyramid -have a lower resolution copy of the base image. The pyramid structure is -sketched below. - -|WSI pyramid stack| *WSI pyramid stack -(*\ `source `__\ *)* - -TIAToolbox allows us to automate common downstream analysis tasks such -as `tissue -classification `__. In this -tutorial we show how you can: 1. Load WSI images using -TIAToolbox; and 2. Use different PyTorch models to classify slides at -the patch-level. In this tutorial, we will provide an example of using -TorchVision ``ResNet18`` model and custom -`HistoEncoder` `__ model. - -Let’s get started! - -.. |WSI pyramid stack| image:: ../_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp - - -Setting up the environment --------------------------- - -To run the examples provided in this tutorial, the following packages -are required as prerequisites. - -1. OpenJpeg -2. OpenSlide -3. Pixman -4. TIAToolbox -5. HistoEncoder (for a custom model example) - -Please run the following command in your terminal to install these -packages: - - -`apt-get -y -qq install libopenjp2-7-dev libopenjp2-tools openslide-tools libpixman-1-dev` -`pip install -q 'tiatoolbox<1.5' histoencoder && echo "Installation is done."` - - -Alternatively, you can run ``brew install openjpeg openslide`` to -install the prerequisite packages on MacOS instead of ``apt-get``. -Further information on installation can be `found -here `__. - - - -Importing related libraries -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code-block:: python - - - """Import modules required to run the Jupyter notebook.""" - from __future__ import annotations - - # Configure logging - import logging - import warnings - if logging.getLogger().hasHandlers(): - logging.getLogger().handlers.clear() - warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") - - # Downloading data and files - import shutil - from pathlib import Path - from zipfile import ZipFile - - # Data processing and visualization - import matplotlib as mpl - import matplotlib.pyplot as plt - import numpy as np - import pandas as pd - from matplotlib import cm - import PIL - import contextlib - import io - from sklearn.metrics import accuracy_score, confusion_matrix - - # TIAToolbox for WSI loading and processing - from tiatoolbox import logger - from tiatoolbox.models.architecture import vanilla - from tiatoolbox.models.engine.patch_predictor import ( - IOPatchPredictorConfig, - PatchPredictor, - ) - from tiatoolbox.utils.misc import download_data, grab_files_from_dir - from tiatoolbox.utils.visualization import overlay_prediction_mask - from tiatoolbox.wsicore.wsireader import WSIReader - - # Torch-related - import torch - from torchvision import transforms - - # Configure plotting - mpl.rcParams["figure.dpi"] = 160 # for high resolution figure in notebook - mpl.rcParams["figure.facecolor"] = "white" # To make sure text is visible in dark mode - - # If you are not using GPU, change ON_GPU to False - ON_GPU = True - - # Function to suppress console output for overly verbose code blocks - def suppress_console_output(): - return contextlib.redirect_stderr(io.StringIO()) - - - -Clean-up before a run -~~~~~~~~~~~~~~~~~~~~~ - -To ensure proper clean-up (for example in abnormal termination), all -files downloaded or created in this run are saved in a single directory -``global_save_dir``, which we set equal to “./tmp/”. To simplify -maintenance, the name of the directory occurs only at this one place, so -that it can easily be changed, if desired. - - - -.. code-block:: python - - - warnings.filterwarnings("ignore") - global_save_dir = Path("./tmp/") - - - def rmdir(dir_path: str | Path) -> None: - """Helper function to delete directory.""" - if Path(dir_path).is_dir(): - shutil.rmtree(dir_path) - logger.info("Removing directory %s", dir_path) - - - rmdir(global_save_dir) # remove directory if it exists from previous runs - global_save_dir.mkdir() - logger.info("Creating new directory %s", global_save_dir) - - - -Downloading the data -~~~~~~~~~~~~~~~~~~~~ - -For our sample data, we will use one whole-slide image, and patches from -the validation subset of `Kather -100k `__ dataset. - - - -.. code-block:: python - - - wsi_path = global_save_dir / "sample_wsi.svs" - patches_path = global_save_dir / "kather100k-validation-sample.zip" - weights_path = global_save_dir / "resnet18-kather100k.pth" - - logger.info("Download has started. Please wait...") - - # Downloading and unzip a sample whole-slide image - download_data( - "https://tiatoolbox.dcs.warwick.ac.uk/sample_wsis/TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.svs", - wsi_path, - ) - - # Download and unzip a sample of the validation set used to train the Kather 100K dataset - download_data( - "https://tiatoolbox.dcs.warwick.ac.uk/datasets/kather100k-validation-sample.zip", - patches_path, - ) - with ZipFile(patches_path, "r") as zipfile: - zipfile.extractall(path=global_save_dir) - - # Download pretrained model weights for WSI classification using ResNet18 architecture - download_data( - "https://tiatoolbox.dcs.warwick.ac.uk/models/pc/resnet18-kather100k.pth", - weights_path, - ) - - logger.info("Download is complete.") - - - -Reading the data ----------------- - -We create a list of patches and a list of corresponding labels. For -example, the first label in ``label_list`` will indicate the class of -the first image patch in ``patch_list``. - - - -.. code-block:: python - - - # Read the patch data and create a list of patches and a list of corresponding labels - dataset_path = global_save_dir / "kather100k-validation-sample" - - # Set the path to the dataset - image_ext = ".tif" # file extension of each image - - # Obtain the mapping between the label ID and the class name - label_dict = { - "BACK": 0, # Background (empty glass region) - "NORM": 1, # Normal colon mucosa - "DEB": 2, # Debris - "TUM": 3, # Colorectal adenocarcinoma epithelium - "ADI": 4, # Adipose - "MUC": 5, # Mucus - "MUS": 6, # Smooth muscle - "STR": 7, # Cancer-associated stroma - "LYM": 8, # Lymphocytes - } - - class_names = list(label_dict.keys()) - class_labels = list(label_dict.values()) - - # Generate a list of patches and generate the label from the filename - patch_list = [] - label_list = [] - for class_name, label in label_dict.items(): - dataset_class_path = dataset_path / class_name - patch_list_single_class = grab_files_from_dir( - dataset_class_path, - file_types="*" + image_ext, - ) - patch_list.extend(patch_list_single_class) - label_list.extend([label] * len(patch_list_single_class)) - - # Show some dataset statistics - plt.bar(class_names, [label_list.count(label) for label in class_labels]) - plt.xlabel("Patch types") - plt.ylabel("Number of patches") - - # Count the number of examples per class - for class_name, label in label_dict.items(): - logger.info( - "Class ID: %d -- Class Name: %s -- Number of images: %d", - label, - class_name, - label_list.count(label), - ) - - # Overall dataset statistics - logger.info("Total number of patches: %d", (len(patch_list))) - - - - - -.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png - :class: sphx-glr-single-img - - -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - |2023-11-14|13:15:59.299| [INFO] Class ID: 0 -- Class Name: BACK -- Number of images: 211 - |2023-11-14|13:15:59.299| [INFO] Class ID: 1 -- Class Name: NORM -- Number of images: 176 - |2023-11-14|13:15:59.299| [INFO] Class ID: 2 -- Class Name: DEB -- Number of images: 230 - |2023-11-14|13:15:59.299| [INFO] Class ID: 3 -- Class Name: TUM -- Number of images: 286 - |2023-11-14|13:15:59.299| [INFO] Class ID: 4 -- Class Name: ADI -- Number of images: 208 - |2023-11-14|13:15:59.299| [INFO] Class ID: 5 -- Class Name: MUC -- Number of images: 178 - |2023-11-14|13:15:59.299| [INFO] Class ID: 6 -- Class Name: MUS -- Number of images: 270 - |2023-11-14|13:15:59.299| [INFO] Class ID: 7 -- Class Name: STR -- Number of images: 209 - |2023-11-14|13:15:59.299| [INFO] Class ID: 8 -- Class Name: LYM -- Number of images: 232 - |2023-11-14|13:15:59.299| [INFO] Total number of patches: 2000 - - - -As you can see for this patch dataset, we have 9 classes/labels with IDs -0-8 and associated class names. describing the dominant tissue type in -the patch: - -- BACK ⟶ Background (empty glass region) -- LYM ⟶ Lymphocytes -- NORM ⟶ Normal colon mucosa -- DEB ⟶ Debris -- MUS ⟶ Smooth muscle -- STR ⟶ Cancer-associated stroma -- ADI ⟶ Adipose -- MUC ⟶ Mucus -- TUM ⟶ Colorectal adenocarcinoma epithelium - - - -Classify image patches ----------------------- - -We demonstrate how to obtain a prediction for each patch within a -digital slide first with the ``patch`` mode and then with a large slide -using ``wsi`` mode. - - -Define ``PatchPredictor`` model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The PatchPredictor class runs a CNN-based classifier written in PyTorch. - -- ``model`` can be any trained PyTorch model with the constraint that - it should follow the - ``tiatoolbox.models.abc.ModelABC`` `(docs)` `__ - class structure. For more information on this matter, please refer to - `our example notebook on advanced model - techniques `__. - In order to load a custom model, you need to write a small - preprocessing function, as in ``preproc_func(img)``, which makes sure - the input tensors are in the right format for the loaded network. -- Alternatively, you can pass ``pretrained_model`` as a string - argument. This specifies the CNN model that performs the prediction, - and it must be one of the models listed - `here `__. - The command will look like this: - ``predictor = PatchPredictor(pretrained_model='resnet18-kather100k', pretrained_weights=weights_path, batch_size=32)``. -- ``pretrained_weights``: When using a ``pretrained_model``, the - corresponding pretrained weights will also be downloaded by default. - You can override the default with your own set of weights via the - ``pretrained_weight`` argument. -- ``batch_size``: Number of images fed into the model each time. Higher - values for this parameter require a larger (GPU) memory capacity. - - - -.. code-block:: python - - - # Importing a pretrained PyTorch model from TIAToolbox - predictor = PatchPredictor(pretrained_model='resnet18-kather100k', batch_size=32) - - # Users can load any PyTorch model architecture instead using the following script - model = vanilla.CNNModel(backbone="resnet18", num_classes=9) # Importing model from torchvision.models.resnet18 - model.load_state_dict(torch.load(weights_path, map_location="cpu", weights_only=True), strict=True) - def preproc_func(img): - img = PIL.Image.fromarray(img) - img = transforms.ToTensor()(img) - return img.permute(1, 2, 0) - model.preproc_func = preproc_func - predictor = PatchPredictor(model=model, batch_size=32) - - - -Predict patch labels -~~~~~~~~~~~~~~~~~~~~ - -We create a predictor object and then call the ``predict`` method using -the ``patch`` mode. We then compute the classification accuracy and -confusion matrix. - - - -.. code-block:: python - - - with suppress_console_output(): - output = predictor.predict(imgs=patch_list, mode="patch", on_gpu=ON_GPU) - - acc = accuracy_score(label_list, output["predictions"]) - logger.info("Classification accuracy: %f", acc) - - # Creating and visualizing the confusion matrix for patch classification results - conf = confusion_matrix(label_list, output["predictions"], normalize="true") - df_cm = pd.DataFrame(conf, index=class_names, columns=class_names) - df_cm - - - - - - -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - |2023-11-14|13:16:03.215| [INFO] Classification accuracy: 0.993000 - - -.. raw:: html - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BACKNORMDEBTUMADIMUCMUSSTRLYM
BACK1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000
NORM0.0000000.9886360.0000000.0113640.0000000.0000000.0000000.0000000.00000
DEB0.0000000.0000000.9913040.0000000.0000000.0000000.0000000.0086960.00000
TUM0.0000000.0000000.0000000.9965030.0000000.0034970.0000000.0000000.00000
ADI0.0048080.0000000.0000000.0000000.9903850.0000000.0048080.0000000.00000
MUC0.0000000.0000000.0000000.0000000.0000000.9887640.0000000.0112360.00000
MUS0.0000000.0000000.0000000.0000000.0000000.0000000.9962960.0037040.00000
STR0.0000000.0000000.0047850.0000000.0000000.0047850.0047850.9856460.00000
LYM0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0043100.99569
-
-
-
-
- - -Predict patch labels for a whole slide -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We now introduce ``IOPatchPredictorConfig``, a class that specifies the -configuration of image reading and prediction writing for the model -prediction engine. This is required to inform the classifier which level -of the WSI pyramid the classifier should read, process data and generate -output. - -Parameters of ``IOPatchPredictorConfig`` are defined as: - -- ``input_resolutions``: A list, in the form of a dictionary, - specifying the resolution of each input. List elements must be in the - same order as in the target ``model.forward()``. If your model - accepts only one input, you just need to put one dictionary - specifying ``'units'`` and ``'resolution'``. Note that TIAToolbox - supports a model with more than one input. For more information on - units and resolution, please see `TIAToolbox - documentation `__. -- ``patch_input_shape``: Shape of the largest input in (height, width) - format. -- ``stride_shape``: The size of a stride (steps) between two - consecutive patches, used in the patch extraction process. If the - user sets ``stride_shape`` equal to ``patch_input_shape``, patches - will be extracted and processed without any overlap. - - - -.. code-block:: python - - - wsi_ioconfig = IOPatchPredictorConfig( - input_resolutions=[{"units": "mpp", "resolution": 0.5}], - patch_input_shape=[224, 224], - stride_shape=[224, 224], - ) - - - -The ``predict`` method applies the CNN on the input patches and get the -results. Here are the arguments and their descriptions: - -- ``mode``: Type of input to be processed. Choose from ``patch``, - ``tile`` or ``wsi`` according to your application. -- ``imgs``: List of inputs, which should be a list of paths to the - input tiles or WSIs. -- ``return_probabilities``: Set to **True** to get per class - probabilities alongside predicted labels of input patches. If you - wish to merge the predictions to generate prediction maps for - ``tile`` or ``wsi`` modes, you can set ``return_probabilities=True``. -- ``ioconfig``: set the IO configuration information using the - ``IOPatchPredictorConfig`` class. -- ``resolution`` and ``unit`` (not shown below): These arguments - specify the level or micron-per-pixel resolution of the WSI levels - from which we plan to extract patches and can be used instead of - ``ioconfig``. Here we specify the WSI level as ``'baseline'``, - which is equivalent to level 0. In general, this is the level of - greatest resolution. In this particular case, the image has only one - level. More information can be found in the - `documentation `__. -- ``masks``: A list of paths corresponding to the masks of WSIs in the - ``imgs`` list. These masks specify the regions in the original WSIs - from which we want to extract patches. If the mask of a particular - WSI is specified as ``None``, then the labels for all patches of that - WSI (even background regions) would be predicted. This could cause - unnecessary computation. -- ``merge_predictions``: You can set this parameter to ``True`` if it’s - required to generate a 2D map of patch classification results. - However, for large WSIs this will require large available memory. An - alternative (default) solution is to set ``merge_predictions=False``, - and then generate the 2D prediction maps using the - ``merge_predictions`` function as you will see later on. - -Since we are using a large WSI the patch extraction and prediction -processes may take some time (make sure to set the ``ON_GPU=True`` if -you have access to Cuda enabled GPU and PyTorch+Cuda). - - - -.. code-block:: python - - - with suppress_console_output(): - wsi_output = predictor.predict( - imgs=[wsi_path], - masks=None, - mode="wsi", - merge_predictions=False, - ioconfig=wsi_ioconfig, - return_probabilities=True, - save_dir=global_save_dir / "wsi_predictions", - on_gpu=ON_GPU, - ) - - - - -We see how the prediction model works on our whole-slide images by -visualizing the ``wsi_output``. We first need to merge patch prediction -outputs and then visualize them as an overlay on the original image. As -before, the ``merge_predictions`` method is used to merge the patch -predictions. Here we set the parameters -``resolution=1.25, units='power'`` to generate the prediction map at -1.25x magnification. If you would like to have higher/lower resolution -(bigger/smaller) prediction maps, you need to change these parameters -accordingly. When the predictions are merged, use the -``overlay_patch_prediction`` function to overlay the prediction map on -the WSI thumbnail, which should be extracted at the resolution used for -prediction merging. - - -.. code-block:: python - - - overview_resolution = ( - 4 # the resolution in which we desire to merge and visualize the patch predictions - ) - # the unit of the `resolution` parameter. Can be "power", "level", "mpp", or "baseline" - overview_unit = "mpp" - wsi = WSIReader.open(wsi_path) - wsi_overview = wsi.slide_thumbnail(resolution=overview_resolution, units=overview_unit) - plt.figure(), plt.imshow(wsi_overview) - plt.axis("off") - - - - - -.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png - :class: sphx-glr-single-img - - - -Overlaying the prediction map on this image as below gives: - - - -.. code-block:: python - - - # Visualization of whole-slide image patch-level prediction - # first set up a label to color mapping - label_color_dict = {} - label_color_dict[0] = ("empty", (0, 0, 0)) - colors = cm.get_cmap("Set1").colors - for class_name, label in label_dict.items(): - label_color_dict[label + 1] = (class_name, 255 * np.array(colors[label])) - - pred_map = predictor.merge_predictions( - wsi_path, - wsi_output[0], - resolution=overview_resolution, - units=overview_unit, - ) - overlay = overlay_prediction_mask( - wsi_overview, - pred_map, - alpha=0.5, - label_info=label_color_dict, - return_ax=True, - ) - plt.show() - - - - - -.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png - :class: sphx-glr-single-img - - - -Feature extraction with a pathology-specific model --------------------------------------------------- - -In this section, we will show how to extract features from a pretrained -PyTorch model that exists outside TIAToolbox, using the WSI inference -engines provided by TIAToolbox. To illustrate this we will use -HistoEncoder, a computational-pathology specific model that has been -trained in a self-supervised fashion to extract features from histology -images. The model has been made available here: - -‘HistoEncoder: Foundation models for digital pathology’ -(https://github.com/jopo666/HistoEncoder) by Pohjonen, Joona and team at -the University of Helsinki. - -We will plot a umap reduction into 3D (RGB) of the feature map to -visualize how the features capture the differences between some of the -above mentioned tissue types. - - - -.. code-block:: python - - - # Import some extra modules - import histoencoder.functional as F - import torch.nn as nn - - from tiatoolbox.models.engine.semantic_segmentor import DeepFeatureExtractor, IOSegmentorConfig - from tiatoolbox.models.models_abc import ModelABC - import umap - - - -TIAToolbox defines a ModelABC which is a class inheriting PyTorch -`nn.Module `__ -and specifies how a model should look in order to be used in the -TIAToolbox inference engines. The histoencoder model doesn’t follow this -structure, so we need to wrap it in a class whose output and methods are -those that the TIAToolbox engine expects. - - - -.. code-block:: python - - - class HistoEncWrapper(ModelABC): - """Wrapper for HistoEnc model that conforms to tiatoolbox ModelABC interface.""" - - def __init__(self: HistoEncWrapper, encoder) -> None: - super().__init__() - self.feat_extract = encoder - - def forward(self: HistoEncWrapper, imgs: torch.Tensor) -> torch.Tensor: - """Pass input data through the model. - - Args: - imgs (torch.Tensor): - Model input. - - """ - out = F.extract_features(self.feat_extract, imgs, num_blocks=2, avg_pool=True) - return out - - @staticmethod - def infer_batch( - model: nn.Module, - batch_data: torch.Tensor, - *, - on_gpu: bool, - ) -> list[np.ndarray]: - """Run inference on an input batch. - - Contains logic for forward operation as well as i/o aggregation. - - Args: - model (nn.Module): - PyTorch defined model. - batch_data (torch.Tensor): - A batch of data generated by - `torch.utils.data.DataLoader`. - on_gpu (bool): - Whether to run inference on a GPU. - - """ - img_patches_device = batch_data.to('cuda') if on_gpu else batch_data - model.eval() - # Do not compute the gradient (not training) - with torch.inference_mode(): - output = model(img_patches_device) - return [output.cpu().numpy()] - - - - -Now that we have our wrapper, we will create our feature extraction -model and instantiate a -`DeepFeatureExtractor `__ -to allow us to use this model over a WSI. We will use the same WSI as -above, but this time we will extract features from the patches of the -WSI using the HistoEncoder model, rather than predicting some label for -each patch. - - - -.. code-block:: python - - - # create the model - encoder = F.create_encoder("prostate_medium") - model = HistoEncWrapper(encoder) - - # set the pre-processing function - norm=transforms.Normalize(mean=[0.662, 0.446, 0.605],std=[0.169, 0.190, 0.155]) - trans = [ - transforms.ToTensor(), - norm, - ] - model.preproc_func = transforms.Compose(trans) - - wsi_ioconfig = IOSegmentorConfig( - input_resolutions=[{"units": "mpp", "resolution": 0.5}], - patch_input_shape=[224, 224], - output_resolutions=[{"units": "mpp", "resolution": 0.5}], - patch_output_shape=[224, 224], - stride_shape=[224, 224], - ) - - - -When we create the ``DeepFeatureExtractor``, we will pass the -``auto_generate_mask=True`` argument. This will automatically create a -mask of the tissue region using otsu thresholding, so that the extractor -processes only those patches containing tissue. - - - -.. code-block:: python - - - # create the feature extractor and run it on the WSI - extractor = DeepFeatureExtractor(model=model, auto_generate_mask=True, batch_size=32, num_loader_workers=4, num_postproc_workers=4) - with suppress_console_output(): - out = extractor.predict(imgs=[wsi_path], mode="wsi", ioconfig=wsi_ioconfig, save_dir=global_save_dir / "wsi_features",) - - - - -These features could be used to train a downstream model, but here in -order to get some intuition for what the features represent, we will use -a UMAP reduction to visualize the features in RGB space. The points -labeled in a similar color should have similar features, so we can check -if the features naturally separate out into the different tissue regions -when we overlay the UMAP reduction on the WSI thumbnail. We will plot it -along with the patch-level prediction map from above to see how the -features compare to the patch-level predictions in the following cells. - - - -.. code-block:: python - - - # First we define a function to calculate the umap reduction - def umap_reducer(x, dims=3, nns=10): - """UMAP reduction of the input data.""" - reducer = umap.UMAP(n_neighbors=nns, n_components=dims, metric="manhattan", spread=0.5, random_state=2) - reduced = reducer.fit_transform(x) - reduced -= reduced.min(axis=0) - reduced /= reduced.max(axis=0) - return reduced - - # load the features output by our feature extractor - pos = np.load(global_save_dir / "wsi_features" / "0.position.npy") - feats = np.load(global_save_dir / "wsi_features" / "0.features.0.npy") - pos = pos / 8 # as we extracted at 0.5mpp, and we are overlaying on a thumbnail at 4mpp - - # reduce the features into 3 dimensional (rgb) space - reduced = umap_reducer(feats) - - # plot the prediction map the classifier again - overlay = overlay_prediction_mask( - wsi_overview, - pred_map, - alpha=0.5, - label_info=label_color_dict, - return_ax=True, - ) - - # plot the feature map reduction - plt.figure() - plt.imshow(wsi_overview) - plt.scatter(pos[:,0], pos[:,1], c=reduced, s=1, alpha=0.5) - plt.axis("off") - plt.title("UMAP reduction of HistoEnc features") - plt.show() - - - - - -.. rst-class:: sphx-glr-horizontal - - - * - - .. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png - :class: sphx-glr-multi-img - - * - - .. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png - :alt: UMAP reduction of HistoEnc features - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png - :class: sphx-glr-multi-img - - - - -We see that the prediction map from our patch-level predictor, and the -feature map from our self-supervised feature encoder, capture similar -information about the tissue types in the WSI. This is a good sanity -check that our models are working as expected. It also shows that the -features extracted by the HistoEncoder model are capturing the -differences between the tissue types, and so that they are encoding -histologically relevant information. - - -Where to Go From Here ---------------------- - -In this notebook, we show how we can use the ``PatchPredictor`` and -``DeepFeatureExtractor`` classes and their ``predict`` method to predict -the label, or extract features, for patches of big tiles and WSIs. We -introduce ``merge_predictions`` and ``overlay_prediction_mask`` helper -functions that merge the patch prediction outputs and visualize the -resulting prediction map as an overlay on the input image/WSI. - -All the processes take place within TIAToolbox and we can easily put the -pieces together, following our example code. Please make sure to set -inputs and options correctly. We encourage you to further investigate -the effect on the prediction output of changing ``predict`` function -parameters. We have demonstrated how to use your own pretrained model or -one provided by the research community for a specific task in the -TIAToolbox framework to do inference on large WSIs even if the model -structure is not defined in the TIAToolbox model class. - -You can learn more through the following resources: - -- `Advanced model handling with PyTorch and - TIAToolbox `__ -- `Creating slide graphs for WSI with a custom PyTorch graph neural - network `__ - diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py deleted file mode 100644 index 67b055d9ff..0000000000 --- a/intermediate_source/torch_compile_tutorial.py +++ /dev/null @@ -1,606 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Introduction to ``torch.compile`` -================================= -**Author:** William Wen -""" - -###################################################################### -# ``torch.compile`` is the latest method to speed up your PyTorch code! -# ``torch.compile`` makes PyTorch code run faster by -# JIT-compiling PyTorch code into optimized kernels, -# all while requiring minimal code changes. -# -# In this tutorial, we cover basic ``torch.compile`` usage, -# and demonstrate the advantages of ``torch.compile`` over -# previous PyTorch compiler solutions, such as -# `TorchScript `__ and -# `FX Tracing `__. -# -# **Contents** -# -# .. contents:: -# :local: -# -# **Required pip Dependencies** -# -# - ``torch >= 2.0`` -# - ``torchvision`` -# - ``numpy`` -# - ``scipy`` -# - ``tabulate`` - -###################################################################### -# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in -# order to reproduce the speedup numbers shown below and documented elsewhere. - -import torch -import warnings - -gpu_ok = False -if torch.cuda.is_available(): - device_cap = torch.cuda.get_device_capability() - if device_cap in ((7, 0), (8, 0), (9, 0)): - gpu_ok = True - -if not gpu_ok: - warnings.warn( - "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " - "than expected." - ) - -###################################################################### -# Basic Usage -# ------------ -# -# ``torch.compile`` is included in the latest PyTorch. -# Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly -# binary. If Triton is still missing, try installing ``torchtriton`` via pip -# (``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"`` -# for CUDA 11.7). -# -# Arbitrary Python functions can be optimized by passing the callable to -# ``torch.compile``. We can then call the returned optimized -# function in place of the original function. - -def foo(x, y): - a = torch.sin(x) - b = torch.cos(y) - return a + b -opt_foo1 = torch.compile(foo) -print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10))) - -###################################################################### -# Alternatively, we can decorate the function. -t1 = torch.randn(10, 10) -t2 = torch.randn(10, 10) - -@torch.compile -def opt_foo2(x, y): - a = torch.sin(x) - b = torch.cos(y) - return a + b -print(opt_foo2(t1, t2)) - -###################################################################### -# We can also optimize ``torch.nn.Module`` instances. - -t = torch.randn(10, 100) - -class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x): - return torch.nn.functional.relu(self.lin(x)) - -mod = MyModule() -opt_mod = torch.compile(mod) -print(opt_mod(t)) - -###################################################################### -# torch.compile and Nested Calls -# ------------------------------ -# Nested function calls within the decorated function will also be compiled. - -def nested_function(x): - return torch.sin(x) - -@torch.compile -def outer_function(x, y): - a = nested_function(x) - b = torch.cos(y) - return a + b - -print(outer_function(t1, t2)) - -###################################################################### -# In the same fashion, when compiling a module all sub-modules and methods -# within it, that are not in a skip list, are also compiled. - -class OuterModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.inner_module = MyModule() - self.outer_lin = torch.nn.Linear(10, 2) - - def forward(self, x): - x = self.inner_module(x) - return torch.nn.functional.relu(self.outer_lin(x)) - -outer_mod = OuterModule() -opt_outer_mod = torch.compile(outer_mod) -print(opt_outer_mod(t)) - -###################################################################### -# We can also disable some functions from being compiled by using -# ``torch.compiler.disable``. Suppose you want to disable the tracing on just -# the ``complex_function`` function, but want to continue the tracing back in -# ``complex_conjugate``. In this case, you can use -# ``torch.compiler.disable(recursive=False)`` option. Otherwise, the default is -# ``recursive=True``. - -def complex_conjugate(z): - return torch.conj(z) - -@torch.compiler.disable(recursive=False) -def complex_function(real, imag): - # Assuming this function cause problems in the compilation - z = torch.complex(real, imag) - return complex_conjugate(z) - -def outer_function(): - real = torch.tensor([2, 3], dtype=torch.float32) - imag = torch.tensor([4, 5], dtype=torch.float32) - z = complex_function(real, imag) - return torch.abs(z) - -# Try to compile the outer_function -try: - opt_outer_function = torch.compile(outer_function) - print(opt_outer_function()) -except Exception as e: - print("Compilation of outer_function failed:", e) - -###################################################################### -# Best Practices and Recommendations -# ---------------------------------- -# -# Behavior of ``torch.compile`` with Nested Modules and Function Calls -# -# When you use ``torch.compile``, the compiler will try to recursively compile -# every function call inside the target function or module inside the target -# function or module that is not in a skip list (such as built-ins, some functions in -# the torch.* namespace). -# -# **Best Practices:** -# -# 1. **Top-Level Compilation:** One approach is to compile at the highest level -# possible (i.e., when the top-level module is initialized/called) and -# selectively disable compilation when encountering excessive graph breaks or -# errors. If there are still many compile issues, compile individual -# subcomponents instead. -# -# 2. **Modular Testing:** Test individual functions and modules with ``torch.compile`` -# before integrating them into larger models to isolate potential issues. -# -# 3. **Disable Compilation Selectively:** If certain functions or sub-modules -# cannot be handled by `torch.compile`, use the `torch.compiler.disable` context -# managers to recursively exclude them from compilation. -# -# 4. **Compile Leaf Functions First:** In complex models with multiple nested -# functions and modules, start by compiling the leaf functions or modules first. -# For more information see `TorchDynamo APIs for fine-grained tracing `__. - -###################################################################### -# Demonstrating Speedups -# ----------------------- -# -# Let's now demonstrate that using ``torch.compile`` can speed -# up real models. We will compare standard eager mode and -# ``torch.compile`` by evaluating and training a ``torchvision`` model on random data. -# -# Before we start, we need to define some utility functions. - -# Returns the result of running `fn()` and the time it took for `fn()` to run, -# in seconds. We use CUDA events and synchronization for the most accurate -# measurements. -def timed(fn): - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - result = fn() - end.record() - torch.cuda.synchronize() - return result, start.elapsed_time(end) / 1000 - -# Generates random input and targets data for the model, where `b` is -# batch size. -def generate_data(b): - return ( - torch.randn(b, 3, 128, 128).to(torch.float32).cuda(), - torch.randint(1000, (b,)).cuda(), - ) - -N_ITERS = 10 - -from torchvision.models import densenet121 -def init_model(): - return densenet121().to(torch.float32).cuda() - -###################################################################### -# First, let's compare inference. -# -# Note that in the call to ``torch.compile``, we have the additional -# ``mode`` argument, which we will discuss below. - -model = init_model() - -# Reset since we are using a different mode. -import torch._dynamo -torch._dynamo.reset() - -model_opt = torch.compile(model, mode="reduce-overhead") - -inp = generate_data(16)[0] -with torch.no_grad(): - print("eager:", timed(lambda: model(inp))[1]) - print("compile:", timed(lambda: model_opt(inp))[1]) - -###################################################################### -# Notice that ``torch.compile`` takes a lot longer to complete -# compared to eager. This is because ``torch.compile`` compiles -# the model into optimized kernels as it executes. In our example, the -# structure of the model doesn't change, and so recompilation is not -# needed. So if we run our optimized model several more times, we should -# see a significant improvement compared to eager. - -eager_times = [] -for i in range(N_ITERS): - inp = generate_data(16)[0] - with torch.no_grad(): - _, eager_time = timed(lambda: model(inp)) - eager_times.append(eager_time) - print(f"eager eval time {i}: {eager_time}") - -print("~" * 10) - -compile_times = [] -for i in range(N_ITERS): - inp = generate_data(16)[0] - with torch.no_grad(): - _, compile_time = timed(lambda: model_opt(inp)) - compile_times.append(compile_time) - print(f"compile eval time {i}: {compile_time}") -print("~" * 10) - -import numpy as np -eager_med = np.median(eager_times) -compile_med = np.median(compile_times) -speedup = eager_med / compile_med -assert(speedup > 1) -print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") -print("~" * 10) - -###################################################################### -# And indeed, we can see that running our model with ``torch.compile`` -# results in a significant speedup. Speedup mainly comes from reducing Python overhead and -# GPU read/writes, and so the observed speedup may vary on factors such as model -# architecture and batch size. For example, if a model's architecture is simple -# and the amount of data is large, then the bottleneck would be -# GPU compute and the observed speedup may be less significant. -# -# You may also see different speedup results depending on the chosen ``mode`` -# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce -# the overhead of Python. For your own models, -# you may need to experiment with different modes to maximize speedup. You can -# read more about modes `here `__. -# -# You may might also notice that the second time we run our model with ``torch.compile`` is significantly -# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"`` -# mode runs a few warm-up iterations for CUDA graphs. -# -# For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed`` -# function we defined above. We wrote our own timing function in this tutorial to show -# ``torch.compile``'s compilation latency. -# -# Now, let's consider comparing training. - -model = init_model() -opt = torch.optim.Adam(model.parameters()) - -def train(mod, data): - opt.zero_grad(True) - pred = mod(data[0]) - loss = torch.nn.CrossEntropyLoss()(pred, data[1]) - loss.backward() - opt.step() - -eager_times = [] -for i in range(N_ITERS): - inp = generate_data(16) - _, eager_time = timed(lambda: train(model, inp)) - eager_times.append(eager_time) - print(f"eager train time {i}: {eager_time}") -print("~" * 10) - -model = init_model() -opt = torch.optim.Adam(model.parameters()) -train_opt = torch.compile(train, mode="reduce-overhead") - -compile_times = [] -for i in range(N_ITERS): - inp = generate_data(16) - _, compile_time = timed(lambda: train_opt(model, inp)) - compile_times.append(compile_time) - print(f"compile train time {i}: {compile_time}") -print("~" * 10) - -eager_med = np.median(eager_times) -compile_med = np.median(compile_times) -speedup = eager_med / compile_med -assert(speedup > 1) -print(f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") -print("~" * 10) - -###################################################################### -# Again, we can see that ``torch.compile`` takes longer in the first -# iteration, as it must compile the model, but in subsequent iterations, we see -# significant speedups compared to eager. -# -# We remark that the speedup numbers presented in this tutorial are for -# demonstration purposes only. Official speedup values can be seen at the -# `TorchInductor performance dashboard `__. - -###################################################################### -# Comparison to TorchScript and FX Tracing -# ----------------------------------------- -# -# We have seen that ``torch.compile`` can speed up PyTorch code. -# Why else should we use ``torch.compile`` over existing PyTorch -# compiler solutions, such as TorchScript or FX Tracing? Primarily, the -# advantage of ``torch.compile`` lies in its ability to handle -# arbitrary Python code with minimal changes to existing code. -# -# One case that ``torch.compile`` can handle that other compiler -# solutions struggle with is data-dependent control flow (the -# ``if x.sum() < 0:`` line below). - -def f1(x, y): - if x.sum() < 0: - return -y - return y - -# Test that `fn1` and `fn2` return the same result, given -# the same arguments `args`. Typically, `fn1` will be an eager function -# while `fn2` will be a compiled function (torch.compile, TorchScript, or FX graph). -def test_fns(fn1, fn2, args): - out1 = fn1(*args) - out2 = fn2(*args) - return torch.allclose(out1, out2) - -inp1 = torch.randn(5, 5) -inp2 = torch.randn(5, 5) - -###################################################################### -# TorchScript tracing ``f1`` results in -# silently incorrect results, since only the actual control flow path -# is traced. - -traced_f1 = torch.jit.trace(f1, (inp1, inp2)) -print("traced 1, 1:", test_fns(f1, traced_f1, (inp1, inp2))) -print("traced 1, 2:", test_fns(f1, traced_f1, (-inp1, inp2))) - -###################################################################### -# FX tracing ``f1`` results in an error due to the presence of -# data-dependent control flow. - -import traceback as tb -try: - torch.fx.symbolic_trace(f1) -except: - tb.print_exc() - -###################################################################### -# If we provide a value for ``x`` as we try to FX trace ``f1``, then -# we run into the same problem as TorchScript tracing, as the data-dependent -# control flow is removed in the traced function. - -fx_f1 = torch.fx.symbolic_trace(f1, concrete_args={"x": inp1}) -print("fx 1, 1:", test_fns(f1, fx_f1, (inp1, inp2))) -print("fx 1, 2:", test_fns(f1, fx_f1, (-inp1, inp2))) - -###################################################################### -# Now we can see that ``torch.compile`` correctly handles -# data-dependent control flow. - -# Reset since we are using a different mode. -torch._dynamo.reset() - -compile_f1 = torch.compile(f1) -print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2))) -print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2))) -print("~" * 10) - -###################################################################### -# TorchScript scripting can handle data-dependent control flow, but this -# solution comes with its own set of problems. Namely, TorchScript scripting -# can require major code changes and will raise errors when unsupported Python -# is used. -# -# In the example below, we forget TorchScript type annotations and we receive -# a TorchScript error because the input type for argument ``y``, an ``int``, -# does not match with the default argument type, ``torch.Tensor``. - -def f2(x, y): - return x + y - -inp1 = torch.randn(5, 5) -inp2 = 3 - -script_f2 = torch.jit.script(f2) -try: - script_f2(inp1, inp2) -except: - tb.print_exc() - -###################################################################### -# However, ``torch.compile`` is easily able to handle ``f2``. - -compile_f2 = torch.compile(f2) -print("compile 2:", test_fns(f2, compile_f2, (inp1, inp2))) -print("~" * 10) - -###################################################################### -# Another case that ``torch.compile`` handles well compared to -# previous compilers solutions is the usage of non-PyTorch functions. - -import scipy -def f3(x): - x = x * 2 - x = scipy.fft.dct(x.numpy()) - x = torch.from_numpy(x) - x = x * 2 - return x - -###################################################################### -# TorchScript tracing treats results from non-PyTorch function calls -# as constants, and so our results can be silently wrong. - -inp1 = torch.randn(5, 5) -inp2 = torch.randn(5, 5) -traced_f3 = torch.jit.trace(f3, (inp1,)) -print("traced 3:", test_fns(f3, traced_f3, (inp2,))) - -###################################################################### -# TorchScript scripting and FX tracing disallow non-PyTorch function calls. - -try: - torch.jit.script(f3) -except: - tb.print_exc() - -try: - torch.fx.symbolic_trace(f3) -except: - tb.print_exc() - -###################################################################### -# In comparison, ``torch.compile`` is easily able to handle -# the non-PyTorch function call. - -compile_f3 = torch.compile(f3) -print("compile 3:", test_fns(f3, compile_f3, (inp2,))) - -###################################################################### -# TorchDynamo and FX Graphs -# -------------------------- -# -# One important component of ``torch.compile`` is TorchDynamo. -# TorchDynamo is responsible for JIT compiling arbitrary Python code into -# `FX graphs `__, which can -# then be further optimized. TorchDynamo extracts FX graphs by analyzing Python bytecode -# during runtime and detecting calls to PyTorch operations. -# -# Normally, TorchInductor, another component of ``torch.compile``, -# further compiles the FX graphs into optimized kernels, -# but TorchDynamo allows for different backends to be used. In order to inspect -# the FX graphs that TorchDynamo outputs, let us create a custom backend that -# outputs the FX graph and simply returns the graph's unoptimized forward method. - -from typing import List -def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]): - print("custom backend called with FX graph:") - gm.graph.print_tabular() - return gm.forward - -# Reset since we are using a different backend. -torch._dynamo.reset() - -opt_model = torch.compile(init_model(), backend=custom_backend) -opt_model(generate_data(16)[0]) - -###################################################################### -# Using our custom backend, we can now see how TorchDynamo is able to handle -# data-dependent control flow. Consider the function below, where the line -# ``if b.sum() < 0`` is the source of data-dependent control flow. - -def bar(a, b): - x = a / (torch.abs(a) + 1) - if b.sum() < 0: - b = b * -1 - return x * b - -opt_bar = torch.compile(bar, backend=custom_backend) -inp1 = torch.randn(10) -inp2 = torch.randn(10) -opt_bar(inp1, inp2) -opt_bar(inp1, -inp2) - -###################################################################### -# The output reveals that TorchDynamo extracted 3 different FX graphs -# corresponding the following code (order may differ from the output above): -# -# 1. ``x = a / (torch.abs(a) + 1)`` -# 2. ``b = b * -1; return x * b`` -# 3. ``return x * b`` -# -# When TorchDynamo encounters unsupported Python features, such as data-dependent -# control flow, it breaks the computation graph, lets the default Python -# interpreter handle the unsupported code, then resumes capturing the graph. -# -# Let's investigate by example how TorchDynamo would step through ``bar``. -# If ``b.sum() < 0``, then TorchDynamo would run graph 1, let -# Python determine the result of the conditional, then run -# graph 2. On the other hand, if ``not b.sum() < 0``, then TorchDynamo -# would run graph 1, let Python determine the result of the conditional, then -# run graph 3. -# -# This highlights a major difference between TorchDynamo and previous PyTorch -# compiler solutions. When encountering unsupported Python features, -# previous solutions either raise an error or silently fail. -# TorchDynamo, on the other hand, will break the computation graph. -# -# We can see where TorchDynamo breaks the graph by using ``torch._dynamo.explain``: - -# Reset since we are using a different backend. -torch._dynamo.reset() -explain_output = torch._dynamo.explain(bar)(torch.randn(10), torch.randn(10)) -print(explain_output) - -###################################################################### -# In order to maximize speedup, graph breaks should be limited. -# We can force TorchDynamo to raise an error upon the first graph -# break encountered by using ``fullgraph=True``: - -opt_bar = torch.compile(bar, fullgraph=True) -try: - opt_bar(torch.randn(10), torch.randn(10)) -except: - tb.print_exc() - -###################################################################### -# And below, we demonstrate that TorchDynamo does not break the graph on -# the model we used above for demonstrating speedups. - -opt_model = torch.compile(init_model(), fullgraph=True) -print(opt_model(generate_data(16)[0])) - -###################################################################### -# We can use ``torch.export`` (from PyTorch 2.1+) to extract a single, exportable -# FX graph from the input PyTorch program. The exported graph is intended to be -# run on different (i.e. Python-less) environments. One important restriction -# is that the ``torch.export`` does not support graph breaks. Please check -# `this tutorial `__ -# for more details on ``torch.export``. - -###################################################################### -# Conclusion -# ------------ -# -# In this tutorial, we introduced ``torch.compile`` by covering -# basic usage, demonstrating speedups over eager mode, comparing to previous -# PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions -# with FX graphs. We hope that you will give ``torch.compile`` a try! diff --git a/intermediate_source/torch_export_nightly_tutorial.rst b/intermediate_source/torch_export_nightly_tutorial.rst deleted file mode 100644 index 78c710a344..0000000000 --- a/intermediate_source/torch_export_nightly_tutorial.rst +++ /dev/null @@ -1,858 +0,0 @@ -torch.export Nightly Tutorial -============================= -**Author:** William Wen, Zhengxu Chen, Angela Yi - - -.. warning:: - - ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility - breaking changes. - -.. note:: - Outputs (e.g. from print statements) are only samples. - -:func:`torch.export` is the PyTorch 2.X way to export PyTorch models into -standardized model representations, intended -to be run on different (i.e. Python-less) environments. - -In this tutorial, you will learn how to use :func:`torch.export` to extract -``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs. -We also detail some considerations/modifications that you may need -to make in order to make your model compatible with ``torch.export``. - -**Contents** - -.. contents:: - :local: - -Basic Usage ------------ - -``torch.export`` extracts single-graph representations from PyTorch programs -by tracing the target function, given example inputs. -``torch.export.export()`` is the main entry point for ``torch.export``. - -In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous, -though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()`` -generally refers to the actual function call. - -The signature of ``torch.export.export()`` is: - -.. code-block:: python - - export( - f: Callable, - args: Tuple[Any, ...], - kwargs: Optional[Dict[str, Any]] = None, - *, - dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None - ) -> ExportedProgram - -``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` -and wraps it in an ``ExportedProgram``, which can be serialized or executed later with -different inputs. Note that while the output ``ExportedGraph`` is callable and can be -called in the same way as the original input callable, it is not a ``torch.nn.Module``. -We will detail the ``dynamic_shapes`` argument later in the tutorial. - -.. code-block:: python - - import torch - from torch.export import export - - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - - mod = MyModule() - exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100))) - print(type(exported_mod)) - print(exported_mod(torch.randn(8, 100), torch.randn(8, 100))) - -.. code-block:: bash - - - tensor([[0.0000, 1.2178, 0.0000, 0.4397, 0.4774, 0.0000, 0.0000, 0.0943, 0.0000, - 0.4656], - [0.8333, 0.0000, 0.5912, 0.0000, 1.4689, 0.2122, 0.1996, 0.4628, 0.0000, - 0.7495], - [0.0000, 0.0000, 0.3900, 0.0000, 0.0000, 0.0000, 0.4515, 0.0000, 0.8187, - 0.8938], - [0.5753, 0.7709, 0.0000, 0.0000, 0.0000, 0.8081, 0.0000, 0.0000, 0.8002, - 0.9441], - [0.0000, 0.0000, 0.0000, 0.0000, 0.5711, 1.0921, 0.3438, 0.3268, 0.4640, - 0.0000], - [0.0000, 0.0000, 0.0000, 0.2434, 0.7253, 0.6886, 0.0000, 0.6982, 0.5100, - 0.0000], - [0.2279, 0.0000, 1.2951, 1.1055, 0.0000, 0.0000, 0.0000, 0.2088, 0.0000, - 0.5022], - [0.0000, 0.0000, 1.1468, 0.0000, 0.5220, 1.1592, 0.9096, 0.0000, 0.4248, - 1.2142]], grad_fn=) - -Let's review some attributes of ``ExportedProgram`` that are of interest. - -The ``graph`` attribute is an `FX graph `__ -traced from the function we exported, that is, the computation graph of all PyTorch operations. -The FX graph has some important properties: - -- The operations are "ATen-level" operations. -- The graph is "functionalized", meaning that no operations are mutations. - -The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute -so that it can be ran as a ``torch.nn.Module``. - -.. code-block:: python - - print(exported_mod) - print(exported_mod.graph_module) - -.. code-block:: bash - - ExportedProgram: - class GraphModule(torch.nn.Module): - def forward(self, arg0_1: f32[10, 100], arg1_1: f32[10], arg2_1: f32[8, 100], arg3_1: f32[8, 100]): - # File: torch_export_nightly_tutorial.py:69, code: return torch.nn.functional.relu(self.lin(x + y), inplace=True) - add: f32[8, 100] = torch.ops.aten.add.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None - t: f32[100, 10] = torch.ops.aten.t.default(arg0_1); arg0_1 = None - addmm: f32[8, 10] = torch.ops.aten.addmm.default(arg1_1, add, t); arg1_1 = add = t = None - relu: f32[8, 10] = torch.ops.aten.relu.default(addmm); addmm = None - return (relu,) - - Graph signature: ExportGraphSignature(input_specs=[InputSpec(kind=, arg=TensorArgument(name='arg0_1'), target='lin.weight'), InputSpec(kind=, arg=TensorArgument(name='arg1_1'), target='lin.bias'), InputSpec(kind=, arg=TensorArgument(name='arg2_1'), target=None), InputSpec(kind=, arg=TensorArgument(name='arg3_1'), target=None)], output_specs=[OutputSpec(kind=, arg=TensorArgument(name='relu'), target=None)]) - Range constraints: {} - Equality constraints: [] - - GraphModule() - - - - def forward(self, arg0_1, arg1_1, arg2_1, arg3_1): - add = torch.ops.aten.add.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None - t = torch.ops.aten.t.default(arg0_1); arg0_1 = None - addmm = torch.ops.aten.addmm.default(arg1_1, add, t); arg1_1 = add = t = None - relu = torch.ops.aten.relu.default(addmm); addmm = None - return (relu,) - -The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``) -and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)`` -is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate. -Future uses of input to the original mutating ``relu`` op are replaced by the additional new output -of the replacement non-mutating ``relu`` op. - -Other attributes of interest in ``ExportedProgram`` include: - -- ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. -- ``range_constraints`` and ``equality_constraints`` -- constraints, covered later - -.. code-block:: python - - print(exported_mod.graph_signature) - -.. code-block:: bash - - ExportGraphSignature(parameters=['lin.weight', 'lin.bias'], buffers=[], user_inputs=['arg2_1', 'arg3_1'], user_outputs=['relu'], inputs_to_parameters={'arg0_1': 'lin.weight', 'arg1_1': 'lin.bias'}, inputs_to_buffers={}, buffers_to_mutate={}, backward_signature=None, assertion_dep_token=None) - -See the ``torch.export`` `documentation `__ -for more details. - -Graph Breaks ------------- - -Although ``torch.export`` shares components with ``torch.compile``, -the key limitation of ``torch.export``, especially when compared to ``torch.compile``, is that it does not -support graph breaks. This is because handling graph breaks involves interpreting -the unsupported operation with default Python evaluation, which is incompatible -with the export use case. Therefore, in order to make your model code compatible -with ``torch.export``, you will need to modify your code to remove graph breaks. - -A graph break is necessary in cases such as: - -- data-dependent control flow - -.. code-block:: python - - def bad1(x): - if x.sum() > 0: - return torch.sin(x) - return torch.cos(x) - - import traceback as tb - try: - export(bad1, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.UserError: Dynamic control flow is not supported at the moment. Please use functorch.experimental.control_flow.cond to explicitly capture the control flow - - from user code: - File "torch_export_nightly_tutorial.py", line 126, in bad1 - if x.sum() > 0: - -- accessing tensor data with ``.data`` - -.. code-block:: python - - def bad2(x): - x.data[0, 0] = 3 - return x - - try: - export(bad2, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - RuntimeError: - Found following user inputs located at [0] are mutated. This is currently banned in the aot_export workflow. - -- calling unsupported functions (such as many built-in functions) - -.. code-block:: python - - def bad3(x): - x = x + 1 - return x + id(x) - - try: - export(bad3, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.Unsupported: call_id with args (TensorVariable(),) - - from user code: - File "torch_export_nightly_tutorial.py", line 155, in bad3 - return x + id(x) - -- unsupported Python language features (e.g. throwing exceptions, match statements) - -.. code-block:: python - - def bad4(x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - - try: - export(bad4, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.Unsupported: call_function BuiltinVariable(RuntimeError) [ConstantVariable(str)] {} - - from user code: - File "torch_export_nightly_tutorial.py", line 168, in bad4 - raise RuntimeError("bad") - -The sections below demonstrate some ways you can modify your code -in order to remove graph breaks. - -Control Flow Ops ----------------- - -``torch.export`` actually does support data-dependent control flow. -But these need to be expressed using control flow ops. For example, -we can fix the control flow example above using the ``cond`` op, like so: - -.. code-block:: python - - from functorch.experimental.control_flow import cond - - def bad1_fixed(x): - def true_fn(x): - return torch.sin(x) - def false_fn(x): - return torch.cos(x) - return cond(x.sum() > 0, true_fn, false_fn, [x]) - - exported_bad1_fixed = export(bad1_fixed, (torch.randn(3, 3),)) - print(exported_bad1_fixed(torch.ones(3, 3))) - print(exported_bad1_fixed(-torch.ones(3, 3))) - -.. code-block:: bash - - tensor([[0.8415, 0.8415, 0.8415], - [0.8415, 0.8415, 0.8415], - [0.8415, 0.8415, 0.8415]]) - tensor([[0.5403, 0.5403, 0.5403], - [0.5403, 0.5403, 0.5403], - [0.5403, 0.5403, 0.5403]]) - -There are limitations to ``cond`` that one should be aware of: - -- The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor. -- The operands (i.e. ``[x]``) must be tensors. -- The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the - operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.). -- Branch functions cannot mutate input or global variables. -- Branch functions cannot access closure variables, except for ``self`` if the function is - defined in the scope of a method. - -For more details about ``cond``, check out the `documentation `__. - -.. - [NOTE] map is not documented at the moment - We can also use ``map``, which applies a function across the first dimension - of the first tensor argument. - - from functorch.experimental.control_flow import map - - def map_example(xs): - def map_fn(x, const): - def true_fn(x): - return x + const - def false_fn(x): - return x - const - return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x]) - return control_flow.map(map_fn, xs, torch.tensor([2.0])) - - exported_map_example= export(map_example, (torch.randn(4, 3),)) - inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3))) - print(exported_map_example(inp)) - -Constraints/Dynamic Shapes --------------------------- - -Ops can have different specializations/behaviors for different tensor shapes, so by default, -``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective -example inputs given to the initial ``torch.export.export()`` call. -If we try to run the ``ExportedProgram`` in the example below with a tensor -with a different shape, we get an error: - -.. code-block:: python - - class MyModule2(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - - mod2 = MyModule2() - exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) - - try: - exported_mod2(torch.randn(10, 100), torch.randn(10, 100)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - RuntimeError: Input arg3_1.shape[0] is specialized at 8 - -We can relax this constraint using the ``dynamic_shapes`` argument of -``torch.export.export()``, which allows us to specify, using ``torch.export.Dim`` -(`documentation `__), -which dimensions of the input tensors are dynamic. - -For each tensor argument of the input callable, we can specify a mapping from the dimension -to a ``torch.export.Dim``. -A ``torch.export.Dim`` is essentially a named symbolic integer with optional -minimum and maximum bounds. - -Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping -from the input callable's tensor argument names, to dimension --> dim mappings as described above. -If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is -assumed to be static. - -The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging. -Then we can specify an optional minimum and maximum bound (inclusive). Below, we show example usage. - -In the example below, our input -``inp1`` has an unconstrained first dimension, but the size of the second -dimension must be in the interval [4, 18]. - -.. code-block:: python - - from torch.export import Dim - - inp1 = torch.randn(10, 10, 2) - - def dynamic_shapes_example1(x): - x = x[:, 2:] - return torch.relu(x) - - inp1_dim0 = Dim("inp1_dim0") - inp1_dim1 = Dim("inp1_dim1", min=4, max=18) - dynamic_shapes1 = { - "x": {0: inp1_dim0, 1: inp1_dim1}, - } - - exported_dynamic_shapes_example1 = export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1) - - print(exported_dynamic_shapes_example1(torch.randn(5, 5, 2))) - - try: - exported_dynamic_shapes_example1(torch.randn(8, 1, 2)) - except Exception: - tb.print_exc() - - try: - exported_dynamic_shapes_example1(torch.randn(8, 20, 2)) - except Exception: - tb.print_exc() - - try: - exported_dynamic_shapes_example1(torch.randn(8, 8, 3)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - tensor([[[0.0000, 0.0828], - [0.8190, 0.0000], - [0.0037, 0.0221]], - - [[0.0000, 2.0898], - [0.0000, 0.0000], - [0.8182, 2.9165]], - - [[1.3572, 0.7422], - [0.4423, 0.0000], - [0.0000, 0.0000]], - - [[0.0000, 0.2497], - [0.0000, 0.1912], - [0.0000, 0.0000]], - - [[0.0000, 1.0522], - [0.4442, 0.0000], - [1.4188, 0.8161]]]) - - RuntimeError: Input arg0_1.shape[1] is outside of specified dynamic range [4, 18] - - RuntimeError: Input arg0_1.shape[1] is outside of specified dynamic range [4, 18] - - RuntimeError: Input arg0_1.shape[2] is specialized at 2 - -Note that if our example inputs to ``torch.export`` do not satisfy the constraints -given by ``dynamic_shapes``, then we get an error. - -.. code-block:: python - - inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18) - dynamic_shapes1_bad = { - "x": {0: inp1_dim0, 1: inp1_dim1_bad}, - } - - try: - export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1_bad) - except Exception: - tb.print_exc() - -.. code-block:: python - - torch._dynamo.exc.UserError: 10 not in range [11, 18] - -We can enforce that equalities between dimensions of different tensors -by using the same ``torch.export.Dim`` object, for example, in matrix multiplication: - -.. code-block:: python - - inp2 = torch.randn(4, 8) - inp3 = torch.randn(8, 2) - - def dynamic_shapes_example2(x, y): - return x @ y - - inp2_dim0 = Dim("inp2_dim0") - inner_dim = Dim("inner_dim") - inp3_dim1 = Dim("inp3_dim1") - - dynamic_shapes2 = { - "x": {0: inp2_dim0, 1: inner_dim}, - "y": {0: inner_dim, 1: inp3_dim1}, - } - - exported_dynamic_shapes_example2 = export(dynamic_shapes_example2, (inp2, inp3), dynamic_shapes=dynamic_shapes2) - - print(exported_dynamic_shapes_example2(torch.randn(2, 16), torch.randn(16, 4))) - - try: - exported_dynamic_shapes_example2(torch.randn(4, 8), torch.randn(4, 2)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - tensor([[ 7.5352, -4.3836, -2.8961, 4.3412], - [ 2.3891, 4.9101, -7.4326, -0.1697]]) - - RuntimeError: Input arg0_1.shape[1] is not equal to input arg1_1.shape[0] - -We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints -are necessary. We can do this by relaxing all constraints (recall that if we -do not provide constraints for a dimension, the default behavior is to constrain -to the exact shape value of the example input) and letting ``torch.export`` -error out. - -.. code-block:: python - - inp4 = torch.randn(8, 16) - inp5 = torch.randn(16, 32) - - def dynamic_shapes_example3(x, y): - if x.shape[0] <= 16: - return x @ y[:, :16] - return y - - dynamic_shapes3 = { - "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())}, - "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())}, - } - - try: - export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.UserError: Constraints violated (inp4_dim0, inp5_dim0, inp5_dim1)! For more information, run with TORCH_LOGS=dynamic. - - The values of inp5_dim0 = L['y'].size()[0] and inp4_dim1 = L['x'].size()[1] must always be equal. - - Not all values of inp5_dim1 = L['y'].size()[1] in the specified range satisfy the generated guard Ne(L['y'].size()[1], 16). - - Not all values of inp4_dim0 = L['x'].size()[0] in the specified range satisfy the generated guard L['x'].size()[0] <= 16. - - Not all values of inp5_dim1 = L['y'].size()[1] in the specified range satisfy the generated guard L['y'].size()[1] >= 16. - - Suggested fixes: - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - -We can see that the error message gives us suggested fixes to our -dynamic shape constraints. Let us follow those suggestions (exact -suggestions may differ slightly): - -.. code-block:: python - - def suggested_fixes(): - inp4_dim1 = Dim('shared_dim') - # suggested fixes below - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - # end of suggested fixes - return { - "x": {0: inp4_dim0, 1: inp4_dim1}, - "y": {0: inp5_dim0, 1: inp5_dim1}, - } - - dynamic_shapes3_fixed = suggested_fixes() - exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) - print(exported_dynamic_shapes_example3(torch.randn(4, 32), torch.randn(32, 64))) - -.. code-block:: python - - tensor([[ 4.1510, -4.1174, 3.4397, 1.5075, -4.3566, 4.2102, 7.2033, - 0.3611, -3.9041, 8.2987, -3.5751, -7.1508, 0.4470, 2.2460, - -0.9288, -8.1764], - [ -1.5879, -4.5107, -11.0845, -10.3962, -1.4359, 1.2877, -10.2839, - 7.3742, -0.5569, -2.0485, 3.1028, -2.4692, -1.3837, 6.8744, - -9.4191, -5.9387], - [ -3.4660, 2.8480, -2.9857, 11.7783, 0.2220, -5.5934, 1.9793, - 6.1118, 1.9817, -7.6156, 8.2070, -6.6976, -4.8177, -5.4002, - 9.3291, -7.0860], - [ -0.7406, -0.6509, 3.1847, -1.6311, 5.8144, 12.0439, 12.9141, - 8.8778, -9.5971, 4.1847, 5.8781, 0.1364, -7.3096, -4.0822, - -9.0587, 5.3681]]) - -Note that in the example above, because we constrained the value of ``x.shape[0]`` in -``dynamic_shapes_example3``, the exported program is sound even though there is a -raw ``if`` statement. - -If you want to see why ``torch.export`` generated these constraints, you can -re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, -or use ``torch._logging.set_logs``. - -.. code-block:: python - - import logging - torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) - exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) - - # reset to previous values - torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) - -.. code-block:: bash - - [2023-10-12 11:24:01,657] [12/0] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo start tracing dynamic_shapes_example3 torch_export_nightly_tutorial.py:374 - [2023-10-12 11:24:01,658] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_env - [2023-10-12 11:24:01,663] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s0 = 8 for L['x'].size()[0] [2, 16] - [2023-10-12 11:24:01,665] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s1 = 16 for L['x'].size()[1] [2, 9223372036854775806] - [2023-10-12 11:24:01,677] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s2 = 16 for L['y'].size()[0] [2, 9223372036854775806] - [2023-10-12 11:24:01,680] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s3 = 32 for L['y'].size()[1] [17, 9223372036854775806] - [2023-10-12 11:24:01,734] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] eval Eq(s1, s2) [guard added] at torch_export_nightly_tutorial.py:376 in dynamic_shapes_example3 (_meta_registrations.py:1891 in meta_mm) - [2023-10-12 11:24:01,738] [12/0] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo done tracing dynamic_shapes_example3 (RETURN_VALUE) - [2023-10-12 11:24:01,743] [12/0] torch._dynamo.output_graph: [INFO] Step 2: calling compiler function dynamo_normalization_capturing_compiler - [2023-10-12 11:24:01,743] [12/0] torch._dynamo.output_graph: [INFO] Step 2: done compiler function dynamo_normalization_capturing_compiler - [2023-10-12 11:24:01,747] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] produce_guards - [2023-10-12 11:24:01,839] torch._dynamo.eval_frame: [INFO] Summary of dimension constraints: - [2023-10-12 11:24:01,839] torch._dynamo.eval_frame: [INFO] Suggested fixes: - [2023-10-12 11:24:01,839] torch._dynamo.eval_frame: [INFO] - [2023-10-12 11:24:01,847] torch.fx.experimental.symbolic_shapes: [INFO] create_env - -We can view an ``ExportedProgram``'s constraints using the ``range_constraints`` and -``equality_constraints`` attributes. The logging above reveals what the symbols ``s0, s1, ...`` -represent. - -.. code-block:: python - - print(exported_dynamic_shapes_example3.range_constraints) - print(exported_dynamic_shapes_example3.equality_constraints) - -.. code-block:: bash - - {s0: RangeConstraint(min_val=2, max_val=16), s1: RangeConstraint(min_val=2, max_val=9223372036854775806), s2: RangeConstraint(min_val=2, max_val=9223372036854775806), s3: RangeConstraint(min_val=17, max_val=9223372036854775806)} - [(InputDim(input_name='arg0_1', dim=1), InputDim(input_name='arg1_1', dim=0))] - -Custom Ops ----------- - -``torch.export`` can export PyTorch programs with custom operators. - -Currently, the steps to register a custom op for use by ``torch.export`` are: - -- Define the custom op using ``torch.library`` (`reference `__) - as with any other custom op - -.. code-block:: python - - from torch.library import Library, impl - - m = Library("my_custom_library", "DEF") - - m.define("custom_op(Tensor input) -> Tensor") - - @impl(m, "custom_op", "CompositeExplicitAutograd") - def custom_op(x): - print("custom_op called!") - return torch.relu(x) - -- Define a ``"Meta"`` implementation of the custom op that returns an empty - tensor with the same shape as the expected output - -.. code-block:: python - - @impl(m, "custom_op", "Meta") - def custom_op_meta(x): - return torch.empty_like(x) - -- Call the custom op from the code you want to export using ``torch.ops`` - -.. code-block:: python - - def custom_op_example(x): - x = torch.sin(x) - x = torch.ops.my_custom_library.custom_op(x) - x = torch.cos(x) - return x - -- Export the code as before - -.. code-block:: python - - exported_custom_op_example = export(custom_op_example, (torch.randn(3, 3),)) - exported_custom_op_example.graph_module.print_readable() - print(exported_custom_op_example(torch.randn(3, 3))) - -.. code-block:: bash - - custom_op called! - tensor([[0.5947, 0.8062, 0.6231], - [1.0000, 1.0000, 0.6615], - [0.5412, 1.0000, 1.0000]]) - -Note in the above outputs that the custom op is included in the exported graph. -And when we call the exported graph as a function, the original custom op is called, -as evidenced by the ``print`` call. - -If you have a custom operator implemented in C++, please refer to -`this document `__ -to make it compatible with ``torch.export``. - -Decompositions --------------- - -The graph produced by ``torch.export`` by default returns a graph containing -only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 -operators, all of which are functional, that is, they do not -mutate or alias inputs. You can find a list of all ATen operators -`here `__ -and you can inspect if an operator is functional by checking -``op._schema.is_mutable``, for example: - -.. code-block:: python - - print(torch.ops.aten.add.Tensor._schema.is_mutable) - print(torch.ops.aten.add_.Tensor._schema.is_mutable) - -.. code-block:: bash - - False - True - -By default, the environment in which you want to run the exported graph -should support all ~2000 of these operators. -However, you can use the following API on the exported program -if your specific environment is only able to support a subset of -the ~2000 operators. - -.. code-block:: python - - def run_decompositions( - self: ExportedProgram, - decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] - ) -> ExportedProgram - -``run_decompositions`` takes in a decomposition table, which is a mapping of -operators to a function specifying how to reduce, or decompose, that operator -into an equivalent sequence of other ATen operators. - -The default decomposition table for ``run_decompositions`` is the -`Core ATen decomposition table `__ -which will decompose the all ATen operators to the -`Core ATen Operator Set `__ -which consists of only ~180 operators. - -.. code-block:: python - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - - ep = export(M(), (torch.randn(2, 3),)) - print(ep.graph) - - core_ir_ep = ep.run_decompositions() - print(core_ir_ep.graph) - -.. code-block:: bash - - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%arg0_1,), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %t), kwargs = {}) - return (addmm,) - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [1, 0]), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %permute), kwargs = {}) - return (addmm,) - -Notice that after running ``run_decompositions`` the -``torch.ops.aten.t.default`` operator, which is not part of the Core ATen -Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part -of the Core ATen Opset. - -Most ATen operators already have decompositions, which are located -`here `__. -If you would like to use some of these existing decomposition functions, -you can pass in a list of operators you would like to decompose to the -`get_decompositions `__ -function, which will return a decomposition table using existing -decomposition implementations. - -.. code-block:: python - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - - ep = export(M(), (torch.randn(2, 3),)) - print(ep.graph) - - from torch._decomp import get_decompositions - decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) - core_ir_ep = ep.run_decompositions(decomp_table) - print(core_ir_ep.graph) - -.. code-block:: bash - - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%arg0_1,), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %t), kwargs = {}) - return (addmm,) - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [1, 0]), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %permute), kwargs = {}) - return (addmm,) - -If there is no existing decomposition function for an ATen operator that you would -like to decompose, feel free to send a pull request into PyTorch -implementing the decomposition! - -ExportDB --------- - -``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement, -there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to -rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting -if-statements using ``cond``. - -`ExportDB `__ is the standard reference that documents -supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each -of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``. -Examples are also tagged by category so that they can be more easily searched. - -For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator. -We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like: - -.. code-block:: python - - def cond_predicate(x): - """ - The conditional statement (aka predicate) passed to ``cond()`` must be one of the following: - - torch.Tensor with a single element - - boolean expression - NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized. - """ - pred = x.dim() > 2 and x.shape[2] > 10 - return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x]) - -More generally, ExportDB can be used as a reference when one of the following occurs: - -1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features - and you want to know if ``torch.export`` covers that feature. -2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it. - -ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach -out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``. - -Conclusion ----------- - -We introduced ``torch.export``, the new PyTorch 2.X way to export single computation -graphs from PyTorch programs. In particular, we demonstrate several code modifications -and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph. diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py deleted file mode 100644 index dc5e226f86..0000000000 --- a/intermediate_source/torch_export_tutorial.py +++ /dev/null @@ -1,768 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -torch.export Tutorial -=================================================== -**Author:** William Wen, Zhengxu Chen, Angela Yi -""" - -###################################################################### -# -# .. warning:: -# -# ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility -# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.3. -# -# :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into -# standardized model representations, intended -# to be run on different (i.e. Python-less) environments. The official -# documentation can be found `here `__. -# -# In this tutorial, you will learn how to use :func:`torch.export` to extract -# ``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs. -# We also detail some considerations/modifications that you may need -# to make in order to make your model compatible with ``torch.export``. -# -# **Contents** -# -# .. contents:: -# :local: - -###################################################################### -# Basic Usage -# ----------- -# -# ``torch.export`` extracts single-graph representations from PyTorch programs -# by tracing the target function, given example inputs. -# ``torch.export.export()`` is the main entry point for ``torch.export``. -# -# In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous, -# though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()`` -# generally refers to the actual function call. -# -# The signature of ``torch.export.export()`` is: -# -# .. code-block:: python -# -# export( -# f: Callable, -# args: Tuple[Any, ...], -# kwargs: Optional[Dict[str, Any]] = None, -# *, -# dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None -# ) -> ExportedProgram -# -# ``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` -# and wraps it in an ``ExportedProgram``, which can be serialized or executed later with -# different inputs. Note that while the output ``ExportedGraph`` is callable and can be -# called in the same way as the original input callable, it is not a ``torch.nn.Module``. -# We will detail the ``dynamic_shapes`` argument later in the tutorial. - -import torch -from torch.export import export - -class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - -mod = MyModule() -exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100))) -print(type(exported_mod)) -print(exported_mod.module()(torch.randn(8, 100), torch.randn(8, 100))) - - -###################################################################### -# Let's review some attributes of ``ExportedProgram`` that are of interest. -# -# The ``graph`` attribute is an `FX graph `__ -# traced from the function we exported, that is, the computation graph of all PyTorch operations. -# The FX graph has some important properties: -# -# - The operations are "ATen-level" operations. -# - The graph is "functionalized", meaning that no operations are mutations. -# -# The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute -# so that it can be ran as a ``torch.nn.Module``. - -print(exported_mod) -print(exported_mod.graph_module) - -###################################################################### -# The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``) -# and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)`` -# is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate. -# Future uses of input to the original mutating ``relu`` op are replaced by the additional new output -# of the replacement non-mutating ``relu`` op. -# -# Other attributes of interest in ``ExportedProgram`` include: -# -# - ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. -# - ``range_constraints`` -- constraints, covered later - -print(exported_mod.graph_signature) - -###################################################################### -# See the ``torch.export`` `documentation `__ -# for more details. - -###################################################################### -# Graph Breaks -# ------------ -# -# Although ``torch.export`` shares components with ``torch.compile``, -# the key limitation of ``torch.export``, especially when compared to -# ``torch.compile``, is that it does not support graph breaks. This is because -# handling graph breaks involves interpreting the unsupported operation with -# default Python evaluation, which is incompatible with the export use case. -# Therefore, in order to make your model code compatible with ``torch.export``, -# you will need to modify your code to remove graph breaks. -# -# A graph break is necessary in cases such as: -# -# - data-dependent control flow - -class Bad1(torch.nn.Module): - def forward(self, x): - if x.sum() > 0: - return torch.sin(x) - return torch.cos(x) - -import traceback as tb -try: - export(Bad1(), (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# - accessing tensor data with ``.data`` - -class Bad2(torch.nn.Module): - def forward(self, x): - x.data[0, 0] = 3 - return x - -try: - export(Bad2(), (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# - calling unsupported functions (such as many built-in functions) - -class Bad3(torch.nn.Module): - def forward(self, x): - x = x + 1 - return x + id(x) - -try: - export(Bad3(), (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# - unsupported Python language features (e.g. throwing exceptions, match statements) - -class Bad4(torch.nn.Module): - def forward(self, x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - -try: - export(Bad4(), (torch.randn(3, 3),)) -except Exception: - tb.print_exc() - -###################################################################### -# Non-Strict Export -# ----------------- -# -# To trace the program, ``torch.export`` uses TorchDynamo, a byte code analysis -# engine, to symbolically analyze the Python code and build a graph based on the -# results. This analysis allows ``torch.export`` to provide stronger guarantees -# about safety, but not all Python code is supported, causing these graph -# breaks. -# -# To address this issue, in PyTorch 2.3, we introduced a new mode of -# exporting called non-strict mode, where we trace through the program using the -# Python interpreter executing it exactly as it would in eager mode, allowing us -# to skip over unsupported Python features. This is done through adding a -# ``strict=False`` flag. -# -# Looking at some of the previous examples which resulted in graph breaks: -# -# - Accessing tensor data with ``.data`` now works correctly - -class Bad2(torch.nn.Module): - def forward(self, x): - x.data[0, 0] = 3 - return x - -bad2_nonstrict = export(Bad2(), (torch.randn(3, 3),), strict=False) -print(bad2_nonstrict.module()(torch.ones(3, 3))) - -###################################################################### -# - Calling unsupported functions (such as many built-in functions) traces -# through, but in this case, ``id(x)`` gets specialized as a constant integer in -# the graph. This is because ``id(x)`` is not a tensor operation, so the -# operation is not recorded in the graph. - -class Bad3(torch.nn.Module): - def forward(self, x): - x = x + 1 - return x + id(x) - -bad3_nonstrict = export(Bad3(), (torch.randn(3, 3),), strict=False) -print(bad3_nonstrict) -print(bad3_nonstrict.module()(torch.ones(3, 3))) - -###################################################################### -# - Unsupported Python language features (such as throwing exceptions, match -# statements) now also get traced through. - -class Bad4(torch.nn.Module): - def forward(self, x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - -bad4_nonstrict = export(Bad4(), (torch.randn(3, 3),), strict=False) -print(bad4_nonstrict.module()(torch.ones(3, 3))) - - -###################################################################### -# However, there are still some features that require rewrites to the original -# module: - -###################################################################### -# Control Flow Ops -# ---------------- -# -# ``torch.export`` actually does support data-dependent control flow. -# But these need to be expressed using control flow ops. For example, -# we can fix the control flow example above using the ``cond`` op, like so: - -from functorch.experimental.control_flow import cond - -class Bad1Fixed(torch.nn.Module): - def forward(self, x): - def true_fn(x): - return torch.sin(x) - def false_fn(x): - return torch.cos(x) - return cond(x.sum() > 0, true_fn, false_fn, [x]) - -exported_bad1_fixed = export(Bad1Fixed(), (torch.randn(3, 3),)) -print(exported_bad1_fixed.module()(torch.ones(3, 3))) -print(exported_bad1_fixed.module()(-torch.ones(3, 3))) - -###################################################################### -# There are limitations to ``cond`` that one should be aware of: -# -# - The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor. -# - The operands (i.e. ``[x]``) must be tensors. -# - The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the -# operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.). -# - Branch functions cannot mutate input or global variables. -# - Branch functions cannot access closure variables, except for ``self`` if the function is -# defined in the scope of a method. -# -# For more details about ``cond``, check out the `cond documentation `__. - -###################################################################### -# .. -# [NOTE] map is not documented at the moment -# We can also use ``map``, which applies a function across the first dimension -# of the first tensor argument. -# -# from functorch.experimental.control_flow import map -# -# def map_example(xs): -# def map_fn(x, const): -# def true_fn(x): -# return x + const -# def false_fn(x): -# return x - const -# return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x]) -# return control_flow.map(map_fn, xs, torch.tensor([2.0])) -# -# exported_map_example= export(map_example, (torch.randn(4, 3),)) -# inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3))) -# print(exported_map_example(inp)) - -###################################################################### -# Constraints/Dynamic Shapes -# -------------------------- -# -# Ops can have different specializations/behaviors for different tensor shapes, so by default, -# ``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective -# example inputs given to the initial ``torch.export.export()`` call. -# If we try to run the ``ExportedProgram`` in the example below with a tensor -# with a different shape, we get an error: - -class MyModule2(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - -mod2 = MyModule2() -exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) - -try: - exported_mod2.module()(torch.randn(10, 100), torch.randn(10, 100)) -except Exception: - tb.print_exc() - -###################################################################### -# We can relax this constraint using the ``dynamic_shapes`` argument of -# ``torch.export.export()``, which allows us to specify, using ``torch.export.Dim`` -# (`documentation `__), -# which dimensions of the input tensors are dynamic. -# -# For each tensor argument of the input callable, we can specify a mapping from the dimension -# to a ``torch.export.Dim``. -# A ``torch.export.Dim`` is essentially a named symbolic integer with optional -# minimum and maximum bounds. -# -# Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping -# from the input callable's tensor argument names, to dimension --> dim mappings as described above. -# If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is -# assumed to be static. -# -# The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging. -# Then we can specify an optional minimum and maximum bound (inclusive). Below, we show a usage example. -# -# In the example below, our input -# ``inp1`` has an unconstrained first dimension, but the size of the second -# dimension must be in the interval [4, 18]. - -from torch.export import Dim - -inp1 = torch.randn(10, 10, 2) - -class DynamicShapesExample1(torch.nn.Module): - def forward(self, x): - x = x[:, 2:] - return torch.relu(x) - -inp1_dim0 = Dim("inp1_dim0") -inp1_dim1 = Dim("inp1_dim1", min=4, max=18) -dynamic_shapes1 = { - "x": {0: inp1_dim0, 1: inp1_dim1}, -} - -exported_dynamic_shapes_example1 = export(DynamicShapesExample1(), (inp1,), dynamic_shapes=dynamic_shapes1) - -print(exported_dynamic_shapes_example1.module()(torch.randn(5, 5, 2))) - -try: - exported_dynamic_shapes_example1.module()(torch.randn(8, 1, 2)) -except Exception: - tb.print_exc() - -try: - exported_dynamic_shapes_example1.module()(torch.randn(8, 20, 2)) -except Exception: - tb.print_exc() - -try: - exported_dynamic_shapes_example1.module()(torch.randn(8, 8, 3)) -except Exception: - tb.print_exc() - -###################################################################### -# Note that if our example inputs to ``torch.export`` do not satisfy the constraints -# given by ``dynamic_shapes``, then we get an error. - -inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18) -dynamic_shapes1_bad = { - "x": {0: inp1_dim0, 1: inp1_dim1_bad}, -} - -try: - export(DynamicShapesExample1(), (inp1,), dynamic_shapes=dynamic_shapes1_bad) -except Exception: - tb.print_exc() - -###################################################################### -# We can enforce that equalities between dimensions of different tensors -# by using the same ``torch.export.Dim`` object, for example, in matrix multiplication: - -inp2 = torch.randn(4, 8) -inp3 = torch.randn(8, 2) - -class DynamicShapesExample2(torch.nn.Module): - def forward(self, x, y): - return x @ y - -inp2_dim0 = Dim("inp2_dim0") -inner_dim = Dim("inner_dim") -inp3_dim1 = Dim("inp3_dim1") - -dynamic_shapes2 = { - "x": {0: inp2_dim0, 1: inner_dim}, - "y": {0: inner_dim, 1: inp3_dim1}, -} - -exported_dynamic_shapes_example2 = export(DynamicShapesExample2(), (inp2, inp3), dynamic_shapes=dynamic_shapes2) - -print(exported_dynamic_shapes_example2.module()(torch.randn(2, 16), torch.randn(16, 4))) - -try: - exported_dynamic_shapes_example2.module()(torch.randn(4, 8), torch.randn(4, 2)) -except Exception: - tb.print_exc() - -###################################################################### -# We can also describe one dimension in terms of other. There are some -# restrictions to how detailed we can specify one dimension in terms of another, -# but generally, those in the form of ``A * Dim + B`` should work. - -class DerivedDimExample1(torch.nn.Module): - def forward(self, x, y): - return x + y[1:] - -foo = DerivedDimExample1() - -x, y = torch.randn(5), torch.randn(6) -dimx = torch.export.Dim("dimx", min=3, max=6) -dimy = dimx + 1 -derived_dynamic_shapes1 = ({0: dimx}, {0: dimy}) - -derived_dim_example1 = export(foo, (x, y), dynamic_shapes=derived_dynamic_shapes1) - -print(derived_dim_example1.module()(torch.randn(4), torch.randn(5))) - -try: - derived_dim_example1.module()(torch.randn(4), torch.randn(6)) -except Exception: - tb.print_exc() - - -class DerivedDimExample2(torch.nn.Module): - def forward(self, z, y): - return z[1:] + y[1::3] - -foo = DerivedDimExample2() - -z, y = torch.randn(4), torch.randn(10) -dx = torch.export.Dim("dx", min=3, max=6) -dz = dx + 1 -dy = dx * 3 + 1 -derived_dynamic_shapes2 = ({0: dz}, {0: dy}) - -derived_dim_example2 = export(foo, (z, y), dynamic_shapes=derived_dynamic_shapes2) -print(derived_dim_example2.module()(torch.randn(7), torch.randn(19))) - -###################################################################### -# We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints -# are necessary. We can do this by relaxing all constraints (recall that if we -# do not provide constraints for a dimension, the default behavior is to constrain -# to the exact shape value of the example input) and letting ``torch.export`` -# error out. - -inp4 = torch.randn(8, 16) -inp5 = torch.randn(16, 32) - -class DynamicShapesExample3(torch.nn.Module): - def forward(self, x, y): - if x.shape[0] <= 16: - return x @ y[:, :16] - return y - -dynamic_shapes3 = { - "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())}, - "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())}, -} - -try: - export(DynamicShapesExample3(), (inp4, inp5), dynamic_shapes=dynamic_shapes3) -except Exception: - tb.print_exc() - -###################################################################### -# We can see that the error message gives us suggested fixes to our -# dynamic shape constraints. Let us follow those suggestions (exact -# suggestions may differ slightly): - -def suggested_fixes(): - inp4_dim1 = Dim('shared_dim') - # suggested fixes below - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - # end of suggested fixes - return { - "x": {0: inp4_dim0, 1: inp4_dim1}, - "y": {0: inp5_dim0, 1: inp5_dim1}, - } - -dynamic_shapes3_fixed = suggested_fixes() -exported_dynamic_shapes_example3 = export(DynamicShapesExample3(), (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) -print(exported_dynamic_shapes_example3.module()(torch.randn(4, 32), torch.randn(32, 64))) - -###################################################################### -# Note that in the example above, because we constrained the value of ``x.shape[0]`` in -# ``dynamic_shapes_example3``, the exported program is sound even though there is a -# raw ``if`` statement. -# -# If you want to see why ``torch.export`` generated these constraints, you can -# re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, -# or use ``torch._logging.set_logs``. - -import logging -torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) -exported_dynamic_shapes_example3 = export(DynamicShapesExample3(), (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) - -# reset to previous values -torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) - -###################################################################### -# We can view an ``ExportedProgram``'s symbolic shape ranges using the -# ``range_constraints`` field. - -print(exported_dynamic_shapes_example3.range_constraints) - -###################################################################### -# Custom Ops -# ---------- -# -# ``torch.export`` can export PyTorch programs with custom operators. -# -# Currently, the steps to register a custom op for use by ``torch.export`` are: -# -# - Define the custom op using ``torch.library`` (`reference `__) -# as with any other custom op - -@torch.library.custom_op("my_custom_library::custom_op", mutates_args={}) -def custom_op(input: torch.Tensor) -> torch.Tensor: - print("custom_op called!") - return torch.relu(x) - -###################################################################### -# - Define a ``"Meta"`` implementation of the custom op that returns an empty -# tensor with the same shape as the expected output - -@custom_op.register_fake -def custom_op_meta(x): - return torch.empty_like(x) - -###################################################################### -# - Call the custom op from the code you want to export using ``torch.ops`` - -class CustomOpExample(torch.nn.Module): - def forward(self, x): - x = torch.sin(x) - x = torch.ops.my_custom_library.custom_op(x) - x = torch.cos(x) - return x - -###################################################################### -# - Export the code as before - -exported_custom_op_example = export(CustomOpExample(), (torch.randn(3, 3),)) -exported_custom_op_example.graph_module.print_readable() -print(exported_custom_op_example.module()(torch.randn(3, 3))) - -###################################################################### -# Note in the above outputs that the custom op is included in the exported graph. -# And when we call the exported graph as a function, the original custom op is called, -# as evidenced by the ``print`` call. -# -# If you have a custom operator implemented in C++, please refer to -# `this document `__ -# to make it compatible with ``torch.export``. - -###################################################################### -# Decompositions -# -------------- -# -# The graph produced by ``torch.export`` by default returns a graph containing -# only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 -# operators, all of which are functional, that is, they do not -# mutate or alias inputs. You can find a list of all ATen operators -# `here `__ -# and you can inspect if an operator is functional by checking -# ``op._schema.is_mutable``, for example: - -print(torch.ops.aten.add.Tensor._schema.is_mutable) -print(torch.ops.aten.add_.Tensor._schema.is_mutable) - -###################################################################### -# By default, the environment in which you want to run the exported graph -# should support all ~2000 of these operators. -# However, you can use the following API on the exported program -# if your specific environment is only able to support a subset of -# the ~2000 operators. -# -# .. code-block:: python -# -# def run_decompositions( -# self: ExportedProgram, -# decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] -# ) -> ExportedProgram -# -# ``run_decompositions`` takes in a decomposition table, which is a mapping of -# operators to a function specifying how to reduce, or decompose, that operator -# into an equivalent sequence of other ATen operators. -# -# The default decomposition table for ``run_decompositions`` is the -# `Core ATen decomposition table `__ -# which will decompose the all ATen operators to the -# `Core ATen Operator Set `__ -# which consists of only ~180 operators. - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - -ep = export(M(), (torch.randn(2, 3),)) -print(ep.graph) - -core_ir_ep = ep.run_decompositions() -print(core_ir_ep.graph) - -###################################################################### -# Notice that after running ``run_decompositions`` the -# ``torch.ops.aten.t.default`` operator, which is not part of the Core ATen -# Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part -# of the Core ATen Opset. -# -# Most ATen operators already have decompositions, which are located -# `here `__. -# If you would like to use some of these existing decomposition functions, -# you can pass in a list of operators you would like to decompose to the -# `get_decompositions `__ -# function, which will return a decomposition table using existing -# decomposition implementations. - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - -ep = export(M(), (torch.randn(2, 3),)) -print(ep.graph) - -from torch._decomp import get_decompositions -decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) -core_ir_ep = ep.run_decompositions(decomp_table) -print(core_ir_ep.graph) - -###################################################################### -# If there is no existing decomposition function for an ATen operator that you would -# like to decompose, feel free to send a pull request into PyTorch -# implementing the decomposition! - -###################################################################### -# ExportDB -# -------- -# -# ``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement, -# there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to -# rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting -# if-statements using ``cond``. -# -# `ExportDB `__ is the standard reference that documents -# supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each -# of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``. -# Examples are also tagged by category so that they can be more easily searched. -# -# For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator. -# We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like: - -def cond_predicate(x): - """ - The conditional statement (aka predicate) passed to ``cond()`` must be one of the following: - - ``torch.Tensor`` with a single element - - boolean expression - NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized. - """ - pred = x.dim() > 2 and x.shape[2] > 10 - return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x]) - -###################################################################### -# More generally, ExportDB can be used as a reference when one of the following occurs: -# -# 1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features -# and you want to know if ``torch.export`` covers that feature. -# 2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it. -# -# ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach -# out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``. - -###################################################################### -# Running the Exported Program -# ---------------------------- -# -# As ``torch.export`` is only a graph capturing mechanism, calling the artifact -# produced by ``torch.export`` eagerly will be equivalent to running the eager -# module. To optimize the execution of the Exported Program, we can pass this -# exported artifact to backends such as Inductor through ``torch.compile``, -# `AOTInductor `__, -# or `TensorRT `__. - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 3) - - def forward(self, x): - x = self.linear(x) - return x - -inp = torch.randn(2, 3, device="cuda") -m = M().to(device="cuda") -ep = torch.export.export(m, (inp,)) - -# Run it eagerly -res = ep.module()(inp) -print(res) - -# Run it with torch.compile -res = torch.compile(ep.module(), backend="inductor")(inp) -print(res) - -###################################################################### -# .. code-block:: python -# -# import torch._export -# import torch._inductor -# -# # Note: these APIs are subject to change -# # Compile the exported program to a .so using ``AOTInductor`` -# with torch.no_grad(): -# so_path = torch._inductor.aot_compile(ep.module(), [inp]) -# -# # Load and run the .so file in Python. -# # To load and run it in a C++ environment, see: -# # https://pytorch.org/docs/main/torch.compiler_aot_inductor.html -# res = torch._export.aot_load(so_path, device="cuda")(inp) - -###################################################################### -# Conclusion -# ---------- -# -# We introduced ``torch.export``, the new PyTorch 2.X way to export single computation -# graphs from PyTorch programs. In particular, we demonstrate several code modifications -# and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph. diff --git a/intermediate_source/torchrec_tutorial.rst b/intermediate_source/torchrec_tutorial.rst deleted file mode 100644 index 6a450b1659..0000000000 --- a/intermediate_source/torchrec_tutorial.rst +++ /dev/null @@ -1,244 +0,0 @@ -Introduction to TorchRec -======================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `__. - This will allow you to experiment with the information presented below. - -Follow along with the video below or on `youtube `__. - -.. raw:: html - -
- -
- -When building recommendation systems, we frequently want to represent -entities like products or pages with embeddings. For example, see Meta -AI’s `Deep learning recommendation -model `__, or DLRM. As the number of -entities grow, the size of the embedding tables can exceed a single -GPU’s memory. A common practice is to shard the embedding table across -devices, a type of model parallelism. To that end, TorchRec introduces -its primary API -called |DistributedModelParallel|_, -or DMP. Like PyTorch’s DistributedDataParallel, DMP wraps a model to -enable distributed training. - -Installation ------------- - -Requirements: python >= 3.7 - -We highly recommend CUDA when using TorchRec (If using CUDA: cuda >= 11.0). - - -.. code:: shell - - # install pytorch with cudatoolkit 11.3 - conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y - # install TorchRec - pip3 install torchrec-nightly - - -Overview --------- - -This tutorial will cover three pieces of TorchRec: the ``nn.module`` |EmbeddingBagCollection|_, the |DistributedModelParallel|_ API, and -the datastructure |KeyedJaggedTensor|_. - - -Distributed Setup -~~~~~~~~~~~~~~~~~ - -We setup our environment with torch.distributed. For more info on -distributed, see this -`tutorial `__. - -Here, we use one rank (the colab process) corresponding to our 1 colab -GPU. - -.. code:: python - - import os - import torch - import torchrec - import torch.distributed as dist - - os.environ["RANK"] = "0" - os.environ["WORLD_SIZE"] = "1" - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "29500" - - # Note - you will need a V100 or A100 to run tutorial as as! - # If using an older GPU (such as colab free K80), - # you will need to compile fbgemm with the appripriate CUDA architecture - # or run with "gloo" on CPUs - dist.init_process_group(backend="nccl") - - -From EmbeddingBag to EmbeddingBagCollection -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyTorch represents embeddings through |torch.nn.Embedding|_ and |torch.nn.EmbeddingBag|_. -EmbeddingBag is a pooled version of Embedding. - -TorchRec extends these modules by creating collections of embeddings. We -will use |EmbeddingBagCollection|_ to represent a group of EmbeddingBags. - -Here, we create an EmbeddingBagCollection (EBC) with two embedding bags. -Each table, ``product_table`` and ``user_table``, is represented by a 64 -dimension embedding of size 4096. Note how we initially allocate the EBC -on device “meta”. This will tell EBC to not allocate memory yet. - -.. code:: python - - ebc = torchrec.EmbeddingBagCollection( - device="meta", - tables=[ - torchrec.EmbeddingBagConfig( - name="product_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["product"], - pooling=torchrec.PoolingType.SUM, - ), - torchrec.EmbeddingBagConfig( - name="user_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["user"], - pooling=torchrec.PoolingType.SUM, - ) - ] - ) - - -DistributedModelParallel -~~~~~~~~~~~~~~~~~~~~~~~~ - -Now, we’re ready to wrap our model with |DistributedModelParallel|_ (DMP). Instantiating DMP will: - -1. Decide how to shard the model. DMP will collect the available - ‘sharders’ and come up with a ‘plan’ of the optimal way to shard the - embedding table(s) (i.e., the EmbeddingBagCollection). -2. Actually shard the model. This includes allocating memory for each - embedding table on the appropriate device(s). - -In this toy example, since we have two EmbeddingTables and one GPU, -TorchRec will place both on the single GPU. - -.. code:: python - - model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda")) - print(model) - print(model.plan) - - -Query vanilla nn.EmbeddingBag with input and offsets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We query |nn.Embedding|_ and |nn.EmbeddingBag|_ -with ``input`` and ``offsets``. Input is a 1-D tensor containing the -lookup values. Offsets is a 1-D tensor where the sequence is a -cumulative sum of the number of values to pool per example. - -Let’s look at an example, recreating the product EmbeddingBag above: - -:: - - |------------| - | product ID | - |------------| - | [101, 202] | - | [] | - | [303] | - |------------| - -.. code:: python - - product_eb = torch.nn.EmbeddingBag(4096, 64) - product_eb(input=torch.tensor([101, 202, 303]), offsets=torch.tensor([0, 2, 2])) - - -Representing minibatches with KeyedJaggedTensor -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We need an efficient representation of multiple examples of an arbitrary -number of entity IDs per feature per example. In order to enable this -“jagged” representation, we use the TorchRec datastructure -|KeyedJaggedTensor|_ (KJT). - -Let’s take a look at how to lookup a collection of two embedding -bags, “product” and “user”. Assume the minibatch is made up of three -examples for three users. The first of which has two product IDs, the -second with none, and the third with one product ID. - -:: - - |------------|------------| - | product ID | user ID | - |------------|------------| - | [101, 202] | [404] | - | [] | [505] | - | [303] | [606] | - |------------|------------| - -The query should be: - -.. code:: python - - mb = torchrec.KeyedJaggedTensor( - keys = ["product", "user"], - values = torch.tensor([101, 202, 303, 404, 505, 606]).cuda(), - lengths = torch.tensor([2, 0, 1, 1, 1, 1], dtype=torch.int64).cuda(), - ) - - print(mb.to(torch.device("cpu"))) - - -Note that the KJT batch size is -``batch_size = len(lengths)//len(keys)``. In the above example, -batch_size is 3. - - - -Putting it all together, querying our distributed model with a KJT minibatch -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, we can query our model using our minibatch of products and -users. - -The resulting lookup will contain a KeyedTensor, where each key (or -feature) contains a 2D tensor of size 3x64 (batch_size x embedding_dim). - -.. code:: python - - pooled_embeddings = model(mb) - print(pooled_embeddings) - - -More resources --------------- - -For more information, please see our -`dlrm `__ -example, which includes multinode training on the criteo terabyte -dataset, using Meta’s `DLRM `__. - - -.. |DistributedModelParallel| replace:: ``DistributedModelParallel`` -.. _DistributedModelParallel: https://pytorch.org/torchrec/torchrec.distributed.html#torchrec.distributed.model_parallel.DistributedModelParallel -.. |EmbeddingBagCollection| replace:: ``EmbeddingBagCollection`` -.. _EmbeddingBagCollection: https://pytorch.org/torchrec/torchrec.modules.html#torchrec.modules.embedding_modules.EmbeddingBagCollection -.. |KeyedJaggedTensor| replace:: ``KeyedJaggedTensor`` -.. _KeyedJaggedTensor: https://pytorch.org/torchrec/torchrec.sparse.html#torchrec.sparse.jagged_tensor.JaggedTensor -.. |torch.nn.Embedding| replace:: ``torch.nn.Embedding`` -.. _torch.nn.Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html -.. |torch.nn.EmbeddingBag| replace:: ``torch.nn.EmbeddingBag`` -.. _torch.nn.EmbeddingBag: https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html -.. |nn.Embedding| replace:: ``nn.Embedding`` -.. _nn.Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html -.. |nn.EmbeddingBag| replace:: ``nn.EmbeddingBag`` -.. _nn.EmbeddingBag: https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html diff --git a/intermediate_source/torchserve_with_ipex.rst b/intermediate_source/torchserve_with_ipex.rst deleted file mode 100644 index 1a11b4180f..0000000000 --- a/intermediate_source/torchserve_with_ipex.rst +++ /dev/null @@ -1,394 +0,0 @@ -Grokking PyTorch Intel CPU performance from first principles -============================================================ - -A case study on the TorchServe inference framework optimized with `Intel® Extension for PyTorch* `_. - -Authors: Min Jean Cho, Mark Saroufim - -Reviewers: Ashok Emani, Jiong Gong - -Getting a strong out-of-box performance for deep learning on CPUs can be tricky but it’s much easier if you’re aware of the main problems that affect performance, how to measure them and how to solve them. - -TL;DR - -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ -| Problem | How to measure it | Solution | -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ -| Bottlenecked GEMM execution units | - `Imbalance or Serial Spinning `_ | Avoid using logical cores by setting thread affinity to physical cores via core pinning | -| | - `Front-End Bound `_ | | -| | - `Core Bound `_ | | -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ -| Non Uniform Memory Access (NUMA) | - Local vs. remote memory access | Avoid cross-socket computation by setting thread affinity to a specific socket via core pinning | -| | - `UPI Utilization `_ | | -| | - Latency in memory accesses | | -| | - Thread migration | | -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ - -*GEMM (General Matrix Multiply)* run on fused-multiply-add (FMA) or dot-product (DP) execution units which will be bottlenecked and cause delays in thread waiting/*spinning at synchronization* barrier when *hyperthreading* is enabled - because using logical cores causes insufficient concurrency for all working threads as each logical thread *contends for the same core resources*. Instead, if we use 1 thread per physical core, we avoid this contention. So we generally recommend *avoiding logical cores* by setting CPU *thread affinity* to physical cores via *core pinning*. - -Multi-socket systems have *Non-Uniform Memory Access (NUMA)* which is a shared memory architecture that describes the placement of main memory modules with respect to processors. But if a process is not NUMA-aware, slow *remote memory* is frequently accessed when *threads migrate* cross socket via *Intel Ultra Path Interconnect (UPI)* during run time. We address this problem by setting CPU *thread affinity* to a specific socket via *core pinning*. - -Knowing these principles in mind, proper CPU runtime configuration can significantly boost out-of-box performance. - -In this blog, we'll walk you through the important runtime configurations you should be aware of from `CPU Performance Tuning Guide `_, explain how they work, how to profile them and how to integrate them within a model serving framework like `TorchServe `_ via an easy to use `launch script `_ which we’ve `integrated `_ :superscript:`1` natively. - -We’ll explain all of these ideas :strong:`visually` from :strong:`first principles` with lots of :strong:`profiles` and show you how we applied our learnings to make out of the box CPU performance on TorchServe better. - -1. The feature has to be explicitly enabled by setting *cpu_launcher_enable=true* in *config.properties*. - -Avoid logical cores for deep learning -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Avoiding logical cores for deep learning workloads generally improves performance. To understand this, let us take a step back to GEMM. - -:strong:`Optimizing GEMM optimizes deep learning` - -The majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM which is at the core of fully connected layers. Fully connected layers have been used for decades since multi-layer perceptrons (MLP) `proved to be a universal approximator of any continuous function `_. Any MLP can be entirely represented as GEMM. And even a convolution can be represented as a GEMM by using a `Toepliz matrix `_. - -Returning to the original topic, most GEMM operators benefit from using non-hyperthreading, because the majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM running on fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores. With hyperthreading enabled, OpenMP threads will contend for the same GEMM execution units. - -.. figure:: /_static/img/torchserve-ipex-images/1_.png - :width: 70% - :align: center - -And if 2 logical threads run GEMM at the same time, they will be sharing the same core resources causing front end bound, such that the overhead from this front end bound is greater than the gain from running both logical threads at the same time. - -Therefore we generally recommend avoiding using logical cores for deep learning workloads to achieve good performance. The launch script by default uses physical cores only; however, users can easily experiment with logical vs. physical cores by simply toggling the ``--use_logical_core`` launch script knob. - -:strong:`Exercise` - -We'll use the following example of feeding ResNet50 dummy tensor: - -.. code:: python - - import torch - import torchvision.models as models - import time - - model = models.resnet50(pretrained=False) - model.eval() - data = torch.rand(1, 3, 224, 224) - - # warm up - for _ in range(100): - model(data) - - start = time.time() - for _ in range(100): - model(data) - end = time.time() - print('Inference took {:.2f} ms in average'.format((end-start)/100*1000)) - -Throughout the blog, we'll use `Intel® VTune™ Profiler `_ to profile and verify optimizations. And we'll run all exercises on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. The CPU information is shown in Figure 2.1. - -Environment variable ``OMP_NUM_THREADS`` is used to set the number of threads for parallel region. We'll compare ``OMP_NUM_THREADS=2`` with (1) use of logical cores and (2) use of physical cores only. - -(1) Both OpenMP threads trying to utilize the same GEMM execution units shared by hyperthreading cores (0, 56) - -We can visualize this by running ``htop`` command on Linux as shown below. - -.. figure:: /_static/img/torchserve-ipex-images/2.png - :width: 100% - :align: center - - -.. figure:: /_static/img/torchserve-ipex-images/3.png - :width: 100% - :align: center - -We notice that the Spin Time is flagged, and Imbalance or Serial Spinning contributed to the majority of it - 4.980 seconds out of the 8.982 seconds total. The Imbalance or Serial Spinning when using logical cores is due to insufficient concurrency of working threads as each logical thread contends for the same core resources. - -The Top Hotspots section of the execution summary indicates that ``__kmp_fork_barrier`` took 4.589 seconds of CPU time - during 9.33% of the CPU execution time, threads were just spinning at this barrier due to thread synchronization. - -(2) Each OpenMP thread utilizing GEMM execution units in respective physical cores (0,1) - - -.. figure:: /_static/img/torchserve-ipex-images/4.png - :width: 80% - :align: center - - -.. figure:: /_static/img/torchserve-ipex-images/5.png - :width: 80% - :align: center - -We first note that the execution time dropped from 32 seconds to 23 seconds by avoiding logical cores. While there's still some non-negligible Imbalance or Serial Spinning, we note relative improvement from 4.980 seconds to 3.887 seconds. - -By not using logical threads (instead, using 1 thread per physical core), we avoid logical threads contending for the same core resources. The Top Hotspots section also indicates relative improvement of ``__kmp_fork_barrier`` time from 4.589 seconds to 3.530 seconds. - -Local memory access is always faster than remote memory access -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We generally recommend binding a process to a local socket such that the process does not migrate across sockets. Generally the goal of doing so is to utilize high speed cache on local memory and to avoid remote memory access which can be ~2x slower. - - -.. figure:: /_static/img/torchserve-ipex-images/6.png - :width: 80% - :align: center -Figure 1. Two-socket configuration - -Figure 1. shows a typical two-socket configuration. Notice that each socket has its own local memory. Sockets are connected to each other via Intel Ultra Path Interconnect (UPI) which allows each socket to access the local memory of another socket called remote memory. Local memory access is always faster than remote memory access. - -.. figure:: /_static/img/torchserve-ipex-images/7.png - :width: 50% - :align: center -Figure 2.1. CPU information - -Users can get their CPU information by running ``lscpu`` command on their Linux machine. Figure 2.1. shows an example of ``lscpu`` execution on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. Notice that there are 28 cores per socket, and 2 threads per core (i.e., hyperthreading is enabled). In other words, there are 28 logical cores in addition to 28 physical cores, giving a total of 56 cores per socket. And there are 2 sockets, giving a total of 112 cores (``Thread(s) per core`` x ``Core(s) per socket`` x ``Socket(s)``). - -.. figure:: /_static/img/torchserve-ipex-images/8.png - :width: 100% - :align: center -Figure 2.2. CPU information - -The 2 sockets are mapped to 2 NUMA nodes (NUMA node 0, NUMA node 1) respectively. Physical cores are indexed prior to logical cores. As shown in Figure 2.2., the first 28 physical cores (0-27) and the first 28 logical cores (56-83) on the first socket are on NUMA node 0. And the second 28 physical cores (28-55) and the second 28 logical cores (84-111) on the second socket are on NUMA node 1. Cores on the same socket share local memory and last level cache (LLC) which is much faster than cross-socket communication via Intel UPI. - -Now that we understand NUMA, cross-socket (UPI) traffic, local vs. remote memory access in multi-processor systems, let's profile and verify our understanding. - -:strong:`Exercise` - -We'll reuse the ResNet50 example above. - -As we did not pin threads to processor cores of a specific socket, the operating system periodically schedules threads on processor cores located in different sockets. - -.. figure:: /_static/img/torchserve-ipex-images/9.gif - :width: 100% - :align: center - -Figure 3. CPU usage of non NUMA-aware application. 1 main worker thread was launched, then it launched a physical core number (56) of threads on all cores, including logical cores. - -(Aside: If the number of threads is not set by `torch.set_num_threads `_, the default number of threads is the number of physical cores in a hyperthreading enabled system. This can be verified by `torch.get_num_threads `_. Hence we see above about half of the cores busy running the example script.) - -.. figure:: /_static/img/torchserve-ipex-images/10.png - :width: 100% - :align: center -Figure 4. Non-Uniform Memory Access Analysis graph - - -Figure 4. compares local vs. remote memory access over time. We verify usage of remote memory which could result in sub-optimal performance. - -:strong:`Set thread affinity to reduce remote memory access and cross-socket (UPI) traffic` - -Pinning threads to cores on the same socket helps maintain locality of memory access. In this example, we'll pin to the physical cores on the first NUMA node (0-27). With the launch script, users can easily experiment with NUMA nodes configuration by simply toggling the ``--node_id`` launch script knob. - -Let's visualize the CPU usage now. - -.. figure:: /_static/img/torchserve-ipex-images/11.gif - :width: 100% - :align: center -Figure 5. CPU usage of NUMA-aware application - -1 main worker thread was launched, then it launched threads on all physical cores on the first numa node. - -.. figure:: /_static/img/torchserve-ipex-images/12.png - :width: 100% - :align: center -Figure 6. Non-Uniform Memory Access Analysis graph - -As shown in Figure 6., now almost all memory accesses are local accesses. - -Efficient CPU usage with core pinning for multi-worker inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When running multi-worker inference, cores are overlapped (or shared) between workers causing inefficient CPU usage. To address this problem, the launch script equally divides the number of available cores by the number of workers such that each worker is pinned to assigned cores during runtime. - -:strong:`Exercise with TorchServe` - -For this exercise, let's apply the CPU performance tuning principles and recommendations that we have discussed so far to `TorchServe apache-bench benchmarking `_. - -We'll use ResNet50 with 4 workers, concurrency 100, requests 10,000. All other parameters (e.g., batch_size, input, etc) are the same as the `default parameters `_. - -We'll compare the following three configurations: - -(1) default TorchServe setting (no core pinning) - -(2) `torch.set_num_threads `_ = ``number of physical cores / number of workers`` (no core pinning) - -(3) core pinning via the launch script (Required Torchserve>=0.6.1) - -After this exercise, we'll have verified that we prefer avoiding logical cores and prefer local memory access via core pinning with a real TorchServe use case. - -1. Default TorchServe setting (no core pinning) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The `base_handler `_ doesn't explicitly set `torch.set_num_threads `_. Hence the default number of threads is the number of physical CPU cores as described `here `_. Users can check the number of threads by `torch.get_num_threads `_ in the base_handler. Each of the 4 main worker threads launches a physical core number (56) of threads, launching a total of 56x4 = 224 threads, which is more than the total number of cores 112. Therefore cores are guaranteed to be heavily overlapped with high logical core utilization- multiple workers using multiple cores at the same time. Furthermore, because threads are not affinitized to specific CPU cores, the operating system periodically schedules threads to cores located in different sockets. - -1. CPU usage - -.. figure:: /_static/img/torchserve-ipex-images/13.png - :width: 100% - :align: center - -4 main worker threads were launched, then each launched a physical core number (56) of threads on all cores, including logical cores. - -2. Core Bound stalls - -.. figure:: /_static/img/torchserve-ipex-images/14.png - :width: 80% - :align: center - -We observe a very high Core Bound stall of 88.4%, decreasing pipeline efficiency. Core Bound stalls indicate sub-optimal use of available execution units in the CPU. For example, several GEMM instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores could cause Core Bound stalls. And as described in the previous section, use of logical cores amplifies this problem. - - -.. figure:: /_static/img/torchserve-ipex-images/15.png - :width: 40% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images/16.png - :width: 50% - :align: center - -An empty pipeline slot not filled with micro-ops (uOps) is attributed to a stall. For example, without core pinning CPU usage may not effectively be on compute but on other operations like thread scheduling from Linux kernel. We see above that ``__sched_yield`` contributed to the majority of the Spin Time. - -3. Thread Migration - -Without core pinning, scheduler may migrate thread executing on a core to a different core. Thread migration can disassociate the thread from data that has already been fetched into the caches resulting in longer data access latencies. This problem is exacerbated in NUMA systems when thread migrates across sockets. Data that has been fetched to high speed cache on local memory now becomes remote memory, which is much slower. - -.. figure:: /_static/img/torchserve-ipex-images/17.png - :width: 50% - :align: center - -Generally the total number of threads should be less than or equal to the total number of threads supported by the core. In the above example, we notice a large number of threads executing on core_51 instead of the expected 2 threads (since hyperthreading is enabled in Intel(R) Xeon(R) Platinum 8180 CPUs) . This indicates thread migration. - -.. figure:: /_static/img/torchserve-ipex-images/18.png - :width: 80% - :align: center - -Additionally, notice that thread (TID:97097) was executing on a large number of CPU cores, indicating CPU migration. For example, this thread was executing on cpu_81, then migrated to cpu_14, then migrated to cpu_5, and so on. Furthermore, note that this thread migrated cross socket back and forth many times, resulting in very inefficient memory access. For example, this thread executed on cpu_70 (NUMA node 0), then migrated to cpu_100 (NUMA node 1), then migrated to cpu_24 (NUMA node 0). - -4. Non Uniform Memory Access Analysis - -.. figure:: /_static/img/torchserve-ipex-images/19.png - :width: 100% - :align: center - -Compare local vs. remote memory access over time. We observe that about half, 51.09%, of the memory accesses were remote accesses, indicating sub-optimal NUMA configuration. - -2. torch.set_num_threads = ``number of physical cores / number of workers`` (no core pinning) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For an apple-to-apple comparison with launcher's core pinning, we'll set the number of threads to the number of cores divided by the number of workers (launcher does this internally). Add the following code snippet in the `base_handler `_: - -.. code:: python - - torch.set_num_threads(num_physical_cores/num_workers) - -As before without core pinning, these threads are not affinitized to specific CPU cores, causing the operating system to periodically schedule threads on cores located in different sockets. - -1. CPU usage - -.. figure:: /_static/img/torchserve-ipex-images/20.gif - :width: 100% - :align: center - -4 main worker threads were launched, then each launched a ``num_physical_cores/num_workers`` number (14) of threads on all cores, including logical cores. - -2. Core Bound stalls - -.. figure:: /_static/img/torchserve-ipex-images/21.png - :width: 80% - :align: center - -Although the percentage of Core Bound stalls has decreased from 88.4% to 73.5%, the Core Bound is still very high. - -.. figure:: /_static/img/torchserve-ipex-images/22.png - :width: 40% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images/23.png - :width: 50% - :align: center - -3. Thread Migration - -.. figure:: /_static/img/torchserve-ipex-images/24.png - :width: 75% - :align: center - -Similar as before, without core pinning thread (TID:94290) was executing on a large number of CPU cores, indicating CPU migration. We notice again cross-socket thread migration, resulting in very inefficient memory access. For example, this thread executed on cpu_78 (NUMA node 0), then migrated to cpu_108 (NUMA node 1). - -4. Non Uniform Memory Access Analysis - -.. figure:: /_static/img/torchserve-ipex-images/25.png - :width: 100% - :align: center - -Although an improvement from the original 51.09%, still 40.45% of memory access is remote, indicating sub-optimal NUMA configuration. - -3. launcher core pinning -~~~~~~~~~~~~~~~~~~~~~~~~ -Launcher will internally equally distribute physical cores to workers, and bind them to each worker. As a reminder, launcher by default uses physical cores only. In this example, launcher will bind worker 0 to cores 0-13 (NUMA node 0), worker 1 to cores 14-27 (NUMA node 0), worker 2 to cores 28-41 (NUMA node 1), and worker 3 to cores 42-55 (NUMA node 1). Doing so ensures that cores are not overlapped among workers and avoids logical core usage. - -1. CPU usage - -.. figure:: /_static/img/torchserve-ipex-images/26.gif - :width: 100% - :align: center - -4 main worker threads were launched, then each launched a ``num_physical_cores/num_workers`` number (14) of threads affinitized to the assigned physical cores. - -2. Core Bound stalls - -.. figure:: /_static/img/torchserve-ipex-images/27.png - :width: 80% - :align: center - -Core Bound stalls has decreased significantly from the original 88.4% to 46.2% - almost a 2x improvement. - -.. figure:: /_static/img/torchserve-ipex-images/28.png - :width: 40% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images/29.png - :width: 50% - :align: center - -We verify that with core binding, most CPU time is effectively used on compute - Spin Time of 0.256s. - -3. Thread Migration - -.. figure:: /_static/img/torchserve-ipex-images/30.png - :width: 100% - :align: center - -We verify that `OMP Primary Thread #0` was bound to assigned physical cores (42-55), and did not migrate cross-socket. - -4. Non Uniform Memory Access Analysis - -.. figure:: /_static/img/torchserve-ipex-images/31.png - :width: 100% - :align: center - -Now almost all, 89.52%, memory accesses are local accesses. - -Conclusion -~~~~~~~~~~ - -In this blog, we've showcased that properly setting your CPU runtime configuration can significantly boost out-of-box CPU performance. - -We have walked through some general CPU performance tuning principles and recommendations: - -- In a hyperthreading enabled system, avoid logical cores by setting thread affinity to physical cores only via core pinning. -- In a multi-socket system with NUMA, avoid cross-socket remote memory access by setting thread affinity to a specific socket via core pinning. - -We have visually explained these ideas from first principles and have verified the performance boost with profiling. And finally, we have applied all of our learnings to TorchServe to boost out-of-box TorchServe CPU performance. - -These principles can be automatically configured via an easy to use launch script which has already been integrated into TorchServe. - -For interested readers, please check out the following documents: - -- `CPU specific optimizations `_ -- `Maximize Performance of Intel® Software Optimization for PyTorch* on CPU `_ -- `Performance Tuning Guide `_ -- `Launch Script Usage Guide `_ -- `Top-down Microarchitecture Analysis Method `_ -- `Configuring oneDNN for Benchmarking `_ -- `Intel® VTune™ Profiler `_ -- `Intel® VTune™ Profiler User Guide `_ - -And stay tuned for a follow-up posts on optimized kernels on CPU via `Intel® Extension for PyTorch* `_ and advanced launcher configurations such as memory allocator. - -Acknowledgement -~~~~~~~~~~~~~~~ - -We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this blog. We would also like to thank Hamid Shojanazeri (Meta), Li Ning (AWS) and Jing Xu (Intel) for helpful feedback in code review. And Suraj Subramanian (Meta) and Geeta Chauhan (Meta) for helpful feedback on the blog. diff --git a/intermediate_source/torchserve_with_ipex_2.rst b/intermediate_source/torchserve_with_ipex_2.rst deleted file mode 100644 index 64f3db6b27..0000000000 --- a/intermediate_source/torchserve_with_ipex_2.rst +++ /dev/null @@ -1,447 +0,0 @@ -Grokking PyTorch Intel CPU performance from first principles (Part 2) -===================================================================== - -Authors: `Min Jean Cho `_, `Jing Xu `_, `Mark Saroufim `_ - -In the `Grokking PyTorch Intel CPU Performance From First Principles `_ tutorial -, we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into `TorchServe `_ for optimized CPU performance. - -In this tutorial, we will demonstrate boosting performance with memory allocator via the `Intel® Extension for PyTorch* Launcher `_ -, and optimized kernels on CPU via `Intel® Extension for PyTorch* `_ -, and apply them to TorchServe showcasing 7.71x throughput speedup for ResNet50 and 2.20x throughput speedup for BERT. - -.. figure:: /_static/img/torchserve-ipex-images-2/1.png - :width: 100% - :align: center - -Prerequisites -------------- -Throughout this tutorial, we will use `Top-down Microarchitecture Analysis (TMA) `_ to profile and show that the Back End Bound (Memory Bound, Core Bound) is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intel® Extension for PyTorch* for improving Back End Bound. We will use `toplev `_, a tool part of `pmu-tools `_ built on top of `Linux perf `_, for TMA. - -We will also use `Intel® VTune™ Profiler's Instrumentation and Tracing Technology (ITT) `__ to profile at finer granularity. - -Top-down Microarchitecture Analysis Method (TMA) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When tuning CPU for optimal performance, it's useful to know where the bottleneck is. Most CPU cores have on-chip Performance Monitoring Units (PMUs). PMUs are dedicated pieces of logic within a CPU core that count specific hardware events as they occur on the system. Examples of these events may be Cache Misses or Branch Mispredictions. PMUs are used for Top-down Microarchitecture Analysis (TMA) to identify the bottlenecks. TMA consists of hierarchical levels as shown: - -.. figure:: /_static/img/torchserve-ipex-images-2/2.png - :width: 100% - :align: center - -The top level, level-1, metrics collect *Retiring*, *Bad Speculation*, *Front End Bound*, *Back End Bound*. The pipeline of CPU can conceptually be simplified and divided into two: the frontend and the backend. The *frontend* is responsible for fetching the program code and decoding them into low-level hardware operations called micro-ops (uOps). The uOps are then fed to the *backend* in a process called allocation. Once allocated, the backend is responsible for executing the uOp in an available execution unit. A completion of uOp's execution is called *retirement*. In contrast, a *bad speculation* is when speculatively fetched uOps are canceled before retiring such as in the case of mispredicted branches. Each of these metrics can further be broken down in the subsequent levels to pinpoint the bottleneck. - -Tune for the Back End Bound -+++++++++++++++++++++++++++ -The majority of untuned deep learning workloads will be Back End Bound. Resolving Back End bound is often resolving sources of latency causing retirement to take longer than necessary. As shown above, Back End Bound has two sub-metrics – Core Bound and Memory Bound. - -Memory Bound stalls have causes related to the memory subsystem. For example, last-level cache (LLC or L3 cache) miss causing access to DRAM. Scaling deep learning models often requires significant compute. And high compute utilization requires that data is available when the execution units need it to execute the uOps. This requires prefetching the data and reusing the data in cache instead of fetching that same data multiple times from main memory which causes execution units to be starved while data is being returned. Throughout this tutorial, we wll show that a more efficient memory allocator, operator fusion, memory layout format optimization reduce overhead on Memory Bound with better cache locality. - -Core Bound stalls indicate sub-optimal use of available execution units while there are no uncompleted memory accesses. For example, several general matrix-matrix multiplication (GEMM) instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units could cause Core Bound stalls. Key deep learning kernels, including the DP kernels, have been well optimized by `oneDNN library `_ (oneAPI Deep Neural Network Library), reducing overhead on Core Bound. - -Operations like GEMM, convolution, deconvolution are compute-intensive. While operations like pooling, batch normalization, activation functions like ReLU are memory-bound. - -Intel® VTune™ Profiler's Instrumentation and Tracing Technology (ITT) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ITT APIs of Intel® VTune Profiler is a useful tool to annotate a region of your workload for tracing to profile and visualize at a finer granularity of your annotation – OP/function/sub-function granularity. By annotating at the granularity of your PyTorch model's OPs, Intel® VTune Profiler's ITT enables op-level profiling. Intel® VTune Profiler's ITT has been integrated into `PyTorch Autograd Profiler `_. :superscript:`1` - -1. The feature has to be explicitly enabled by *with torch.autograd.profiler.emit_itt()*. - -TorchServe with Intel® Extension for PyTorch* ---------------------------------------------- -`Intel® Extension for PyTorch* `__ is a Python package to extend PyTorch with optimizations for extra performance boost on Intel hardware. - -Intel® Extension for PyTorch* has already been integrated into TorchServe to improve the performance out-of-box. :superscript:`2` For custom handler scripts, we recommend adding the *intel_extension_for_pytorch* package in. - -2. The feature has to be explicitly enabled by setting *ipex_enable=true* in *config.properties*. - -Throughout this section, we will show that Back End Bound is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intel® Extension for PyTorch* for improving Back End Bound, which has two submetrics - Memory Bound, and Core Bound. A more efficient memory allocator, operator fusion, memory layout format optimization improve Memory Bound. Ideally, Memory Bound can be improved to Core Bound by optimized operators and better cache locality. And key deep learning primitives, such as convolution, matrix multiplication, dot-product, have been well optimized by Intel® Extension for PyTorch* and oneDNN library, improving Core Bound. - -Leveraging Advanced Launcher Configuration: Memory Allocator -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Memory allocator plays an important role from performance perspective. A more efficient memory usage reduces overhead on unnecessary memory allocations or destructions, and thus faster execution. For deep learning workloads in practice, especially those running on large multi-core systems or servers like TorchServe, TCMalloc, or JeMalloc can generally get better memory usage than the default PyTorch memory allocator, PTMalloc. - -TCMalloc, JeMalloc, PTMalloc -++++++++++++++++++++++++++++ -Both TCMalloc and JeMalloc use thread-local caches to reduce overhead on thread synchronization, and lock contention by using spinlocks and per-thread arenas respectively. TCMalloc and JeMalloc reduce overhead on unnecessary memory allocation and deallocation. Both allocators categorize memory allocations by sizes to reduce overhead on memory fragmentation. - -With the launcher, users can easily experiment with different memory allocators by choosing one of the three launcher knobs *--enable_tcmalloc* (TCMalloc), *--enable_jemalloc* (JeMalloc), *--use_default_allocator* (PTMalloc). - -Exercise -^^^^^^^^ -Let's profile PTMalloc vs. JeMalloc. - -We will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket to avoid any NUMA complication – to profile the effect of memory allocator only. - -The following example measures the average inference time of ResNet50: - -.. code:: python - - import torch - import torchvision.models as models - import time - - model = models.resnet50(pretrained=False) - model.eval() - batch_size = 32 - data = torch.rand(batch_size, 3, 224, 224) - - # warm up - for _ in range(100): - model(data) - - # measure - # Intel® VTune Profiler's ITT context manager - with torch.autograd.profiler.emit_itt(): - start = time.time() - for i in range(100): - # Intel® VTune Profiler's ITT to annotate each step - torch.profiler.itt.range_push('step_{}'.format(i)) - model(data) - torch.profiler.itt.range_pop() - end = time.time() - - print('Inference took {:.2f} ms in average'.format((end-start)/100*1000)) - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/3.png - :width: 100% - :align: center - -Level-1 TMA shows that both PTMalloc and JeMalloc are bounded by the backend. More than half of the execution time was stalled by the backend. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/4.png - :width: 100% - :align: center - -Level-2 TMA shows that the Back End Bound was caused by Memory Bound. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/5.png - :width: 100% - :align: center - -Most of the metrics under the Memory Bound identify which level of the memory hierarchy from the L1 cache to main memory is the bottleneck. A hotspot bounded at a given level indicates that most of the data was being retrieved from that cache or memory-level. Optimizations should focus on moving data closer to the core. Level-3 TMA shows that PTMalloc was bottlenecked by DRAM Bound. On the other hand, JeMalloc was bottlenecked by L1 Bound – JeMalloc moved data closer to the core, and thus faster execution. - -Let's look at Intel® VTune Profiler ITT trace. In the example script, we have annotated each *step_x* of the inference loop. - -.. figure:: /_static/img/torchserve-ipex-images-2/6.png - :width: 100% - :align: center - -Each step is traced in the timeline graph. The duration of model inference on the last step (step_99) decreased from 304.308 ms to 261.843 ms. - -Exercise with TorchServe -^^^^^^^^^^^^^^^^^^^^^^^^ -Let's profile PTMalloc vs. JeMalloc with TorchServe. - -We will use `TorchServe apache-bench benchmarking `_ with ResNet50 FP32, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the `default parameters `_. - -As in the previous exercise, we will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in `config.properties `__: - -PTMalloc - -.. code:: python - - cpu_launcher_enable=true - cpu_launcher_args=--node_id 0 --use_default_allocator - -JeMalloc - -.. code:: python - - cpu_launcher_enable=true - cpu_launcher_args=--node_id 0 --enable_jemalloc - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/7.png - :width: 100% - :align: center - -Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/8.png - :width: 100% - :align: center - -Let's use Intel® VTune Profiler ITT to annotate `TorchServe inference scope `_ to profile at inference-level granularity. As `TorchServe Architecture `_ consists of several sub-components, including the Java frontend for handling request/response, and the Python backend for running the actual inference on the models, it is helpful to use Intel® VTune Profiler ITT to limit the collection of trace data at inference-level. - -.. figure:: /_static/img/torchserve-ipex-images-2/9.png - :width: 100% - :align: center - -Each inference call is traced in the timeline graph. The duration of the last model inference decreased from 561.688 ms to 251.287 ms - 2.2x speedup. - -.. figure:: /_static/img/torchserve-ipex-images-2/10.png - :width: 100% - :align: center - -The timeline graph can be expanded to see op-level profiling results. The duration of *aten::conv2d* decreased from 16.401 ms to 6.392 ms - 2.6x speedup. - -In this section, we have demonstrated that JeMalloc can give better performance than the default PyTorch memory allocator, PTMalloc, with efficient thread-local caches improving Back-End-Bound. - -Intel® Extension for PyTorch* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The three major `Intel® Extension for PyTorch* `__ optimization techniques, Operator, Graph, Runtime, are as shown: - -+------------------------------------------------------------------------------------------------------------------------+ -| Intel® Extension for PyTorch* Optimization Techniques | -+======================================================+=======================================+=========================+ -| Operator | Graph | Runtime | -+------------------------------------------------------+---------------------------------------+-------------------------+ -| - Vectorization and Multi-threading | - Constant folding to reduce compute | - Thread affinitization | -| - Low-precision BF16/INT8 compute | - Op fusion for better cache locality | - Memory buffer pooling | -| - Data layout optimization for better cache locality | | - GPU runtime | -| | | - Launcher | -+------------------------------------------------------+---------------------------------------+-------------------------+ - -Operator Optimization -+++++++++++++++++++++ -Optimized operators and kernels are registered through PyTorch dispatching mechanism. These operators and kernels are accelerated from native vectorization feature and matrix calculation feature of Intel hardware. During execution, Intel® Extension for PyTorch* intercepts invocation of ATen operators, and replaces the original ones with these optimized ones. Popular operators like Convolution, Linear have been optimized in Intel® Extension for PyTorch*. - -Exercise -^^^^^^^^ -Let's profile optimized operator with Intel® Extension for PyTorch*. We will compare with and without the lines in code changes. - -As in the previous exercises, we will bind the workload to physical cores of the first socket. - -.. code:: python - - import torch - - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv = torch.nn.Conv2d(16, 33, 3, stride=2) - self.relu = torch.nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.relu(x) - return x - - model = Model() - model.eval() - data = torch.rand(20, 16, 50, 100) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) - ###################################################### - - print(model) - -The model consists of two operations—Conv2d and ReLU. By printing the model object, we get the following output. - -.. figure:: /_static/img/torchserve-ipex-images-2/11.png - :width: 60% - :align: center - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/12.png - :width: 100% - :align: center - -Notice the Back End Bound reduced from 68.9 to 38.5 – 1.8x speedup. - -Additionally, let's profile with PyTorch Profiler. - -.. figure:: /_static/img/torchserve-ipex-images-2/13.png - :width: 100% - :align: center - -Notice the CPU time reduced from 851 us to 310 us – 2.7X speedup. - -Graph Optimization -++++++++++++++++++ -It is highly recommended for users to take advantage of Intel® Extension for PyTorch* with `TorchScript `_ for further graph optimizations. To optimize performance further with TorchScript, Intel® Extension for PyTorch* supports oneDNN fusion of frequently used FP32/BF16 operator patterns, like Conv2D+ReLU, Linear+ReLU, and more to reduce operator/kernel invocation overheads, and for better cache locality. Some operator fusions allow to maintain temporary calculations, data type conversions, data layouts for better cache locality. As well as for INT8, Intel® Extension for PyTorch* has built-in quantization recipes to deliver good statistical accuracy for popular DL workloads including CNN, NLP and recommendation models. The quantized model is then optimized with oneDNN fusion support. - -Exercise -^^^^^^^^ -Let's profile FP32 graph optimization with TorchScript. - -As in the previous exercises, we will bind the workload to physical cores of the first socket. - -.. code:: python - - import torch - - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv = torch.nn.Conv2d(16, 33, 3, stride=2) - self.relu = torch.nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.relu(x) - return x - - model = Model() - model.eval() - data = torch.rand(20, 16, 50, 100) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) - ###################################################### - - # torchscript - with torch.no_grad(): - model = torch.jit.trace(model, data) - model = torch.jit.freeze(model) - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/14.png - :width: 100% - :align: center - -Notice the Back End Bound reduced from 67.1 to 37.5 – 1.8x speedup. - -Additionally, let's profile with PyTorch Profiler. - -.. figure:: /_static/img/torchserve-ipex-images-2/15.png - :width: 100% - :align: center - -Notice that with Intel® Extension for PyTorch* Conv + ReLU operators are fused, and the CPU time reduced from 803 us to 248 us – 3.2X speedup. The oneDNN eltwise post-op enables fusing a primitive with an elementwise primitive. This is one of the most popular kinds of fusion: an eltwise (typically an activation function such as ReLU) with preceding convolution or inner product. Have a look at the oneDNN verbose log shown in the next section. - -Channels Last Memory Format -+++++++++++++++++++++++++++ -When invoking *ipex.optimize* on model, Intel® Extension for PyTorch* automatically converts the model to optimized memory format, channels last. Channels last is a memory format that is more friendly to Intel Architecture. Compared to PyTorch default channels first NCHW (batch, channels, height, width) memory format, channels last NHWC (batch, height, width, channels) memory format generally accelerates convolutional neural networks with better cache locality. - -One thing to note is that it is expensive to convert memory format. So it's better to convert the memory format prior to deployment once, and keep the memory format conversion minimum during deployment. As the data propagates through model's layers the channels last memory format is preserved through consecutive channels last supported layers (for example, Conv2d -> ReLU -> Conv2d) and conversions are only made in between channels last unsupported layers. See `Memory Format Propagation `_ for more details. - -Exercise -^^^^^^^^ -Let's demonstrate channels last optimization. - -.. code:: python - - import torch - - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv = torch.nn.Conv2d(16, 33, 3, stride=2) - self.relu = torch.nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.relu(x) - return x - - model = Model() - model.eval() - data = torch.rand(20, 16, 50, 100) - - import intel_extension_for_pytorch as ipex - ############################### code changes ############################### - ipex.disable_auto_channels_last() # omit this line for channels_last (default) - ############################################################################ - model = ipex.optimize(model) - - with torch.no_grad(): - model = torch.jit.trace(model, data) - model = torch.jit.freeze(model) - -We will use `oneDNN verbose mode `_, a tool to help collect information at oneDNN graph level such as operator fusions, kernel execution time spent on executing oneDNN primitives. For more information, refer to the `oneDNN Documentation `_. - -.. figure:: /_static/img/torchserve-ipex-images-2/16.png - :width: 15% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images-2/17.png - :width: 100% - :align: center - -Above is oneDNN verbose from channels first. We can verify that there are reorders from weight and data, then do computation, and finally reorder output back. - -.. figure:: /_static/img/torchserve-ipex-images-2/18.png - :width: 80% - :align: center - -Above is oneDNN verbose from channels last. We can verify that channels last memory format avoids unnecessary reorders. - -Performance Boost with Intel® Extension for PyTorch* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Below summarizes performance boost of TorchServe with Intel® Extension for PyTorch* for ResNet50 and BERT-base-uncased. - -.. figure:: /_static/img/torchserve-ipex-images-2/19.png - :width: 100% - :align: center - -Exercise with TorchServe -~~~~~~~~~~~~~~~~~~~~~~~~ -Let's profile Intel® Extension for PyTorch* optimizations with TorchServe. - -We will use `TorchServe apache-bench benchmarking `_ with ResNet50 FP32 TorchScript, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the `default parameters `_. - -As in the previous exercise, we will use the launcher to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in `config.properties `__: - -.. code:: python - - cpu_launcher_enable=true - cpu_launcher_args=--node_id 0 - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/20.png - :width: 100% - :align: center - -Level-1 TMA shows that both are bounded by the backend. As discussed earlier, the majority of untuned deep learning workloads will be Back End Bound. Notice the Back End Bound reduced from 70.0 to 54.1. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/21.png - :width: 100% - :align: center - -As discussed earlier, Back End Bound has two submetrics – Memory Bound and Core Bound. Memory Bound indicates the workload is under-optimized or under-utilized, and ideally memory-bound operations can be improved to core-bound by optimizing the OPs and improving cache locality. Level-2 TMA shows that the Back End Bound improved from Memory Bound to Core Bound. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/22.png - :width: 100% - :align: center - -Scaling deep learning models for production on a model serving framework like TorchServe requires high compute utilization. This requires that data is available through prefetching and reusing the data in cache when the execution units need it to execute the uOps. Level-3 TMA shows that the Back End Memory Bound improved from DRAM Bound to Core Bound. - -As in the previous exercise with TorchServe, let's use Intel® VTune Profiler ITT to annotate `TorchServe inference scope `_ to profile at inference-level granularity. - -.. figure:: /_static/img/torchserve-ipex-images-2/23.png - :width: 100% - :align: center - -Each inference call is traced in the timeline graph. The duration of the last inference call decreased from 215.731 ms to 95.634 ms - 2.3x speedup. - -.. figure:: /_static/img/torchserve-ipex-images-2/24.png - :width: 100% - :align: center - -The timeline graph can be expanded to see op-level profiling results. Notice that Conv + ReLU has been fused, and the duration decreased from 6.393 ms + 1.731 ms to 3.408 ms - 2.4x speedup. - -Conclusion ------------ -In this tutorial, we have used Top-down Microarchitecture Analysis (TMA) and Intel® VTune™ Profiler's Instrumentation and Tracing Technology (ITT) to demonstrate that - -- Often the primary bottleneck of under-optimized or under-tuned deep learning workloads are Back End Bound, which has two submetrics, Memory Bound and Core Bound. - -- A more efficient memory allocator, operator fusion, memory layout format optimization by Intel® Extension for PyTorch* improve Memory Bound. - -- Key deep learning primitives, such as convolution, matrix multiplication, dot-product, etc have been well optimized by Intel® Extension for PyTorch* and oneDNN library, improving Core Bound. - -- Intel® Extension for PyTorch* has been integrated into TorchServe with an ease-of-use API. - -- TorchServe with Intel® Extension for PyTorch* shows 7.71x throughput speedup for ResNet50, and 2.20x throughput speedup for BERT. - -Related Readings ----------------- -`Top-down Microarchitecture Analysis Method `_ - -`Top-Down performance analysis methodology `_ - -`Accelerating PyTorch with Intel® Extension for PyTorch* `_ - -Acknowledgement ---------------- -We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this tutorial. We would also like to thank Hamid Shojanazeri (Meta) and Li Ning (AWS) for their helpful feedback in code review and the tutorial. diff --git a/intermediate_source/torchvision_tutorial.py b/intermediate_source/torchvision_tutorial.py deleted file mode 100644 index d1e4c5c5d5..0000000000 --- a/intermediate_source/torchvision_tutorial.py +++ /dev/null @@ -1,534 +0,0 @@ -# -*- coding: utf-8 -*- -""" -TorchVision Object Detection Finetuning Tutorial -==================================================== -""" - -###################################################################### -# -# For this tutorial, we will be finetuning a pre-trained `Mask -# R-CNN `_ model on the `Penn-Fudan -# Database for Pedestrian Detection and -# Segmentation `_. It contains -# 170 images with 345 instances of pedestrians, and we will use it to -# illustrate how to use the new features in torchvision in order to train -# an object detection and instance segmentation model on a custom dataset. -# -# -# .. note :: -# -# This tutorial works only with torchvision version >=0.16 or nightly. -# If you're using torchvision<=0.15, please follow -# `this tutorial instead `_. -# -# -# Defining the Dataset -# -------------------- -# -# The reference scripts for training object detection, instance -# segmentation and person keypoint detection allows for easily supporting -# adding new custom datasets. The dataset should inherit from the standard -# :class:`torch.utils.data.Dataset` class, and implement ``__len__`` and -# ``__getitem__``. -# -# The only specificity that we require is that the dataset ``__getitem__`` -# should return a tuple: -# -# - image: :class:`torchvision.tv_tensors.Image` of shape ``[3, H, W]``, a pure tensor, or a PIL Image of size ``(H, W)`` -# - target: a dict containing the following fields -# -# - ``boxes``, :class:`torchvision.tv_tensors.BoundingBoxes` of shape ``[N, 4]``: -# the coordinates of the ``N`` bounding boxes in ``[x0, y0, x1, y1]`` format, ranging from ``0`` -# to ``W`` and ``0`` to ``H`` -# - ``labels``, integer :class:`torch.Tensor` of shape ``[N]``: the label for each bounding box. -# ``0`` represents always the background class. -# - ``image_id``, int: an image identifier. It should be -# unique between all the images in the dataset, and is used during -# evaluation -# - ``area``, float :class:`torch.Tensor` of shape ``[N]``: the area of the bounding box. This is used -# during evaluation with the COCO metric, to separate the metric -# scores between small, medium and large boxes. -# - ``iscrowd``, uint8 :class:`torch.Tensor` of shape ``[N]``: instances with ``iscrowd=True`` will be -# ignored during evaluation. -# - (optionally) ``masks``, :class:`torchvision.tv_tensors.Mask` of shape ``[N, H, W]``: the segmentation -# masks for each one of the objects -# -# If your dataset is compliant with above requirements then it will work for both -# training and evaluation codes from the reference script. Evaluation code will use scripts from -# ``pycocotools`` which can be installed with ``pip install pycocotools``. -# -# .. note :: -# For Windows, please install ``pycocotools`` from `gautamchitnis `_ with command -# -# ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI`` -# -# One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class, -# you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can -# define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has both -# classes, your ``labels`` tensor should look like ``[1, 2]``. -# -# Additionally, if you want to use aspect ratio grouping during training -# (so that each batch only contains images with similar aspect ratios), -# then it is recommended to also implement a ``get_height_and_width`` -# method, which returns the height and the width of the image. If this -# method is not provided, we query all elements of the dataset via -# ``__getitem__`` , which loads the image in memory and is slower than if -# a custom method is provided. -# -# Writing a custom dataset for PennFudan -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Let’s write a dataset for the PennFudan dataset. First, let's download the dataset and -# extract the `zip file `_: -# -# .. code:: python -# -# wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P data -# cd data && unzip PennFudanPed.zip -# -# -# We have the following folder structure: -# -# :: -# -# PennFudanPed/ -# PedMasks/ -# FudanPed00001_mask.png -# FudanPed00002_mask.png -# FudanPed00003_mask.png -# FudanPed00004_mask.png -# ... -# PNGImages/ -# FudanPed00001.png -# FudanPed00002.png -# FudanPed00003.png -# FudanPed00004.png -# -# Here is one example of a pair of images and segmentation masks - -import matplotlib.pyplot as plt -from torchvision.io import read_image - - -image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png") -mask = read_image("data/PennFudanPed/PedMasks/FudanPed00046_mask.png") - -plt.figure(figsize=(16, 8)) -plt.subplot(121) -plt.title("Image") -plt.imshow(image.permute(1, 2, 0)) -plt.subplot(122) -plt.title("Mask") -plt.imshow(mask.permute(1, 2, 0)) - -###################################################################### -# So each image has a corresponding -# segmentation mask, where each color correspond to a different instance. -# Let’s write a :class:`torch.utils.data.Dataset` class for this dataset. -# In the code below, we are wrapping images, bounding boxes and masks into -# :class:`torchvision.tv_tensors.TVTensor` classes so that we will be able to apply torchvision -# built-in transformations (`new Transforms API `_) -# for the given object detection and segmentation task. -# Namely, image tensors will be wrapped by :class:`torchvision.tv_tensors.Image`, bounding boxes into -# :class:`torchvision.tv_tensors.BoundingBoxes` and masks into :class:`torchvision.tv_tensors.Mask`. -# As :class:`torchvision.tv_tensors.TVTensor` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain -# :class:`torch.Tensor` API. For more information about torchvision ``tv_tensors`` see -# `this documentation `_. - -import os -import torch - -from torchvision.io import read_image -from torchvision.ops.boxes import masks_to_boxes -from torchvision import tv_tensors -from torchvision.transforms.v2 import functional as F - - -class PennFudanDataset(torch.utils.data.Dataset): - def __init__(self, root, transforms): - self.root = root - self.transforms = transforms - # load all image files, sorting them to - # ensure that they are aligned - self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages")))) - self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks")))) - - def __getitem__(self, idx): - # load images and masks - img_path = os.path.join(self.root, "PNGImages", self.imgs[idx]) - mask_path = os.path.join(self.root, "PedMasks", self.masks[idx]) - img = read_image(img_path) - mask = read_image(mask_path) - # instances are encoded as different colors - obj_ids = torch.unique(mask) - # first id is the background, so remove it - obj_ids = obj_ids[1:] - num_objs = len(obj_ids) - - # split the color-encoded mask into a set - # of binary masks - masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8) - - # get bounding box coordinates for each mask - boxes = masks_to_boxes(masks) - - # there is only one class - labels = torch.ones((num_objs,), dtype=torch.int64) - - image_id = idx - area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) - # suppose all instances are not crowd - iscrowd = torch.zeros((num_objs,), dtype=torch.int64) - - # Wrap sample and targets into torchvision tv_tensors: - img = tv_tensors.Image(img) - - target = {} - target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img)) - target["masks"] = tv_tensors.Mask(masks) - target["labels"] = labels - target["image_id"] = image_id - target["area"] = area - target["iscrowd"] = iscrowd - - if self.transforms is not None: - img, target = self.transforms(img, target) - - return img, target - - def __len__(self): - return len(self.imgs) - -###################################################################### -# That’s all for the dataset. Now let’s define a model that can perform -# predictions on this dataset. -# -# Defining your model -# ------------------- -# -# In this tutorial, we will be using `Mask -# R-CNN `_, which is based on top of -# `Faster R-CNN `_. Faster R-CNN is a -# model that predicts both bounding boxes and class scores for potential -# objects in the image. -# -# .. image:: ../../_static/img/tv_tutorial/tv_image03.png -# -# Mask R-CNN adds an extra branch -# into Faster R-CNN, which also predicts segmentation masks for each -# instance. -# -# .. image:: ../../_static/img/tv_tutorial/tv_image04.png -# -# There are two common -# situations where one might want -# to modify one of the available models in TorchVision Model Zoo. The first -# is when we want to start from a pre-trained model, and just finetune the -# last layer. The other is when we want to replace the backbone of the -# model with a different one (for faster predictions, for example). -# -# Let’s go see how we would do one or another in the following sections. -# -# 1 - Finetuning from a pretrained model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Let’s suppose that you want to start from a model pre-trained on COCO -# and want to finetune it for your particular classes. Here is a possible -# way of doing it: - - -import torchvision -from torchvision.models.detection.faster_rcnn import FastRCNNPredictor - -# load a model pre-trained on COCO -model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") - -# replace the classifier with a new one, that has -# num_classes which is user-defined -num_classes = 2 # 1 class (person) + background -# get number of input features for the classifier -in_features = model.roi_heads.box_predictor.cls_score.in_features -# replace the pre-trained head with a new one -model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) - -###################################################################### -# 2 - Modifying the model to add a different backbone -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -import torchvision -from torchvision.models.detection import FasterRCNN -from torchvision.models.detection.rpn import AnchorGenerator - -# load a pre-trained model for classification and return -# only the features -backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features -# ``FasterRCNN`` needs to know the number of -# output channels in a backbone. For mobilenet_v2, it's 1280 -# so we need to add it here -backbone.out_channels = 1280 - -# let's make the RPN generate 5 x 3 anchors per spatial -# location, with 5 different sizes and 3 different aspect -# ratios. We have a Tuple[Tuple[int]] because each feature -# map could potentially have different sizes and -# aspect ratios -anchor_generator = AnchorGenerator( - sizes=((32, 64, 128, 256, 512),), - aspect_ratios=((0.5, 1.0, 2.0),) -) - -# let's define what are the feature maps that we will -# use to perform the region of interest cropping, as well as -# the size of the crop after rescaling. -# if your backbone returns a Tensor, featmap_names is expected to -# be [0]. More generally, the backbone should return an -# ``OrderedDict[Tensor]``, and in ``featmap_names`` you can choose which -# feature maps to use. -roi_pooler = torchvision.ops.MultiScaleRoIAlign( - featmap_names=['0'], - output_size=7, - sampling_ratio=2 -) - -# put the pieces together inside a Faster-RCNN model -model = FasterRCNN( - backbone, - num_classes=2, - rpn_anchor_generator=anchor_generator, - box_roi_pool=roi_pooler -) - -###################################################################### -# Object detection and instance segmentation model for PennFudan Dataset -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# In our case, we want to finetune from a pre-trained model, given that -# our dataset is very small, so we will be following approach number 1. -# -# Here we want to also compute the instance segmentation masks, so we will -# be using Mask R-CNN: - - -import torchvision -from torchvision.models.detection.faster_rcnn import FastRCNNPredictor -from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor - - -def get_model_instance_segmentation(num_classes): - # load an instance segmentation model pre-trained on COCO - model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT") - - # get number of input features for the classifier - in_features = model.roi_heads.box_predictor.cls_score.in_features - # replace the pre-trained head with a new one - model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) - - # now get the number of input features for the mask classifier - in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels - hidden_layer = 256 - # and replace the mask predictor with a new one - model.roi_heads.mask_predictor = MaskRCNNPredictor( - in_features_mask, - hidden_layer, - num_classes - ) - - return model - - -###################################################################### -# That’s it, this will make ``model`` be ready to be trained and evaluated -# on your custom dataset. -# -# Putting everything together -# --------------------------- -# -# In ``references/detection/``, we have a number of helper functions to -# simplify training and evaluating detection models. Here, we will use -# ``references/detection/engine.py`` and ``references/detection/utils.py``. -# Just download everything under ``references/detection`` to your folder and use them here. -# On Linux if you have ``wget``, you can download them using below commands: - -os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py") -os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py") -os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py") -os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py") -os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py") - -###################################################################### -# Since v0.15.0 torchvision provides `new Transforms API `_ -# to easily write data augmentation pipelines for Object Detection and Segmentation tasks. -# -# Let’s write some helper functions for data augmentation / -# transformation: - -from torchvision.transforms import v2 as T - - -def get_transform(train): - transforms = [] - if train: - transforms.append(T.RandomHorizontalFlip(0.5)) - transforms.append(T.ToDtype(torch.float, scale=True)) - transforms.append(T.ToPureTensor()) - return T.Compose(transforms) - -###################################################################### -# Testing ``forward()`` method (Optional) -# --------------------------------------- -# -# Before iterating over the dataset, it's good to see what the model -# expects during training and inference time on sample data. -import utils - -model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") -dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) -data_loader = torch.utils.data.DataLoader( - dataset, - batch_size=2, - shuffle=True, - collate_fn=utils.collate_fn -) - -# For Training -images, targets = next(iter(data_loader)) -images = list(image for image in images) -targets = [{k: v for k, v in t.items()} for t in targets] -output = model(images, targets) # Returns losses and detections -print(output) - -# For inference -model.eval() -x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] -predictions = model(x) # Returns predictions -print(predictions[0]) - - -###################################################################### -# Let’s now write the main function which performs the training and the -# validation: - - -from engine import train_one_epoch, evaluate - -# train on the GPU or on the CPU, if a GPU is not available -device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') - -# our dataset has two classes only - background and person -num_classes = 2 -# use our dataset and defined transformations -dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) -dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False)) - -# split the dataset in train and test set -indices = torch.randperm(len(dataset)).tolist() -dataset = torch.utils.data.Subset(dataset, indices[:-50]) -dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) - -# define training and validation data loaders -data_loader = torch.utils.data.DataLoader( - dataset, - batch_size=2, - shuffle=True, - collate_fn=utils.collate_fn -) - -data_loader_test = torch.utils.data.DataLoader( - dataset_test, - batch_size=1, - shuffle=False, - collate_fn=utils.collate_fn -) - -# get the model using our helper function -model = get_model_instance_segmentation(num_classes) - -# move model to the right device -model.to(device) - -# construct an optimizer -params = [p for p in model.parameters() if p.requires_grad] -optimizer = torch.optim.SGD( - params, - lr=0.005, - momentum=0.9, - weight_decay=0.0005 -) - -# and a learning rate scheduler -lr_scheduler = torch.optim.lr_scheduler.StepLR( - optimizer, - step_size=3, - gamma=0.1 -) - -# let's train it just for 2 epochs -num_epochs = 2 - -for epoch in range(num_epochs): - # train for one epoch, printing every 10 iterations - train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) - # update the learning rate - lr_scheduler.step() - # evaluate on the test dataset - evaluate(model, data_loader_test, device=device) - -print("That's it!") - - - -###################################################################### -# So after one epoch of training, we obtain a COCO-style mAP > 50, and -# a mask mAP of 65. -# -# But what do the predictions look like? Let’s take one image in the -# dataset and verify -# -import matplotlib.pyplot as plt - -from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks - - -image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png") -eval_transform = get_transform(train=False) - -model.eval() -with torch.no_grad(): - x = eval_transform(image) - # convert RGBA -> RGB and move to device - x = x[:3, ...].to(device) - predictions = model([x, ]) - pred = predictions[0] - - -image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8) -image = image[:3, ...] -pred_labels = [f"pedestrian: {score:.3f}" for label, score in zip(pred["labels"], pred["scores"])] -pred_boxes = pred["boxes"].long() -output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red") - -masks = (pred["masks"] > 0.7).squeeze(1) -output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue") - - -plt.figure(figsize=(12, 12)) -plt.imshow(output_image.permute(1, 2, 0)) - -###################################################################### -# The results look good! -# -# Wrapping up -# ----------- -# -# In this tutorial, you have learned how to create your own training -# pipeline for object detection models on a custom dataset. For -# that, you wrote a :class:`torch.utils.data.Dataset` class that returns the -# images and the ground truth boxes and segmentation masks. You also -# leveraged a Mask R-CNN model pre-trained on COCO train2017 in order to -# perform transfer learning on this new dataset. -# -# For a more complete example, which includes multi-machine / multi-GPU -# training, check ``references/detection/train.py``, which is present in -# the torchvision repository. -#