allenai · dependabot · May 2, 2023 · May 8, 2023 · May 8, 2023 · May 9, 2023
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -20,7 +20,7 @@ env:
   WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
   BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
   BEAKER_WORKSPACE: ai2/tango-testing
-  BEAKER_DEFAULT_CLUSTER: ai2/tango-gpu-tests
+  BEAKER_DEFAULT_CLUSTER: ai2/canary
   BEAKER_IMAGE: petew/tango-testing
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
@@ -93,12 +93,6 @@ jobs:
             run: |
               pytest -v --color=yes --doctest-modules tango/integrations/transformers tests/integrations/transformers
 
-          - name: FairScale integration
-            extras: dev,fairscale
-            requires_torch: true
-            run: |
-              pytest -v --color=yes --doctest-modules tango/integrations/fairscale tests/integrations/fairscale
-
           - name: W&B integration
             extras: dev,torch,flax,wandb
             requires_torch: true
@@ -298,7 +292,7 @@ jobs:
                   path: /unused
           token: ${{ secrets.BEAKER_TOKEN }}
           workspace: ${{ env.BEAKER_WORKSPACE }}
-          clusters: ai2/general-cirrascale,ai2/allennlp-cirrascale,ai2/aristo-cirrascale,ai2/mosaic-cirrascale,ai2/s2-cirrascale
+          clusters: ai2/general-cirrascale,ai2/allennlp-cirrascale,ai2/aristo-cirrascale,ai2/mosaic-cirrascale,ai2/s2-cirrascale,ai2/mosaic-cirrascale-a100,ai2/prior-cirrascale,ai2/general-cirrascale-a100-80g-ib
 
   release:
     name: Release

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixes a bug where `FromParams` would fail to parse when an object takes a `Step` argument directly.
 - Changed a name so we don't override the built-in name `set`.
 - Fixed a bug that would cause O(n^2) memory consumption in dense step graphs.
+- Fixed how we find learning rate schedulers in Torch 2.
 
 
 ## [v1.2.0](https://github.com/allenai/tango/releases/tag/v1.2.0) - 2023-02-10

diff --git a/README.md b/README.md
@@ -230,7 +230,7 @@ The motivation behind this library is that we can make research easier by compos
 You can run the `tango` command through [pdb](https://docs.python.org/3/library/pdb.html). For example:
 
 ```bash
-python -m pdb -m tango run config.jsonnet
+python -m pdb -m tango run fsdp_config.jsonnet
 ```
 
 ### How is Tango different from [Metaflow](https://metaflow.org), [Airflow](https://airflow.apache.org), or [redun](https://github.com/insitro/redun)?

diff --git a/docs/source/api/integrations/fairscale.rst b/docs/source/api/integrations/fairscale.rst
diff --git a/docs/source/api/integrations/index.rst b/docs/source/api/integrations/index.rst
@@ -8,7 +8,6 @@ Integrations
    :caption: Integrations
 
    torch
-   fairscale
    datasets
    transformers
    wandb

diff --git a/docs/source/api/integrations/torch.rst b/docs/source/api/integrations/torch.rst
@@ -32,6 +32,8 @@ Model
 .. autoclass:: tango.integrations.torch.Model
    :members:
 
+.. autofunction:: tango.integrations.torch.with_wrapped_modules
+
 TrainingEngine
 ~~~~~~~~~~~~~~
 
@@ -40,6 +42,11 @@ TrainingEngine
 
 .. autoclass:: tango.integrations.torch.TorchTrainingEngine
 
+.. autoclass:: tango.integrations.torch.FSDPTrainingEngine
+
+.. autoclass:: tango.integrations.torch.FSDPConfig
+    :members:
+
 Optim
 ~~~~~
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -62,7 +62,6 @@
     "rich": ("https://rich.readthedocs.io/en/latest", None),
     "torch": ("https://pytorch.org/docs/stable", None),
     "flax": ("https://flax.readthedocs.io/en/latest", None),
-    "fairscale": ("https://fairscale.readthedocs.io/en/latest/", None),
     "datasets": ("https://huggingface.co/docs/datasets/master/en", None),
     "transformers": ("https://huggingface.co/docs/transformers/master/en", None),
     "beaker": ("https://beaker-py.readthedocs.io/en/latest/", None),

diff --git a/docs/source/examples/eval_p3.md b/docs/source/examples/eval_p3.md
@@ -22,5 +22,5 @@ to create the same configuration for all 10 prompts:
 You can run the experiment with:
 
 ```bash
-tango run config.jsonnet -i eval -d /tmp/workspace
+tango run fsdp_config.jsonnet -i eval -d /tmp/workspace
 ```
diff --git a/docs/source/examples/train_lm.md b/docs/source/examples/train_lm.md
@@ -33,5 +33,5 @@ Next you'll need to create a configuration file that defines the experiment. Jus
 Now we can run the experiment with:
 
 ```bash
-tango run config.jsonnet -i tokenize_step.py -d /tmp/results
+tango run fsdp_config.jsonnet -i tokenize_step.py -d /tmp/results
 ```
diff --git a/docs/source/first_steps.md b/docs/source/first_steps.md
@@ -237,7 +237,7 @@ Tango will warn you when you try to cache a non-deterministic step.
 This time when we run the experiment we'll designate a specific directory for Tango to use:
 
 ```bash
-$ tango run config.jsonnet -i components -d workspace/
+$ tango run fsdp_config.jsonnet -i components -d workspace/
 ```
 ```
 Starting new run live-tarpon
@@ -262,7 +262,7 @@ $ cat workspace/runs/live-tarpon/add_numbers/data.json
 Now look what happens when we run this step again:
 
 ```bash
-$ tango run config.jsonnet -i components -d workspace/
+$ tango run fsdp_config.jsonnet -i components -d workspace/
 ```
 ```
 Starting new run modest-shrimp
@@ -290,7 +290,7 @@ If we changed the inputs to the step in `config.jsonnet`:
 And ran it again:
 
 ```bash
-$ tango run config.jsonnet -i components -d workspace/
+$ tango run fsdp_config.jsonnet -i components -d workspace/
 ```
 ```
 Starting new run true-parrot

diff --git a/examples/finetune/config.jsonnet b/examples/finetune/config.jsonnet
@@ -23,7 +23,7 @@ local batch_size = 2;
 
 local activation_checkpointing = false;  # use activation/gradient checkpointing (probably need this GPT-J 6B, but not gpt2)
 local amp = false;  # use PyTorch's native automatic mixed precision
-local fsdp = false;  # Use FairScale's FullyShardedDataParallel (probably need this GPT-J 6B, but not gpt2)
+local fsdp = false;  # Use Torch's FullyShardedDataParallel (probably need this GPT-J 6B, but not gpt2)
 local cpu_offloading = false;  # Can only be used with 'fsdp' - saves a lot of GPU memory by offloading params+gradients to CPU, but is very slow.
 
 ######################
@@ -38,14 +38,13 @@ assert fsdp == true || cpu_offloading == false : "cpu_offloading only available
 
 # FullyShardedDataParallel config:
 local fsdp_config = if fsdp then {
-    reshard_after_forward: true,
     move_params_to_cpu: cpu_offloading,
     move_grads_to_cpu: cpu_offloading,
     mixed_precision: amp,
 } else null;
 
 local training_engine = {
-    type: if fsdp then "fairscale" else "torch",
+    type: if fsdp then "torch::fsdp" else "torch",
     optimizer: {
         type: "torch::AdamW",
         lr: learning_rate,
@@ -95,13 +94,13 @@ local dataloader = if devices > 1 then distributed_dataloader else single_device
         trained_model: {
             type: "transformers::finetune",
             model: {
-                type: "fairscale::with_wrapped_modules",
+                type: "torch::with_wrapped_modules",
                 model: {
                     type: "transformers::finetune::from_pretrained",
                     pretrained_model_name_or_path: pretrained_model,
                     low_cpu_mem_usage: load_with_low_cpu_mem_usage,
                 },
-                modules_to_wrap: modules_to_wrap,  # tell FairScale to wrap the transformer's blocks individually
+                modules_to_wrap: modules_to_wrap,  # tell torch to wrap the transformer's blocks individually
                 fsdp_config: fsdp_config,
                 activation_checkpointing: activation_checkpointing,
             },

diff --git a/examples/train_lm/README.md b/examples/train_lm/README.md
@@ -6,7 +6,7 @@ This Tango example showcases how you could train or fine-tune a causal language
 or [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj) from [transformers](https://github.com/huggingface/transformers) on WikiText2 or a similar dataset.
 It's best that you run this experiment on a machine with a GPU and PyTorch [properly installed](https://pytorch.org/get-started/locally/#start-locally), otherwise Tango will fall back to CPU-only and it will be extremely slow.
 
-This example also depends on [FairScale](https://fairscale.readthedocs.io/en/latest/), which allows you to leverage [`FullyShardedDataParallel`](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html) (FSDP) and [activation checkpointing](https://fairscale.readthedocs.io/en/latest/api/nn/checkpoint/checkpoint_activations.html) to fine-tune [GPT-J 6B](https://huggingface.co/EleutherAI/gpt-j-6B) or a similar-sized model. Just set the constants `fsdp` and `activation_checkpointing` in the config to `true`.
+This example also uses [`FullyShardedDataParallel`](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) (FSDP) and [activation checkpointing](https://pytorch.org/docs/stable/checkpoint.html) to fine-tune [GPT-J 6B](https://huggingface.co/EleutherAI/gpt-j-6B) or a similar-sized model. Just set the constants `fsdp` and `activation_checkpointing` in the config to `true`.
 Without using CPU offloading you'll need at least 4 x 40GiB A100 GPUs, or a different configuration with a comparable amount of total GPU memory.
 
 <!-- end overview -->

diff --git a/examples/train_lm/config.jsonnet b/examples/train_lm/config.jsonnet
@@ -44,14 +44,13 @@ assert fsdp == true || cpu_offloading == false : "cpu_offloading only available
 
 # FullyShardedDataParallel config:
 local fsdp_config = if fsdp then {
-    reshard_after_forward: true,
     move_params_to_cpu: cpu_offloading,
     move_grads_to_cpu: cpu_offloading,
     mixed_precision: amp,
 } else null;
 
 local training_engine = {
-    type: if fsdp then "fairscale" else "torch",
+    type: if fsdp then "torch::fsdp" else "torch",
     optimizer: {
         type: "torch::AdamW",
         lr: learning_rate,
@@ -100,13 +99,13 @@ local dataloader = if devices > 1 then distributed_dataloader else single_device
         trained_model: {
             type: "torch::train",
             model: {
-                type: "fairscale::with_wrapped_modules",
+                type: "torch::with_wrapped_modules",
                 model: {
                     type: "transformers::AutoModelForCausalLM::from_pretrained",
                     pretrained_model_name_or_path: pretrained_model,
                     low_cpu_mem_usage: load_with_low_cpu_mem_usage,
                 },
-                modules_to_wrap: ["transformer\\.h\\.[0-9]+"],  # tell FairScale to wrap the transformer's blocks individually
+                modules_to_wrap: ["transformer\\.h\\.[0-9]+"],  # tell torch to wrap the transformer's blocks individually
                 fsdp_config: fsdp_config,
                 activation_checkpointing: activation_checkpointing,
             },

diff --git a/integration_tests/README.md b/integration_tests/README.md