Skip to content

Commit

Permalink
2024-10-23 nightly release (1a57ce1)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Oct 23, 2024
1 parent a02bde7 commit de5f406
Show file tree
Hide file tree
Showing 13 changed files with 176 additions and 63 deletions.
11 changes: 4 additions & 7 deletions .github/scripts/validate_binaries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,8 @@ if [[ ${MATRIX_GPU_ARCH_TYPE} = 'rocm' ]]; then
exit 0
fi

if [[ ${MATRIX_PYTHON_VERSION} = '3.12' ]]; then
echo "Temporarily disable validation for Python 3.12"
exit 0
fi

if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
export CUDA_VERSION="cu118"
export CUDA_VERSION="cu124"
else
export CUDA_VERSION="cpu"
fi
Expand All @@ -36,8 +31,10 @@ fi
if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
if [[ ${MATRIX_GPU_ARCH_VERSION} = '11.8' ]]; then
export CUDA_VERSION="cu118"
else
elif [[ ${MATRIX_GPU_ARCH_VERSION} = '12.1' ]]; then
export CUDA_VERSION="cu121"
else
export CUDA_VERSION="cu124"
fi
else
export CUDA_VERSION="cpu"
Expand Down
36 changes: 15 additions & 21 deletions .github/workflows/release_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,32 @@ jobs:
strategy:
matrix:
include:
- os: linux.2xlarge
python-version: 3.8
python-tag: "py38"
cuda-tag: "cu121"
- os: linux.2xlarge
python-version: 3.9
python-tag: "py39"
cuda-tag: "cu121"
cuda-tag: "cu124"
- os: linux.2xlarge
python-version: '3.10'
python-tag: "py310"
cuda-tag: "cu121"
cuda-tag: "cu124"
- os: linux.2xlarge
python-version: '3.11'
python-tag: "py311"
cuda-tag: "cu121"
cuda-tag: "cu124"
- os: linux.2xlarge
python-version: '3.12'
python-tag: "py312"
cuda-tag: "cu121"
cuda-tag: "cu124"
steps:
# Checkout the repository to the GitHub Actions runner
- name: Check ldd --version
run: ldd --version
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v4
- name: Update pip
run: |
sudo yum update -y
sudo yum -y install git python3-pip
sudo pip3 install --upgrade pip
- name: Setup conda
run: |
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
Expand All @@ -73,12 +68,12 @@ jobs:
- name: Install PyTorch and CUDA
shell: bash
run: |
conda run -n build_binary pip install torch --index-url https://download.pytorch.org/whl/test/cu121
conda run -n build_binary pip install torch
- name: Install fbgemm
shell: bash
run: |
conda run -n build_binary pip install numpy
conda run -n build_binary pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/test/cu121
conda run -n build_binary pip install fbgemm-gpu
- name: Install Dependencies
shell: bash
run: |
Expand All @@ -102,7 +97,7 @@ jobs:
python setup.py bdist_wheel \
--python-tag=${{ matrix.python-tag }}
- name: Upload wheel as GHA artifact
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
path: dist/torchrec-*.whl
Expand All @@ -112,9 +107,9 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [linux.4xlarge.nvidia.gpu]
python-version: [3.8, 3.9, "3.10", "3.11", "3.12"]
cuda-tag: ["cu121"]
os: [linux.g5.12xlarge.nvidia.gpu]
python-version: [3.9, "3.10", "3.11", "3.12"]
cuda-tag: ["cu124"]
needs: build_on_cpu
# the glibc version should match the version of the one we used to build the binary
# for this case, it's 2.26
Expand Down Expand Up @@ -149,12 +144,11 @@ jobs:
sudo lshw -C display
# Checkout the repository to the GitHub Actions runner
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v4
- name: Update pip
run: |
sudo yum update -y
sudo yum -y install git python3-pip
sudo pip3 install --upgrade pip
- name: Setup conda
run: |
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
Expand All @@ -179,19 +173,19 @@ jobs:
- name: Install PyTorch and CUDA
shell: bash
run: |
conda run -n build_binary pip install torch --index-url https://download.pytorch.org/whl/test/cu121
conda run -n build_binary pip install torch
# download wheel from GHA
- name: Install fbgemm
shell: bash
run: |
conda run -n build_binary pip install numpy
conda run -n build_binary pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/test/cu121
conda run -n build_binary pip install fbgemm-gpu
- name: Install torchmetrics
shell: bash
run: |
conda run -n build_binary pip install torchmetrics==1.0.3
- name: Download wheel
uses: actions/download-artifact@v2
uses: actions/download-artifact@v4
with:
name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
- name: Display structure of downloaded files
Expand Down
16 changes: 16 additions & 0 deletions .github/workflows/unittest_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ jobs:
python-version: 3.9
python-tag: "py39"
cuda-tag: "cu121"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: 3.9
python-tag: "py39"
cuda-tag: "cu124"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: '3.10'
python-tag: "py310"
Expand All @@ -31,6 +35,10 @@ jobs:
python-version: '3.10'
python-tag: "py310"
cuda-tag: "cu121"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: '3.10'
python-tag: "py310"
cuda-tag: "cu124"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: '3.11'
python-tag: "py311"
Expand All @@ -39,6 +47,10 @@ jobs:
python-version: '3.11'
python-tag: "py311"
cuda-tag: "cu121"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: '3.11'
python-tag: "py311"
cuda-tag: "cu124"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: '3.12'
python-tag: "py312"
Expand All @@ -47,6 +59,10 @@ jobs:
python-version: '3.12'
python-tag: "py312"
cuda-tag: "cu121"
- os: linux.g5.12xlarge.nvidia.gpu
python-version: '3.12'
python-tag: "py312"
cuda-tag: "cu124"
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: ${{ matrix.os }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/validate-binaries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ on:
workflow_dispatch:
inputs:
channel:
description: "Channel to use (nightly, release, test)"
description: "Channel to use (nightly, release, test, pypi)"
required: true
type: choice
options:
Expand Down
9 changes: 9 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ TorchRec has been used to accelerate advancements in recommendation systems, som
* [Disaggregated Multi-Tower: Topology-aware Modeling Technique for Efficient Large-Scale Recommendation](https://arxiv.org/abs/2403.00877) paper
* [The Algorithm ML](https://github.com/twitter/the-algorithm-ml) from Twitter
* [Training Recommendation Models with Databricks](https://docs.databricks.com/en/machine-learning/train-recommender-models.html)
* [Toward 100TB model with Embedding Offloading Paper](https://dl.acm.org/doi/10.1145/3640457.3688037)


## Introduction
Expand Down Expand Up @@ -39,6 +40,10 @@ Check out the [Getting Started](https://pytorch.org/torchrec/setup-torchrec.html

1. Install pytorch. See [pytorch documentation](https://pytorch.org/get-started/locally/).
```
CUDA 12.4
pip install torch --index-url https://download.pytorch.org/whl/nightly/cu124
CUDA 12.1
pip install torch --index-url https://download.pytorch.org/whl/nightly/cu121
Expand All @@ -60,6 +65,10 @@ Check out the [Getting Started](https://pytorch.org/torchrec/setup-torchrec.html

3. Install FBGEMM.
```
CUDA 12.4
pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu124
CUDA 12.1
pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ If you are interested in helping improve the TorchRec project, here is
how you can contribute:

1. **Visit Our** `GitHub Repository <https://github.com/pytorch/torchrec>`__:
There yoou can find the source code, issues, and ongoing projects.
There you can find the source code, issues, and ongoing projects.

1. **Submit Feedback or Issues**: If you encounter any bugs or have
suggestions for improvements, please submit an issue through the
Expand Down
6 changes: 3 additions & 3 deletions docs/source/setup-torchrec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Below demonstrates the compatability matrix that is currently tested:
* - Python Version
- 3.9, 3.10, 3.11, 3.12
* - Compute Platform
- CPU, CUDA 11.8, CUDA 12.1
- CPU, CUDA 11.8, CUDA 12.1, CUDA 12.4

Aside from those requirements, TorchRec's core dependencies are PyTorch and FBGEMM.
If your system is compatible with both libraries generally, then it should be sufficient for TorchRec.
Expand All @@ -50,7 +50,7 @@ Therefore, specific versions of TorchRec and FBGEMM should correspond to a speci

Installation
------------
Below we show installations for CUDA 12.1 as an example. For CPU or CUDA 11.8, swap ``cu121`` for ``cpu`` or ``cu118``.
Below we show installations for CUDA 12.1 as an example. For CPU, CUDA 11.8, or CUDA 12.4, swap ``cu121`` for ``cpu``, ``cu118``, or ``cu124`` respectively.

.. tab-set::

Expand All @@ -63,7 +63,7 @@ Below we show installations for CUDA 12.1 as an example. For CPU or CUDA 11.8, s
pip install torchmetrics==1.0.3
pip install torchrec --index-url https://download.pytorch.org/whl/cu121
.. tab-item:: **Stable via PyPI (Only for CUDA 12.1)**
.. tab-item:: **Stable via PyPI (Only for CUDA 12.4)**

.. code-block:: bash
Expand Down
20 changes: 10 additions & 10 deletions torchrec/distributed/comm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2153,8 +2153,8 @@ def forward(
if rsi.codecs is not None:
inputs = rsi.codecs.forward.encode(inputs)
output = inputs.new_empty((inputs.size(0) // my_size, inputs.size(1)))
with record_function("## reduce_scatter_base ##"):
req = dist._reduce_scatter_base(
with record_function("## reduce_scatter_tensor ##"):
req = dist.reduce_scatter_tensor(
output,
inputs,
group=pg,
Expand Down Expand Up @@ -2222,7 +2222,7 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:
grad_output = rsi.codecs.backward.encode(grad_output)
grad_inputs = grad_output.new_empty(rsi.input_sizes)
with record_function("## reduce_scatter_base_bw (all_gather) ##"):
req = dist._all_gather_base(
req = dist.all_gather_into_tensor(
grad_inputs,
grad_output.contiguous(),
group=ctx.pg,
Expand Down Expand Up @@ -2250,8 +2250,8 @@ def forward(
input = agi.codecs.forward.encode(input)

outputs = input.new_empty((input.size(0) * my_size, input.size(1)))
with record_function("## all_gather_base ##"):
req = dist._all_gather_base(
with record_function("## all_gather_into_tensor ##"):
req = dist.all_gather_into_tensor(
outputs,
input,
group=pg,
Expand Down Expand Up @@ -2319,7 +2319,7 @@ def backward(ctx, grad_outputs: Tensor) -> Tuple[None, None, Tensor]:
grad_outputs = agi.codecs.backward.encode(grad_outputs)
grad_input = grad_outputs.new_empty(agi.input_size)
with record_function("## all_gather_base_bw (reduce_scatter) ##"):
req = dist._reduce_scatter_base(
req = dist.reduce_scatter_tensor(
grad_input,
grad_outputs.contiguous(),
group=ctx.pg,
Expand Down Expand Up @@ -2349,11 +2349,11 @@ def forward(

output = input.new_empty(rsi.input_sizes[my_rank])

# Use dist._reduce_scatter_base when a vector reduce-scatter is not needed
# Use dist.reduce_scatter_tensor when a vector reduce-scatter is not needed
# else use dist.reduce_scatter which internally supports vector reduce-scatter
if rsi.equal_splits:
with record_function("## reduce_scatter_base ##"):
req = dist._reduce_scatter_base(
with record_function("## reduce_scatter_tensor ##"):
req = dist.reduce_scatter_tensor(
output,
input,
group=pg,
Expand Down Expand Up @@ -2434,7 +2434,7 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:

if rsi.equal_splits:
with record_function("## reduce_scatter_base_bw (all_gather) ##"):
req = dist._all_gather_base(
req = dist.all_gather_into_tensor(
grad_input,
grad_output.contiguous(),
group=ctx.pg,
Expand Down
9 changes: 7 additions & 2 deletions torchrec/distributed/planner/enumerators.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,16 @@ def populate_estimates(self, sharding_options: List[ShardingOption]) -> None:
def _filter_sharding_types(
self, name: str, allowed_sharding_types: List[str]
) -> List[str]:
# GRID_SHARD is only supported if specified by user in parameter constraints
if not self._constraints or not self._constraints.get(name):
return allowed_sharding_types
return [
t for t in allowed_sharding_types if t != ShardingType.GRID_SHARD.value
]
constraints: ParameterConstraints = self._constraints[name]
if not constraints.sharding_types:
return allowed_sharding_types
return [
t for t in allowed_sharding_types if t != ShardingType.GRID_SHARD.value
]
constrained_sharding_types: List[str] = constraints.sharding_types

filtered_sharding_types = list(
Expand Down
22 changes: 22 additions & 0 deletions torchrec/distributed/planner/tests/test_proposers.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def setUp(self) -> None:
self.uniform_proposer = UniformProposer()
self.grid_search_proposer = GridSearchProposer()
self.dynamic_programming_proposer = DynamicProgrammingProposer()
self._sharding_types = [x.value for x in ShardingType]

def test_greedy_two_table(self) -> None:
tables = [
Expand All @@ -127,6 +128,17 @@ def test_greedy_two_table(self) -> None:
feature_names=["feature_1"],
),
]
"""
GRID_SHARD only is available if specified by user in parameter constraints, however,
adding parameter constraints does not work because of the non deterministic nature of
_filter_sharding_types (set & set) operation when constraints are present. This means
the greedy proposer will have a different order of sharding types on each test invocation
which we cannot have a harcoded "correct" answer for. We mock the call to _filter_sharding_types
to ensure the order of the sharding types list is always the same.
"""
self.enumerator._filter_sharding_types = MagicMock(
return_value=self._sharding_types
)

model = TestSparseNN(tables=tables, sparse_device=torch.device("meta"))
search_space = self.enumerator.enumerate(
Expand Down Expand Up @@ -335,6 +347,16 @@ def test_grid_search_three_table(self) -> None:
for i in range(1, 4)
]
model = TestSparseNN(tables=tables, sparse_device=torch.device("meta"))
"""
GRID_SHARD only is available if specified by user in parameter constraints, however,
adding parameter constraints does not work because of the non deterministic nature of
_filter_sharding_types (set & set) operation when constraints are present, we mock the
call to _filter_sharding_types to ensure the order of the sharding types list is always
the same.
"""
self.enumerator._filter_sharding_types = MagicMock(
return_value=self._sharding_types
)
search_space = self.enumerator.enumerate(
module=model,
sharders=[
Expand Down
Loading

0 comments on commit de5f406

Please sign in to comment.