-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
41 changed files
with
5,491 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
# This workflow will: | ||
# - Create a new Github release | ||
# - Build wheels for supported architectures | ||
# - Deploy the wheels to the Github release | ||
# - Release the static code to PyPi | ||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries | ||
|
||
name: Build wheels and deploy | ||
|
||
on: | ||
create: | ||
tags: | ||
- v* | ||
|
||
jobs: | ||
|
||
setup_release: | ||
name: Create Release | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Get the tag version | ||
id: extract_branch | ||
run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/} | ||
shell: bash | ||
|
||
- name: Create Release | ||
id: create_release | ||
uses: actions/create-release@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
with: | ||
tag_name: ${{ steps.extract_branch.outputs.branch }} | ||
release_name: ${{ steps.extract_branch.outputs.branch }} | ||
|
||
build_wheels: | ||
name: Build Wheel | ||
needs: setup_release | ||
runs-on: ${{ matrix.os }} | ||
|
||
strategy: | ||
fail-fast: false | ||
matrix: | ||
# Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the | ||
# manylinux docker image, but I haven't figured out how to install CUDA on manylinux. | ||
os: [ubuntu-20.04] | ||
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] | ||
torch-version: ['1.12.1', '1.13.1', '2.0.1', '2.1.1', '2.2.0.dev20231127'] | ||
cuda-version: ['11.8.0', '12.2.0'] | ||
# We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. | ||
# Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. | ||
# Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) | ||
# when building without C++11 ABI and using it on nvcr images. | ||
cxx11_abi: ['FALSE', 'TRUE'] | ||
exclude: | ||
# Pytorch <= 1.12 does not support Python 3.11 | ||
- torch-version: '1.12.1' | ||
python-version: '3.11' | ||
# Pytorch >= 2.0 only supports Python >= 3.8 | ||
- torch-version: '2.0.1' | ||
python-version: '3.7' | ||
- torch-version: '2.1.1' | ||
python-version: '3.7' | ||
- torch-version: '2.2.0.dev20231127' | ||
python-version: '3.7' | ||
# Pytorch <= 2.0 only supports CUDA <= 11.8 | ||
- torch-version: '1.12.1' | ||
cuda-version: '12.2.0' | ||
- torch-version: '1.13.1' | ||
cuda-version: '12.2.0' | ||
- torch-version: '2.0.1' | ||
cuda-version: '12.2.0' | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Set CUDA and PyTorch versions | ||
run: | | ||
echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV | ||
echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV | ||
- name: Free up disk space | ||
if: ${{ runner.os == 'Linux' }} | ||
# https://github.com/easimon/maximize-build-space/blob/master/action.yml | ||
# https://github.com/easimon/maximize-build-space/tree/test-report | ||
run: | | ||
sudo rm -rf /usr/share/dotnet | ||
sudo rm -rf /opt/ghc | ||
sudo rm -rf /opt/hostedtoolcache/CodeQL | ||
- name: Set up swap space | ||
if: runner.os == 'Linux' | ||
uses: pierotofy/[email protected] | ||
with: | ||
swap-size-gb: 10 | ||
|
||
- name: Install CUDA ${{ matrix.cuda-version }} | ||
if: ${{ matrix.cuda-version != 'cpu' }} | ||
uses: Jimver/[email protected] | ||
id: cuda-toolkit | ||
with: | ||
cuda: ${{ matrix.cuda-version }} | ||
linux-local-args: '["--toolkit"]' | ||
# default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1 | ||
# method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }} | ||
method: 'network' | ||
# We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions, | ||
# not just nvcc | ||
# sub-packages: '["nvcc"]' | ||
|
||
- name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }} | ||
run: | | ||
pip install --upgrade pip | ||
# If we don't install before installing Pytorch, we get error for torch 2.0.1 | ||
# ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none) | ||
pip install lit | ||
# We want to figure out the CUDA version to download pytorch | ||
# e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116 | ||
# This code is ugly, maybe there's a better way to do this. | ||
export TORCH_CUDA_VERSION=$(python -c "import os; minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118}[os.environ['MATRIX_TORCH_VERSION']]; maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121}[os.environ['MATRIX_TORCH_VERSION']]; print(max(min(int(os.environ['MATRIX_CUDA_VERSION']), maxv), minv))") | ||
if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then | ||
pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} | ||
else | ||
pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} | ||
fi | ||
nvcc --version | ||
python --version | ||
python -c "import torch; print('PyTorch:', torch.__version__)" | ||
python -c "import torch; print('CUDA:', torch.version.cuda)" | ||
python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" | ||
shell: | ||
bash | ||
|
||
- name: Build wheel | ||
run: | | ||
# We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6 | ||
# https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810 | ||
# However this still fails so I'm using a newer version of setuptools | ||
pip install setuptools==68.0.0 | ||
pip install ninja packaging wheel | ||
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH | ||
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH | ||
# Limit MAX_JOBS otherwise the github runner goes OOM | ||
MAX_JOBS=2 MAMBA_FORCE_BUILD="TRUE" MAMBA_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist | ||
tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }} | ||
wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2") | ||
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name} | ||
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV | ||
- name: Log Built Wheels | ||
run: | | ||
ls dist | ||
- name: Get the tag version | ||
id: extract_branch | ||
run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/} | ||
|
||
- name: Get Release with tag | ||
id: get_current_release | ||
uses: joutvhu/get-release@v1 | ||
with: | ||
tag_name: ${{ steps.extract_branch.outputs.branch }} | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Upload Release Asset | ||
id: upload_release_asset | ||
uses: actions/upload-release-asset@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
with: | ||
upload_url: ${{ steps.get_current_release.outputs.upload_url }} | ||
asset_path: ./dist/${{env.wheel_name}} | ||
asset_name: ${{env.wheel_name}} | ||
asset_content_type: application/* | ||
|
||
publish_package: | ||
name: Publish package | ||
needs: [build_wheels] | ||
|
||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.10' | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install ninja packaging setuptools wheel twine | ||
# We don't want to download anything CUDA-related here | ||
pip install torch --index-url https://download.pytorch.org/whl/cpu | ||
- name: Build core package | ||
env: | ||
MAMBA_SKIP_CUDA_BUILD: "TRUE" | ||
run: | | ||
python setup.py sdist --dist-dir=dist | ||
- name: Deploy | ||
env: | ||
TWINE_USERNAME: "__token__" | ||
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} | ||
run: | | ||
python -m twine upload dist/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[submodule "3rdparty/lm-evaluation-harness"] | ||
path = 3rdparty/lm-evaluation-harness | ||
url = https://github.com/EleutherAI/lm-evaluation-harness/ |
Submodule lm-evaluation-harness
added at
a35206
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Tri Dao, [email protected] | ||
Albert Gu, [email protected] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,141 @@ | ||
# Mamba | ||
|
||
This repository contains the code for the paper [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752). | ||
![Mamba](assets/selection.png "Selective State Space") | ||
> **Mamba: Linear-Time Sequence Modeling with Selective State Spaces**\ | ||
> Albert Gu*, Tri Dao*\ | ||
> Paper: https://arxiv.org/abs/2312.00752 | ||
The first official code release of the paper will be uploaded around noon EST, Monday Dec. 4. | ||
## Installation | ||
|
||
- `pip install causal-conv1d`: an efficient implemention of a simple causal Conv1d layer used inside the Mamba block. | ||
- `pip install mamba-ssm`: the core Mamba package. | ||
|
||
If `pip` complains about PyTorch versions, try passing `--no-build-isolation` to `pip`. | ||
|
||
Other requirements: | ||
- Linux | ||
- NVIDIA GPU | ||
- PyTorch 1.12+ | ||
- CUDA 11.6+ | ||
|
||
## Usage | ||
|
||
We expose several levels of interface with the Mamba model. | ||
|
||
### Selective SSM | ||
|
||
Mamba is based on a selective SSM layer, which is the focus of the paper (Section 3; Algorithm 2). | ||
|
||
Source: [ops/selective_scan_interface.py](mamba_ssm/ops/selective_scan_interface.py). | ||
|
||
### Mamba Block | ||
|
||
The main module of this repository is the Mamba architecture block wrapping the selective SSM. | ||
|
||
Source: [modules/mamba_simple.py](mamba_ssm/modules/mamba_simple.py). | ||
|
||
Usage: | ||
``` | ||
from mamba_ssm import Mamba | ||
batch, length, dim = 2, 64, 16 | ||
x = torch.randn(batch, length, dim).to("cuda") | ||
model = Mamba( | ||
# This module uses roughly 3 * expand * d_model^2 parameters | ||
d_model=dim, # Model dimension d_model | ||
d_state=16, # SSM state expansion factor | ||
d_conv=4, # Local convolution width | ||
expand=2, # Block expansion factor | ||
).to("cuda") | ||
y = model(x) | ||
assert y.shape == x.shape | ||
``` | ||
|
||
### Mamba Language Model | ||
|
||
Finally, we provide an example of a complete language model: a deep sequence model backbone (with repeating Mamba blocks) + language model head. | ||
|
||
Source: [models/mixer_seq_simple.py](mamba_ssm/models/mixer_seq_simple.py). | ||
|
||
This is an example of how to integrate Mamba into an end-to-end neural network. | ||
This example is used in the generation scripts below. | ||
|
||
|
||
|
||
## Pretrained Models | ||
|
||
Pretrained models are uploaded to | ||
[HuggingFace](https://huggingface.co/state-spaces): `mamba-130m`, `mamba-370m`, | ||
`mamba-790m`, `mamba-1.4b`, `mamba-2.8b`. | ||
|
||
The models will be autodownloaded by the generation script below. | ||
|
||
These models were trained on the [Pile](https://huggingface.co/datasets/EleutherAI/pile), and follow the standard model dimensions described by GPT-3 and followed by many open source models: | ||
|
||
| Parameters | Layers | Model dim. | | ||
|------------|--------|------------| | ||
| 130M | 12 | 768 | | ||
| 370M | 24 | 1024 | | ||
| 790M | 24 | 1536 | | ||
| 1.4B | 24 | 2048 | | ||
| 2.8B | 32 | 2560 | | ||
|
||
(The layer count of Mamba should be doubled, as two Mamba blocks are needed for each "layer" (MHA block + MLP block) of a Transformer.) | ||
|
||
Note: these are base models trained only for 300B tokens, without any form of downstream modification (instruction tuning, etc.). | ||
Performance is expected to be comparable or better than other architectures trained on similar data, but not to match larger or fine-tuned models. | ||
|
||
|
||
## Evaluations | ||
|
||
To run zero-shot evaluations of models (corresponding to Table 3 of the paper), | ||
we use the | ||
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) | ||
library. | ||
|
||
1. Pull the `lm-evaluation-harness` repo by `git submodule update --init | ||
--recursive`. We use the `big-refactor` branch. | ||
2. Install `lm-evaluation-harness`: `pip install -e 3rdparty/lm-evaluation-harness` | ||
3. Run evaluation with (more documentation at the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) repo): | ||
``` | ||
python evals/lm_harness_eval.py --model mamba --model_args pretrained=state-spaces/mamba-130m --tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande --device cuda --batch_size 64 | ||
python evals/lm_harness_eval.py --model hf --model_args pretrained=EleutherAI/pythia-160m --tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande --device cuda --batch_size 64 | ||
``` | ||
|
||
Note that the result of each task might differ from reported values by 0.1-0.3 due to noise in the evaluation process. | ||
|
||
## Inference | ||
|
||
The script [benchmarks/benchmark_generation_mamba_simple.py](benchmarks/benchmark_generation_mamba_simple.py) | ||
1. autoloads a model from the HuggingFace Hub, | ||
2. generates completions of a user-specified prompt, | ||
3. benchmarks the inference speed of this generation. | ||
|
||
Other configurable options include the top-p (nucleus sampling) probability, and the softmax temperature. | ||
|
||
### Examples | ||
|
||
To test generation latency (e.g. batch size = 1) with different sampling strategies: | ||
|
||
``` | ||
python benchmarks/benchmark_generation_mamba_simple.py --model-name "state-spaces/mamba-2.8b" --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.5 | ||
python benchmarks/benchmark_generation_mamba_simple.py --model-name "EleutherAI/pythia-2.8b" --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.5 | ||
``` | ||
|
||
To test generation throughput with random prompts (e.g. large batch size): | ||
``` | ||
python benchmarks/benchmark_generation_mamba_simple.py --model-name "state-spaces/mamba-2.8b" --batch 128 | ||
python benchmarks/benchmark_generation_mamba_simple.py --model-name "EleutherAI/pythia-2.8b" --batch 128 | ||
``` | ||
|
||
## Citation | ||
|
||
If you use this codebase, or otherwise found our work valuable, please cite Mamba: | ||
``` | ||
@article{mamba, | ||
title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces}, | ||
author={Gu, Albert and Dao, Tri}, | ||
journal={arXiv preprint arXiv:2312.00752}, | ||
year={2023} | ||
} | ||
``` |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.