diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e295743c2..25d07fd56 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -142,7 +142,7 @@ jobs: export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Currently for this setting the runner goes OOM if we pass --threads 4 to nvcc - if [[ ${MATRIX_CUDA_VERSION} == "12.1" && ${MATRIX_TORCH_VERSION} == "2.1" ]]; then + if [[ ${MATRIX_CUDA_VERSION} == "121" && ${MATRIX_TORCH_VERSION} == "2.1" ]]; then export FLASH_ATTENTION_FORCE_SINGLE_THREAD="TRUE" fi # Limit MAX_JOBS otherwise the github runner goes OOM diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py index 8472f0b78..6bc96613f 100644 --- a/flash_attn/__init__.py +++ b/flash_attn/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.0.6.post1" +__version__ = "2.0.6.post2" from flash_attn.flash_attn_interface import flash_attn_func from flash_attn.flash_attn_interface import flash_attn_kvpacked_func