From 5eeef1ea30d3702086363e8f88fd7376196e5c8b Mon Sep 17 00:00:00 2001 From: Yuqing Xia Date: Mon, 12 Aug 2024 05:09:51 -0700 Subject: [PATCH 01/46] intermediate save --- compute_sm.py | 7 + csrc/flash_attn/flash_api.cpp | 185 ++++++++++++------ csrc/flash_attn/src/flash.h | 8 +- csrc/flash_attn/src/flash_bwd_kernel.h | 91 +++++---- .../src/flash_bwd_launch_template.h | 65 ++++-- .../src/flash_bwd_preprocess_kernel.h | 81 ++++---- csrc/flash_attn/src/flash_fwd_kernel.h | 160 ++++++++------- .../src/flash_fwd_launch_template.h | 43 ++-- csrc/flash_attn/src/generate_kernels.py | 28 +-- csrc/flash_attn/src/kernel_traits.h | 136 ++++++++----- 10 files changed, 489 insertions(+), 315 deletions(-) create mode 100644 compute_sm.py diff --git a/compute_sm.py b/compute_sm.py new file mode 100644 index 000000000..17a201f4d --- /dev/null +++ b/compute_sm.py @@ -0,0 +1,7 @@ +Br = 64 +Bc = 128 +QKHeaddim = 128 +VHeaddim = 256 +smem =2 *(Br * QKHeaddim * 2 + Br * VHeaddim + Bc * QKHeaddim + Bc * VHeaddim + Br * Bc * 2) +smem = smem/1024 +print(smem) \ No newline at end of file diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index a928ec1ec..9d0f8c20b 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -29,6 +29,8 @@ void set_params_fprop(Flash_fwd_params ¶ms, const size_t h_k, const size_t d, const size_t d_rounded, + const size_t vd, + const size_t vd_rounded, // device pointers const at::Tensor q, const at::Tensor k, @@ -99,6 +101,8 @@ void set_params_fprop(Flash_fwd_params ¶ms, params.seqlen_k_rounded = seqlen_k_rounded; params.d = d; params.d_rounded = d_rounded; + params.vd = vd; + params.vd_rounded = vd_rounded; // Set the different scale values. #ifdef FLASHATTENTION_DISABLE_SOFTCAP @@ -164,6 +168,7 @@ void set_params_dgrad(Flash_bwd_params ¶ms, const size_t h_k, const size_t d, const size_t d_rounded, + const size_t vd, // device pointers const at::Tensor q, const at::Tensor k, @@ -189,7 +194,7 @@ void set_params_dgrad(Flash_bwd_params ¶ms, const bool unpadded_lse) { set_params_fprop(params, - b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded, + b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,vd, vd, q, k, v, out, cu_seqlens_q_d, cu_seqlens_k_d, @@ -292,12 +297,13 @@ inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n } std::tuple set_params_splitkv(Flash_fwd_params ¶ms, const int batch_size, - const int num_heads, const int head_size, const int max_seqlen_k, const int max_seqlen_q, - const int head_size_rounded, const float p_dropout, + const int num_heads, const int head_size, const int v_head_size, const int max_seqlen_k, const int max_seqlen_q, + const int head_size_rounded, const int v_head_size_rounded,const float p_dropout, const int num_splits, cudaDeviceProp *dprops, struct c10::TensorOptions opts) { // This needs to match with run_mha_fwd_splitkv_dispatch - const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64); + const max_head_size = head_size > v_head_size ? head_size : v_head_size; + const int block_n = max_head_size <= 64 ? 256 : (max_head_size <= 128 ? 128 : 64); const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n; // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel. // In any case we don't expect seqlen_q to be larger than 64 for inference. @@ -313,7 +319,7 @@ std::tuple set_params_splitkv(Flash_fwd_params ¶ms, } if (params.num_splits > 1) { softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat)); - out_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, head_size_rounded}, opts.dtype(at::kFloat)); + out_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, v_head_size_rounded}, opts.dtype(at::kFloat)); params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr(); params.oaccum_ptr = out_accum.data_ptr(); } @@ -381,7 +387,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); const auto sizes = q.sizes(); - + const auto v_head_size_og = v.sizes()[3]; const int batch_size = sizes[0]; int seqlen_q = sizes[1]; int num_heads = sizes[2]; @@ -390,6 +396,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const int num_heads_k = k.size(2); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } @@ -411,18 +418,24 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size num_heads = num_heads_k; } + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og); - CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); at::Tensor q_padded, k_padded, v_padded; if (head_size_og % 8 != 0) { q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); - v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + // v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); } else { q_padded = q; k_padded = k; + // v_padded = v; + } + if (v_head_size_og % 8 != 0) { + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } else { v_padded = v; } @@ -432,18 +445,24 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); CHECK_DEVICE(out); TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); - CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], head_size_og); + CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], v_head_size_og); if (seqlenq_ngroups_swapped) { - out = out.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2); + } + if (head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } - if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } } else { - out = torch::empty_like(q_padded); + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; const int head_size = round_multiple(head_size_og, 8); const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int v_head_size = round_multiple(v_head_size_og, 8); + const int v_head_size_rounded = v_head_size <= 192 ? round_multiple(v_head_size, 32) : 256; const int seqlen_q_rounded = round_multiple(seqlen_q, 128); const int seqlen_k_rounded = round_multiple(seqlen_k, 128); @@ -468,6 +487,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size seqlen_q_rounded, seqlen_k_rounded, num_heads, num_heads_k, head_size, head_size_rounded, + v_head_size, v_head_size_rounded, q_padded, k_padded, v_padded, out, /*cu_seqlens_q_d=*/nullptr, /*cu_seqlens_k_d=*/nullptr, @@ -484,8 +504,8 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size // Keep references to these tensors to extend their lifetime at::Tensor softmax_lse_accum, out_accum; std::tie(softmax_lse_accum, out_accum) = set_params_splitkv( - params, batch_size, num_heads, head_size, seqlen_k, seqlen_q, - head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts); + params, batch_size, num_heads, head_size, v_head_size, seqlen_k, seqlen_q, + head_size_rounded, v_head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts); // number of times random will be generated per thread, to offset philox counter in thc random // state @@ -516,14 +536,14 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size } at::Tensor out_padded = out; - if (head_size_og % 8 != 0) { - out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + if (v_head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)}); if (out_.has_value()) { out_.value().copy_(out); } } if (seqlenq_ngroups_swapped) { - out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); - out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); + out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); } @@ -592,7 +612,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s CHECK_CONTIGUOUS(cu_seqlens_k); const auto sizes = q.sizes(); - + const auto v_head_size_og = v.sizes()[3]; const int batch_size = cu_seqlens_q.numel() - 1; int num_heads = sizes[1]; const int head_size_og = sizes[2]; @@ -625,6 +645,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); if (window_size_left >= max_seqlen_k) { window_size_left = -1; } @@ -634,10 +655,10 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s if (!paged_KV) { const int total_k = k.size(0); CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); - CHECK_SHAPE(v, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); } else { CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size_og); - CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, head_size_og); + CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, v_head_size_og); CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); } @@ -655,31 +676,41 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s if (head_size_og % 8 != 0) { q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); - v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + // v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); } else { q_padded = q; k_padded = k; + // v_padded = v; + } + if (v_head_size_og % 8 != 0) { + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } else { v_padded = v; } - at::Tensor out; if (out_.has_value()) { out = out_.value(); TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); CHECK_DEVICE(out); TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); - CHECK_SHAPE(out, sizes[0], sizes[1], head_size_og); + CHECK_SHAPE(out, sizes[0], sizes[1], v_head_size_og); if (seqlenq_ngroups_swapped) { - out = out.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + } + if (v_head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } - if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } } else { - out = torch::empty_like(q_padded); + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; const int head_size = round_multiple(head_size_og, 8); const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int v_head_size = round_multiple(v_head_size_og, 8); + const int v_head_size_rounded = v_head_size <= 192 ? round_multiple(v_head_size, 32) : 256; const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128); const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128); @@ -709,6 +740,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s seqlen_q_rounded, seqlen_k_rounded, num_heads, num_heads_k, head_size, head_size_rounded, + v_head_size, v_head_size_rounded, q_padded, k_padded, v_padded, out, cu_seqlens_q_d, cu_seqlens_k.data_ptr(), @@ -736,8 +768,8 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s if (seqlenq_ngroups_swapped) { // Only apply split-k for decoding std::tie(softmax_lse_accum, out_accum) = - set_params_splitkv(params, batch_size, num_heads, head_size, - max_seqlen_k, max_seqlen_q, head_size_rounded, + set_params_splitkv(params, batch_size, num_heads, head_size, v_head_size, + max_seqlen_k, max_seqlen_q, head_size_rounded,v_head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts); } @@ -780,16 +812,18 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s } at::Tensor out_padded = out; - if (head_size_og % 8 != 0) { - out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + if (v_head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)}); if (out_.has_value()) { out_.value().copy_(out); } } if (seqlenq_ngroups_swapped) { int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_k, head_size_og}; int64_t size_after[] = {batch_size, num_heads_k * max_seqlen_q, head_size_og}; - out = out.reshape(size_before).transpose(1, 2).reshape(size_after); - out_padded = out_padded.reshape(size_before).transpose(1, 2).reshape(size_after); + int64_t o_size_before[] = {batch_size, max_seqlen_q, num_heads_k, v_head_size_og}; + int64_t o_size_after[] = {batch_size, num_heads_k * max_seqlen_q, v_head_size_og}; + out = out.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); + out_padded = out_padded.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); q_padded = q_padded.reshape(size_before).transpose(1, 2).reshape(size_after); softmax_lse = softmax_lse.reshape({num_heads * max_seqlen_q, batch_size}); } @@ -865,7 +899,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); const auto sizes = q.sizes(); - + const auto v_head_size_og = v.sizes()[3]; const int batch_size = sizes[0]; const int seqlen_q = sizes[1]; const int num_heads = sizes[2]; @@ -876,7 +910,9 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); - if (head_size > 192 && is_dropout) { + TORCH_CHECK(v_head_size_og % 8 == 0, " v head_size should be a multiple of 8"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); + if ((head_size > 192 || v_head_size_og > 192) && is_dropout) { TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); } TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); @@ -894,8 +930,8 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size); - CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size); - CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og); at::Tensor dq, dk, dv; @@ -922,7 +958,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); CHECK_DEVICE(dv); TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); - CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size); + CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, v_head_size_og); } else { dv = torch::empty_like(v); } @@ -960,7 +996,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si at::Tensor dk_expanded, dv_expanded; if (num_heads_k != num_heads) { // MQA / GQA dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); - dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); + dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, v_head_size_og}, opts); } else { dk_expanded = dk; dv_expanded = dv; @@ -974,6 +1010,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si seqlen_q_rounded, seqlen_k_rounded, num_heads, num_heads_k, head_size, head_size_rounded, + v_head_size_og, q, k, v, out, dout_padded, dq, dk_expanded, dv_expanded, nullptr, @@ -1027,12 +1064,12 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si // For MQA/GQA we need to sum dK and dV across the groups if (num_heads_k != num_heads) { at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); - at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); + at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {3}); } if (head_size_og % 8 != 0) { dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); - dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + // dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); } return { dq, dk, dv, softmax_d }; @@ -1106,7 +1143,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_CONTIGUOUS(cu_seqlens_k); const auto sizes = q.sizes(); - + const auto v_head_size_og = v.sizes()[3]; const int total_q = sizes[0]; const int batch_size = cu_seqlens_q.numel() - 1; const int num_heads = sizes[1]; @@ -1117,7 +1154,10 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); - if (head_size > 192 && is_dropout) { + TORCH_CHECK(v_head_size_og % 8 == 0, " v head_size should be a multiple of 8"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); + + if ((head_size > 192 || v_head_size_og > 192) && is_dropout) { TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); } TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); @@ -1135,8 +1175,8 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_SHAPE(q, total_q, num_heads, head_size); CHECK_SHAPE(k, total_k, num_heads_k, head_size); - CHECK_SHAPE(v, total_k, num_heads_k, head_size); - CHECK_SHAPE(out, total_q, num_heads, head_size); + CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(out, total_q, num_heads, v_head_size_og); CHECK_SHAPE(dout, total_q, num_heads, head_size_og); CHECK_SHAPE(cu_seqlens_q, batch_size + 1); CHECK_SHAPE(cu_seqlens_k, batch_size + 1); @@ -1165,7 +1205,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); CHECK_DEVICE(dv); TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); - CHECK_SHAPE(dv, total_k, num_heads_k, head_size); + CHECK_SHAPE(dv, total_k, num_heads_k, v_head_size_og); } else { dv = torch::empty_like(v); } @@ -1209,7 +1249,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size at::Tensor dk_expanded, dv_expanded; if (num_heads_k != num_heads) { // MQA / GQA dk_expanded = torch::empty({total_k, num_heads, head_size}, opts); - dv_expanded = torch::empty({total_k, num_heads, head_size}, opts); + dv_expanded = torch::empty({total_k, num_heads, v_head_size_og}, opts); } else { dk_expanded = dk; dv_expanded = dv; @@ -1230,6 +1270,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size seqlen_q_rounded, seqlen_k_rounded, num_heads, num_heads_k, head_size, head_size_rounded, + v_head_size_og, q, k, v, out, dout_padded, dq, dk_expanded, dv_expanded, cu_seqlens_q.data_ptr(), @@ -1282,12 +1323,12 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size // For MQA/GQA we need to sum dK and dV across the groups if (num_heads_k != num_heads) { at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); - at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); + at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {2}); } if (head_size_og % 8 != 0) { dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); - dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + // dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); } return { dq, dk, dv, softmax_d }; @@ -1350,7 +1391,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he } const auto sizes = q.sizes(); - + const auto v_head_size_og = v.sizes()[3]; const int batch_size = sizes[0]; int seqlen_q = sizes[1]; int num_heads = sizes[2]; @@ -1366,6 +1407,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); // causal=true is the same as causal=false in this case if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } @@ -1387,10 +1429,10 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); if (!paged_KV) { CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og); - CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og); + CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, v_head_size_og); } else { CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og); - CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, head_size_og); + CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, v_head_size_og); CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); } @@ -1398,7 +1440,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he if (head_size_og % 8 != 0) { q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); kcache_padded = torch::nn::functional::pad(kcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); - vcache_padded = torch::nn::functional::pad(vcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + vcache_padded = torch::nn::functional::pad(vcache, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } else { q_padded = q; kcache_padded = kcache; @@ -1411,15 +1453,21 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); CHECK_DEVICE(out); TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); - CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og); - if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); + if (head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } } else { - out = torch::empty_like(q_padded); + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; const int head_size = round_multiple(head_size_og, 8); const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int v_head_size = round_multiple(v_head_size_og, 8); + const int v_head_size_rounded = v_head_size <= 192 ? round_multiple(v_head_size, 32) : 256; const int seqlen_q_rounded = round_multiple(seqlen_q, 128); const int seqlen_k_rounded = round_multiple(seqlen_k, 128); @@ -1438,6 +1486,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he seqlen_q_rounded, seqlen_k_rounded, num_heads, num_heads_k, head_size, head_size_rounded, + v_head_size, v_head_size_rounded, q_padded, kcache_padded, vcache_padded, out, /*cu_seqlens_q_d=*/nullptr, /*cu_seqlens_k_d=*/nullptr, @@ -1465,10 +1514,10 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension"); int seqlen_knew = k.size(1); CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og); - CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, v_head_size_og); if (head_size_og % 8 != 0) { k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); - v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } else { k_padded = k; v_padded = v; @@ -1542,7 +1591,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he at::Tensor softmax_lse_accum, out_accum; std::tie(softmax_lse_accum, out_accum) = set_params_splitkv( params, batch_size, num_heads, head_size, seqlen_k, seqlen_q, - head_size_rounded, /*dropout*/ 0.f, num_splits, dprops, opts); + head_size_rounded, v_head_size_rounded, /*dropout*/ 0.f, num_splits, dprops, opts); if (paged_KV) { params.block_table = block_table.data_ptr(); @@ -1559,18 +1608,28 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value() || paged_KV); if (head_size_og % 8 != 0) { - out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); - if (out_.has_value()) { out_.value().copy_(out); } + // out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + // if (out_.has_value()) { out_.value().copy_(out); } if (k_.has_value()) { // It's expensive to copy the KV cache here for the case where head size not divisible by 8, // but we don't expect to get this case in practice. This is just so that the code works for that case. kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); - vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); + // vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); + } + } + if (v_head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + if (k_.has_value()) { + // It's expensive to copy the KV cache here for the case where head size not divisible by 8, + // but we don't expect to get this case in practice. This is just so that the code works for that case. + // kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); + vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)})); } } if (seqlenq_ngroups_swapped) { - out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); } return {out, softmax_lse}; diff --git a/csrc/flash_attn/src/flash.h b/csrc/flash_attn/src/flash.h index 6f597fbee..75825b9ad 100644 --- a/csrc/flash_attn/src/flash.h +++ b/csrc/flash_attn/src/flash.h @@ -67,7 +67,7 @@ struct Flash_fwd_params : public Qkv_params { void * __restrict__ softmax_lseaccum_ptr; // The dimensions. - int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q; + int b, seqlen_q, seqlen_k, seqlen_knew, d, vd, seqlen_q_rounded, seqlen_k_rounded, d_rounded, vd_rounded, rotary_dim, total_q; // The scaling factors for the kernel. float scale_softmax; @@ -189,7 +189,7 @@ struct Flash_bwd_params : public Flash_fwd_params { //////////////////////////////////////////////////////////////////////////////////////////////////// -template void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream); -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); -template void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream); +template void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_bwd_kernel.h b/csrc/flash_attn/src/flash_bwd_kernel.h index 4f95bd34a..46634ceb5 100644 --- a/csrc/flash_attn/src/flash_bwd_kernel.h +++ b/csrc/flash_attn/src/flash_bwd_kernel.h @@ -91,7 +91,8 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in constexpr int kBlockM = Kernel_traits::kBlockM; constexpr int kBlockN = Kernel_traits::kBlockN; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; constexpr int MMA_N_SdP = kBlockN / decltype(typename Kernel_traits::TiledMmaSdP{}.template tile_size_mnk<1>())::value; constexpr int AtomLayoutMS = Kernel_traits::AtomLayoutMSdP; constexpr bool Double_buffer = !Kernel_traits::No_double_buffer; @@ -125,25 +126,25 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in const index_t row_offset_dpsum = (params.unpadded_lse? bidh * (params.total_q + 128 * params.b) + binfo.q_offset(params.seqlen_q_rounded, 1, bidb) + 128 * bidb: (bidb * params.h + bidh) * params.seqlen_q_rounded) + (m_block_max - 1) * kBlockM; Tensor gQ = make_tensor(make_gmem_ptr(reinterpret_cast(params.q_ptr) + row_offset_q), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.q_row_stride, _1{})); Tensor gK = make_tensor(make_gmem_ptr(reinterpret_cast(params.k_ptr) + row_offset_k), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.k_row_stride, _1{})); Tensor gV = make_tensor(make_gmem_ptr(reinterpret_cast(params.v_ptr) + row_offset_v), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.v_row_stride, _1{})); Tensor gdO = make_tensor(make_gmem_ptr(reinterpret_cast(params.do_ptr) + row_offset_do), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.do_row_stride, _1{})); Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast(params.o_ptr) + row_offset_o), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.o_row_stride, _1{})); Tensor gdQ = make_tensor(make_gmem_ptr(reinterpret_cast(params.dq_ptr) + row_offset_dq), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dq_row_stride, _1{})); Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dq_accum_ptr) + row_offset_dq_accum), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.h * params.d_rounded, _1{})); Tensor gLSE = make_tensor(make_gmem_ptr(reinterpret_cast(params.softmax_lse_ptr) + row_offset_lse), Shape>{}, Stride<_1>{}); @@ -151,16 +152,16 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in Shape>{}, Stride<_1>{}); Tensor sQ = make_tensor(make_smem_ptr(reinterpret_cast(smem_)), - typename Kernel_traits::SmemLayoutQdO{}); - Tensor sQt = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutQdOtransposed{}); - Tensor sQtNoSwizzle = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutQdOtransposedNoSwizzle{}); + typename Kernel_traits::SmemLayoutQ{}); + Tensor sQt = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutQtransposed{}); + Tensor sQtNoSwizzle = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutQtransposedNoSwizzle{}); // Double buffer for sQ - Tensor sdO = make_tensor(sQ.data() + (Double_buffer ? 2 : 1) * size(sQ), typename Kernel_traits::SmemLayoutQdO{}); - Tensor sdOt = make_tensor(sdO.data(), typename Kernel_traits::SmemLayoutQdOtransposed{}); + Tensor sdO = make_tensor(sQ.data() + (Double_buffer ? 2 : 1) * size(sQ), typename Kernel_traits::SmemLayoutdO{}); + Tensor sdOt = make_tensor(sdO.data(), typename Kernel_traits::SmemLayoutdOtransposed{}); Tensor sdOtransposedNoSwizzle = make_tensor(sdO.data(), - typename Kernel_traits::SmemLayoutQdOtransposedNoSwizzle{}); - Tensor sK = make_tensor(sdO.data() + size(sdO), typename Kernel_traits::SmemLayoutKV{}); - Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutKV{}); + typename Kernel_traits::SmemLayoutdOtransposedNoSwizzle{}); + Tensor sK = make_tensor(sdO.data() + size(sdO), typename Kernel_traits::SmemLayoutK{}); + Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutV{}); Tensor sKt = make_tensor(sK.data(), typename Kernel_traits::SmemLayoutKtransposed{}); Tensor sKtNoSwizzle = make_tensor(sK.data(), typename Kernel_traits::SmemLayoutKtransposedNoSwizzle{}); Tensor sdS = make_tensor(!Kernel_traits::Is_V_in_regs ? sV.data() + size(sV) : sK.data() + size(sK), @@ -229,8 +230,8 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in Tensor tdQrdS = thr_mma_dq.partition_fragment_A(sdS); // (MMA, MMA_N, MMA_N) Tensor tdQrKt = thr_mma_dq.partition_fragment_B(sKtNoSwizzle); // (MMA, MMA_K, MMA_N) - Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K - Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K + Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K + Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K // // Copy Atom retiling @@ -289,20 +290,25 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in // Tensor cQ = make_identity_tensor(make_shape(size<0>(sQ), size<1>(sQ))); // (BLK_M,BLK_K) -> (blk_m,blk_k) - Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor cK = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor cV = make_identity_tensor(make_shape(size<0>(sV), size<1>(sV))); // (BLK_N,BLK_K) -> (blk_n,blk_k) Tensor tQcQ = gmem_thr_copy_QKV.partition_D(cQ); - Tensor tKVcKV = gmem_thr_copy_QKV.partition_D(cKV); + Tensor tKcK = gmem_thr_copy_QKV.partition_D(cK); + Tensor tVcV = gmem_thr_copy_QKV.partition_D(cV); // Allocate predicate tensors for k Tensor tQpQ = make_tensor(make_shape(size<2>(tQsQ))); - Tensor tKVpKV = make_tensor(make_shape(size<2>(tKsK))); + Tensor tKpK = make_tensor(make_shape(size<2>(tKsK))); + Tensor tVpV = make_tensor(make_shape(size<2>(tVsV))); // Set predicates for k bounds if (!Is_even_K) { #pragma unroll for (int k = 0; k < size(tQpQ); ++k) { tQpQ(k) = get<1>(tQcQ(0, 0, k)) < params.d; } #pragma unroll - for (int k = 0; k < size(tKVpKV); ++k) { tKVpKV(k) = get<1>(tKVcKV(0, 0, k)) < params.d; } + for (int k = 0; k < size(tKpK); ++k) { tKpK(k) = get<1>(tKcK(0, 0, k)) < params.d; } + #pragma unroll + for (int k = 0; k < size(tVpV); ++k) { tVpV(k) = get<1>(tVcV(0, 0, k)) < params.vd; } } // Prologue @@ -333,10 +339,10 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in const index_t row_offset_dv = binfo.k_offset(params.dv_batch_stride, params.dv_row_stride, bidb) + n_block * kBlockN * params.dv_row_stride + bidh * params.dv_head_stride; Tensor gdK = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_ptr) + row_offset_dk), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dk_row_stride, _1{})); Tensor gdV = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_ptr) + row_offset_dv), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dv_row_stride, _1{})); typename Kernel_traits::GmemTiledCopydKV gmem_tiled_copy_dKV; auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(tidx); @@ -346,17 +352,22 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in Tensor tdVrdV = make_tensor(shape(tdVgdV)); clear(tdKrdK); clear(tdVrdV); - Tensor cdKV = make_identity_tensor(make_shape(size<0>(gdK), size<1>(gdK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) - Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV); - Tensor tdKVpdKV = make_tensor(make_shape(size<2>(tdKgdK))); + Tensor cdK = make_identity_tensor(make_shape(size<0>(gdK), size<1>(gdK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor cdV = make_identity_tensor(make_shape(size<0>(gdV), size<1>(gdV))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor tdKcdK = gmem_thr_copy_dKV.partition_D(cdK); + Tensor tdVcdV = gmem_thr_copy_dKV.partition_D(cdV); + Tensor tdKpdK = make_tensor(make_shape(size<2>(tdKgdK))); + Tensor tdVpdV = make_tensor(make_shape(size<2>(tdVgdV))); + #pragma unroll + for (int k = 0; k < size(tdKpdK); ++k) { tdKpdK(k) = get<1>(tdKcdK(0, 0, k)) < params.d; } #pragma unroll - for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.d; } + for (int k = 0; k < size(tdVpdV); ++k) { tdVpdV(k) = get<1>(tdVcdV(0, 0, k)) < params.vd; } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( - gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKcdK, tdKpdK, binfo.actual_seqlen_k - n_block * kBlockN ); flash::copy( - gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdVcdV, tdVpdV, binfo.actual_seqlen_k - n_block * kBlockN ); return; } @@ -372,7 +383,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in if (Kernel_traits::Is_V_in_regs) { // Clear the smem tiles to account for predicated off loads flash::copy( - gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_QKV, tVgV, tVsV, tVcV, tVpV, binfo.actual_seqlen_k - n_block * kBlockN ); flash::cp_async_fence(); } @@ -418,11 +429,11 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in // // if (cute::thread(1, 0)) { print(tKrK); } flash::copy( - gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_QKV, tKgK, tKsK, tKcK, tKpK, binfo.actual_seqlen_k - n_block * kBlockN ); if (!Kernel_traits::Is_V_in_regs) { flash::copy( - gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_QKV, tVgV, tVsV, tVcV, tVpV, binfo.actual_seqlen_k - n_block * kBlockN ); } flash::cp_async_fence(); @@ -592,7 +603,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in } // if (cute::thread0()) { print(dS); } - Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape, Int>{}); // MMA, MMA_N, MMA_K + Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape, Int>{}); // MMA, MMA_N, MMA_K tdQgdQaccum.data() = tdQgdQaccum.data() + (-int(kBlockM * params.h * params.d_rounded)); if (Is_first || Seq_parallel) { clear(acc_dq); @@ -708,7 +719,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in Tensor tdQrdQ = make_tensor(shape(tdQgdQ)); cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ); tdQgdQ.data() = tdQgdQ.data() + (-int(kBlockM * params.dq_row_stride)); - Tensor cdQ = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cdQ = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ); #pragma unroll for (int m = 0; m < size<1>(tdQgdQ); ++m) { @@ -733,8 +744,8 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in Tensor rdK = flash::convert_type(acc_dk); Tensor rdV = flash::convert_type(acc_dv); - Tensor sdK = make_tensor(sK.data(), typename Kernel_traits::SmemLayoutdKV{}); // (SMEM_N, SMEM_K) - Tensor sdV = make_tensor(sdK.data() + size(sdK), typename Kernel_traits::SmemLayoutdKV{}); // (SMEM_N, SMEM_K) + Tensor sdK = make_tensor(sK.data(), typename Kernel_traits::SmemLayoutdK{}); // (SMEM_N, SMEM_K) + Tensor sdV = make_tensor(sdK.data() + size(sdK), typename Kernel_traits::SmemLayoutdV{}); // (SMEM_N, SMEM_K) // Partition sdV and sdK to match the accumulator partitioning auto smem_tiled_copy_dKV = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdKV{}, tiled_mma_dkv); @@ -758,10 +769,10 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in const index_t row_offset_dv = binfo.k_offset(params.dv_batch_stride, params.dv_row_stride, bidb) + n_block * kBlockN * params.dv_row_stride + bidh * params.dv_head_stride; Tensor gdK = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_ptr) + row_offset_dk), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dk_row_stride, _1{})); Tensor gdV = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_ptr) + row_offset_dv), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dv_row_stride, _1{})); typename Kernel_traits::GmemTiledCopydKV gmem_tiled_copy_dKV; @@ -780,7 +791,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV); Tensor tdKVpdKV = make_tensor(make_shape(size<2>(tdKgdK))); #pragma unroll - for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.d; } + for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.vd; } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN diff --git a/csrc/flash_attn/src/flash_bwd_launch_template.h b/csrc/flash_attn/src/flash_bwd_launch_template.h index 727d87e93..362b07982 100644 --- a/csrc/flash_attn/src/flash_bwd_launch_template.h +++ b/csrc/flash_attn/src/flash_bwd_launch_template.h @@ -87,7 +87,7 @@ void run_flash_bwd_seqk_parallel(Flash_bwd_params ¶ms, cudaStream_t stream) // We want to specialize to is_even_MN and not just is_even_M, since in the case where N is not // a multiple of kBlockN, we'll need to apply mask in the loop. const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_q % Kernel_traits::kBlockM == 0 && params.seqlen_k % Kernel_traits::kBlockN == 0; - const bool is_even_K = params.d == Kernel_traits::kHeadDim; + const bool is_even_K = (params.d == Kernel_traits::kQKHeadDim && params.vd == Kernel_traits::kVHeadDim);//TODO check if this is correct constexpr int smem_size_dq_dk_dv = Kernel_traits::kSmemSize1colblock; // printf("smem_size_dq_dk_dv = %d\n", smem_size_dq_dk_dv); BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] { @@ -129,8 +129,9 @@ void run_flash_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { } template -void run_mha_bwd_hdim32(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 32; +void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -140,21 +141,27 @@ void run_mha_bwd_hdim32(Flash_bwd_params ¶ms, cudaStream_t stream) { C10_CUDA_CHECK(status_); } DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB + constexpr static int Br = 128; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB + if (max_smem_per_block >= 104 * 1024) { // 104 KB if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } } else { // 96 KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } template -void run_mha_bwd_hdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 64; +void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -171,14 +178,19 @@ void run_mha_bwd_hdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { // run_flash_bwd>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); // This is slightly faster. We want to split M more so we need fewer registers to store LSE. + constexpr static int Br = 128; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); // This has a lot of register spilling // run_flash_bwd, Is_dropout>(params, stream); } else { // if (params.h == params.h_k) { // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); // } else { @@ -198,8 +210,9 @@ void run_mha_bwd_hdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { } template -void run_mha_bwd_hdim96(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 96; +void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -210,22 +223,27 @@ void run_mha_bwd_hdim96(Flash_bwd_params ¶ms, cudaStream_t stream) { } // printf("max_smem_per_block = %d\n", max_smem_per_block); DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); if (max_smem_per_block >= 116 * 1024) { if constexpr(!Is_dropout) { // 92KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { // 116 KB // This is faster for dropout since we don't have many registers to spare - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } template -void run_mha_bwd_hdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 128; +void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VKHeaddim = 256; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -236,12 +254,16 @@ void run_mha_bwd_hdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { } // printf("max_smem_per_block = %d\n", max_smem_per_block); DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); // run_flash_bwd>(params, stream); // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. // run_flash_bwd>(params, stream); if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); @@ -249,14 +271,14 @@ void run_mha_bwd_hdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { // run_flash_bwd, Is_dropout>(params, stream); } else { // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } // run_flash_bwd>(params, stream); // run_flash_bwd>(params, stream); }); } - +/* template void run_mha_bwd_hdim160(Flash_bwd_params ¶ms, cudaStream_t stream) { constexpr static int Headdim = 160; @@ -320,3 +342,4 @@ void run_mha_bwd_hdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { } }); } +*/ \ No newline at end of file diff --git a/csrc/flash_attn/src/flash_bwd_preprocess_kernel.h b/csrc/flash_attn/src/flash_bwd_preprocess_kernel.h index c8e307417..408652342 100644 --- a/csrc/flash_attn/src/flash_bwd_preprocess_kernel.h +++ b/csrc/flash_attn/src/flash_bwd_preprocess_kernel.h @@ -68,7 +68,8 @@ inline __device__ void compute_dot_do_o(const Params ¶ms) { const int tidx = threadIdx.x; constexpr int kBlockM = Kernel_traits::kBlockM; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; const BlockInfo binfo(params, bidb); if (m_block * kBlockM >= binfo.actual_seqlen_q) return; @@ -83,13 +84,13 @@ inline __device__ void compute_dot_do_o(const Params ¶ms) { const index_t row_offset_dpsum = (params.unpadded_lse ? (bidh * (params.total_q + 128 * params.b) + binfo.q_offset(params.seqlen_q_rounded, 1, bidb) + 128 * bidb): (bidb * params.h + bidh) * params.seqlen_q_rounded) + m_block * kBlockM; Tensor gdO = make_tensor(make_gmem_ptr(reinterpret_cast(params.do_ptr) + row_offset_do), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.do_row_stride, _1{})); Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast(params.o_ptr) + row_offset_o), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.o_row_stride, _1{})); Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dq_accum_ptr) + row_offset_dq_accum), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.h * params.d_rounded, _1{})); Tensor dP_sum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dsoftmax_sum) + row_offset_dpsum), Shape>{}, Stride<_1>{}); @@ -105,14 +106,14 @@ inline __device__ void compute_dot_do_o(const Params ¶ms) { Tensor tdOgO = gmem_thr_copy_dO.partition_S(gO); Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum); - Tensor cdO = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cdO = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) Tensor tdOcdO = gmem_thr_copy_dO.partition_S(cdO); // Allocate predicate tensors for k Tensor tdOpdO = make_tensor(make_shape(size<2>(tdOgdO))); // Set predicates for k bounds #pragma unroll - for (int k = 0; k < size(tdOpdO); ++k) {tdOpdO(k) = get<1>(tdOcdO(0, 0, k)) < params.d;} + for (int k = 0; k < size(tdOpdO); ++k) {tdOpdO(k) = get<1>(tdOcdO(0, 0, k)) < params.vd;} Tensor tdOrdO = make_fragment_like(tdOgdO); Tensor tdOrO = make_fragment_like(tdOgO); @@ -152,17 +153,19 @@ inline __device__ void clear_dKVaccum(const Params ¶ms) { const int tidx = threadIdx.x; constexpr int kBlockN = Kernel_traits::kBlockN; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; const BlockInfo binfo(params, bidb); if (n_block * kBlockN >= binfo.actual_seqlen_k) return; - const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.d_rounded; + const index_t row_offset_dk_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.d_rounded; + const index_t row_offset_dv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.vd_rounded; - Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_accum_ptr) + row_offset_dkv_accum), - Shape, Int>{}, Stride, _1>{}); - Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_accum_ptr) + row_offset_dkv_accum), - Shape, Int>{}, Stride, _1>{}); + Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_accum_ptr) + row_offset_dk_accum), + Shape, Int>{}, Stride, _1>{}); + Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_accum_ptr) + row_offset_dv_accum), + Shape, Int>{}, Stride, _1>{}); typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dKVaccum; auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx); @@ -196,7 +199,7 @@ inline __device__ void convert_dQ(const Params ¶ms, const int nsplits) { const int tidx = threadIdx.x; constexpr int kBlockM = Kernel_traits::kBlockM; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; const BlockInfo binfo(params, bidb); if (m_block * kBlockM >= binfo.actual_seqlen_q) return; @@ -207,10 +210,10 @@ inline __device__ void convert_dQ(const Params ¶ms, const int nsplits) { + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded; Tensor gdQ = make_tensor(make_gmem_ptr(reinterpret_cast(params.dq_ptr) + row_offset_dq), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dq_row_stride, _1{})); Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dq_accum_ptr) + row_offset_dq_accum), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.h * params.d_rounded, _1{})); Tensor sdQ = make_tensor(make_smem_ptr(reinterpret_cast(smem_)), @@ -230,7 +233,7 @@ inline __device__ void convert_dQ(const Params ¶ms, const int nsplits) { Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ); Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum); - Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape, Int>{}); // MMA, MMA_N, MMA_K + Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape, Int>{}); // MMA, MMA_N, MMA_K CUTE_STATIC_ASSERT_V(size(acc_dq) == size(tdQgdQaccum)); Tensor tdQrdQaccum = make_fragment_like(tdQgdQaccum); @@ -251,7 +254,7 @@ inline __device__ void convert_dQ(const Params ¶ms, const int nsplits) { Tensor tdQrdQ = make_tensor(shape(tdQgdQ)); cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ); - Tensor cdQ = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cdQ = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ); Tensor tdQpdQ = make_tensor(make_shape(size<2>(tdQgdQ))); #pragma unroll @@ -284,7 +287,8 @@ inline __device__ void convert_dKV(const Params ¶ms) { const int tidx = threadIdx.x; constexpr int kBlockN = Kernel_traits::kBlockN; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; const BlockInfo binfo(params, bidb); if (n_block * kBlockN >= binfo.actual_seqlen_k) return; @@ -293,21 +297,23 @@ inline __device__ void convert_dKV(const Params ¶ms) { + n_block * kBlockN * params.dk_row_stride + bidh * params.dk_head_stride; const index_t row_offset_dv = binfo.k_offset(params.dv_batch_stride, params.dv_row_stride, bidb) + n_block * kBlockN * params.dv_row_stride + bidh * params.dv_head_stride; - const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + const index_t row_offset_dk_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.d_rounded; + const index_t row_offset_dv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + + n_block * kBlockN) * params.vd_rounded; Tensor gdK = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_ptr) + row_offset_dk), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dk_row_stride, _1{})); Tensor gdV = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_ptr) + row_offset_dv), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.dv_row_stride, _1{})); - Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_accum_ptr) + row_offset_dkv_accum), - Shape, Int>{}, - Stride, _1>{}); - Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_accum_ptr) + row_offset_dkv_accum), - Shape, Int>{}, - Stride, _1>{}); + Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dk_accum_ptr) + row_offset_dk_accum), + Shape, Int>{}, + Stride, _1>{}); + Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.dv_accum_ptr) + row_offset_dv_accum), + Shape, Int>{}, + Stride, _1>{}); Tensor sdK = make_tensor(make_smem_ptr(reinterpret_cast(smem_)), typename Kernel_traits::SmemLayoutdKV{}); @@ -331,8 +337,8 @@ inline __device__ void convert_dKV(const Params ¶ms) { Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_S(gdKaccum); Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_S(gdVaccum); - Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K - Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K + Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K + Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape, Int>{}); // MMA, MMA_N, MMA_K CUTE_STATIC_ASSERT_V(size(acc_dk) == size(tdKgdKaccum)); CUTE_STATIC_ASSERT_V(size(acc_dv) == size(tdVgdVaccum)); @@ -361,17 +367,22 @@ inline __device__ void convert_dKV(const Params ¶ms) { cute::copy(gmem_tiled_copy_dKV, tdKsdK, tdKrdK); cute::copy(gmem_tiled_copy_dKV, tdVsdV, tdVrdV); - Tensor cdKV = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) - Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV); - Tensor tdKVpdKV = make_tensor(make_shape(size<2>(tdKgdK))); + Tensor cdK= make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cdV = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor tdKcdK = gmem_thr_copy_dKV.partition_D(cdK); + Tensor tdVcdV = gmem_thr_copy_dKV.partition_D(cdV); + Tensor tdKpdK = make_tensor(make_shape(size<2>(tdKgdK))); + Tensor tdVpdV = make_tensor(make_shape(size<2>(tdVgdV))); #pragma unroll - for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.d; } + for (int k = 0; k < size(tdKpdK); ++k) { tdKpdK(k) = get<1>(tdKcdK(0, 0, k)) < params.d; } + #pragma unroll + for (int k = 0; k < size(tdVpdV); ++k) { tdVpdV(k) = get<1>(tdVcdV(0, 0, k)) < params.vd; } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( - gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKcdK, tdKpdK, binfo.actual_seqlen_k - n_block * kBlockN ); flash::copy( - gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdVcdV, tdVpdV, binfo.actual_seqlen_k - n_block * kBlockN ); } diff --git a/csrc/flash_attn/src/flash_fwd_kernel.h b/csrc/flash_attn/src/flash_fwd_kernel.h index 788f3790e..655e81427 100644 --- a/csrc/flash_attn/src/flash_fwd_kernel.h +++ b/csrc/flash_attn/src/flash_fwd_kernel.h @@ -60,7 +60,8 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi constexpr int kBlockM = Kernel_traits::kBlockM; constexpr int kBlockN = Kernel_traits::kBlockN; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; constexpr int kNWarps = Kernel_traits::kNWarps; auto seed_offset = at::cuda::philox::unpack(params.philox_args); @@ -91,9 +92,9 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi if ((Is_causal || Is_local || !Is_even_MN) && n_block_max <= n_block_min) { Tensor mO = make_tensor(make_gmem_ptr(reinterpret_cast(params.o_ptr) + binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)), - make_shape(binfo.actual_seqlen_q, params.h, params.d), + make_shape(binfo.actual_seqlen_q, params.h, params.vd), make_stride(params.o_row_stride, params.o_head_stride, _1{})); - Tensor gO = local_tile(mO(_, bidh, _), Shape, Int>{}, + Tensor gO = local_tile(mO(_, bidh, _), Shape, Int>{}, make_coord(m_block, 0)); // (kBlockM, kHeadDim) Tensor gLSE = get_lse_tile(params, bidb, bidh, m_block, binfo); @@ -110,7 +111,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi Tensor tOpO = make_tensor(make_shape(size<2>(tOgO))); if (!Is_even_K) { #pragma unroll - for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d; } + for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.vd; } } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( @@ -136,19 +137,19 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi + binfo.q_offset(params.q_batch_stride, params.q_row_stride, bidb)), make_shape(binfo.actual_seqlen_q, params.h, params.d), make_stride(params.q_row_stride, params.q_head_stride, _1{})); - Tensor gQ = local_tile(mQ(_, bidh, _), Shape, Int>{}, + Tensor gQ = local_tile(mQ(_, bidh, _), Shape, Int>{}, make_coord(m_block, 0)); // (kBlockM, kHeadDim) Tensor mK = make_tensor(make_gmem_ptr(reinterpret_cast(params.k_ptr) + binfo.k_offset(params.k_batch_stride, params.k_row_stride, bidb)), make_shape(binfo.actual_seqlen_k, params.h_k, params.d), make_stride(params.k_row_stride, params.k_head_stride, _1{})); - Tensor gK = local_tile(mK(_, bidh / params.h_h_k_ratio, _), Shape, Int>{}, + Tensor gK = local_tile(mK(_, bidh / params.h_h_k_ratio, _), Shape, Int>{}, make_coord(_, 0)); // (kBlockN, kHeadDim, nblocksN) Tensor mV = make_tensor(make_gmem_ptr(reinterpret_cast(params.v_ptr) + binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb)), - make_shape(binfo.actual_seqlen_k, params.h_k, params.d), + make_shape(binfo.actual_seqlen_k, params.h_k, params.vd), make_stride(params.v_row_stride, params.v_head_stride, _1{})); - Tensor gV = local_tile(mV(_, bidh / params.h_h_k_ratio, _), Shape, Int>{}, + Tensor gV = local_tile(mV(_, bidh / params.h_h_k_ratio, _), Shape, Int>{}, make_coord(_, 0)); // (kBlockN, kHeadDim, nblocksN) Tensor gP = make_tensor(make_gmem_ptr(reinterpret_cast(params.p_ptr) + row_offset_p), Shape, Int>{}, @@ -158,8 +159,8 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi typename Kernel_traits::SmemLayoutQ{}); // Careful we're using the same smem for sQ and sK | sV if Share_Q_K_smem; Tensor sK = make_tensor(sQ.data() + (Kernel_traits::Share_Q_K_smem ? 0 : size(sQ)), - typename Kernel_traits::SmemLayoutKV{}); - Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutKV{}); + typename Kernel_traits::SmemLayoutK{}); + Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutV{}); Tensor sVt = make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{}); Tensor sVtNoSwizzle = make_tensor(sV.data().get(), typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{}); @@ -181,7 +182,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi Tensor tSgS = thr_mma.partition_C(gP); - Tensor acc_o = partition_fragment_C(tiled_mma, Shape, Int>{}); // MMA, MMA_M, MMA_K + Tensor acc_o = partition_fragment_C(tiled_mma, Shape, Int>{}); // MMA, MMA_M, MMA_K // // Copy Atom retiling @@ -211,7 +212,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi // Construct identity layout for sQ and sK Tensor cQ = make_identity_tensor(make_shape(size<0>(sQ), size<1>(sQ))); // (BLK_M,BLK_K) -> (blk_m,blk_k) - Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor cK = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) // Tensor tScQ = thr_mma.partition_A(cQ); // (MMA,MMA_M,MMA_K) // if (cute::thread0()) { // print(tScQ.layout()); printf("\n"); @@ -227,18 +228,18 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi // Repeat the partitioning with identity layouts Tensor tQcQ = gmem_thr_copy_QKV.partition_S(cQ); // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k) - Tensor tKVcKV = gmem_thr_copy_QKV.partition_S(cKV); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) + Tensor tKcK = gmem_thr_copy_QKV.partition_S(cK); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) // Allocate predicate tensors for k Tensor tQpQ = make_tensor(make_shape(size<2>(tQsQ))); - Tensor tKVpKV = make_tensor(make_shape(size<2>(tKsK))); + Tensor tKpK = make_tensor(make_shape(size<2>(tKsK))); // Set predicates for k bounds if (!Is_even_K) { #pragma unroll for (int k = 0; k < size(tQpQ); ++k) { tQpQ(k) = get<1>(tQcQ(0, 0, k)) < params.d; } #pragma unroll - for (int k = 0; k < size(tKVpKV); ++k) { tKVpKV(k) = get<1>(tKVcKV(0, 0, k)) < params.d; } + for (int k = 0; k < size(tKpK); ++k) { tKpK(k) = get<1>(tKcK(0, 0, k)) < params.d; } } // Prologue @@ -263,7 +264,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi int n_block = n_block_max - 1; // We don't need to clear the sK smem tiles since we'll mask out the scores anyway. - flash::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, n_block), tKsK, tKVcKV, tKVpKV, + flash::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, n_block), tKsK, tKcK, tKpK, binfo.actual_seqlen_k - n_block * kBlockN); cute::cp_async_fence(); // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z < 2) { print(tKgK); } @@ -284,6 +285,15 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi const float alibi_slope = !Has_alibi || params.alibi_slopes_ptr == nullptr ? 0.0f : reinterpret_cast(params.alibi_slopes_ptr)[bidb * params.alibi_slopes_batch_stride + bidh] / params.scale_softmax; flash::Mask mask(binfo.actual_seqlen_k, binfo.actual_seqlen_q, params.window_size_left, params.window_size_right, alibi_slope); + Tensor cV = make_identity_tensor(make_shape(size<0>(sV), size<1>(sV))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor tVcV = gmem_thr_copy_QKV.partition_S(cV); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) + Tensor tVpV = make_tensor(make_shape(size<2>(tVsV))); + // Set predicates for k bounds + if (!Is_even_K) { + #pragma unroll + for (int k = 0; k < size(tVpV); ++k) { tVpV(k) = get<1>(tVcV(0, 0, k)) < params.vd; } + } + // For performance reason, we separate out two kinds of iterations: // those that need masking on S, and those that don't. // We need masking on S for the very last block when K and V has length not multiple of kBlockN. @@ -304,11 +314,11 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi // Advance gV if (masking_step > 0) { - flash::copy(gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tVcV, tVpV); } else { // Clear the smem tiles to account for predicated off loads flash::copy( - gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tVcV, tVpV, binfo.actual_seqlen_k - n_block * kBlockN ); } cute::cp_async_fence(); @@ -329,7 +339,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi flash::cp_async_wait<0>(); __syncthreads(); if (n_block > n_block_min) { - flash::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, n_block - 1), tKsK, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, n_block - 1), tKsK, tKcK, tKpK); // This cp_async_fence needs to be in the if block, otherwise the synchronization // isn't right and we get race conditions. cute::cp_async_fence(); @@ -377,7 +387,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi clear(acc_s); flash::cp_async_wait<0>(); __syncthreads(); - flash::copy(gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tVcV, tVpV); cute::cp_async_fence(); flash::gemm( @@ -391,7 +401,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi flash::cp_async_wait<0>(); __syncthreads(); if (n_block > n_block_min) { - flash::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, n_block - 1), tKsK, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, n_block - 1), tKsK, tKcK, tKpK); // This cp_async_fence needs to be in the if block, otherwise the synchronization // isn't right and we get race conditions. cute::cp_async_fence(); @@ -445,9 +455,9 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi Tensor mO = make_tensor(make_gmem_ptr(reinterpret_cast(params.o_ptr) + binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)), - make_shape(binfo.actual_seqlen_q, params.h, params.d), + make_shape(binfo.actual_seqlen_q, params.h, params.vd), make_stride(params.o_row_stride, params.o_head_stride, _1{})); - Tensor gO = local_tile(mO(_, bidh, _), Shape, Int>{}, + Tensor gO = local_tile(mO(_, bidh, _), Shape, Int>{}, make_coord(m_block, 0)); // (kBlockM, kHeadDim) Tensor gLSE = get_lse_tile(params, bidb, bidh, m_block, binfo); @@ -461,7 +471,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi Tensor tOrO = make_tensor(shape(tOgO)); cute::copy(gmem_tiled_copy_O, tOsO, tOrO); - Tensor caccO = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor caccO = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) Tensor taccOcO = thr_mma.partition_C(caccO); // (MMA,MMA_M,MMA_K) static_assert(decltype(size<0>(taccOcO))::value == 4); // Convert to ((2, 2), MMA_M, MMA_K) then take only the row indices. @@ -482,7 +492,7 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi Tensor tOpO = make_tensor(make_shape(size<2>(tOgO))); if (!Is_even_K) { #pragma unroll - for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d; } + for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.vd; } } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( @@ -507,7 +517,8 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons constexpr int kBlockM = Kernel_traits::kBlockM; constexpr int kBlockN = Kernel_traits::kBlockN; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; constexpr int kNWarps = Kernel_traits::kNWarps; using GmemTiledCopyO = std::conditional_t< @@ -538,11 +549,11 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb) + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride; const index_t row_offset_oaccum = (((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q - + m_block * kBlockM) * params.d_rounded; + + m_block * kBlockM) * params.vd_rounded; const index_t row_offset_lseaccum = ((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q + m_block * kBlockM; Tensor gOaccum = make_tensor(make_gmem_ptr(reinterpret_cast(Split ? params.oaccum_ptr : params.o_ptr) + (Split ? row_offset_oaccum : row_offset_o)), - Shape, Int>{}, - make_stride(Split ? kHeadDim : params.o_row_stride, _1{})); + Shape, Int>{}, + make_stride(Split ? kVHeadDim : params.o_row_stride, _1{})); Tensor gLSEaccum = make_tensor(make_gmem_ptr(reinterpret_cast(Split ? params.softmax_lseaccum_ptr : params.softmax_lse_ptr) + row_offset_lseaccum), Shape>{}, Stride<_1>{}); @@ -558,7 +569,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons Tensor tOpO = make_tensor(make_shape(size<2>(tOgOaccum))); if (!Is_even_K) { #pragma unroll - for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d; } + for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.vd; } } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( @@ -593,20 +604,20 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons Tensor mQ = make_tensor(make_gmem_ptr(reinterpret_cast(params.q_ptr) + binfo.q_offset(params.q_batch_stride, params.q_row_stride, bidb)), make_shape(binfo.actual_seqlen_q, params.h, params.d), make_stride(params.q_row_stride, params.q_head_stride, _1{})); - Tensor gQ = local_tile(mQ(_, bidh, _), Shape, Int>{}, + Tensor gQ = local_tile(mQ(_, bidh, _), Shape, Int>{}, make_coord(m_block, 0)); // (kBlockM, kHeadDim) Tensor gK = make_tensor(make_gmem_ptr(reinterpret_cast(params.k_ptr) + row_offset_k), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.k_row_stride, _1{})); // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { printf("k_ptr = %p, row_offset_k = %d, gK_ptr = %p\n", params.k_ptr, row_offset_k, gK.data()); } Tensor gV = make_tensor(make_gmem_ptr(reinterpret_cast(params.v_ptr) + row_offset_v), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.v_row_stride, _1{})); Tensor sQ = make_tensor(make_smem_ptr(reinterpret_cast(smem_)), typename Kernel_traits::SmemLayoutQ{}); - Tensor sK = make_tensor(sQ.data() + size(sQ), typename Kernel_traits::SmemLayoutKV{}); - Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutKV{}); + Tensor sK = make_tensor(sQ.data() + size(sQ), typename Kernel_traits::SmemLayoutK{}); + Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutV{}); Tensor sVt = make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{}); Tensor sVtNoSwizzle = make_tensor(sV.data().get(), typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{}); @@ -626,7 +637,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons Tensor tSrK = thr_mma.partition_fragment_B(sK); // (MMA,MMA_N,MMA_K) Tensor tOrVt = thr_mma.partition_fragment_B(sVtNoSwizzle); // (MMA, MMA_K,MMA_N) - Tensor acc_o = partition_fragment_C(tiled_mma, Shape, Int>{}); // MMA, MMA_M, MMA_K + Tensor acc_o = partition_fragment_C(tiled_mma, Shape, Int>{}); // MMA, MMA_M, MMA_K // // Copy Atom retiling @@ -653,22 +664,26 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons // Construct identity layout for sQ and sK Tensor cQ = make_identity_tensor(make_shape(size<0>(sQ), size<1>(sQ))); // (BLK_M,BLK_K) -> (blk_m,blk_k) - Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor cK = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + Tensor cV = make_identity_tensor(make_shape(size<0>(sV), size<1>(sV))); // (BLK_N,BLK_K) -> (blk_n,blk_k) // Repeat the partitioning with identity layouts Tensor tQcQ = gmem_thr_copy_QKV.partition_S(cQ); // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k) - Tensor tKVcKV = gmem_thr_copy_QKV.partition_S(cKV); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) - + Tensor tKcK = gmem_thr_copy_QKV.partition_S(cK); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) + Tensor tVcV = gmem_thr_copy_QKV.partition_S(cV); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) // Allocate predicate tensors for k Tensor tQpQ = make_tensor(make_shape(size<2>(tQsQ))); - Tensor tKVpKV = make_tensor(make_shape(size<2>(tKsK))); + Tensor tKpK = make_tensor(make_shape(size<2>(tKsK))); + Tensor tVpV = make_tensor(make_shape(size<2>(tVsV))); // Set predicates for k bounds if (!Is_even_K) { #pragma unroll for (int k = 0; k < size(tQpQ); ++k) { tQpQ(k) = get<1>(tQcQ(0, 0, k)) < params.d; } #pragma unroll - for (int k = 0; k < size(tKVpKV); ++k) { tKVpKV(k) = get<1>(tKVcKV(0, 0, k)) < params.d; } + for (int k = 0; k < size(tKpK); ++k) { tKpK(k) = get<1>(tKcK(0, 0, k)) < params.d; } + #pragma unroll + for (int k = 0; k < size(tVpV); ++k) { tVpV(k) = get<1>(tVcV(0, 0, k)) < params.vd; } } // Prologue @@ -684,16 +699,16 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons // We want to do this so that all threadblocks can proceed right after they finish writing the KV cache. const index_t row_offset_cossin = ((n_block_max - 1) * kBlockN + (params.leftpad_k == nullptr ? 0 : params.leftpad_k[bidb])) * (params.rotary_dim / 2); Tensor gCos = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_cos_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.rotary_dim / 2, _1{})); Tensor gSin = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_sin_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.rotary_dim / 2, _1{})); Tensor gCosCont = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_cos_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.rotary_dim / 2, _1{})); Tensor gSinCont = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_sin_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.rotary_dim / 2, _1{})); Tensor tRgCos = gmem_thr_copy_rotary.partition_S(gCos); Tensor tRgSin = gmem_thr_copy_rotary.partition_S(gSin); @@ -714,12 +729,12 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons // This maps to accessing the first 64 rows of knew_ptr. Tensor gKnew = make_tensor(make_gmem_ptr(reinterpret_cast(params.knew_ptr) + row_offset_knew - binfo.seqlen_k_cache * params.knew_row_stride), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.knew_row_stride, _1{})); // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { printf("knew_ptr = %p, row_offset_knew = %d, gKnew_ptr = %p\n", params.knew_ptr, row_offset_knew, gKnew.data()); } Tensor gVnew = make_tensor(make_gmem_ptr(reinterpret_cast(params.vnew_ptr) + row_offset_vnew - binfo.seqlen_k_cache * params.vnew_row_stride), - Shape, Int>{}, + Shape, Int>{}, make_stride(params.vnew_row_stride, _1{})); Tensor tKgKnew = gmem_thr_copy_QKV.partition_S(gKnew); // (KCPY, KCPY_N, KCPY_K) Tensor tVgVnew = gmem_thr_copy_QKV.partition_S(gVnew); // (VCPY, VCPY_N, VCPY_K) @@ -729,18 +744,18 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons auto tVgV_data = tVgV.data(); for (int n_block = n_block_max - 1; n_block >= n_block_copy_min; n_block--) { flash::copy_w_min_idx( - tVgVnew, tVgV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN + tVgVnew, tVgV, tVcV, tVpV, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN ); tVgVnew.data() = tVgVnew.data() + (-int(kBlockN * params.vnew_row_stride)); if (params.rotary_dim == 0) { flash::copy_w_min_idx( - tKgKnew, tKgK, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN + tKgKnew, tKgK, tKcK, tKpK, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN ); } else { if (params.is_rotary_interleaved) { // Don't clear OOB_K because we're writing to global memory flash::copy_rotary_interleaved( - tKgKnew, tKgK, tRgCos, tRgSin, tKVcKV, binfo.actual_seqlen_k - n_block * kBlockN, + tKgKnew, tKgK, tRgCos, tRgSin, tKcK, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN, params.d, params.rotary_dim ); tRgCos.data() = tRgCos.data() + (-int(kBlockN * params.rotary_dim / 2)); @@ -748,7 +763,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons } else { // Don't clear OOB_K because we're writing to global memory flash::copy_rotary_contiguous( - tKgKnew, tKgK, tRgCosCont, tRgSinCont, tKVcKV, binfo.actual_seqlen_k - n_block * kBlockN, + tKgKnew, tKgK, tRgCosCont, tRgSinCont, tKcK, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN, params.d, params.rotary_dim ); tRgCosCont.data() = tRgCosCont.data() + (-int(kBlockN * params.rotary_dim / 2)); @@ -789,16 +804,16 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons // If not causal, all the queries get the same the cos/sin, taken at location seqlen_k_cache. // We do this by setting the row stride of gCos / gSin to 0. Tensor gCos = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_cos_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{})); Tensor gSin = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_sin_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{})); Tensor gCosCont = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_cos_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{})); Tensor gSinCont = make_tensor(make_gmem_ptr(reinterpret_cast(params.rotary_sin_ptr) + row_offset_cossin), - Shape, Int>{}, + Shape, Int>{}, make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{})); Tensor tRgCos = gmem_thr_copy_rotary.partition_S(gCos); Tensor tRgSin = gmem_thr_copy_rotary.partition_S(gSin); @@ -864,11 +879,11 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons const int block_table_offset_next = n_block * kBlockN - block_table_idx_next * params.page_block_size; tVgV.data() = tVgV.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.v_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.v_row_stride; } - flash::copy(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tVgV, tVsV, tVcV, tVpV); } else { // Clear the smem tiles to account for predicated off loads flash::copy( - gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN + gmem_tiled_copy_QKV, tVgV, tVsV, tVcV, tVpV, binfo.actual_seqlen_k - n_block * kBlockN ); } cute::cp_async_fence(); @@ -903,7 +918,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons const int block_table_offset_next =(n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size; tKgK.data() = tKgK.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.k_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.k_row_stride; } - flash::copy(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tKgK, tKsK, tKcK, tKpK); // This cp_async_fence needs to be in the if block, otherwise the synchronization // isn't right and we get race conditions. cute::cp_async_fence(); @@ -946,7 +961,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons const int block_table_offset_next = n_block * kBlockN - block_table_idx_next * params.page_block_size; tVgV.data() = tVgV.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.v_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.v_row_stride; } - flash::copy(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tVgV, tVsV, tVcV, tVpV); cute::cp_async_fence(); flash::gemm( @@ -970,7 +985,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons const int block_table_offset_next = (n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size; tKgK.data() = tKgK.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.k_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.k_row_stride; } - flash::copy(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV); + flash::copy(gmem_tiled_copy_QKV, tKgK, tKsK, tKcK, tKpK); // This cp_async_fence needs to be in the if block, otherwise the synchronization // isn't right and we get race conditions. cute::cp_async_fence(); @@ -1016,14 +1031,14 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb) + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride; const index_t row_offset_oaccum = (((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q - + m_block * kBlockM) * params.d_rounded; + + m_block * kBlockM) * params.vd_rounded; const index_t row_offset_lseaccum = (Split || !params.unpadded_lse ? ((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q : bidh * params.total_q + binfo.q_offset(params.seqlen_q, 1, bidb) ) + m_block * kBlockM; Tensor gOaccum = make_tensor(make_gmem_ptr(reinterpret_cast(Split ? params.oaccum_ptr : params.o_ptr) + (Split ? row_offset_oaccum : row_offset_o)), - Shape, Int>{}, - make_stride(Split ? kHeadDim : params.o_row_stride, _1{})); + Shape, Int>{}, + make_stride(Split ? kVHeadDim : params.o_row_stride, _1{})); Tensor gLSEaccum = make_tensor(make_gmem_ptr(reinterpret_cast(Split ? params.softmax_lseaccum_ptr : params.softmax_lse_ptr) + row_offset_lseaccum), Shape>{}, Stride<_1>{}); // if (tidx == 0) { printf("row_offset_o = %d, bidh = %d, gOaccum = %p\n", row_offset_o, bidh, gOaccum.data()); } @@ -1038,7 +1053,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons Tensor tOrOaccum = make_tensor(shape(tOgOaccum)); cute::copy(gmem_tiled_copy_Oaccum, tOsOaccum, tOrOaccum); - Tensor caccO = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor caccO = make_identity_tensor(Shape, Int>{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) Tensor taccOcO = thr_mma.partition_C(caccO); // (MMA,MMA_M,MMA_K) static_assert(decltype(size<0>(taccOcO))::value == 4); // Convert to ((2, 2), MMA_M, MMA_K) then take only the row indices. @@ -1059,7 +1074,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons Tensor tOpO = make_tensor(make_shape(size<2>(tOgOaccum))); if (!Is_even_K) { #pragma unroll - for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d; } + for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.vd; } } // Clear_OOB_K must be false since we don't want to write zeros to gmem flash::copy( @@ -1110,7 +1125,8 @@ inline __device__ void combine_attn_seqk_parallel(const Params ¶ms) { using ElementAccum = typename Kernel_traits::ElementAccum; using index_t = typename Kernel_traits::index_t; constexpr int kMaxSplits = 1 << Log_max_splits; - constexpr int kHeadDim = Kernel_traits::kHeadDim; + constexpr int kQKHeadDim = Kernel_traits::kQKHeadDim; + constexpr int kVHeadDim = Kernel_traits::kVHeadDim; constexpr int kNThreads = Kernel_traits::kNThreads; static_assert(kMaxSplits <= 128, "kMaxSplits must be <= 128"); @@ -1212,10 +1228,10 @@ inline __device__ void combine_attn_seqk_parallel(const Params ¶ms) { } __syncthreads(); - const index_t row_offset_oaccum = bidx * kBlockM * params.d_rounded; + const index_t row_offset_oaccum = bidx * kBlockM * params.vd_rounded; Tensor gOaccum = make_tensor(make_gmem_ptr(reinterpret_cast(params.oaccum_ptr) + row_offset_oaccum), - Shape, Int>{}, - Stride, _1>{}); + Shape, Int>{}, + Stride, _1>{}); constexpr int kBlockN = kNThreads / kBlockM; using GmemLayoutAtomOaccum = Layout, Int>, Stride, _1>>; using GmemTiledCopyOaccum = decltype( @@ -1230,13 +1246,13 @@ inline __device__ void combine_attn_seqk_parallel(const Params ¶ms) { clear(tOrO); // Predicates - Tensor cOaccum = make_identity_tensor(Shape, Int>{}); + Tensor cOaccum = make_identity_tensor(Shape, Int>{}); // Repeat the partitioning with identity layouts Tensor tOcOaccum = gmem_thr_copy_Oaccum.partition_S(cOaccum); Tensor tOpOaccum = make_tensor(make_shape(size<2>(tOgOaccum))); if (!Is_even_K) { #pragma unroll - for (int k = 0; k < size(tOpOaccum); ++k) { tOpOaccum(k) = get<1>(tOcOaccum(0, 0, k)) < params.d; } + for (int k = 0; k < size(tOpOaccum); ++k) { tOpOaccum(k) = get<1>(tOcOaccum(0, 0, k)) < params.vd; } } // Load Oaccum in then scale and accumulate to O for (int split = 0; split < params.num_splits; ++split) { @@ -1256,7 +1272,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params ¶ms) { } // if (cute::thread0()) { printf("lse_scale = %f, %f\n", sLSE[split][0], sLSE[split][1]); print(tOrOaccum); } } - tOgOaccum.data() = tOgOaccum.data() + params.b * params.h * params.seqlen_q * params.d_rounded; + tOgOaccum.data() = tOgOaccum.data() + params.b * params.h * params.seqlen_q * params.vd_rounded; } // if (cute::thread0()) { print_tensor(tOrO); } diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h index 900cf4671..2b9ad7b6b 100644 --- a/csrc/flash_attn/src/flash_fwd_launch_template.h +++ b/csrc/flash_attn/src/flash_fwd_launch_template.h @@ -168,26 +168,28 @@ void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream) } template -void run_mha_fwd_hdim32(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 32; +void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); }); } template -void run_mha_fwd_hdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 64; +void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { if constexpr(!Is_dropout) { // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower // Using block size (64 x 256) is 27% slower for seqlen=2k // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); @@ -196,20 +198,21 @@ void run_mha_fwd_hdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { } template -void run_mha_fwd_hdim96(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 96; +void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; auto dprops = at::cuda::getCurrentDeviceProperties(); bool is_sm8x = dprops->major == 8 && dprops->minor > 0; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), if (is_sm8x) { if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); @@ -220,8 +223,9 @@ void run_mha_fwd_hdim96(Flash_fwd_params ¶ms, cudaStream_t stream) { } template -void run_mha_fwd_hdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int Headdim = 128; +void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; auto dprops = at::cuda::getCurrentDeviceProperties(); bool is_sm8x = dprops->major == 8 && dprops->minor > 0; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { @@ -230,12 +234,12 @@ void run_mha_fwd_hdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. if (is_sm8x) { if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); @@ -246,14 +250,14 @@ void run_mha_fwd_hdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { // 1st ones are good for H100, A100 // 2nd one is good for A6000 bc we get slightly better occupancy } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); } }); } - +/* template void run_mha_fwd_hdim160(Flash_fwd_params ¶ms, cudaStream_t stream) { constexpr static int Headdim = 160; @@ -327,3 +331,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { // run_flash_fwd, Is_dropout, Is_causal>(params, stream); }); } +*/ \ No newline at end of file diff --git a/csrc/flash_attn/src/generate_kernels.py b/csrc/flash_attn/src/generate_kernels.py index 119e34956..0db6f2d6c 100644 --- a/csrc/flash_attn/src/generate_kernels.py +++ b/csrc/flash_attn/src/generate_kernels.py @@ -15,26 +15,27 @@ } SM = [80] # Sm80 kernels support up to -HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256] +# HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256] +HEAD_DIMENSIONS = [32, 64, 96, 128] IS_CAUSAL = ["false", "true"] KERNEL_IMPL_TEMPLATE_FWD = """#include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream) {{ - run_mha_fwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); +void run_mha_fwd_<{DTYPE}, {QKHEAD_DIM}, {VHEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream) {{ + run_mha_fwd_qkdim{QKHEAD_DIM}_vdim{VHEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); }} """ KERNEL_IMPL_TEMPLATE_FWD_SPLIT = """#include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch<{DTYPE}, {QKHEAD_DIM}, {VHEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream); """ KERNEL_IMPL_TEMPLATE_BWD = """#include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_bwd_params ¶ms, cudaStream_t stream) {{ - run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); +void run_mha_bwd_<{DTYPE}, {QKHEAD_DIM}, {VHEAD_DIM}, {IS_CAUSAL}>(Flash_bwd_params ¶ms, cudaStream_t stream) {{ + run_mha_bwd_qkdim{QKHEAD_DIM}_vdim{VHEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); }} """ @@ -43,34 +44,35 @@ class Kernel: sm: int dtype: str - head_dim: int + qkhead_dim: int is_causal: bool direction: str @property def template(self) -> str: + self.vhead_dim = self.qkhead_dim * 2 if self.direction == "fwd": return KERNEL_IMPL_TEMPLATE_FWD.format( - DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal ) elif self.direction == "bwd": return KERNEL_IMPL_TEMPLATE_BWD.format( - DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal ) else: return KERNEL_IMPL_TEMPLATE_FWD_SPLIT.format( - DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal ) @property def filename(self) -> str: - return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" + return f"flash_{self.direction}_qkdim{self.qkhead_dim}_vdim{self.vhead_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" def get_all_kernels() -> List[Kernel]: for direction in ["fwd", "fwd_split", "bwd"]: - for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM): - yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, is_causal=is_causal, direction=direction) + for dtype, qkhead_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM): + yield Kernel(sm=sm, dtype=dtype, qkhead_dim=qkhead_dim, is_causal=is_causal, direction=direction) def write_kernel(kernel: Kernel, autogen_dir: Path) -> None: diff --git a/csrc/flash_attn/src/kernel_traits.h b/csrc/flash_attn/src/kernel_traits.h index 5a7b74911..a43e4ea45 100644 --- a/csrc/flash_attn/src/kernel_traits.h +++ b/csrc/flash_attn/src/kernel_traits.h @@ -12,7 +12,7 @@ using namespace cute; -template +template struct Flash_kernel_traits { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 @@ -46,8 +46,8 @@ struct Flash_kernel_traits { }; // If Share_Q_K_smem is true, that forces Is_Q_in_regs to be true -template > +template > struct Flash_fwd_kernel_traits : public Base { using Element = typename Base::Element; using ElementAccum = typename Base::ElementAccum; @@ -65,10 +65,13 @@ struct Flash_fwd_kernel_traits : public Base { static constexpr int kBlockM = kBlockM_; static constexpr int kBlockN = kBlockN_; - static constexpr int kHeadDim = kHeadDim_; - static_assert(kHeadDim % 32 == 0); - static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32; - static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32); + static constexpr int kQKHeadDim = kQKHeadDim_; + static constexpr int kVHeadDim = kVHeadDim_; + static_assert(kQKHeadDim % 32 == 0); + static_assert(kVHeadDim % 32 == 0); + static constexpr int kBlockKSmem = kQKHeadDim % 64 == 0 ? 64 : 32; + static constexpr int kBlockKGmem = kQKHeadDim % 128 == 0 ? 128 : (kQKHeadDim % 64 == 0 ? 64 : 32); + static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3; using TiledMma = TiledMMA< @@ -83,15 +86,17 @@ struct Flash_fwd_kernel_traits : public Base { Stride, _1>>{})); using SmemLayoutQ = decltype(tile_to_shape( SmemLayoutAtomQ{}, - Shape, Int>{})); + Shape, Int>{})); - using SmemLayoutKV = decltype(tile_to_shape( + using SmemLayoutK = decltype(tile_to_shape( SmemLayoutAtomQ{}, - Shape, Int>{})); - + Shape, Int>{})); + using SmemLayoutV = decltype(tile_to_shape( + SmemLayoutAtomQ{}, + Shape, Int>{})); // https://github.com/ColfaxResearch/cutlass-kernels/blob/a222587e6d59b93ba704853d3946fb686d8b8892/src/fmha/fmha_forward.cu#L434 using SmemLayoutVtransposed = decltype( - composition(SmemLayoutKV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); + composition(SmemLayoutKV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); using SmemLayoutVtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutVtransposed{})); using SmemLayoutAtomO = decltype( @@ -100,16 +105,17 @@ struct Flash_fwd_kernel_traits : public Base { Stride, _1>>{})); using SmemLayoutO = decltype(tile_to_shape( SmemLayoutAtomO{}, - Shape, Int>{})); + Shape, Int>{})); using SmemCopyAtomO = Copy_Atom; using SmemCopyAtomOaccum = Copy_Atom; static constexpr int kSmemQSize = size(SmemLayoutQ{}) * sizeof(Element); - static constexpr int kSmemKVSize = size(SmemLayoutKV{}) * 2 * sizeof(Element); - static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize; + static constexpr int kSmemKSize = size(SmemLayoutK{}) * sizeof(Element); + static constexpr int kSmemVSize = size(SmemLayoutV{}) * sizeof(Element); + static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKSize + kSmemVSize) : kSmemQSize + kSmemKSize + kSmemVSize; static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); - static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad"); + static_assert(kQKHeadDim % kGmemElemsPerLoad == 0, "kQKHeadDim must be a multiple of kGmemElemsPerLoad"); // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts. // For example, for d=128, smem is split into 2 "pages", each page takes care of columns // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem, @@ -160,7 +166,7 @@ struct Flash_fwd_kernel_traits : public Base { // Is_V_in_regs is an option to reduce smem usage, but will increase register pressue. // No_double_buffer is another option to reduce smem usage, but will slow things down. -template > @@ -181,12 +187,18 @@ struct Flash_bwd_kernel_traits : public Base { static constexpr int kBlockM = kBlockM_; static constexpr int kBlockN = kBlockN_; - static constexpr int kHeadDim = kHeadDim_; - static_assert(kHeadDim % 32 == 0); - static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32; - static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32); + static constexpr int kQKHeadDim = kQKHeadDim_; + static constexpr int kVHeadDim = kVHeadDim_; + static_assert(kQKHeadDim % 32 == 0); + static_assert(kVHeadDim % 32 == 0); + static constexpr int kBlockKSmem = kQKHeadDim % 64 == 0 ? 64 : 32; + static constexpr int kBlockKGmem = kQKHeadDim % 128 == 0 ? 128 : (kQKHeadDim % 64 == 0 ? 64 : 32); static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3; + static constexpr int kBlockKSmem2 = kVHeadDim % 64 == 0 ? 64 : 32; + static constexpr int kBlockKGmem2 = kVHeadDim % 128 == 0 ? 128 : (kVHeadDim % 64 == 0 ? 64 : 32); + static constexpr int kSwizzle2 = kBlockKSmem2 == 32 ? 2 : 3; + static constexpr int AtomLayoutMSdP = AtomLayoutMSdP_; static_assert(kNWarps % AtomLayoutMSdP == 0); static_assert(kNWarps % AtomLayoutNdKV == 0); @@ -207,25 +219,39 @@ struct Flash_bwd_kernel_traits : public Base { Layout, Int, _1>>, // 2x4x1 or 4x2x1 thread group Tile, Int<16 * kNWarps / AtomLayoutMdQ>, _16>>; - using SmemLayoutAtomQdO = decltype( + using SmemLayoutAtomQ = decltype( composition(Swizzle{}, Layout>, Stride, _1>>{})); - using SmemLayoutQdO = decltype(tile_to_shape( - SmemLayoutAtomQdO{}, - make_shape(Int{}, Int{}))); + using SmemLayoutAtomdO = decltype( + composition(Swizzle{}, + Layout>, + Stride, _1>>{})); + using SmemLayoutQ = decltype(tile_to_shape( + SmemLayoutAtomQ{}, + make_shape(Int{}, Int{}))); + using SmemLayoutdO = decltype(tile_to_shape( + SmemLayoutAtomdO{}, + make_shape(Int{}, Int{}))); - using SmemLayoutAtomKV = decltype( + using SmemLayoutAtomK = decltype( composition(Swizzle{}, Layout, Int>, Stride, _1>>{})); - using SmemLayoutKV = decltype(tile_to_shape( + using SmemLayoutAtomV = decltype( + composition(Swizzle{}, + Layout, Int>, + Stride, _1>>{})); + using SmemLayoutK = decltype(tile_to_shape( // SmemLayoutAtomQdO{}, - SmemLayoutAtomKV{}, - make_shape(Int{}, Int{}))); - + SmemLayoutAtomK{}, + make_shape(Int{}, Int{}))); + using SmemLayoutV = decltype(tile_to_shape( + // SmemLayoutAtomQdO{}, + SmemLayoutAtomV{}, + make_shape(Int{}, Int{}))); using SmemLayoutKtransposed = decltype( - composition(SmemLayoutKV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); + composition(SmemLayoutKV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); using SmemLayoutKtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutKtransposed{})); // TODO: generalize to other values of kBlockN @@ -252,17 +278,28 @@ struct Flash_bwd_kernel_traits : public Base { using SmemCopyAtomPdS = Copy_Atom; - using SmemLayoutQdOtransposed = decltype( - composition(SmemLayoutQdO{}, make_layout(Shape, Int>{}, GenRowMajor{}))); - using SmemLayoutQdOtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutQdOtransposed{})); + using SmemLayoutQtransposed = decltype( + composition(SmemLayoutQ{}, make_layout(Shape, Int>{}, GenRowMajor{}))); + using SmemLayoutdOtransposed = decltype( + composition(SmemLayoutdO{}, make_layout(Shape, Int>{}, GenRowMajor{}))); + using SmemLayoutQtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutQtransposed{})); + using SmemLayoutdOtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutdOtransposed{})); - using SmemLayoutAtomdKV = decltype( + using SmemLayoutAtomdK = decltype( composition(Swizzle{}, Layout>, Stride, _1>>{})); - using SmemLayoutdKV = decltype(tile_to_shape( - SmemLayoutAtomdKV{}, - make_shape(Int{}, Int{}))); + using SmemLayoutAtomdV = decltype( + composition(Swizzle{}, + Layout>, + Stride, _1>>{})); + using SmemLayoutdK = decltype(tile_to_shape( + SmemLayoutAtomdK{}, + make_shape(Int{}, Int{}))); + using SmemLayoutdV = decltype(tile_to_shape( + SmemLayoutAtomdV{}, + make_shape(Int{}, Int{}))); + using SmemCopyAtomdKV = Copy_Atom; using SmemLayoutAtomdQ = decltype( @@ -271,26 +308,29 @@ struct Flash_bwd_kernel_traits : public Base { Stride, _1>>{})); using SmemLayoutdQ = decltype(tile_to_shape( SmemLayoutAtomdQ{}, - make_shape(Int{}, Int{}))); + make_shape(Int{}, Int{}))); using SmemCopyAtomdQ = Copy_Atom; // Double buffer for sQ - static constexpr int kSmemQdOSize = size(SmemLayoutQdO{}) * (No_double_buffer ? 2 : 3) * sizeof(Element); - static constexpr int kSmemKVSize = size(SmemLayoutKV{}) * 2 * sizeof(Element); + static constexpr int kSmemQSize = size(SmemLayoutQ{}) * (No_double_buffer ? 1 : 2) * sizeof(Element); + static constexpr int kSmemdOSize = size(SmemLayoutdO{}) * sizeof(Element); + static constexpr int kSmemKSize = size(SmemLayoutK{}) * sizeof(Element); + static constexpr int kSmemVSize = size(SmemLayoutV{}) * sizeof(Element); static constexpr int kSmemdSSize = size(SmemLayoutPdS{}) * sizeof(Element); static constexpr int kSmemPSize = size(SmemLayoutPdS{}) * sizeof(Element); static constexpr int kSmemdQSize = size(SmemLayoutdQ{}) * sizeof(Element); - static constexpr int kSmemSize = kSmemQdOSize + static constexpr int kSmemSize = kSmemQSize + kSmemdOSize + (!Is_V_in_regs - ? kSmemKVSize + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize) - : std::max(kSmemKVSize, kSmemKVSize / 2 + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize))); - static constexpr int kSmemSize1colblock = kSmemQdOSize + ? kSmemKSize + kSmemVSize + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize) + : std::max(kSmemKSize + kSmemVSize, kSmemKSize + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize))); + static constexpr int kSmemSize1colblock = kSmemQSize + kSmemdOSize + (!Is_V_in_regs - ? kSmemKVSize + kSmemdSSize + kSmemPSize - : std::max(kSmemKVSize, kSmemKVSize / 2 + kSmemdSSize + kSmemPSize)); + ? kSmemKSize + kSmemVSize + kSmemdSSize + kSmemPSize + : std::max(kSmemKSize + kSmemVSize, kSmemKSize + kSmemdSSize + kSmemPSize)); static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); - static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad"); + static_assert(kQKHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad"); + static_assert(kVHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad"); // Using kBlockKSmem instead of kHeadDim here to avoid bank conflicts, but doesn't seem // to affect speed in practice. static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad; From 331a601974e4502126bcd78f9929f30f2abf052a Mon Sep 17 00:00:00 2001 From: Yuqing Xia Date: Tue, 13 Aug 2024 01:22:02 -0700 Subject: [PATCH 02/46] support var dim --- compute_sm.py | 12 +- csrc/flash_attn/flash_api.cpp | 51 ++++--- .../src/flash_bwd_hdim160_fp16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim160_fp16_sm80.cu | 10 -- .../src/flash_bwd_hdim192_fp16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim192_fp16_sm80.cu | 10 -- .../src/flash_bwd_hdim224_bf16_sm80.cu | 10 -- .../src/flash_bwd_hdim224_fp16_sm80.cu | 10 -- .../src/flash_bwd_hdim256_bf16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim256_bf16_sm80.cu | 10 -- .../src/flash_bwd_hdim256_fp16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim256_fp16_sm80.cu | 10 -- .../src/flash_bwd_hdim32_bf16_sm80.cu | 10 -- .../src/flash_bwd_hdim32_fp16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim32_fp16_sm80.cu | 10 -- .../src/flash_bwd_hdim64_bf16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim64_bf16_sm80.cu | 10 -- .../src/flash_bwd_hdim64_fp16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim64_fp16_sm80.cu | 10 -- .../src/flash_bwd_hdim96_bf16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim96_bf16_sm80.cu | 10 -- .../src/flash_bwd_hdim96_fp16_causal_sm80.cu | 10 -- .../src/flash_bwd_hdim96_fp16_sm80.cu | 10 -- .../src/flash_bwd_launch_template.h | 8 +- ...h_bwd_qkdim128_vdim256_bf16_causal_sm80.cu | 10 ++ .../flash_bwd_qkdim128_vdim256_bf16_sm80.cu | 10 ++ ..._bwd_qkdim128_vdim256_fp16_causal_sm80.cu} | 4 +- ...> flash_bwd_qkdim128_vdim256_fp16_sm80.cu} | 4 +- ...sh_bwd_qkdim32_vdim64_bf16_causal_sm80.cu} | 4 +- .../src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu | 10 ++ ...sh_bwd_qkdim32_vdim64_fp16_causal_sm80.cu} | 4 +- ... => flash_bwd_qkdim32_vdim64_fp16_sm80.cu} | 4 +- ...sh_bwd_qkdim64_vdim128_bf16_causal_sm80.cu | 10 ++ .../flash_bwd_qkdim64_vdim128_bf16_sm80.cu | 10 ++ ...h_bwd_qkdim64_vdim128_fp16_causal_sm80.cu} | 4 +- ...=> flash_bwd_qkdim64_vdim128_fp16_sm80.cu} | 4 +- ...sh_bwd_qkdim96_vdim192_bf16_causal_sm80.cu | 10 ++ .../flash_bwd_qkdim96_vdim192_bf16_sm80.cu | 10 ++ ...h_bwd_qkdim96_vdim192_fp16_causal_sm80.cu} | 4 +- ...=> flash_bwd_qkdim96_vdim192_fp16_sm80.cu} | 4 +- .../src/flash_fwd_hdim160_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim160_fp16_sm80.cu | 10 -- .../src/flash_fwd_hdim192_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim192_fp16_sm80.cu | 10 -- .../src/flash_fwd_hdim224_bf16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim224_bf16_sm80.cu | 10 -- .../src/flash_fwd_hdim224_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim224_fp16_sm80.cu | 10 -- .../src/flash_fwd_hdim256_bf16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim256_bf16_sm80.cu | 10 -- .../src/flash_fwd_hdim256_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim256_fp16_sm80.cu | 10 -- .../src/flash_fwd_hdim32_bf16_sm80.cu | 10 -- .../src/flash_fwd_hdim32_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim32_fp16_sm80.cu | 10 -- .../src/flash_fwd_hdim64_bf16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim64_bf16_sm80.cu | 10 -- .../src/flash_fwd_hdim64_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim64_fp16_sm80.cu | 10 -- .../src/flash_fwd_hdim96_bf16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim96_bf16_sm80.cu | 10 -- .../src/flash_fwd_hdim96_fp16_causal_sm80.cu | 10 -- .../src/flash_fwd_hdim96_fp16_sm80.cu | 10 -- csrc/flash_attn/src/flash_fwd_kernel.h | 10 +- .../src/flash_fwd_launch_template.h | 18 +-- ...h_fwd_qkdim128_vdim256_bf16_causal_sm80.cu | 10 ++ .../flash_fwd_qkdim128_vdim256_bf16_sm80.cu | 10 ++ ..._fwd_qkdim128_vdim256_fp16_causal_sm80.cu} | 4 +- ...> flash_fwd_qkdim128_vdim256_fp16_sm80.cu} | 4 +- ...sh_fwd_qkdim32_vdim64_bf16_causal_sm80.cu} | 4 +- .../src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu | 10 ++ ...sh_fwd_qkdim32_vdim64_fp16_causal_sm80.cu} | 4 +- ... => flash_fwd_qkdim32_vdim64_fp16_sm80.cu} | 4 +- ...sh_fwd_qkdim64_vdim128_bf16_causal_sm80.cu | 10 ++ .../flash_fwd_qkdim64_vdim128_bf16_sm80.cu | 10 ++ ...h_fwd_qkdim64_vdim128_fp16_causal_sm80.cu} | 4 +- ...=> flash_fwd_qkdim64_vdim128_fp16_sm80.cu} | 4 +- ...sh_fwd_qkdim96_vdim192_bf16_causal_sm80.cu | 10 ++ .../flash_fwd_qkdim96_vdim192_bf16_sm80.cu | 10 ++ ...h_fwd_qkdim96_vdim192_fp16_causal_sm80.cu} | 4 +- ...=> flash_fwd_qkdim96_vdim192_fp16_sm80.cu} | 4 +- ...lash_fwd_split_hdim160_fp16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim160_fp16_sm80.cu | 7 - ...lash_fwd_split_hdim192_bf16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim192_bf16_sm80.cu | 7 - ...lash_fwd_split_hdim192_fp16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim192_fp16_sm80.cu | 7 - ...lash_fwd_split_hdim224_bf16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim224_bf16_sm80.cu | 7 - ...lash_fwd_split_hdim224_fp16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim224_fp16_sm80.cu | 7 - ...lash_fwd_split_hdim256_bf16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim256_bf16_sm80.cu | 7 - ...lash_fwd_split_hdim256_fp16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim256_fp16_sm80.cu | 7 - ...flash_fwd_split_hdim32_bf16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim32_bf16_sm80.cu | 7 - ...flash_fwd_split_hdim64_bf16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim64_bf16_sm80.cu | 7 - ...flash_fwd_split_hdim96_bf16_causal_sm80.cu | 7 - .../src/flash_fwd_split_hdim96_bf16_sm80.cu | 7 - ...split_qkdim128_vdim256_bf16_causal_sm80.cu | 7 + ...sh_fwd_split_qkdim128_vdim256_bf16_sm80.cu | 7 + ...plit_qkdim128_vdim256_fp16_causal_sm80.cu} | 2 +- ...h_fwd_split_qkdim128_vdim256_fp16_sm80.cu} | 2 +- ..._split_qkdim32_vdim64_bf16_causal_sm80.cu} | 2 +- ...ash_fwd_split_qkdim32_vdim64_bf16_sm80.cu} | 2 +- ..._split_qkdim32_vdim64_fp16_causal_sm80.cu} | 2 +- ...ash_fwd_split_qkdim32_vdim64_fp16_sm80.cu} | 2 +- ...split_qkdim64_vdim128_bf16_causal_sm80.cu} | 2 +- ...ash_fwd_split_qkdim64_vdim128_bf16_sm80.cu | 7 + ...split_qkdim64_vdim128_fp16_causal_sm80.cu} | 2 +- ...sh_fwd_split_qkdim64_vdim128_fp16_sm80.cu} | 2 +- ...split_qkdim96_vdim192_bf16_causal_sm80.cu} | 2 +- ...ash_fwd_split_qkdim96_vdim192_bf16_sm80.cu | 7 + ...split_qkdim96_vdim192_fp16_causal_sm80.cu} | 2 +- ...sh_fwd_split_qkdim96_vdim192_fp16_sm80.cu} | 2 +- csrc/flash_attn/src/generate_kernels.py | 11 +- csrc/flash_attn/src/kernel_traits.h | 6 +- csrc/flash_attn/src/static_switch.h | 21 ++- setup.py | 132 +++++++----------- test_flash.py | 19 +++ 122 files changed, 355 insertions(+), 777 deletions(-) delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu rename csrc/flash_attn/src/{flash_bwd_hdim160_bf16_sm80.cu => flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_bwd_hdim128_fp16_causal_sm80.cu => flash_bwd_qkdim128_vdim256_fp16_sm80.cu} (56%) rename csrc/flash_attn/src/{flash_bwd_hdim128_fp16_sm80.cu => flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu} (56%) create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu rename csrc/flash_attn/src/{flash_bwd_hdim32_bf16_causal_sm80.cu => flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_bwd_hdim128_bf16_causal_sm80.cu => flash_bwd_qkdim32_vdim64_fp16_sm80.cu} (66%) create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu rename csrc/flash_attn/src/{flash_bwd_hdim160_bf16_causal_sm80.cu => flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_bwd_hdim128_bf16_sm80.cu => flash_bwd_qkdim64_vdim128_fp16_sm80.cu} (66%) create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu rename csrc/flash_attn/src/{flash_bwd_hdim192_bf16_causal_sm80.cu => flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_bwd_hdim192_bf16_sm80.cu => flash_bwd_qkdim96_vdim192_fp16_sm80.cu} (66%) delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim224_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim224_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_hdim160_bf16_sm80.cu => flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_fwd_hdim128_fp16_causal_sm80.cu => flash_fwd_qkdim128_vdim256_fp16_sm80.cu} (56%) rename csrc/flash_attn/src/{flash_fwd_hdim128_fp16_sm80.cu => flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu} (56%) create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_hdim32_bf16_causal_sm80.cu => flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_fwd_hdim128_bf16_causal_sm80.cu => flash_fwd_qkdim32_vdim64_fp16_sm80.cu} (66%) create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_hdim160_bf16_causal_sm80.cu => flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_fwd_hdim128_bf16_sm80.cu => flash_fwd_qkdim64_vdim128_fp16_sm80.cu} (66%) create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_hdim192_bf16_causal_sm80.cu => flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu} (66%) rename csrc/flash_attn/src/{flash_fwd_hdim192_bf16_sm80.cu => flash_fwd_qkdim96_vdim192_fp16_sm80.cu} (66%) delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu delete mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_split_hdim128_fp16_causal_sm80.cu => flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim128_fp16_sm80.cu => flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim128_bf16_sm80.cu => flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu} (81%) rename csrc/flash_attn/src/{flash_fwd_split_hdim160_bf16_causal_sm80.cu => flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu} (81%) rename csrc/flash_attn/src/{flash_fwd_split_hdim32_fp16_causal_sm80.cu => flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim32_fp16_sm80.cu => flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim160_bf16_sm80.cu => flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu} (81%) create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_split_hdim64_fp16_causal_sm80.cu => flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim64_fp16_sm80.cu => flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim128_bf16_causal_sm80.cu => flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu} (81%) create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu rename csrc/flash_attn/src/{flash_fwd_split_hdim96_fp16_causal_sm80.cu => flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu} (82%) rename csrc/flash_attn/src/{flash_fwd_split_hdim96_fp16_sm80.cu => flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu} (82%) create mode 100644 test_flash.py diff --git a/compute_sm.py b/compute_sm.py index 17a201f4d..f65cf3bc6 100644 --- a/compute_sm.py +++ b/compute_sm.py @@ -1,7 +1,9 @@ -Br = 64 -Bc = 128 +Br = 128 +Bc = 64 QKHeaddim = 128 VHeaddim = 256 -smem =2 *(Br * QKHeaddim * 2 + Br * VHeaddim + Bc * QKHeaddim + Bc * VHeaddim + Br * Bc * 2) -smem = smem/1024 -print(smem) \ No newline at end of file +bwdsmem =2 *(Br * QKHeaddim * 2 + Br * VHeaddim + Bc * QKHeaddim + Bc * VHeaddim + Br * Bc * 2) +bwdsmem = bwdsmem/1024 +fwdsmem = (Br * QKHeaddim + Bc * QKHeaddim + Bc * VHeaddim)*2 +fwdsmem = fwdsmem/1024 +print("fwdsmem:", fwdsmem) diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index 9d0f8c20b..35ac2229e 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -245,9 +245,9 @@ void run_mha_fwd(Flash_fwd_params ¶ms, cudaStream_t stream, bool force_split HEADDIM_SWITCH(params.d, [&] { BOOL_SWITCH(params.is_causal, Is_causal, [&] { if (params.num_splits <= 1 && !force_split_kernel) { // If we don't set it num_splits == 0 - run_mha_fwd_(params, stream); + run_mha_fwd_(params, stream); } else { - run_mha_fwd_splitkv_dispatch(params, stream); + run_mha_fwd_splitkv_dispatch(params, stream); } }); }); @@ -302,7 +302,7 @@ std::tuple set_params_splitkv(Flash_fwd_params ¶ms, const int num_splits, cudaDeviceProp *dprops, struct c10::TensorOptions opts) { // This needs to match with run_mha_fwd_splitkv_dispatch - const max_head_size = head_size > v_head_size ? head_size : v_head_size; + const int max_head_size = head_size > v_head_size ? head_size : v_head_size; const int block_n = max_head_size <= 64 ? 256 : (max_head_size <= 128 ? 128 : 64); const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n; // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel. @@ -372,6 +372,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer."); auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, "FlashAttention only support fp16 and bf16 data type"); if (q_dtype == torch::kBFloat16) { @@ -387,7 +388,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); const auto sizes = q.sizes(); - const auto v_head_size_og = v.sizes()[3]; + const int v_head_size_og = v.sizes()[3]; const int batch_size = sizes[0]; int seqlen_q = sizes[1]; int num_heads = sizes[2]; @@ -449,13 +450,15 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size if (seqlenq_ngroups_swapped) { out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2); } - if (head_size_og % 8 != 0) { - out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + if (v_head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } } else { - out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); - out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); + if (v_head_size_og % 8 != 0) { + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; @@ -612,7 +615,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s CHECK_CONTIGUOUS(cu_seqlens_k); const auto sizes = q.sizes(); - const auto v_head_size_og = v.sizes()[3]; + const int v_head_size_og = v.sizes()[2]; const int batch_size = cu_seqlens_q.numel() - 1; int num_heads = sizes[1]; const int head_size_og = sizes[2]; @@ -698,12 +701,14 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); } if (v_head_size_og % 8 != 0) { - out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + out = torch::empty({total_q, num_heads, v_head_size_og}, q.options()); out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } } else { - out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); - out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + out = torch::empty({total_q, num_heads, v_head_size_og}, q.options()); + if (v_head_size_og % 8 != 0) { + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; @@ -835,7 +840,7 @@ void run_mha_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { FP16_SWITCH(!params.is_bf16, [&] { HEADDIM_SWITCH(params.d, [&] { BOOL_SWITCH(params.is_causal, Is_causal, [&] { - run_mha_bwd_(params, stream); + run_mha_bwd_(params, stream); }); }); }); @@ -899,7 +904,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); const auto sizes = q.sizes(); - const auto v_head_size_og = v.sizes()[3]; + const int v_head_size_og = v.sizes()[3]; const int batch_size = sizes[0]; const int seqlen_q = sizes[1]; const int num_heads = sizes[2]; @@ -922,7 +927,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si const int seqlen_q_rounded = round_multiple(seqlen_q, 128); const int seqlen_k_rounded = round_multiple(seqlen_k, 128); - TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8"); + // TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8"); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } if (window_size_left >= seqlen_k) { window_size_left = -1; } @@ -1143,7 +1148,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_CONTIGUOUS(cu_seqlens_k); const auto sizes = q.sizes(); - const auto v_head_size_og = v.sizes()[3]; + const int v_head_size_og = v.sizes()[2]; const int total_q = sizes[0]; const int batch_size = cu_seqlens_q.numel() - 1; const int num_heads = sizes[1]; @@ -1391,7 +1396,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he } const auto sizes = q.sizes(); - const auto v_head_size_og = v.sizes()[3]; + const int v_head_size_og = vcache.sizes()[3]; const int batch_size = sizes[0]; int seqlen_q = sizes[1]; int num_heads = sizes[2]; @@ -1454,13 +1459,15 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he CHECK_DEVICE(out); TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); - if (head_size_og % 8 != 0) { - out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); + if (v_head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); } } else { - out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q_dtype,); - out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); + if (v_head_size_og % 8 != 0) { + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; @@ -1590,7 +1597,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he // Keep references to these tensors to extend their lifetime at::Tensor softmax_lse_accum, out_accum; std::tie(softmax_lse_accum, out_accum) = set_params_splitkv( - params, batch_size, num_heads, head_size, seqlen_k, seqlen_q, + params, batch_size, num_heads, head_size, v_head_size, seqlen_k, seqlen_q, head_size_rounded, v_head_size_rounded, /*dropout*/ 0.f, num_splits, dprops, opts); if (paged_KV) { diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu deleted file mode 100644 index a511162dc..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim160(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu deleted file mode 100644 index c9ce19acb..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim160(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu deleted file mode 100644 index 69cad5ae4..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim192(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu deleted file mode 100644 index 3d4cab58b..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim192(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu deleted file mode 100644 index b2b58e2ab..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim224(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu deleted file mode 100644 index e65cdaede..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim224(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu deleted file mode 100644 index 692744597..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu deleted file mode 100644 index d718ec88b..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu deleted file mode 100644 index 551c695e0..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu deleted file mode 100644 index a58770026..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu deleted file mode 100644 index d6d403638..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim32(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu deleted file mode 100644 index 60aa2d60b..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim32(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu deleted file mode 100644 index b06d50eaa..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim32(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu deleted file mode 100644 index 52b93be9d..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu deleted file mode 100644 index 09d9e2b75..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu deleted file mode 100644 index 5a4ea5f46..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu deleted file mode 100644 index fb115ff76..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu deleted file mode 100644 index 5f4c26a47..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu deleted file mode 100644 index 224213d79..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu deleted file mode 100644 index d0349014f..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu deleted file mode 100644 index 663fc8592..000000000 --- a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_bwd_launch_template.h" - -template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_bwd_launch_template.h b/csrc/flash_attn/src/flash_bwd_launch_template.h index 362b07982..161a4c56b 100644 --- a/csrc/flash_attn/src/flash_bwd_launch_template.h +++ b/csrc/flash_attn/src/flash_bwd_launch_template.h @@ -98,7 +98,7 @@ void run_flash_bwd_seqk_parallel(Flash_bwd_params ¶ms, cudaStream_t stream) // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates. // If head dim > 128, set IsEvenMNConst to false to reduce number of templates // If Is_local, set Is_causal to false - auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel; + auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel;//TODO check if this is correct // auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel; if (smem_size_dq_dk_dv >= 48 * 1024) { C10_CUDA_CHECK(cudaFuncSetAttribute( @@ -243,7 +243,7 @@ void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) template void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { constexpr static int QKHeaddim = 128; - constexpr static int VKHeaddim = 256; + constexpr static int VHeaddim = 256; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -263,7 +263,7 @@ void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. // run_flash_bwd>(params, stream); if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); @@ -271,7 +271,7 @@ void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) // run_flash_bwd, Is_dropout>(params, stream); } else { // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } // run_flash_bwd>(params, stream); diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu new file mode 100644 index 000000000..010fbd630 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu new file mode 100644 index 000000000..53e334b12 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu index 1b6173725..1bbccb862 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim160(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim128_vdim256(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu similarity index 56% rename from csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu index 5d27cd97b..ba1916590 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim128(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim128_vdim256(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu similarity index 56% rename from csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu index 2d7ddf46b..621e9f679 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim128(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim32_vdim64(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu new file mode 100644 index 000000000..a87d7b453 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim32_vdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu index 1282939a0..0f8b1fec7 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim32(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim32_vdim64(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu index 13132e86d..6d2f207fc 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim128(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim32_vdim64(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..740f0baa8 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim64_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..34df4e575 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim64_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu index c18a78c76..5e9428a4f 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim160(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim64_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu index 85a5dc88e..b0912ed91 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim128(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim64_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu new file mode 100644 index 000000000..17f479dc5 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim96_vdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu new file mode 100644 index 000000000..229078332 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim96_vdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu index f492a7171..a502004d5 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim192(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim96_vdim192(params, stream); } diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu rename to csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu index 2df58daa2..ebd73992f 100644 --- a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_bwd_launch_template.h" template<> -void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { - run_mha_bwd_hdim192(params, stream); +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim96_vdim192(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu deleted file mode 100644 index 1ef511a6b..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim160(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu deleted file mode 100644 index 96abfbd8a..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim160(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu deleted file mode 100644 index a4a7bc242..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim192(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu deleted file mode 100644 index c30c4a14f..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim192(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim224_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim224_bf16_causal_sm80.cu deleted file mode 100644 index a12a5f4ad..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim224_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim224(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu deleted file mode 100644 index 8690bdb1a..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim224(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim224_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim224_fp16_causal_sm80.cu deleted file mode 100644 index f01dad09c..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim224_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim224(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu deleted file mode 100644 index 7ec1e16b7..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim224(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu deleted file mode 100644 index f84e978c9..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu deleted file mode 100644 index c52f0417b..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu deleted file mode 100644 index f96f7edc6..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu deleted file mode 100644 index 9c7c6b93d..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim256(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu deleted file mode 100644 index f377a5b8f..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim32(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu deleted file mode 100644 index 74e4d66ae..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim32(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu deleted file mode 100644 index e85db18e3..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim32(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu deleted file mode 100644 index 9297e8bb6..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu deleted file mode 100644 index 8364b1e7e..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu deleted file mode 100644 index 1c6ed7ef0..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu deleted file mode 100644 index 3c87573ba..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim64(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu deleted file mode 100644 index 49fae856a..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu deleted file mode 100644 index c5af1cf63..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu deleted file mode 100644 index b0d6c9928..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu deleted file mode 100644 index c97aa33f8..000000000 --- a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim96(params, stream); -} diff --git a/csrc/flash_attn/src/flash_fwd_kernel.h b/csrc/flash_attn/src/flash_fwd_kernel.h index 655e81427..74c2833a5 100644 --- a/csrc/flash_attn/src/flash_fwd_kernel.h +++ b/csrc/flash_attn/src/flash_fwd_kernel.h @@ -449,7 +449,8 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi Tensor taccOsO = smem_thr_copy_O.partition_D(sO); // ((Atom,AtomNum),PIPE_M,PIPE_N) // sO has the same size as sQ, so we don't need to sync here. - if (Kernel_traits::Share_Q_K_smem) { __syncthreads(); } + __syncthreads(); + // if (Kernel_traits::Share_Q_K_smem) { __syncthreads(); } cute::copy(smem_tiled_copy_O, taccOrO, taccOsO); @@ -834,7 +835,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons int n_block = n_block_max - 1; // We don't need to clear the sK smem tiles since we'll mask out the scores anyway. - flash::copy(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV, + flash::copy(gmem_tiled_copy_QKV, tKgK, tKsK, tKcK, tKpK, binfo.actual_seqlen_k - n_block * kBlockN); cute::cp_async_fence(); @@ -1024,8 +1025,9 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons // sOaccum is larger than sQ, so we need to syncthreads here // TODO: allocate enough smem for sOaccum - if constexpr (Split) { __syncthreads(); } - + __syncthreads(); + // if constexpr (Split) { __syncthreads(); } + cute::copy(smem_tiled_copy_Oaccum, taccOrOaccum, taccOsOaccum); const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb) diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h index 2b9ad7b6b..3d6a1ddd5 100644 --- a/csrc/flash_attn/src/flash_fwd_launch_template.h +++ b/csrc/flash_attn/src/flash_fwd_launch_template.h @@ -24,7 +24,7 @@ // Use a macro to clean up kernel definitions #define DEFINE_FLASH_FORWARD_KERNEL(kernelName, ...) \ template \ -__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_fwd_params params) +__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_fwd_params params) DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Return_softmax) { #if defined(ARCH_SUPPORTS_FLASH) @@ -60,7 +60,7 @@ void run_flash_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM; dim3 grid(num_m_block, params.b, params.h); const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0; - const bool is_even_K = params.d == Kernel_traits::kHeadDim; + const bool is_even_K = params.d == Kernel_traits::kQKHeadDim; //TODO: Check if this is correct const bool return_softmax = params.p_ptr != nullptr; BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] { EVENK_SWITCH(is_even_K, IsEvenKConst, [&] { @@ -73,7 +73,7 @@ void run_flash_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { // If return_softmax, set IsEvenMNConst to false to reduce number of templates // If head dim > 128, set IsEvenMNConst to false to reduce number of templates // If Is_local, set Is_causal to false - auto kernel = &flash_fwd_kernel; + auto kernel = &flash_fwd_kernel;// TODO: Check if this is correct // auto kernel = &flash_fwd_kernel; // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout)); // auto kernel = &flash_fwd_kernel; @@ -103,7 +103,7 @@ void run_flash_splitkv_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM; dim3 grid(num_m_block, params.num_splits > 1 ? params.num_splits : params.b, params.num_splits > 1 ? params.b * params.h : params.h); const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0; - const bool is_even_K = params.d == Kernel_traits::kHeadDim; + const bool is_even_K = params.d == Kernel_traits::kQKHeadDim; //TODO: Check if this is correct BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] { EVENK_SWITCH(is_even_K, IsEvenKConst, [&] { LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] { @@ -114,7 +114,7 @@ void run_flash_splitkv_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr. // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates. // If Is_local, set Is_causal to false - auto kernel = &flash_fwd_splitkv_kernel; + auto kernel = &flash_fwd_splitkv_kernel; // TODO: Check if this is correct // auto kernel = &flash_fwd_splitkv_kernel; // auto kernel = &flash_fwd_splitkv_kernel; if (smem_size >= 48 * 1024) { @@ -134,7 +134,7 @@ void run_flash_splitkv_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { // We want kBlockM to be as small as possible for more parallelism. // With 128 threads we can load 512 elements at a time, so if headdim is divisible by 128, kBlockM = 4. // If headdim is divisible by 64, then we set kBlockM = 8, etc. - constexpr static int kBlockM = Kernel_traits::kHeadDim % 128 == 0 ? 4 : (Kernel_traits::kHeadDim % 64 == 0 ? 8 : 16); + constexpr static int kBlockM = Kernel_traits::kQKHeadDim % 128 == 0 ? 4 : (Kernel_traits::kQKHeadDim % 64 == 0 ? 8 : 16); // TODO: Check if this is correct dim3 grid_combine((params.b * params.h * params.seqlen_q + kBlockM - 1) / kBlockM); EVENK_SWITCH(is_even_K, IsEvenKConst, [&] { if (params.num_splits <= 2) { @@ -157,14 +157,14 @@ void run_flash_splitkv_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { } } -template +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream) { constexpr static int kBlockM = 64; // Fixed for all head dimensions // TD [2023-08-28]: nvcc segfaults for headdim 96 with block size 64 x 256, // and for headdim 192 with block size 64 x 128. // Also for headdim 160 with block size 64 x 128 after the rotary addition. - constexpr static int kBlockN = Headdim <= 64 ? 256 : (Headdim <= 128 ? 128 : 64); - run_flash_splitkv_fwd, Is_causal>(params, stream); + constexpr static int kBlockN = QKHeaddim <= 64 ? 256 : (QKHeaddim <= 128 ? 128 : 64); + run_flash_splitkv_fwd, Is_causal>(params, stream); } template diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu new file mode 100644 index 000000000..795ec67f1 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu new file mode 100644 index 000000000..e1048791c --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu index 5af68ac38..582a95236 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim160(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu similarity index 56% rename from csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu index c616628c8..bfc09dc6b 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim128(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu similarity index 56% rename from csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu index 4ff6b9fbf..3f80a1fe7 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim128(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim32_vdim64(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu new file mode 100644 index 000000000..e3dba404d --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim32_vdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu index e21d0408c..5677fcef4 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim32(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim32_vdim64(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu index 9383c1024..36b511f06 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim128(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim32_vdim64(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..2869ccfc2 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim64_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..d9d444fd1 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim64_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu index d6d4371bf..2504c540b 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim160(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim64_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu index f03abda48..a5270a3ed 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim128(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim64_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu new file mode 100644 index 000000000..a307cb29e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim96_vdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu new file mode 100644 index 000000000..9f00dd249 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim96_vdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu index 077d25d09..27bb08d03 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim192(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim96_vdim192(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu similarity index 66% rename from csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu index ea5f265fe..7843c337c 100644 --- a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu @@ -5,6 +5,6 @@ #include "flash_fwd_launch_template.h" template<> -void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_hdim192(params, stream); +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim96_vdim192(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu deleted file mode 100644 index 1723c69e0..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu deleted file mode 100644 index 892d2352a..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu deleted file mode 100644 index d07ee0af2..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu deleted file mode 100644 index 23cfa59d5..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu deleted file mode 100644 index 273a28442..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu deleted file mode 100644 index 0f588d1f4..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_causal_sm80.cu deleted file mode 100644 index ea024d9ab..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu deleted file mode 100644 index b06ae5ace..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_causal_sm80.cu deleted file mode 100644 index b217f3789..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu deleted file mode 100644 index 8cf2eabed..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2023, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu deleted file mode 100644 index 370fe9ca3..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu deleted file mode 100644 index 508f07f7d..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu deleted file mode 100644 index 019ded67f..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu deleted file mode 100644 index 708f5542a..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu deleted file mode 100644 index 5a205b7e7..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu deleted file mode 100644 index 2c576f118..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu deleted file mode 100644 index 8c7da41dd..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu deleted file mode 100644 index 93f29dea8..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu deleted file mode 100644 index 50080c47e..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu deleted file mode 100644 index ae56ddd4c..000000000 --- a/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2024, Tri Dao. -// Splitting the different head dimensions to different files to speed up compilation. -// This file is auto-generated. See "generate_kernels.py" - -#include "flash_fwd_launch_template.h" - -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu new file mode 100644 index 000000000..1300e01d6 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu new file mode 100644 index 000000000..754b5d256 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu index 3dd74e273..e72b90ca9 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu index addacedf4..c6dd9c923 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu similarity index 81% rename from csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu index e608e308e..2da6200cd 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu similarity index 81% rename from csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu index 8ace7bda9..138d565e7 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu index 484a15e93..598fa570f 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu index 5474ae89d..4384ec420 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu similarity index 81% rename from csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu index 1e133ec1a..b700fb21a 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..e8dcddc4e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu index 1e2e12b8c..752f148bb 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu index 16c34ed3f..0eaf1b0e7 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu similarity index 81% rename from csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu index a959c9ceb..d9efa099e 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu new file mode 100644 index 000000000..34e9db839 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu index ed305767e..389e228a4 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu similarity index 82% rename from csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu rename to csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu index 022064656..8d9d9d6f4 100644 --- a/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu @@ -4,4 +4,4 @@ #include "flash_fwd_launch_template.h" -template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/generate_kernels.py b/csrc/flash_attn/src/generate_kernels.py index 0db6f2d6c..525854b84 100644 --- a/csrc/flash_attn/src/generate_kernels.py +++ b/csrc/flash_attn/src/generate_kernels.py @@ -50,23 +50,24 @@ class Kernel: @property def template(self) -> str: - self.vhead_dim = self.qkhead_dim * 2 + vhead_dim = self.qkhead_dim * 2 if self.direction == "fwd": return KERNEL_IMPL_TEMPLATE_FWD.format( - DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=vhead_dim, IS_CAUSAL=self.is_causal ) elif self.direction == "bwd": return KERNEL_IMPL_TEMPLATE_BWD.format( - DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=vhead_dim, IS_CAUSAL=self.is_causal ) else: return KERNEL_IMPL_TEMPLATE_FWD_SPLIT.format( - DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=vhead_dim, IS_CAUSAL=self.is_causal ) @property def filename(self) -> str: - return f"flash_{self.direction}_qkdim{self.qkhead_dim}_vdim{self.vhead_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" + vhead_dim = self.qkhead_dim * 2 + return f"flash_{self.direction}_qkdim{self.qkhead_dim}_vdim{vhead_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" def get_all_kernels() -> List[Kernel]: diff --git a/csrc/flash_attn/src/kernel_traits.h b/csrc/flash_attn/src/kernel_traits.h index a43e4ea45..c0fe049e2 100644 --- a/csrc/flash_attn/src/kernel_traits.h +++ b/csrc/flash_attn/src/kernel_traits.h @@ -96,7 +96,7 @@ struct Flash_fwd_kernel_traits : public Base { Shape, Int>{})); // https://github.com/ColfaxResearch/cutlass-kernels/blob/a222587e6d59b93ba704853d3946fb686d8b8892/src/fmha/fmha_forward.cu#L434 using SmemLayoutVtransposed = decltype( - composition(SmemLayoutKV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); + composition(SmemLayoutV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); using SmemLayoutVtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutVtransposed{})); using SmemLayoutAtomO = decltype( @@ -169,7 +169,7 @@ struct Flash_fwd_kernel_traits : public Base { template > + typename Base=Flash_kernel_traits > struct Flash_bwd_kernel_traits : public Base { using Element = typename Base::Element; using ElementAccum = typename Base::ElementAccum; @@ -251,7 +251,7 @@ struct Flash_bwd_kernel_traits : public Base { SmemLayoutAtomV{}, make_shape(Int{}, Int{}))); using SmemLayoutKtransposed = decltype( - composition(SmemLayoutKV{}, make_layout(Shape, Int>{}, GenRowMajor{}))); + composition(SmemLayoutK{}, make_layout(Shape, Int>{}, GenRowMajor{}))); using SmemLayoutKtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutKtransposed{})); // TODO: generalize to other values of kBlockN diff --git a/csrc/flash_attn/src/static_switch.h b/csrc/flash_attn/src/static_switch.h index a57702f6c..9a73daf1b 100644 --- a/csrc/flash_attn/src/static_switch.h +++ b/csrc/flash_attn/src/static_switch.h @@ -90,25 +90,20 @@ #define HEADDIM_SWITCH(HEADDIM, ...) \ [&] { \ if (HEADDIM <= 32) { \ - constexpr static int kHeadDim = 32; \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 64; \ return __VA_ARGS__(); \ } else if (HEADDIM <= 64) { \ - constexpr static int kHeadDim = 64; \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 128; \ return __VA_ARGS__(); \ } else if (HEADDIM <= 96) { \ - constexpr static int kHeadDim = 96; \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 192; \ return __VA_ARGS__(); \ } else if (HEADDIM <= 128) { \ - constexpr static int kHeadDim = 128; \ - return __VA_ARGS__(); \ - } else if (HEADDIM <= 160) { \ - constexpr static int kHeadDim = 160; \ - return __VA_ARGS__(); \ - } else if (HEADDIM <= 192) { \ - constexpr static int kHeadDim = 192; \ - return __VA_ARGS__(); \ - } else if (HEADDIM <= 256) { \ - constexpr static int kHeadDim = 256; \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 256; \ return __VA_ARGS__(); \ } \ }() diff --git a/setup.py b/setup.py index fd67f645b..bc759c03d 100644 --- a/setup.py +++ b/setup.py @@ -180,90 +180,54 @@ def validate_and_update_archs(archs): name="flash_attn_2_cuda", sources=[ "csrc/flash_attn/flash_api.cpp", - "csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu", ], extra_compile_args={ "cxx": ["-O3", "-std=c++17"] + generator_flag, diff --git a/test_flash.py b/test_flash.py new file mode 100644 index 000000000..b7b251b57 --- /dev/null +++ b/test_flash.py @@ -0,0 +1,19 @@ + +import torch +from flash_attn import flash_attn_func +batch = 4 +seqlen_q = 2048 +seqlen_kv = 2048 +dim_qk = 64 +dim_v = 128 +nheads_q = 20 +nheads_kv = 5 +device = torch.device('cuda') +dtype = torch.float16 + +query = torch.randn(batch, seqlen_q, nheads_q, dim_qk, device=device, dtype=dtype) +key = torch.randn(batch, seqlen_kv, nheads_kv, dim_qk, device=device, dtype=dtype) +value = torch.randn(batch, seqlen_kv, nheads_kv, dim_v, device=device, dtype=dtype) + +output = flash_attn_func(query, key, value, causal=False) +print(output[0,0,0,0]) \ No newline at end of file From 02da101d5449f144180c9272edb288768c93ef89 Mon Sep 17 00:00:00 2001 From: xiayuqing0622 Date: Tue, 13 Aug 2024 02:11:34 -0700 Subject: [PATCH 03/46] modify readme --- README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1d0897ab5..b70746d1c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,14 @@ -# FlashAttention -This repository provides the official implementation of FlashAttention and -FlashAttention-2 from the -following papers. +# Customized FlashAttention + +This repository provides Customized FlashAttention based on the official implementation. +we have supported: +- FlashAttention-2 with QKHeadDim=32, VHeadDim=64 +- FlashAttention-2 with QKHeadDim=64, VHeadDim=128 +- FlashAttention-2 with QKHeadDim=128, VHeadDim=256 + +Feel free to tell us what else you need. We might support it soon. :) + +Currently, we do not provide prebuilt library, you need to compile from source. **FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness** Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré From 2bce87c98509448b6fe3d5d8bdcd68b68e7f87af Mon Sep 17 00:00:00 2001 From: xiayuqing0622 Date: Thu, 15 Aug 2024 04:40:23 -0700 Subject: [PATCH 04/46] compatible --- csrc/flash_attn/flash_api.cpp | 19 +- .../src/flash_bwd_hdim128_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim128_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim128_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim128_fp16_sm80.cu | 10 + .../src/flash_bwd_hdim160_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim160_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim160_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim160_fp16_sm80.cu | 10 + .../src/flash_bwd_hdim192_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim192_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim192_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim192_fp16_sm80.cu | 10 + .../src/flash_bwd_hdim256_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim256_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim256_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim256_fp16_sm80.cu | 10 + .../src/flash_bwd_hdim32_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim32_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim32_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim32_fp16_sm80.cu | 10 + .../src/flash_bwd_hdim64_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim64_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim64_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim64_fp16_sm80.cu | 10 + .../src/flash_bwd_hdim96_bf16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim96_bf16_sm80.cu | 10 + .../src/flash_bwd_hdim96_fp16_causal_sm80.cu | 10 + .../src/flash_bwd_hdim96_fp16_sm80.cu | 10 + .../src/flash_bwd_launch_template.h | 280 +++++++++++++----- .../src/flash_fwd_hdim128_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim128_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim128_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim128_fp16_sm80.cu | 10 + .../src/flash_fwd_hdim160_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim160_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim160_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim160_fp16_sm80.cu | 10 + .../src/flash_fwd_hdim192_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim192_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim192_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim192_fp16_sm80.cu | 10 + .../src/flash_fwd_hdim256_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim256_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim256_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim256_fp16_sm80.cu | 10 + .../src/flash_fwd_hdim32_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim32_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim32_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim32_fp16_sm80.cu | 10 + .../src/flash_fwd_hdim64_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim64_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim64_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim64_fp16_sm80.cu | 10 + .../src/flash_fwd_hdim96_bf16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim96_bf16_sm80.cu | 10 + .../src/flash_fwd_hdim96_fp16_causal_sm80.cu | 10 + .../src/flash_fwd_hdim96_fp16_sm80.cu | 10 + .../src/flash_fwd_launch_template.h | 205 +++++++++---- ...lash_fwd_split_hdim128_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim128_bf16_sm80.cu | 7 + ...lash_fwd_split_hdim128_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim128_fp16_sm80.cu | 7 + ...lash_fwd_split_hdim160_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim160_bf16_sm80.cu | 7 + ...lash_fwd_split_hdim160_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim160_fp16_sm80.cu | 7 + ...lash_fwd_split_hdim192_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim192_bf16_sm80.cu | 7 + ...lash_fwd_split_hdim192_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim192_fp16_sm80.cu | 7 + ...lash_fwd_split_hdim256_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim256_bf16_sm80.cu | 7 + ...lash_fwd_split_hdim256_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim256_fp16_sm80.cu | 7 + ...flash_fwd_split_hdim32_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim32_bf16_sm80.cu | 7 + ...flash_fwd_split_hdim32_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim32_fp16_sm80.cu | 7 + ...flash_fwd_split_hdim64_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim64_bf16_sm80.cu | 7 + ...flash_fwd_split_hdim64_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim64_fp16_sm80.cu | 7 + ...flash_fwd_split_hdim96_bf16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim96_bf16_sm80.cu | 7 + ...flash_fwd_split_hdim96_fp16_causal_sm80.cu | 7 + .../src/flash_fwd_split_hdim96_fp16_sm80.cu | 7 + csrc/flash_attn/src/generate_kernels.py | 75 +++-- csrc/flash_attn/src/static_switch.h | 71 ++++- setup.py | 85 ++++++ 90 files changed, 1318 insertions(+), 173 deletions(-) create mode 100644 csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index 35ac2229e..a1a50f2f1 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -242,14 +242,14 @@ void set_params_dgrad(Flash_bwd_params ¶ms, void run_mha_fwd(Flash_fwd_params ¶ms, cudaStream_t stream, bool force_split_kernel=false) { FP16_SWITCH(!params.is_bf16, [&] { - HEADDIM_SWITCH(params.d, [&] { - BOOL_SWITCH(params.is_causal, Is_causal, [&] { - if (params.num_splits <= 1 && !force_split_kernel) { // If we don't set it num_splits == 0 - run_mha_fwd_(params, stream); - } else { - run_mha_fwd_splitkv_dispatch(params, stream); - } - }); + QKHEADDIM_VHEADDIM_SWITCH(params.d, params.vd, [&] { + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + if (params.num_splits <= 1 && !force_split_kernel) { // If we don't set it num_splits == 0 + run_mha_fwd_(params, stream); + } else { + run_mha_fwd_splitkv_dispatch(params, stream); + } + }); }); }); } @@ -838,14 +838,13 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s void run_mha_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { FP16_SWITCH(!params.is_bf16, [&] { - HEADDIM_SWITCH(params.d, [&] { + QKHEADDIM_VHEADDIM_SWITCH(params.d, params.vd, [&] { BOOL_SWITCH(params.is_causal, Is_causal, [&] { run_mha_bwd_(params, stream); }); }); }); } - std::vector mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..3597cd8fc --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu new file mode 100644 index 000000000..a2155d523 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..ee32c0aa9 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu new file mode 100644 index 000000000..968f07ac0 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu new file mode 100644 index 000000000..7ee4d45f2 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu new file mode 100644 index 000000000..e3697365b --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu new file mode 100644 index 000000000..5bdea8f5a --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu new file mode 100644 index 000000000..0194aa487 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu new file mode 100644 index 000000000..f55649e53 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu new file mode 100644 index 000000000..8758a0f00 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu new file mode 100644 index 000000000..a9cb850de --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu new file mode 100644 index 000000000..66e7029af --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu new file mode 100644 index 000000000..972b24972 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu new file mode 100644 index 000000000..632b15b8f --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu new file mode 100644 index 000000000..16edaf9ae --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu new file mode 100644 index 000000000..4aaa83cb6 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu new file mode 100644 index 000000000..cef067c57 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu new file mode 100644 index 000000000..6e723a55f --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu new file mode 100644 index 000000000..87460d632 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu new file mode 100644 index 000000000..439489c14 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu new file mode 100644 index 000000000..af11800ff --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu new file mode 100644 index 000000000..b2fc12156 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu new file mode 100644 index 000000000..b2d08eaa1 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu new file mode 100644 index 000000000..d479f07ac --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu new file mode 100644 index 000000000..01c74893e --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu new file mode 100644 index 000000000..4b17006a6 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu new file mode 100644 index 000000000..68c299d29 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu new file mode 100644 index 000000000..75d6d7822 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_launch_template.h b/csrc/flash_attn/src/flash_bwd_launch_template.h index 161a4c56b..76d49b32d 100644 --- a/csrc/flash_attn/src/flash_bwd_launch_template.h +++ b/csrc/flash_attn/src/flash_bwd_launch_template.h @@ -129,9 +129,8 @@ void run_flash_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { } template -void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 32; - constexpr static int VHeaddim = 64; +void run_mha_bwd_hdim32(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 32; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -141,27 +140,21 @@ void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { C10_CUDA_CHECK(status_); } DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 128; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB - if (max_smem_per_block >= 104 * 1024) { // 104 KB + if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } } else { // 96 KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } template -void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 64; - constexpr static int VHeaddim = 128; +void run_mha_bwd_hdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 64; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -173,46 +166,40 @@ void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) // printf("max_smem_per_block = %d\n", max_smem_per_block); DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { // Changing AtomLayoutMdQ from 2 to 4 takes the same time - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); // This is slightly faster. We want to split M more so we need fewer registers to store LSE. - constexpr static int Br = 128; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); // This has a lot of register spilling - // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); } else { // if (params.h == params.h_k) { - // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); // } else { // } } }); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); } template -void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 96; - constexpr static int VHeaddim = 192; +void run_mha_bwd_hdim96(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 96; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -223,27 +210,22 @@ void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) } // printf("max_smem_per_block = %d\n", max_smem_per_block); DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 64; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); if (max_smem_per_block >= 116 * 1024) { if constexpr(!Is_dropout) { // 92KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { // 116 KB // This is faster for dropout since we don't have many registers to spare - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } template -void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 128; - constexpr static int VHeaddim = 256; +void run_mha_bwd_hdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 128; int device; cudaGetDevice(&device); int max_smem_per_block; @@ -254,31 +236,27 @@ void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) } // printf("max_smem_per_block = %d\n", max_smem_per_block); DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 64; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); - // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); } else { - // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); }); } -/* + template void run_mha_bwd_hdim160(Flash_bwd_params ¶ms, cudaStream_t stream) { constexpr static int Headdim = 160; @@ -292,9 +270,9 @@ void run_mha_bwd_hdim160(Flash_bwd_params ¶ms, cudaStream_t stream) { } DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { if (max_smem_per_block >= 116 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } @@ -312,9 +290,9 @@ void run_mha_bwd_hdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { } DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { if (max_smem_per_block >= 136 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } @@ -332,14 +310,164 @@ void run_mha_bwd_hdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { } DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { if (max_smem_per_block >= 176 * 1024) { // H100 - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else if (max_smem_per_block >= 144 * 1024) { // A100, we don't do double buffering to save smem - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { // sm86 and sm89, max smem is 99 KB. Only works without dropout. V in regs and no double buffering. if constexpr (!Is_dropout) { - run_flash_bwd, false, Is_causal>(params, stream); + run_flash_bwd, false, Is_causal>(params, stream); } } }); } -*/ \ No newline at end of file + +template +void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 128; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB + if (max_smem_per_block >= 104 * 1024) { // 104 KB + if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + } else { // 96 KB + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + +template +void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + // Changing AtomLayoutMdQ from 2 to 4 takes the same time + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // This is slightly faster. We want to split M more so we need fewer registers to store LSE. + constexpr static int Br = 128; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + + if (max_smem_per_block >= 144 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // This has a lot of register spilling + // run_flash_bwd, Is_dropout>(params, stream); + } else { + // if (params.h == params.h_k) { + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // } else { + // } + } + }); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + + // run_flash_bwd>(params, stream); +} + +template +void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + if (max_smem_per_block >= 116 * 1024) { + if constexpr(!Is_dropout) { // 92KB + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { // 116 KB + // This is faster for dropout since we don't have many registers to spare + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + +template +void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // run_flash_bwd>(params, stream); + // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). + // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. + // run_flash_bwd>(params, stream); + if (max_smem_per_block >= 144 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + } else { + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_bwd>(params, stream); + + // run_flash_bwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..8085f173b --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu new file mode 100644 index 000000000..49e011fca --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..bcccc6b80 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu new file mode 100644 index 000000000..0779bd8f9 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu new file mode 100644 index 000000000..4be6cc5ad --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu new file mode 100644 index 000000000..121d4b22e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu new file mode 100644 index 000000000..f3f0c5f5b --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu new file mode 100644 index 000000000..44d0dab1f --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu new file mode 100644 index 000000000..478455719 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu new file mode 100644 index 000000000..dbc9c3a09 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu new file mode 100644 index 000000000..f6ad159b6 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu new file mode 100644 index 000000000..379d9587a --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu new file mode 100644 index 000000000..2755a2571 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu new file mode 100644 index 000000000..57f431d1b --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu new file mode 100644 index 000000000..7781859fd --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu new file mode 100644 index 000000000..274160793 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu new file mode 100644 index 000000000..f19c7c1d3 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu new file mode 100644 index 000000000..0c1b8f35c --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu new file mode 100644 index 000000000..7f1541051 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu new file mode 100644 index 000000000..0776a30a8 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu new file mode 100644 index 000000000..dbfb66e71 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu new file mode 100644 index 000000000..f03b5a88b --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu new file mode 100644 index 000000000..019754bf2 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu new file mode 100644 index 000000000..c043773ec --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu new file mode 100644 index 000000000..9c997288b --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu new file mode 100644 index 000000000..443060de4 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu new file mode 100644 index 000000000..1f02afa83 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu new file mode 100644 index 000000000..7bdfa7bcb --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h index 3d6a1ddd5..cfb4264a6 100644 --- a/csrc/flash_attn/src/flash_fwd_launch_template.h +++ b/csrc/flash_attn/src/flash_fwd_launch_template.h @@ -168,54 +168,51 @@ void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream) } template -void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 32; - constexpr static int VHeaddim = 64; +void run_mha_fwd_hdim32(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 32; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); }); } template -void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 64; - constexpr static int VHeaddim = 128; +void run_mha_fwd_hdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 64; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { if constexpr(!Is_dropout) { // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower // Using block size (64 x 256) is 27% slower for seqlen=2k // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); } }); } template -void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 96; - constexpr static int VHeaddim = 192; +void run_mha_fwd_hdim96(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 96; auto dprops = at::cuda::getCurrentDeviceProperties(); bool is_sm8x = dprops->major == 8 && dprops->minor > 0; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), if (is_sm8x) { if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // These two are always slower // run_flash_fwd>(params, stream); // run_flash_fwd>(params, stream); @@ -223,9 +220,8 @@ void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) } template -void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 128; - constexpr static int VHeaddim = 256; +void run_mha_fwd_hdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 128; auto dprops = at::cuda::getCurrentDeviceProperties(); bool is_sm8x = dprops->major == 8 && dprops->minor > 0; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { @@ -234,30 +230,30 @@ void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. if (is_sm8x) { if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // 1st ones are good for H100, A100 // 2nd one is good for A6000 bc we get slightly better occupancy } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); } }); } -/* + template void run_mha_fwd_hdim160(Flash_fwd_params ¶ms, cudaStream_t stream) { constexpr static int Headdim = 160; @@ -269,20 +265,20 @@ void run_mha_fwd_hdim160(Flash_fwd_params ¶ms, cudaStream_t stream) { // and 128 x 64 with 8 warps is the fastest for non-causal. if (is_sm8x) { if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); }); } @@ -291,15 +287,15 @@ void run_mha_fwd_hdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { constexpr static int Headdim = 192; DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { if constexpr(!Is_dropout) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); }); } @@ -321,14 +317,103 @@ void run_mha_fwd_hdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { // For A100, we want to run with 128 x 64 (128KB smem). // For H100 we want to run with 64 x 64 (96KB smem) since then we can get 2 CTAs per SM. if (max_smem_per_block >= 2 * Headdim * (128 + 2 * 64) && max_smem_per_sm < 4 * Headdim * (64 + 2 * 64)) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); } // 64 KB - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // 96 KB - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + }); +} +template +void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + }); +} + +template +void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower + // Using block size (64 x 256) is 27% slower for seqlen=2k + // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + +template +void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm8x = dprops->major == 8 && dprops->minor > 0; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), + if (is_sm8x) { + if constexpr(!Is_causal) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // These two are always slower + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + }); +} + +template +void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm8x = dprops->major == 8 && dprops->minor > 0; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), + // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. + if (is_sm8x) { + if constexpr(!Is_causal) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // 1st ones are good for H100, A100 + // 2nd one is good for A6000 bc we get slightly better occupancy + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } }); } -*/ \ No newline at end of file diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..00bbaa081 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu new file mode 100644 index 000000000..ef2649ee6 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..e610f55da --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu new file mode 100644 index 000000000..dd0018f44 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu new file mode 100644 index 000000000..2b05a20aa --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu new file mode 100644 index 000000000..78e309cdd --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu new file mode 100644 index 000000000..0504ed5b9 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu new file mode 100644 index 000000000..f21f65c9a --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu new file mode 100644 index 000000000..b7fc5e1b6 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu new file mode 100644 index 000000000..2364925ef --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu new file mode 100644 index 000000000..049afac12 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu new file mode 100644 index 000000000..3c16d8f6e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu new file mode 100644 index 000000000..fb707522e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu new file mode 100644 index 000000000..94f299c90 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu new file mode 100644 index 000000000..0c7ed2c67 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu new file mode 100644 index 000000000..8367a6f9f --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu new file mode 100644 index 000000000..ce3ee1383 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu new file mode 100644 index 000000000..3f8a058c2 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu new file mode 100644 index 000000000..bfcb6e98a --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu new file mode 100644 index 000000000..2abfb9e72 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu new file mode 100644 index 000000000..aa61ba301 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu new file mode 100644 index 000000000..4906716d7 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu new file mode 100644 index 000000000..8d34ac42e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu new file mode 100644 index 000000000..9fc79fbd2 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu new file mode 100644 index 000000000..9002f16c2 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu new file mode 100644 index 000000000..76d0c69a8 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu new file mode 100644 index 000000000..fe1014408 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu new file mode 100644 index 000000000..611f0a4c1 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/generate_kernels.py b/csrc/flash_attn/src/generate_kernels.py index 525854b84..1b9bc9fbc 100644 --- a/csrc/flash_attn/src/generate_kernels.py +++ b/csrc/flash_attn/src/generate_kernels.py @@ -15,23 +15,42 @@ } SM = [80] # Sm80 kernels support up to -# HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256] -HEAD_DIMENSIONS = [32, 64, 96, 128] +HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256] IS_CAUSAL = ["false", "true"] KERNEL_IMPL_TEMPLATE_FWD = """#include "flash_fwd_launch_template.h" +template<> +void run_mha_fwd_<{DTYPE}, {HEAD_DIM}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream) {{ + run_mha_fwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); +}} +""" + +KERNEL_IMPL_TEMPLATE_FWD_SPLIT = """#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch<{DTYPE}, {HEAD_DIM}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream); +""" + +KERNEL_IMPL_TEMPLATE_BWD = """#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_<{DTYPE}, {HEAD_DIM}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_bwd_params ¶ms, cudaStream_t stream) {{ + run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); +}} +""" +KERNEL_IMPL_TEMPLATE_FWD_VDIM = """#include "flash_fwd_launch_template.h" + template<> void run_mha_fwd_<{DTYPE}, {QKHEAD_DIM}, {VHEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream) {{ run_mha_fwd_qkdim{QKHEAD_DIM}_vdim{VHEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); }} """ -KERNEL_IMPL_TEMPLATE_FWD_SPLIT = """#include "flash_fwd_launch_template.h" +KERNEL_IMPL_TEMPLATE_FWD_SPLIT_VDIM = """#include "flash_fwd_launch_template.h" template void run_mha_fwd_splitkv_dispatch<{DTYPE}, {QKHEAD_DIM}, {VHEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream); """ -KERNEL_IMPL_TEMPLATE_BWD = """#include "flash_bwd_launch_template.h" +KERNEL_IMPL_TEMPLATE_BWD_VDIM = """#include "flash_bwd_launch_template.h" template<> void run_mha_bwd_<{DTYPE}, {QKHEAD_DIM}, {VHEAD_DIM}, {IS_CAUSAL}>(Flash_bwd_params ¶ms, cudaStream_t stream) {{ @@ -45,35 +64,53 @@ class Kernel: sm: int dtype: str qkhead_dim: int + vhead_dim: int is_causal: bool direction: str @property def template(self) -> str: - vhead_dim = self.qkhead_dim * 2 - if self.direction == "fwd": - return KERNEL_IMPL_TEMPLATE_FWD.format( - DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=vhead_dim, IS_CAUSAL=self.is_causal - ) - elif self.direction == "bwd": - return KERNEL_IMPL_TEMPLATE_BWD.format( - DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=vhead_dim, IS_CAUSAL=self.is_causal - ) + if self.qkhead_dim == self.vhead_dim: + if self.direction == "fwd": + return KERNEL_IMPL_TEMPLATE_FWD.format( + DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.qkhead_dim, IS_CAUSAL=self.is_causal + ) + elif self.direction == "bwd": + return KERNEL_IMPL_TEMPLATE_BWD.format( + DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.qkhead_dim, IS_CAUSAL=self.is_causal + ) + else: + return KERNEL_IMPL_TEMPLATE_FWD_SPLIT.format( + DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.qkhead_dim, IS_CAUSAL=self.is_causal + ) else: - return KERNEL_IMPL_TEMPLATE_FWD_SPLIT.format( - DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=vhead_dim, IS_CAUSAL=self.is_causal - ) + if self.direction == "fwd": + return KERNEL_IMPL_TEMPLATE_FWD_VDIM.format( + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal + ) + elif self.direction == "bwd": + return KERNEL_IMPL_TEMPLATE_BWD_VDIM.format( + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal + ) + else: + return KERNEL_IMPL_TEMPLATE_FWD_SPLIT_VDIM.format( + DTYPE=DTYPE_MAP[self.dtype], QKHEAD_DIM=self.qkhead_dim, VHEAD_DIM=self.vhead_dim, IS_CAUSAL=self.is_causal + ) @property def filename(self) -> str: - vhead_dim = self.qkhead_dim * 2 - return f"flash_{self.direction}_qkdim{self.qkhead_dim}_vdim{vhead_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" + if self.qkhead_dim == self.vhead_dim: + return f"flash_{self.direction}_hdim{self.qkhead_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" + else: + return f"flash_{self.direction}_qkdim{self.qkhead_dim}_vdim{self.vhead_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" def get_all_kernels() -> List[Kernel]: for direction in ["fwd", "fwd_split", "bwd"]: for dtype, qkhead_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM): - yield Kernel(sm=sm, dtype=dtype, qkhead_dim=qkhead_dim, is_causal=is_causal, direction=direction) + for vhead_dim in [qkhead_dim, 2 * qkhead_dim]: + if vhead_dim <= 256: + yield Kernel(sm=sm, dtype=dtype, qkhead_dim=qkhead_dim, vhead_dim=vhead_dim,is_causal=is_causal, direction=direction) def write_kernel(kernel: Kernel, autogen_dir: Path) -> None: diff --git a/csrc/flash_attn/src/static_switch.h b/csrc/flash_attn/src/static_switch.h index 9a73daf1b..8e663d8c3 100644 --- a/csrc/flash_attn/src/static_switch.h +++ b/csrc/flash_attn/src/static_switch.h @@ -90,20 +90,75 @@ #define HEADDIM_SWITCH(HEADDIM, ...) \ [&] { \ if (HEADDIM <= 32) { \ - constexpr static int kQKHeadDim = 32; \ - constexpr static int kVHeadDim = 64; \ + constexpr static int kHeadDim = 32; \ return __VA_ARGS__(); \ } else if (HEADDIM <= 64) { \ - constexpr static int kQKHeadDim = 64; \ - constexpr static int kVHeadDim = 128; \ + constexpr static int kHeadDim = 64; \ return __VA_ARGS__(); \ } else if (HEADDIM <= 96) { \ - constexpr static int kQKHeadDim = 96; \ - constexpr static int kVHeadDim = 192; \ + constexpr static int kHeadDim = 96; \ return __VA_ARGS__(); \ } else if (HEADDIM <= 128) { \ - constexpr static int kQKHeadDim = 128; \ - constexpr static int kVHeadDim = 256; \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 160) { \ + constexpr static int kHeadDim = 160; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 192) { \ + constexpr static int kHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 256) { \ + constexpr static int kHeadDim = 256; \ return __VA_ARGS__(); \ } \ }() + +#define QKHEADDIM_VHEADDIM_SWITCH(QKHEADDIM, VHEADDIM, ...) \ + [&] { \ + if (QKHEADDIM <= 32 && VHEADDIM <= 32) { \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 32; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 32 && VHEADDIM <= 64) { \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 64 && VHEADDIM <= 64) { \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 64 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 96 && VHEADDIM <= 96) { \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 96; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 96 && VHEADDIM <= 192) { \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 128 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 128 && VHEADDIM <= 256) { \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 256; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 160 && VHEADDIM <= 160) { \ + constexpr static int kQKHeadDim = 160; \ + constexpr static int kVHeadDim = 160; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 192 && VHEADDIM <= 192) { \ + constexpr static int kQKHeadDim = 192; \ + constexpr static int kVHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 256 && VHEADDIM <= 256) { \ + constexpr static int kQKHeadDim = 256; \ + constexpr static int kVHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() + diff --git a/setup.py b/setup.py index bc759c03d..f80af1c32 100644 --- a/setup.py +++ b/setup.py @@ -180,6 +180,91 @@ def validate_and_update_archs(archs): name="flash_attn_2_cuda", sources=[ "csrc/flash_attn/flash_api.cpp", + "csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim32_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim32_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim64_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim64_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim96_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim96_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim128_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim256_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_hdim256_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu", + "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu", "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu", "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu", From e8b4082189911a03e454ec707cb711cefb26fd5a Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Mon, 19 Aug 2024 19:32:44 +0800 Subject: [PATCH 05/46] test_head_dim --- tests/test_flash_attn_headdim.py | 769 +++++++++++++++++++++++++++++++ 1 file changed, 769 insertions(+) create mode 100644 tests/test_flash_attn_headdim.py diff --git a/tests/test_flash_attn_headdim.py b/tests/test_flash_attn_headdim.py new file mode 100644 index 000000000..19b882750 --- /dev/null +++ b/tests/test_flash_attn_headdim.py @@ -0,0 +1,769 @@ +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from flash_attn import ( + flash_attn_func, + flash_attn_kvpacked_func, + flash_attn_qkvpacked_func, + flash_attn_varlen_func, + flash_attn_varlen_kvpacked_func, + flash_attn_varlen_qkvpacked_func, + flash_attn_with_kvcache, +) +from flash_attn.bert_padding import pad_input, unpad_input +from flash_attn.flash_attn_interface import _get_block_size_n +from flash_attn.layers.rotary import apply_rotary_emb + +MAX_HEADDIM_SM8x = 192 + + +is_sm75 = torch.cuda.get_device_capability("cuda") == (7, 5) +is_sm8x = torch.cuda.get_device_capability("cuda")[0] == 8 +is_sm80 = torch.cuda.get_device_capability("cuda") == (8, 0) +is_sm90 = torch.cuda.get_device_capability("cuda") == (9, 0) + + +def attn_bias_from_alibi_slopes( + slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False, key_leftpad=None +): + batch, nheads = slopes.shape + device = slopes.device + slopes = rearrange(slopes, "b h -> b h 1 1") + if causal: + return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes + else: + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + relative_pos = torch.abs(row_idx + sk - sq - col_idx) + return -slopes * relative_pos.to(dtype=slopes.dtype) + + +def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): + assert mode in ["full", "random", "third"] + if mode == "full": + lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32) + elif mode == "random": + lengths = torch.randint( + max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device + ) + elif mode == "third": + lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device) + padding_mask = ( + repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths + ) + return padding_mask + + +def generate_qkv( + q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, d) + k: (batch_size, seqlen_k, nheads_k, d) + v: (batch_size, seqlen_k, nheads_k, d) + query_padding_mask: (batch_size, seqlen), bool + key_padding_mask: (batch_size, seqlen), bool + """ + assert not (kvpacked and qkvpacked) + batch_size, seqlen_q, nheads, d = q.shape + _, seqlen_k, nheads_k, _ = k.shape + assert k.shape == (batch_size, seqlen_k, nheads_k, d) + assert v.shape == (batch_size, seqlen_k, nheads_k, d) + + if query_padding_mask is not None: + q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask) + output_pad_fn = lambda output_unpad: pad_input( + output_unpad, indices_q, batch_size, seqlen_q + ) + else: + q_unpad = rearrange(q, "b s h d -> (b s) h d") + cu_seqlens_q = torch.arange( + 0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device + ) + max_seqlen_q = seqlen_q + output_pad_fn = lambda output_unpad: rearrange( + output_unpad, "(b s) h d -> b s h d", b=batch_size + ) + + if key_padding_mask is not None: + k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask) + v_unpad, _, _, _ = unpad_input(v, key_padding_mask) + else: + k_unpad = rearrange(k, "b s h d -> (b s) h d") + v_unpad = rearrange(v, "b s h d -> (b s) h d") + cu_seqlens_k = torch.arange( + 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device + ) + max_seqlen_k = seqlen_k + + if qkvpacked: + assert (query_padding_mask == key_padding_mask).all() + assert nheads == nheads_k + qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1) + qkv = torch.stack([q, k, v], dim=2) + if query_padding_mask is not None: + dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q) + else: + dqkv_pad_fn = lambda dqkv_unpad: rearrange( + dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + qkv_unpad.detach().requires_grad_(), + cu_seqlens_q, + max_seqlen_q, + qkv.detach().requires_grad_(), + output_pad_fn, + dqkv_pad_fn, + ) + elif kvpacked: + kv_unpad = torch.stack([k_unpad, v_unpad], dim=1) + kv = torch.stack([k, v], dim=2) + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k) + else: + dkv_pad_fn = lambda dkv_unpad: rearrange( + dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + q_unpad.detach().requires_grad_(), + kv_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + kv.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) + else: + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k) + else: + dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size) + return ( + q_unpad.detach().requires_grad_(), + k_unpad.detach().requires_grad_(), + v_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + k.detach().requires_grad_(), + v.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) + + +def construct_local_mask( + seqlen_q, + seqlen_k, + window_size=(-1, -1), # -1 means infinite window size + query_padding_mask=None, + key_padding_mask=None, + device=None, + key_leftpad=None, +): + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + if window_size[0] < 0: + return col_idx > row_idx + sk - sq + window_size[1] + else: + sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk + return torch.logical_or( + col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk), + col_idx < row_idx + sk - sq - window_size[0], + ) + + +def attention_ref( + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k: (batch_size, seqlen_k, nheads_k, head_dim) + v: (batch_size, seqlen_k, nheads_k, head_dim) + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k) + causal: whether to apply causal masking + window_size: (int, int), left and right window size + upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast + output back to fp16/bf16. + reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.) + without changing the math. This is to estimate the numerical error from operation + reordering. + Output: + output: (batch_size, seqlen_q, nheads, head_dim) + attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout + """ + if causal: + window_size = (window_size[0], 0) + dtype_og = q.dtype + if upcast: + q, k, v = q.float(), k.float(), v.float() + seqlen_q, seqlen_k = q.shape[1], k.shape[1] + k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2]) + v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2]) + d = q.shape[-1] + if not reorder_ops: + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + else: + scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d)) + if softcap > 0: + scores = scores / softcap + scores = scores.tanh() + scores = scores * softcap + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + key_leftpad=key_leftpad, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias + attention = torch.softmax(scores, dim=-1).to(v.dtype) + # Some rows might be completely masked out so we fill them with zero instead of NaN + if window_size[0] >= 0 or window_size[1] >= 0: + attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0) + # We want to mask here so that the attention matrix doesn't have any NaNs + # Otherwise we'll get NaN in dV + if query_padding_mask is not None: + attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + dropout_scaling = 1.0 / (1 - dropout_p) + # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling + # output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + if dropout_mask is not None: + attention_drop = attention.masked_fill(~dropout_mask, 0.0) + else: + attention_drop = attention + output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling) + if query_padding_mask is not None: + output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0) + return output.to(dtype=dtype_og), attention.to(dtype=dtype_og) + + +def attention_kvpacked_ref( + q, + kv, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + return attention_ref( + q, + kv[:, :, 0], + kv[:, :, 1], + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + upcast=upcast, + causal=causal, + window_size=window_size, + softcap=softcap, + reorder_ops=reorder_ops, + key_leftpad=key_leftpad, + ) + + +def attention_qkvpacked_ref( + qkv, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, +): + return attention_ref( + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + key_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + upcast=upcast, + causal=causal, + window_size=window_size, + softcap=softcap, + reorder_ops=reorder_ops, + ) + + +def generate_sparsity_mask(seqlen, sparsity=0.3): + repeats = seqlen // 16 // 2 + # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda'), + # torch.tensor([0, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda'), + # torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + nrow, ncol = seqlen // 16, seqlen // 256 + mask = torch.rand(nrow, ncol, device="cuda") < sparsity + return mask + + +def attention_blocksparse_ref(qkv, blockmask, attn_mask, dropout_p, dropout_mask): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + blockmask: (seqlen / 16, seqlen / 256) + attn_mask: (batch_size, seqlen) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen, seqlen) + Output: + output: (batch_size, seqlen, nheads, head_dim) + attention: softmax after dropout + """ + q, k, v = qkv.float().unbind(dim=2) + d = qkv.shape[-1] + seqlen = qkv.shape[1] + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf")) + blockmask = repeat(blockmask, "s_16 s_256 -> (s_16 16) (s_256 256)") + blockmask = blockmask[:seqlen, :seqlen] + scores.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), float("-inf")) + attention = torch.softmax(scores, dim=-1) + attention = attention.masked_fill(rearrange(~attn_mask, "b s -> b 1 s 1"), 0.0) + attention = attention.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), 0.0) + attention_drop = attention.masked_fill(~dropout_mask, 0.0) / (1 - dropout_p) + output = torch.einsum("bhts,bshd->bthd", attention_drop, v) + output.masked_fill_(rearrange(~attn_mask, "b s -> b s 1 1"), 0) + return output.to(dtype=qkv.dtype), attention.to(dtype=qkv.dtype) + + +def convert_flash_attn_S_to_softmax( + S, + seqlen_q, + seqlen_k, + query_padding_mask, + key_padding_mask, + head_dim, + is_dropout, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """FlashAttention stores the S matrix in a different way. + Arguments: + S: (batch_size, nheads, seqlen_q_rounded, seqlen_k_rounded) + query_padding_mask: (batch_size, seqlen_q_rounded) + key_padding_mask: (batch_size, seqlen_k_rounded) + """ + if causal: + window_size = (window_size[0], 0) + seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:] + S_converted = S + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + S.device, + ) + local_mask = F.pad( + local_mask, + (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q), + value=True, + ) + S_converted = S_converted.masked_fill(local_mask, 0.0) + + # Need to zero out things not in attention_mask in case S was initialized with random values + # and some of those values aren't overwritten. + seqlen_q_og = ( + query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q_rounded + ) + if query_padding_mask is not None: + query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og)) + S_converted = S_converted.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + seqlen_k_og = key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k + if key_padding_mask is not None: + key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og)) + S_converted = S_converted.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0) + S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded)) + S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded)) + return S_converted[:, :, :seqlen_q, :seqlen_k] + + +def normalize_flash_attn_S( + attn_unnorm, + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + is_dropout=False, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k, v: (batch_size, seqlen_k, nheads, head_dim) + key_padding_mask: (batch_size, seqlen_q) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + Output: + softmax_lse: (batch_size, nheads, seqlen_q) + softmax_max: (batch_size, nheads, seqlen_q) + """ + if causal: + window_size = (window_size[0], 0) + q, k, v = q.float(), k.float(), v.float() + _, seqlen_q, _, head_dim = q.shape + seqlen_k = k.shape[1] + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(head_dim), k) + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias.to(dtype=scores.dtype) + block_size_n = _get_block_size_n(scores.device, head_dim, is_dropout, causal) + scores_block = scores.split(block_size_n, dim=-1) + lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1) + lse = torch.logsumexp(lse_block, dim=-1) + # lse could be -inf (i.e. all values in scores are -inf), and we want to set those to inf + # so that when we do torch.exp(m - lse), we get 0.0 instead of NaN. + lse[lse == float("-inf")] = float("inf") + scores_max_block = torch.stack([torch.amax(s, dim=-1) for s in scores_block], dim=-1) + cummax_block = torch.cummax(scores_max_block.flip(-1), dim=-1).values.flip(-1).unbind(dim=-1) + attn_unnorm_block = attn_unnorm.split(block_size_n, dim=-1) + attn_norm = torch.cat( + [ + a * rearrange(torch.exp(m - lse), "b h s -> b h s 1") + for a, m in zip(attn_unnorm_block, cummax_block) + ], + dim=-1, + ) + if query_padding_mask is not None: + attn_norm.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + return attn_norm.to(dtype=attn_unnorm.dtype) + + +def get_dropout_fraction( + dropout_mask, + query_padding_mask=None, + key_padding_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """ + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k), bool. True means keep, False means drop. + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + """ + if causal: + window_size = (window_size[0], 0) + batch_size, nheads, seqlen_q, seqlen_k = dropout_mask.shape + dropped = ~dropout_mask + valid = torch.ones_like(dropout_mask) + if query_padding_mask is not None: + dropped.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False) + valid.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False) + if key_padding_mask is not None: + dropped.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False) + valid.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + dropout_mask.device, + ) + dropped.masked_fill_(local_mask, False) + valid.masked_fill_(local_mask, False) + dropped_total = dropped.sum() + return dropped.sum() / valid.sum() + + +@pytest.mark.parametrize("kvpacked", [False]) +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +# @pytest.mark.parametrize("mha_type", ["mha"]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [False]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +# @pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize("d,v_d", [(32, 64), (64, 128), (96, 192), (128, 256)]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (512, 256), + (1024, 1024), + (1023, 1024), + (1024, 1023), + (2048, 2048), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +# @pytest.mark.parametrize("dropout_p", [0.0]) +@pytest.mark.parametrize("softcap", [0.0, 50.0]) +def test_flash_attn_output( + seqlen_q, seqlen_k, d, v_d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap +): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if softcap > 0.0 and dropout_p > 0.0: + pytest.skip("Softcap and dropout not supported together") + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 6 if softcap == 0.0 else 4 # softcap reference impl takes more memory + nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2) + assert nheads % nheads_k == 0 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + if softcap > 0: + # Ensure the values of qk are at least within softcap range. + q = q * softcap + assert kvpacked == False + k = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads_k, v_d, device=device, dtype=dtype, requires_grad=True + ) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + + out, lse, S_dmask = flash_attn_func( + q, + k, + v, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + if dropout_p > 0.0: + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen_q, + seqlen_k, + None, + None, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + attn_unnorm = S_dmask_converted.abs() + k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k) + attn = normalize_flash_attn_S( + attn_unnorm, + q, + k_rep, + v_rep, + None, + None, + attn_bias, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_fraction = get_dropout_fraction( + dropout_mask, None, None, causal=causal, window_size=window_size + ).item() + print(f"Actual dropout fraction: {dropout_fraction}") + else: + dropout_mask = None + + out_ref, attn_ref = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + if dropout_p > 0.0: + print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}") + print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + if ((d <= MAX_HEADDIM_SM8x and v_d <= MAX_HEADDIM_SM8x) or dropout_p == 0) or (is_sm80 or is_sm90): + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + if dropout_p > 0.0: + assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item() + # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate + if not alibi: + assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025) + + if ((d <= MAX_HEADDIM_SM8x and v_d <= MAX_HEADDIM_SM8x) or dropout_p == 0) or (is_sm80 or is_sm90): + assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() + assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() + assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() From ebf0b16de59b2a8bc30ff4e88eccdce17920f4eb Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Tue, 20 Aug 2024 09:50:33 +0800 Subject: [PATCH 06/46] add test headdim --- tests/test_flash_attn_headdim.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_flash_attn_headdim.py b/tests/test_flash_attn_headdim.py index 19b882750..363de0efc 100644 --- a/tests/test_flash_attn_headdim.py +++ b/tests/test_flash_attn_headdim.py @@ -583,7 +583,12 @@ def get_dropout_fraction( # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) # @pytest.mark.parametrize('d', [56, 80]) # @pytest.mark.parametrize("d", [64]) -@pytest.mark.parametrize("d,v_d", [(32, 64), (64, 128), (96, 192), (128, 256)]) +@pytest.mark.parametrize("d,v_d", [ + (32, 64), + # (64, 128), error + (96, 192), + # (128, 256) error + ]) @pytest.mark.parametrize( "seqlen_q,seqlen_k", [ From ab35fc2767c4038c269c512f30b746c3ff091b2e Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Tue, 20 Aug 2024 14:04:26 +0800 Subject: [PATCH 07/46] fix some config bug --- csrc/flash_attn/src/flash_bwd_launch_template.h | 14 ++++++++++---- csrc/flash_attn/src/flash_fwd_launch_template.h | 4 +++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/csrc/flash_attn/src/flash_bwd_launch_template.h b/csrc/flash_attn/src/flash_bwd_launch_template.h index 76d49b32d..404643788 100644 --- a/csrc/flash_attn/src/flash_bwd_launch_template.h +++ b/csrc/flash_attn/src/flash_bwd_launch_template.h @@ -371,13 +371,17 @@ void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) // run_flash_bwd>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); // This is slightly faster. We want to split M more so we need fewer registers to store LSE. - constexpr static int Br = 128; + constexpr static int Br = 64; constexpr static int Bc = 128; constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + Br * Bc * 2 /*dS, P*/); + // printf("smem_size = %d\n", smem_size); + // printf("max_smem_per_block = %d\n", max_smem_per_block); if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // A100 shared memory spill + // run_flash_bwd, Is_dropout, Is_causal>(params, stream); // This has a lot of register spilling // run_flash_bwd, Is_dropout>(params, stream); } else { @@ -448,7 +452,7 @@ void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) // printf("max_smem_per_block = %d\n", max_smem_per_block); DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { constexpr static int Br = 64; - constexpr static int Bc = 128; + constexpr static int Bc = 64; constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + Br * Bc * 2 /*dS, P*/); // run_flash_bwd>(params, stream); @@ -456,7 +460,9 @@ void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. // run_flash_bwd>(params, stream); if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // A100 shared memory spill + // run_flash_bwd, Is_dropout, Is_causal>(params, stream); // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); // run_flash_bwd, Is_dropout>(params, stream); diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h index cfb4264a6..6f51d423e 100644 --- a/csrc/flash_attn/src/flash_fwd_launch_template.h +++ b/csrc/flash_attn/src/flash_fwd_launch_template.h @@ -410,7 +410,9 @@ void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) // 1st ones are good for H100, A100 // 2nd one is good for A6000 bc we get slightly better occupancy } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // A100 RuntimeError: CUDA error: an illegal memory access was encountered + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); From 4e94c205c209778cd342492efcec831c10454347 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Tue, 20 Aug 2024 14:04:59 +0800 Subject: [PATCH 08/46] update test headdim --- tests/test_flash_attn_headdim.py | 42 +++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/tests/test_flash_attn_headdim.py b/tests/test_flash_attn_headdim.py index 363de0efc..69bacbddb 100644 --- a/tests/test_flash_attn_headdim.py +++ b/tests/test_flash_attn_headdim.py @@ -14,7 +14,7 @@ flash_attn_with_kvcache, ) from flash_attn.bert_padding import pad_input, unpad_input -from flash_attn.flash_attn_interface import _get_block_size_n +# from flash_attn.flash_attn_interface import _get_block_size_n from flash_attn.layers.rotary import apply_rotary_emb MAX_HEADDIM_SM8x = 192 @@ -461,6 +461,39 @@ def convert_flash_attn_S_to_softmax( S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded)) return S_converted[:, :, :seqlen_q, :seqlen_k] +def _get_block_size_n_headdim(device, qk_head_dim, v_head_dim, is_dropout, is_causal): + # This should match the block sizes in the CUDA kernel + assert qk_head_dim <= 256 + major, minor = torch.cuda.get_device_capability(device) + is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100) + is_sm80 = major == 8 and minor == 0 + is_sm90 = major == 9 and minor == 0 + if qk_head_dim <= 32: + return 128 + if qk_head_dim <= 64: + return 128 if not is_dropout else 64 + elif qk_head_dim <= 96: + return 64 + elif qk_head_dim <= 128: + # v_head_dim + if v_head_dim==256 and is_dropout: + return 64 + if is_sm8x: + return 64 if (not is_dropout and is_causal) else 32 + else: + return 64 if not is_dropout else 32 + elif qk_head_dim <= 160: + if is_sm8x: + return 64 + else: + return 32 + elif qk_head_dim <= 192: + return 64 + elif qk_head_dim <= 224: + return 64 + elif qk_head_dim <= 256: + return 64 + def normalize_flash_attn_S( attn_unnorm, @@ -489,6 +522,7 @@ def normalize_flash_attn_S( q, k, v = q.float(), k.float(), v.float() _, seqlen_q, _, head_dim = q.shape seqlen_k = k.shape[1] + v_head_dim = v.shape[-1] scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(head_dim), k) if key_padding_mask is not None: scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) @@ -504,7 +538,7 @@ def normalize_flash_attn_S( scores.masked_fill_(local_mask, float("-inf")) if attn_bias is not None: scores = scores + attn_bias.to(dtype=scores.dtype) - block_size_n = _get_block_size_n(scores.device, head_dim, is_dropout, causal) + block_size_n = _get_block_size_n_headdim(scores.device, head_dim, v_head_dim, is_dropout, causal) scores_block = scores.split(block_size_n, dim=-1) lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1) lse = torch.logsumexp(lse_block, dim=-1) @@ -585,9 +619,9 @@ def get_dropout_fraction( # @pytest.mark.parametrize("d", [64]) @pytest.mark.parametrize("d,v_d", [ (32, 64), - # (64, 128), error + (64, 128), (96, 192), - # (128, 256) error + (128, 256) ]) @pytest.mark.parametrize( "seqlen_q,seqlen_k", From 89dbe521b48000ee4f3d942d7c3498c698817159 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Tue, 20 Aug 2024 15:03:52 +0800 Subject: [PATCH 09/46] update test headdim splitkv --- tests/test_flash_attn_headdim.py | 127 +++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/tests/test_flash_attn_headdim.py b/tests/test_flash_attn_headdim.py index 69bacbddb..1a4613d12 100644 --- a/tests/test_flash_attn_headdim.py +++ b/tests/test_flash_attn_headdim.py @@ -806,3 +806,130 @@ def test_flash_attn_output( assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [True]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("d,v_d", [ + (32, 64), + (64, 128), + (96, 192), + (128, 256) + ]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [False]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (3, 1024), + (1, 339), + (64, 800), + (3, 799), + (64, 2048), + (16, 20000), + (16, 100000), + (128, 128), + (256, 256), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +def test_flash_attn_splitkv( + seqlen_q, seqlen_k, swap_sq_sk, d, v_d, causal, local, alibi, deterministic, dtype +): + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 1 + nheads = 12 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads, v_d, device=device, dtype=dtype, requires_grad=True) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + out, lse, _ = flash_attn_func( + q, + k, + v, + 0.0, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out_ref, attn_ref = attention_ref( + q, k, v, None, None, attn_bias, 0.0, None, causal=causal, window_size=window_size + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + 0.0, + None, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5 + + mult = 2 if not alibi else 8 + assert (dq - dq_ref).abs().max().item() <= mult * (dq_pt - dq_ref).abs().max().item() + 2e-4 + assert (dk - dk_ref).abs().max().item() <= mult * (dk_pt - dk_ref).abs().max().item() + 2e-4 + assert (dv - dv_ref).abs().max().item() <= mult * (dv_pt - dv_ref).abs().max().item() + 2e-4 + + From d11b7ae10d9b67f184c2a1900b2282b369615bb5 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Tue, 20 Aug 2024 15:15:56 +0800 Subject: [PATCH 10/46] update ReadMe.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bee7ab43f..23487ffef 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ This repository provides Customized FlashAttention based on the official impleme we have supported: - FlashAttention-2 with QKHeadDim=32, VHeadDim=64 - FlashAttention-2 with QKHeadDim=64, VHeadDim=128 +- FlashAttention-2 with QKHeadDim=96, VHeadDim=192 - FlashAttention-2 with QKHeadDim=128, VHeadDim=256 Feel free to tell us what else you need. We might support it soon. :) From 21ca4bc3e725af317dcf2d3e4357a359af903fb5 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 20 Aug 2024 16:41:32 +0800 Subject: [PATCH 11/46] remove unused file --- test_flash.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 test_flash.py diff --git a/test_flash.py b/test_flash.py deleted file mode 100644 index b7b251b57..000000000 --- a/test_flash.py +++ /dev/null @@ -1,19 +0,0 @@ - -import torch -from flash_attn import flash_attn_func -batch = 4 -seqlen_q = 2048 -seqlen_kv = 2048 -dim_qk = 64 -dim_v = 128 -nheads_q = 20 -nheads_kv = 5 -device = torch.device('cuda') -dtype = torch.float16 - -query = torch.randn(batch, seqlen_q, nheads_q, dim_qk, device=device, dtype=dtype) -key = torch.randn(batch, seqlen_kv, nheads_kv, dim_qk, device=device, dtype=dtype) -value = torch.randn(batch, seqlen_kv, nheads_kv, dim_v, device=device, dtype=dtype) - -output = flash_attn_func(query, key, value, causal=False) -print(output[0,0,0,0]) \ No newline at end of file From 4c3462aeb6abbafeb71c8d1063f6a62ff4cad28a Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 20 Aug 2024 16:43:14 +0800 Subject: [PATCH 12/46] revert Readme --- README.md | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 23487ffef..3e2e066cf 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,7 @@ -# Customized FlashAttention - -This repository provides Customized FlashAttention based on the official implementation. -we have supported: -- FlashAttention-2 with QKHeadDim=32, VHeadDim=64 -- FlashAttention-2 with QKHeadDim=64, VHeadDim=128 -- FlashAttention-2 with QKHeadDim=96, VHeadDim=192 -- FlashAttention-2 with QKHeadDim=128, VHeadDim=256 - -Feel free to tell us what else you need. We might support it soon. :) - -Currently, we do not provide prebuilt library, you need to compile from source. +# FlashAttention +This repository provides the official implementation of FlashAttention and +FlashAttention-2 from the +following papers. **FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness** Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré From f63411df43040211a286eb87f40374ebb950422e Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 21 Aug 2024 15:56:37 +0800 Subject: [PATCH 13/46] create bench headdim --- benchmarks/benchmark_headdim.py | 180 ++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 benchmarks/benchmark_headdim.py diff --git a/benchmarks/benchmark_headdim.py b/benchmarks/benchmark_headdim.py new file mode 100644 index 000000000..341ae4b21 --- /dev/null +++ b/benchmarks/benchmark_headdim.py @@ -0,0 +1,180 @@ +# Install the newest triton version with +# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python" +import pickle +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat + +from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward +from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined + +from flash_attn import flash_attn_qkvpacked_func + +try: + from triton.ops.flash_attention import attention as attention_triton +except ImportError: + attention_triton = None + +try: + import xformers.ops as xops +except ImportError: + xops = None + + +def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1) + return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f) + +def efficiency(flop, time): + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + + +def attention_pytorch(qkv, dropout_p=0.0, causal=True): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + dropout_p: float + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, _, nheads, d = qkv.shape + q, k, v = qkv.unbind(dim=2) + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=qkv.dtype) + + +def time_fwd_bwd(func, *args, **kwargs): + time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) + return time_f[1].mean, time_b[1].mean + + +repeats = 30 +device = 'cuda' +dtype = torch.float16 + +bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)] +causal_vals = [False, True] +headdim_vals = [64, 128] +dim = 2048 +dropout_p = 0.0 + +methods = (["Flash2", "Pytorch"] + + (["Triton"] if attention_triton is not None else []) + + (["xformers.c"] if xops is not None else []) + + (["xformers.f"] if xops is not None else [])) + +time_f = {} +time_b = {} +time_f_b = {} +speed_f = {} +speed_b = {} +speed_f_b = {} +for causal in causal_vals: + for headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + config = (causal, headdim, batch_size, seqlen) + nheads = dim // headdim + qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + f, b = time_fwd_bwd( + flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "Flash2"] = f + time_b[config, "Flash2"] = b + + try: + qkv = qkv.detach().requires_grad_(True) + f, b = time_fwd_bwd( + attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + except: # Skip if OOM + f, b = float('nan'), float('nan') + time_f[config, "Pytorch"] = f + time_b[config, "Pytorch"] = b + + if attention_triton is not None: + q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + # Try both values of sequence_parallel and pick the faster one + try: + f, b = time_fwd_bwd( + attention_triton, q, k, v, causal, headdim**(-0.5), + False, repeats=repeats, verbose=False + ) + except: + f, b = float('nan'), float('inf') + try: + _, b0 = time_fwd_bwd( + attention_triton, q, k, v, causal, headdim**(-0.5), + True, repeats=repeats, verbose=False + ) + except: + b0 = float('inf') + time_f[config, "Triton"] = f + time_b[config, "Triton"] = min(b, b0) if min(b, b0) < float('inf') else float('nan') + + if xops is not None: + q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + f, b = time_fwd_bwd( + xops.memory_efficient_attention, q, k, v, + attn_bias=xops.LowerTriangularMask() if causal else None, + op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp) + ) + time_f[config, "xformers.c"] = f + time_b[config, "xformers.c"] = b + + if xops is not None: + q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + f, b = time_fwd_bwd( + xops.memory_efficient_attention, q, k, v, + attn_bias=xops.LowerTriangularMask() if causal else None, + op=(xops.fmha.flash.FwOp, xops.fmha.flash.BwOp) + ) + time_f[config, "xformers.f"] = f + time_b[config, "xformers.f"] = b + + print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###") + for method in methods: + time_f_b[config, method] = time_f[config, method] + time_b[config, method] + speed_f[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), + time_f[config, method] + ) + speed_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"), + time_b[config, method] + ) + speed_f_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"), + time_f_b[config, method] + ) + print( + f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, " + f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, " + f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s" + ) + + +# with open('flash2_attn_time.plk', 'wb') as fp: +# pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL) From 3e0c7c4276b92aea5ba3d6144a6600e5fc7d17b1 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Thu, 22 Aug 2024 09:11:27 +0800 Subject: [PATCH 14/46] update bench result --- README.md | 14 ++ .../Customflash2_a100_fwd_bwd_benchmark.png | Bin 0 -> 183246 bytes benchmarks/benchmark_headdim.py | 152 ++++++++++-------- 3 files changed, 98 insertions(+), 68 deletions(-) create mode 100644 assets/Customflash2_a100_fwd_bwd_benchmark.png diff --git a/README.md b/README.md index 23487ffef..5a1c700be 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,20 @@ Feel free to tell us what else you need. We might support it soon. :) Currently, we do not provide prebuilt library, you need to compile from source. +## Performance of Customized FlashAttention + +We test the performance on A100. + +We display FlashAttention speedup using these parameters: + +- (qk dim, v_dim): (32,64), (64,128), (128,256); qk hidden dimension 2048 (i.e. 64, 32 or 16 heads). +- Sequence length 512, 1k, 2k, 4k, 8k, 16k. +- Batch size set to 16k / seqlen. + +### Speedup +![Custom-flash-attn](assets/Customflash2_a100_fwd_bwd_benchmark.png) + + **FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness** Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré Paper: https://arxiv.org/abs/2205.14135 diff --git a/assets/Customflash2_a100_fwd_bwd_benchmark.png b/assets/Customflash2_a100_fwd_bwd_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..281f52420cd4e79962238d1e736f692a5c4175f2 GIT binary patch literal 183246 zcmdqIgZRP$0MkD^{dNd# z1S=jWZf||Gffu*Ye z4U2h;p{wu;vxKroSrsdS)S9V$E=37vHY^UudHYk0HC}(xoH4n-#dDqKGhf4A`lVsD z9&oSZ9_C8k-)?;m9?O&TJfz2%()Ree=B$q$LZ{Qo#y(dl$j6l}zk_Qlhv{XBm-nSz zN|5`>6S{7pxx0?`HUglr|EPTF>G<8f$6r*8iZQFlS-7*hd`c>1A&sU z7YZPZorf`j9v&W-z%n^!Y?prE$e-+wvfCy1n!?_yX(O79y>VZL2*u^HIL0=$?qw%Y zUFup*UO%EAReqvQO{SX9v-;)guo3mG=1G4?(Nv6TOe51B)B4Chv+_skpUJRD>DNOe zHhp=RFJtmAmO=z#h?t2JXuzB10 z;JrYA9~4 zy19I%2TwsV31x`_qc+)=%2%$nenR0d?{(r}M|MnfHS!pi7+jSfnJ8_stW2Sb{Ox*w z0GNEqAzn-v#Lw*AAVuvefa=Klg#r6a+vE~#Ic!0c;ideUWM7y@OKJ-78{2Xc6H@G0%zmIX zaDqatJJ5|^iq8Dj(t9AgWe_3Nxky$(xYJmdZ+*x)9z3H~MitN9 zNa>Dhje4spW$ZGZ6vv2obFS>YFDSgN4<3?XY_gIywI-~^o)h0A_ra8m6T!gs!E)+1 zskIT3YG^GOGMs(aOywtK{9NUNBsg5#Amf`w8!sd)Gye6AU@_qG%;D0FEXI+Hpn!nex+LWrtdPJ1w`<0` zv3Y^jGDVz_lW0F(wa~CWJ4&no_42;_Qj&yF@tV*RcU;$Q!i!Kj-~$$UI#sMR%Rkg^ z`Rd*J0%YH@AHRFZgI&=rQ3Gz872R4C-L1Wz=Q%W!u zHd61{8HjHSWm-&jKXw1t{?mTx{r5U(#t=Rh!(`u7cv^L8VT!EYQ_c)>?PqL}+Syvh&*q;w7MYcL>B#7b ze4NpTzNQmNlYqBInW#uBc4!9{7Zq8TuDvJy%%Y!NQKxKIJ?xjRl@9-8^2z6u^EAOH z-RZ}l7GG$8`lS=7SUgerDyYHMM8<{SbCgfcS@M~DV%28(fDW=^RHs6}!m#brnQnfO z>&G$O!V>dx^Y=k=w{>r-ah3X>g@5ju&ibtL(cK7{q1?s(((|o-iG9+h^=N($<Hac zkeKv(;+^!{Yumcjpp|afdHF-@wKsGph{pbDf{2EI^F&Rnm0@(pEN#_R8OIOq>BVAS z#ltiiMcL=F>cL-v!tb!Lm%7K87A!yY;D!t1cJwty`x9*u@zZw(U%XVD`Y|Rj;1;z? z(Pz*X#A;J)$yz&MQaq{LG0-{?G>|>Ood~Br&-$Smpb^2&t*MnI<*+Sal;Au-qfA&l zKK0{izy9mi`G(Oy_1TTa?Zzqd4E1=M`ktkJIhW5)>`!C}$>M7%gn3`{9+>FzWM@!k zRPc@)cQ)5GJ!>$roVJ{H`Rp)S%hyoV*a=mvM>S?!>$!L?XS^C|7_EB=_5VuT=-D_@ zQ_;}o3}0xQC)}T1Y=$!>98odT=t!?+gnz;wdEfnh=KZA3tl(1ZP;C!tLp46-8*3(9 zO6)i{twOJ9(xA-0xIsBG=?TsB)1AipeBtpgKQFIdEo{G=@KhE{3fizfWO18Ecr2zL zNX&SLI@RwQiusDgn&z9r+amqr{L`;_OGSm_zGV6R3P-FS3JxO62UWDi#skK03xfT` zVRnA|e|8vbq?HBice4h6+5a*-9Q{Qv^RSS-P*aA!v$WG}<@Jh6&}L9TmrKyNQB$SH z4Bw1;Fh_9gJ@fs!v-TO!**6RxCh)tb?|7ec3&^;=MD#J$zx(rU;a#WYEP}d+LlK|Z zig`rQ)ja_$rA1W6KPByD@Jc=FwUP5&YmLgP&uZ#XY3=shk6P}5U+|g)pwd%Fn5x!# z+z_75jE-JW5`?U{xQM&Ba3CQro7$5vgF#Z<$6>DxeBbiH{d~=3?E`9*V26N=^fl=( z4qUbj4xa>Cao3p^bv0bI-jV@ZT@Quhru92iDAF+TFb5&=-nMr%c807>vt8hurS4gY zP1-bvCkLY{a|v7Np2lTw;<|z7wVwFL7as$qM4=*wI4kh3)D_kvinoLXAN3{ z%c@lmza5R14{blHi`_`@T>hlANE_!4aL8mg;F#@_3lEah+yMYs7_e zFtHW+awf zuZn(;KL~#ioEtih7=2aXDB=0*`ExL_f=K_}OvCHj-{le*1Y^L2fF z^FKa8Yd59V#cN-_!h;F!J~Tl-Lv%fyLTAGg2~m-~kuLGd{3dPNrd(Zrth4oumF6?Q zwtuAzra#@yvB1kH;dM0KY4Yk5S`(V6%N+^KJmj3^;pYi%x894pN>51tz`atH)4rn+ zss43fdTRRTG`&;r?9-}7qc-RLOmFI4SN$u)gO5U$CiNG%I!tgE9Wna}3U{JZ+H@!GL`@L}NHHqJOsk6n{D6O6_Um; zjnfuBT}!a9$Y*-jS!YPY!0Lil3b*||@3B*LF&5W`m4q(8>zN2s`sT~#wnN($KksJ5 z!Ir=0A=yEu%n#Ue)Q=;pQ^uy?z29!T;2G*4^H-c_6CH!?gZu^S(w9G9+`K(&Sj+xa zpX0i9P%JGkZCC&g{&lKwv*ro?R+tlP(9z)Mcf{Y-WpX`m>fMprP6O+>NW9D4hrawJ zdaiOR8<2T&aB}^V0I+Qwm3GnbyWoS{iAM6 zrOBTdunu|*6H*K>m`rh!Vcz{Zj!vHO2L264C75*>ZT5H{-AlJp-YqX-Ssog|Zo;m1 z$S`7xFo9c`KPHN!cM!&!K~--bu}I-LuLWq1s^Q$1a*pN6_TPFE+J53gY@Jz{_yA9Wl^{kblr< zq0_%tGW!0@^UoAJH3|a{{X~ksVehd1gBo!34*Ngy*njIVBc9^mFyH zg7La~vi`G?|7Ztd=V{~N=;q}Jb!Gb7u9Y>^+e?~-dDpfU$2FJ zLH@rr{DOP}{QucDnpNs=sl;)cjYE|Bch=zc>Yj zM4$e5*8f)ZKUrUR+Ic8KUC@1c$^4hR{>A*?EC0nP#sBx!|JzjjbDsZEik@hh2U7h1 zDKwb}rSGTw(Q>47glOxdQ?!=-y)a|ZKb-$e(RoaBlanriPZ$_-7-|rCeHiB6GG5kW zrD>TXQV2xuIY98JHbhm6Mb%!ZDC`{-20?`_0DyfJt2&lioNDk*2QwuBpGBM%g>5K@ zsiUv)F6{oATj~UQptX>noiABmR^YJtYw_hv@14&(GSIHFPuE1gOYVcg8Rl<(r*eT* z0i>Z2rvELGAPE&K37PfFPo;Q_6)VS*jL-J}PD14#z(OHG?EmlDe{s^spsDERxEyu= z_lD4U&rO4W6Te@pZu6iqc7nxD%4|L=iy(`w1y{<(K<{as4`$|UFQ|0aNcO)>OB zoBQ#@e5P`F3F*({%=)Mw|hkJe^tYO49ZIYiwoiHwpHTz@8{tGOdOXqlvx=edw;2o zI*$@-e}6~RAYs>WwNufU#W}g<_Tg3GRbRok#>WdHS$0HIrFTsqV*8#>YD}*5MX^~x zo1UI)aydtpq_Ecf=nn$;nB?T4#Z&wMb9Pm=Cx4;2@jmG$?RefCHwdYnKdB|TUqUXT zV{XU2n{H_S@%q=rfLO-rCcU#&S`$%!M$Z#GwWCuS2sd ztwI(oKOjsv?d&v{fm19%N-ZL1iAp9tzLL9}jOJ_yU3r}-a=W{$*)Bd3nF~u}>)6M` zZ7a72YP<*Dm-A=qY1$dw)$MgRw}-*}w5}~yeuw@mi^bSM%RP#JkcIa*4wv7wgLHqL zcb=!?P#C?_5L@<2dpS8Owqj-uTa7$Ufd`ampcNPU^(+ph+4I`VAJH;T-+5duW(D8w zT5Lqox|Z)-*rmU?@}Emyzg}FqGdO77tui?q=B{3P5sf-I48CtmdauIZS5o<9GY1FN zyxek1xxUymcQ6SfUISfQcRk1xzAKqy82ox!lAx-i40K*B4p`}9SSfZMRGD`#n1}jL zy_fQ15jj`7)$x7%$genoGgmrw<#>T{FhaA|f?>mVHmYV-JNQbkgQ9)k1k~U%hw}wO_RYPFM-}AeA=aSoF~DxW4OlA3Bon`^)&( z<(O}VUFJ(4xqaKkJ`!28pc^Ek;q|OV@a>KsuKqR#N$36DHKBR!pP%ZPew*12b?R-` z2W=z8a2b;fWBY8O!}b#`AMu-gky$f6`vSksx&ay|4N&lPKbbc9*|_ZejUDPMWufyT z-kW!|mHPt>#8M+X^V{ZOI|0@1T2%GWpp)E@++Tl6C8`qYGtJ+8 zL|l%AQ`={~j~5|FZY&i>-t61mQ8GptJhIX~Nc(L)f4sdqodEP0?8oNl;Du?#sg`T{j#2gjTQj z=4xN8=Ts`(`vW=L5utbncwf1KFZvkypuyLR6(_$xnk5PTl*?DD=dpVN$G#ah?9^3i zZk#o5lc*`LzWjz#qchj`$%C!MZlDGk`B(1Fn%iQ$>l4&2J5Og!4Y9WC%dXfzwA=q; zC)PQWy+gr0`avpFXNPG09Np8qxkjtzwFg>;2B|+ABA3y^3HY*I6#ry?&SQj^TlDWh zhJTk8nh)LwUGE!GpjX2i9@HhC?1lWY%&eJbx+1Z7$#?OsJnxnIaT(JfGw`A-ZoP@j$&o7s@WDI4)=P=g4Qd;n!bt>?30v@CvOQdk} zQ9JNcswO0wGvIO~(@?i*#>6R|NObO962X=L=JkS8_2Z?<1|Br)DzAH>Hb)5zQag@w^78&~&FrA-;}3KaYPXl1!g1 z`>q5wE+4zrePd7Wuic0s)8dThYkf8ESe8G$S3hn}@SC@JNjJf9?aXJ<#n46Ke93df z;_f(eVMcs2D_lJLUnWK=23FNB+W1hJw}DH40(;F_MNd0{Elwv3?;5UTdMEU>rC;1K zFu(r#$Z+qNKIwe$F#7P^G;VxO<@*=Z0Iic{n&sg6G2=~Nsyd(XNi*XlPg$Tvz0BQi zp63|X{dw@cq0If|xFMrr`D@uymY~W?R_Axaq*leEn=*#2GF0;XU z88)o7M0U>h)6Y;|T9*c$4iY!Iq}aaIjGAuD*ql)rF&~u3Mhm>VLv+!({t0B|&F<%P z#k(EDu21JWxzJ8O<^abo_bC6nO|h#qNc*?S{M&evFkCjy4L?!c7>2*MLl)@`LX# zrzcu|-(3vKnisRG(0l7YVg!S=3WijSF8-9szGRJ|OChY4{q|Lq4v5!7t(-c$;7l}E_1DaJ=m1TwKdgHY|K!!ZnW`n z{5&RUp3<+6!h4?KRK+xKfuZSJHSh5y64r%Egz67=j7jYN;Y7H{2`Jkt)wQbkCk-dUdwW6nlF6-Gj(ZvL5aJ-Q*HC1TYeFHaMRbXsD0>A zuWe(N7An3dw&vIEson<656!GgfE zc@g!wwP@t3(^4PKx2D8bZ@Mw@raIXkv@s_yu`&I`r>T1kH!i_vQc%HIL@&YzsgDa= zlgavH-l@S9lM_H<8Y~$pkdN9jm+MdVsL3D#NK=`T6b!D{d|Zoj3ILFpx1<9|W_=`) zH0ekXXl5V$#BXnrmpz@*tEkzuw+MMw6DjMqOo>=+#z}UA z1SO5H>fPOJ$x_M-U!y{d9r7D-4WM1OXFG5}KR^t}^im<{Il~4ge6IH#f{4D~EWYrh z>-HI3!%e-3py-UD51_xkpz|5x!1%0j#yRfHG`LD?PydAEp4sCo^zsNZjCfV4t$}MN zI$vtoiZZG6wl)#$h|3VPoE8!tpDV7qx9#Xde}8vUc;8-G)BC{gdzN4$gJ?OSy>nD? z(@EoA$Roz8+l5bo&g6m!=$-7r>Tp)Hn7^iK%X+H%>kfznw@bsV% zvo))O{05R>))wmPi_}M|fVvvJ{4p_MUixvZcJllF&&0AfM{(F%k$pX?OcWg6c_>zX z+WZOuT}pR2@1B^?oYnUTsxtFfxmV8MS(Nr%-;y6dmhAD=>H?w@@)9Cu25in4@5kZh zN|HJr8dgUsNx?%LH0T5WOuXo~Ap}iO8B`11*hBLstBup|$wKc~(Kh{k?Qk9|IcCzr zsEEopYR@DVmW-l=Qln*V)02Q0>bccCMF;c*mtn?hi7SC{1Ax4 z8+{CY!O<)43AU$j-v&qvAA^P0A%1jN@+rdW0?FLOS(EGdF%a{{g_=1e#|rd3i-4ic zvX5FvP8QcDl#V%Nk5UZ8wp6C%-+`z1n^!uO&BdowYv7O@E0Ww?zap1o_qPycml@g8 z^LXdqBrT*blAp}Qh1KGi#{eEC3j{WxcjnociaR7Hh%6!!T*7NV%)Ch_Mq11>;e6YG zHpOOS6adLjdlQd=Os8($_${K_@Ni;@@&`><^F|>~G4>197JtM%iliWsL&Bv>d}yIW zKLel*zU-WF9^SdvtUCB9FL3GZ(0f@kL&aT|<|m$g&xdO(j}%xmdx?>q%|dm)APH*=x$4-_uLYEs8@#S-Pdvun|+$uC}~i*S*KO$Hae|y-K?8W|t-V=gAM3607+HEUJEPH6bGAg^`RGNawcWOmYFvfC_DqffbD0fE%GkKusVOW?+S>-%8Vf=$B zfbOGg;6{uU*7KXzh!Y?tK6O9m85wCqkB0jqp#DPF5A{R_&iDWYY;J%Ly$c);Av6!a zcI048DC_NehI^L|jmEZcl{~00Uy5JppJo9YCA!=aGVY7XggB5n9_6TN%9cyrL<0d% zcDeb7R5uY>0EflVGgge+*zqX-TEZEC+~w(GkW%m(8CGzFas~Fq>wpK0`q8bY;;!Wu zr&ckFS9s^LXLB|c_ydK5NOL+oLMg0)1I8Szg6@x^O7}+8-9A#7=L9{^yToTI18+l} z$}LQBc++D?kPTNH;4*(A*y;u>S`^_ZAWWgyou;y4^74HU6(SNGYso27PBeh0@hASS zkK|OAHo>zUH@g2k4|_PAl{nV0R?~c6fROYc0wN_&N^H+lKJsPTJj!y5?m1&nBqJ&1 z>1Ubt-=(=bt5?<<@rq8e*K=y$^`nFmW5Jo0TQa-BcU!?lJ8ypVgqFV=_$hJ*Nfqg4;&9x|y}-!hYXD&A6EmQo=Lr4HspyTVyDz0T`{N6n~S3Bmgx>`g4{R}^1<*b|^wEX8o7 z&crG66pQX-Sn9%WVihD-@RNpevx#C-X#qYeq#{Z`lik-J^+2BHV=wUOR#qcQTvPnF z=->17d(S;lF=wN1auxU!f3iFmL5m_Hh(C$RR~2qgB!R@a982)Cu_bOlP!Ho~*~9&6 zG+{W9gQ_&|QZfsi%{&Itp9&un^#PA8?GoC2sPslrjC23t#Op?|AX>-IL8fk$L3>vj z@7T(>TK6{dYNFCs6>h4S$O~N!z$xJZ5t6_;H~x*RiT**|CV~>n&QuMCiUWb8MXE}hyHC$!& zBMi$OIEzHOnRHs@UdU4chmOR|48AmRldPjFbI#<2s@PhQ@s;yP zQ8o8JM)Bkc1_4WkRB5;5Gv$m=`w|D)EfOG`YItLssbU`Z-$k5Myof>Hng(>%fpvU&3VU(a(@i;Ch%AxWCTJ{{L0(WkY~Tf~CejK3*uz8X!5jj;w8 z>aq*ccSZTRsen00TC5oh;^A$ELdvK0LipBTs>WK%_v zLkELL;FctpWo*#zrQDjqFY9^T)zdCFXd5_@$cWkjT#j-~_`;~)?DGXWwKbLtJW5cb zTNba-*s?5zSb6|eF`*vMM{JZIV$n6sE8+un*A#eDkqYnzH(S0|2%lCy8LLN2i-#0n zgDU0V{@Ok|$~^uB4pnLe+bGa`461!m^*m#-?5BZ*V9RaqDT2WoNr`FfGfcoda`^~E zSc98U1UZ{riPkdd)*jWFM-7B(oxojgJ;v+U!amx5=pkt8qa*=>t6&pn&t|Ij|x`pgv ziXDua_KYkP?5B`a%QU>H9dt?>y*^8_!!iXtUMHH+2y9~gF~CfFDpC|xLS>QU-6yBY zN>0r&K{8P3L!%e*7>jbvc`t5<)!F&6*Khvyz973E4U0(LBPE99AcY#^$N}uI11SOX z6b0MoFZ)f_OC&MJFP@@m!s);Fv!5q2BKxSBxEF($-7+=)^c;m^poG=!?`DZqIl zcF^lUweASU-1K7T_a)D?aA#WXAsUZz=Rq3i02oA?G6f$KY3XJqH3O-5Ge=%6`27a2 zn@Ou?iTPsY64kF6m-`T4S_p~*Y1lE9y5`VTKx1|mCg|#|O#dSY z@Q@GdhLJg{nynLocR|e;){IbqVKN|VK>(?ssANlJ;^4zp_~6m~4dC7-gE*+(&`DV) z_#LqfaWG5N(PbEP2U&RGcnb0cv|ey5UeYZ+;9sS@gjgcZSy6t>h+4z^Fx3_e zS~re0(iD~Xq)hm|I~_yc;iojClFp;ho07BR_!dA{v}dn^bW7379!JQPmfQnW9OGsM zD=G~vFc#a@Lr7vklA?IW4rN1dQCe7w6?B)8#<`Knm|nYk8XYf!(}V^Z?d1sYpE-+} z5|`_ zoD;886LZ{WNxN&64MuLR*x&Ea2x)zCAko<{(8wa*-+bS@6Af9(2J{gNF7% zCt*=_!M1&uvd5~PcR3f)k<>@8X;!oHgy}qBfPt>zYe#$_tN;}qt$xu~*MN5){^2CI zV<4e(;CjQeIupzMk7|(C6SS{g^NDqKFod}~wxvhK609(mUBCHVX$O1;JTkLwBRfi_ z9Gm}Ys^AmQT#_cdQG<`kH^nzR6o=1h6&?}|W8%-FlmtJe^gIP2-bNzC&DRrO^hRI&h2NNCCc~WW=E*x)3!5xfosi5BLR5o-zFb04-_2<1iuzNP8;&^9&KUb-F@r zEgk7j&u#kxW?BHXmhOB;SRE#XCd+{N82L9j+t?C(kMD270)>yCb+dAQgnKGKfvnxw zv+72MxaUDQZKJzi&5+?O^2GNk69b=yF>QS}xW%Mpp~2f}R;6FnWn?l^tr1j6Q2d^P zZB2|x@`a_y;|^b1NI9?eC4eFH!Nfx~Iej2cKe3<#+0CCXk4K%iwx@T&S#d+8Ut{{5 zm2uzF8cE{%khX)wL3S=4itphB>1g-Fkb0hS(ma+PLFbX&`rB&y0Z!*cl9HxCT!x?4 z86h$rX{*h-^hGQ}rLGYCO#lzlJh(qK8#t3laI3N{ECaG&d8`mw6SppTzvRZ0FvL<< zTD_drO{t}t*CamJFnGaomo1MdeYh&3NG6ktWch|1(DF~}Z{P{$}vY=9i0#+iQe317b%aK?xRps3VDuFPED(m)=rX!E4nQIC+>P@zk)J*4zNI zAB}|;tMSX<>%R(yMFOPmC^%L}-05W!yBr@Vz##arVKf^{5WH)(YfizeXT$Bq;Nq5; zs}g?C?nQmpzS7OA&86iXx$+H%g%*apvzt~>(vRqCl*?W}?@?}PNSW5%QsrY71|8C} zfvqDosbcD(uz2cYfJy*}-%=Zp-AEYSnD`@Xkc&)vfblICYB1n)6UfX>VhQuCr7I{6 z(XHI%g!fC6^QUV--rOr?={v+l*0L&9pP^M4-ooHYA6#?vuGjDA;W_RAhIyp6B!eh3 z0`zGv%o9PrPz`4@BROQdTGbff9oy9o3d?aXt^^)A&lUiW+}ssdm&uTksG1 z8PX`+80ez*Be(;8Wm&?TfO~EdQr-&c4SGqnulzovely53wq@!>dEzy{oeTBd@!z6! zwfh-bmrj5L^Dirc;ZwyHe}ca^O!?f}@;q7TNb^+AZS1Kv_}rpGy(lrrDt?&Xx3H&8 zjaAXo<;id_qLqgnP7qAgZgmLD_W@D5J0fX(eQpId-E7V!t7sRZc0nAcynMq!TNv&C z!)Qlf5V+)#JIHl=x^I=gd<7Ekr7T-7UDn_do1??TQ>$@*zx6OiJTFNI%y|u-^}&P# zSwDSz2vT3iG)C-+6Ic@u^3os`w2pwMYXRbscMo9r*eX3KaZH|5bWA;3+5M8i%n0|V zEYF5=0-S9rLzCyiRGC5njAI3?if`~~=*S)L@z!Iq2E?;-NXUrM(E%}>_Dwk*-1lk? zpUR{FQYG^o?F6<(aDg*3QR0lFZ(ms2f=p@R>4rwAaS50`deUoHhpV9Jku5Prq`oN> z5f~_Cx9JfOHzBoBU5s}Ohr3~+jVwmUnzji8KbW;}EuxU7 zDX&Cn(uRO6QI#}Z3t?|t-V2B#*37~_N=59(=(3nyebdmiqX{4T(w)^3JEgj#SdgM1 z%$&e`>CF0Cf67-yrYgW+pNwS<6 zWGqtSUx87@_z))<280AZLYdPo&?nqQvKC(!xpDzwiE@(e3~&3&1=8z9kVrPK&YM`SS!hpAR^x9sml^~W2gN?|L!7Y@F z!3%_qfdx?Z!dhOmz78-0*f^37N^t2PLTd_@l2H#%Ben|S&@HPpWOmP(Di@`+7Z+mg z*>%_wGH=xXR3U4n<$(-oSj^#$J;(c4OWO>Y?*uLpltG4VN1C0C&3)DMyD}JM;pvDR zq&pR|8_JT`baHt?D^G5?xtoi{P!-wM22ajBih%;HW6M%50?#R0VliCi6gcgh{eg6N zXM3|yXR65@qS8aQn~Wc^tm-YrZxUVFshV1%Zy>B;G_~^@_rN2>tM*TmV0MN+`M{tG zY)bEfZnZ8_BG#7Anr=*H(Q`w@MqgHhg7Rp=NvJicaC$zi5JLAC3^&w#;4~SQnZ-?CsG>2k43Idu6^Zj;lcB zz|YtQrYvFtg}n<+cJGY<_L75A%9XZjh(0odK`HnF5O6QnRyd3~B`@xAl4|ZyI(`P{ zV-`X@1w2qZQ)zx)sj5dqb6Zc*!~GYTXFi!=w+VRw?!*tiQ@4VCtu7G7MXes-As0K3 zBWr3eyEP+bbWFt36G|&VGLq~~sT87;Ss0T?9mz!P%a;OAi4i6Hc~zaP54Z{gAv)tU z5UllCMhMSZUIY)V-TEv8dOQA(+mAbu#thm&za zj6IpJmnS#T^OQv~qDu9+-gy2he{^7{g~G}^s5_Y+zX%ml!c;LtbPZxE8PmkVa!#Nz z>{km^?tg)oifj*!7s{t}v&v+H3npE#1P$<}?nx>2PTA2>84kOJ7ef;m*C_=fMIPe2 zUtyt?cYr=cUp%Mg1Z1X3Bu3_ z+c%W8GUyU7w2^#xnqHTv82I~*4Nq^a8fS$SI==c31drtp5R<~u`Atdovg@apZ7*lt z#V~bdy)l{Ula9!XLUzRj<49J;ZU(3?v=#|Ou*9o=w@@++R3FWUSw>(8SGhe5NYSR9 zB()20fM3Os-IUe2X)UIY-w%%$t_fWjv4x8E=H0-keKL=wTMW%Ke8tFk9`atJ>}IIK^<@7wd0HrJVKH9AQi$= zh94{cc53Ovu<(#dlB4VITSty;*kJ2`&IU*(2&x9O>$~P6$ySeG&jI(1cYxKl0=e`W{oxgC7@dxI; z@W;7KRJpd=Y7gOBVg==)q;L6Rcpb^y$-4%%(<8ei@97TBdf@+KJ7xm7S@V_C<-~uW zeZw~Pby<|swkY!BuROZ4enGc)d^PwHH+k%J5wF~vBIt>GekxPC0^+bYhR~id&ttw{ z&P@1jdl7gOGFpmWDh!lw76Z1PrlYYWi%NxQsX&^spFSHN5Fs02hjXF$iJ%+j&QmSDA0|n*Hg0=}D{S;@I6CQ1VA>Pjc-2Msh zWb;BFg%L|wP0aCI`cta#fkdIcj2gTudx~w9u(v__O>Z7=P!%6VKBlE(<)b^TO1zoKaNj5@96 z>}7saA5*`i=v;J(#Nb@H8PY?eqQhBiCc!R{pLw&Xr_g1fOrluh7RNu(rv0mrUw)y? zywwux)sFI?sg!?#>?~h_+(vx^H`7>aN|a~q&0?TPJU5nVGN@Y@^Px2w%Umzaa|Wd}zuZlaKM{Fg3;D|?fESg1=to&AG!AK#NDLB6F_|5nda8lcibW;+Um z>rXu|ERTVQ% z`@~Y)d=nZvxvdJohuM4ijYU7kwr{X|_J5(*sBZD`OHHcwCR33{ID%KE1-u&Q`cU*> zf60NqBgsOoAR$1u*!71#PY3@%H&o`sUj|7!r|DX(zli#M7VcN-2VpSe4`SwlJGcmnb&ylc6)VMy}wR_?6AhG zwmBj*rz48>mBUf(ucY6Hoxg_d3pa8Nc!##e>xO$apWQZk*PLV%aDEk<>3Oo!qAx0g zSIu4>9|Yg{SWGn8(Qe)gE8-Esm_k+Z_M}2P+n(A`)yBfid z-Pki?X!$#mBV3#oSbakJaKu}>x%Jzvl%hCH_E#+4o1R5Ws|RGr#pCHs&mAeG&;XIv zHcq5~ulB^=NvL(sAP1IWB-)%)jC9V(#n-?Dw0n;bQn&Ng^Vf!-q^_9eu)K*4fSphe zE)*nFK^>1XgzQK}vQK^zKzmL#l3`ZzZJMau;!}1W+iu$aUeMrOXn@D&FS;#Fw_iRB z8mb{;gKtoP_aIU9GIV>W{F4u$yZoznqq*7I@0!|L{^pp1o8R|{P6XPdz>ZgOtD4a4 z@|zI)iT*pluw=0>t)sR@+#F;DwPB_4x5;YGWcEXvK)=#f0@MK)=m2mP)c0iBbarVQ zn6R(H(2+9!Wi8PrT#W^{^sXUYzYi z=$icBXNY+z=NA9650Y9%Ll~=q{IB<$4@u2>(J&p@ijBXgsdIC+W7^i#+B-nysOVy4 zJnfKnL~mmIK6xSNRlj^2l0c^l$=21EG~1+cQ%HXWz++zfEqi0zIBihT`(CV#w0-Kc zIe#bc;XYr?qO;x<{oKfZN+HiI<@T;w4G#fWlw zyFt{6Mt^C##QQ%?BX!&D>DMo$F>U_!#_7>sCVK! z+M<`BKx1$vnw);tq=aGMb-y3@&Z(seh$gEam058?zS?Nc-(K_meuXlXnRqg&YHu{p zceSZ)KFFyqZLZFURbUM3c)ackmp&`+g^Mx8%K15!H`_$@xn%4&o--r3Bo}@*sGIhh zpVb%|qh|A^7v$u;Dz6%*-4{Su49D+exN7w6*#=3Eq$J07H}=!`1@!US0-|2ZOxv{~ zZ~MdK4=HoiaT_FU zN`BoRR=$<^#UA&~=?Sq5 zCzP*k+TJ)rGd|ye2eMlK;v!^eea|ixYmyDbK$3ekiu<{oD?$4)5?1v4ah>VS)sULW z>REGtmq$t<2n%N&zEhy%eJn?Tk;4m8`_ZcJ|B0afLy378E6eslTP)8r{G7V1v{4w)j_RNHp^F?Zpeb8nUu( z-gebci~jL7BhlIw=xJSk;9kQ^|2aM-f4ABL((AWGl>2(3j8)Do4?^QYIAY3Nj!!0z zMfC7t3kd370ZVxMunlY55pmXHQrUc!+2U_?oVfi*9} zn`xo#?v3v|G)`ZWIVVuxjJq$CC2aDvfHOikWG9}t(K#&DJ+A4Cw0h9iO+M39(MRi& z{uw$)NbABL$Jc@gv+FpWsQ_;)davHPExYv*jRc&G8oKLOUs-a&@2p8A6KL%!%4D-7 z3OM5?`Ov7+#+5(fsHFU{3N((W+QW6D4G%<)b{%?2{Dow{A^8wU565t@7U#eW^W36J zelRgI!q-;G_XW}p4%9mqcVHXYV@z5@!!oOd!`jD|1cub!<7-1ki|DA8}o+^pNI7*R~hY z{(k|R6ETT@(Q%E-9$2mGNU6|3KS>E5jScDH7I=Mh0M_A!fx(*AV5J+Gq8t&_{S35XUN8k)#tl!|NwG&fV zYr}6AOX__nI6;E1=KoWsKjwBQvlC*S3=X{SI1H?2u$C=)-$e{Ygh-SEM-NDkZP`Eg z)X#&T{<3u1WC4B71`1M@PyQrFE>O|W<`6hM1QCs5L_~^}#{SF=#OFrJc~Pwj&+ztz zRGdjhEiXh*QR@vAzS)BZL532TZN`JVExe~+q#rrOM?~~&qOmiTTRA8z(ZW{5g3R6O zj+qd5PnO)TxiaY;rHAx34{5QloX_SR_)Vc5-Vb1-Jt-^y0(iVp3PD@LM{QuTP>eF8 zz|-jpStLg+K>*c3C?#0=omO{Dcj)0N^j6Z4B0X7*Rx`h(=}}(M4?T(}4t)vsE>$xTI+L_U-RR zJ(0xdkJQ59mBxKD$F0@Z(TEx8Si#6}(mJ=N`v(UgF7yUvL7zJuF6ipeNDve+xoW1V zu{=&ycERQU*lY?s;nPknHU)11OD+3@ajPdOf?F75;FD_y0JGb5DM15t=pc{26CK4F z*)l5PF<;hzStL}W!G>kW(}?UbG8lMf?yKKv^_xojXPDKown2ybXf*cjRDoKXG5{-& zgkvn*LpSCTDAcn7F1$vC&sYleY#O3d7`Su`fuwqliI?4FM3dysIVe`htsul|5H0A$W`iv|9sL=G5*^4E@MDMihr!6R+6Ti;$G zC3Y+I26=J~UV2YI;{(|vF@MYKlox+W{vCyRDW)gqpQyLS*zFu8bLBEIe#roFpI~vt4l4Kse~JfZ+;XcITNM?BH2Yrwi&&exGs^!8V%Z}^q_l4fC##BX zGmsnAk(SyJ{J#CZ*ISMHYw-+jNY!3XKkDtwE&4%xIa_OD-iWyAkhS?lKj+&+=Q!3bjU9uCv_+BE-%y`=!c2ujWV- zGAaktxJ5l03YYBod5x7-+WxcHm5_DxIs%jjeoIjZwrTY9(U^zqTZNdKnCDwg{t|rq zAh4q4h?PKK4rX`@YwKErmunNi}_O37xD=9+%tWe0Pv(gazODfAM3-w>4!$P3Mpx{cW{*oTP4$yYsPUdfc($OrbyFG`(J z3Wf#Mb7jA)RHWU|X{VgMpR9?Ml5k`ygQi^hYTW*-xSrgrHi7IUAj_?OBhzrcG|Rm^ z;H4e|(F6wyJ&NQGaXW?tAcCcoeQs!cy2SP)fT&Q@D@b!Eg*NPNUL--bhs2elALxFx zyhm>qp%h%Q6^D#Z9OV`P)>$DV_$^=gv$(N!{t-4AsZ`cdv@+Krf5Zbp{0c3-=4ZD$aE(OrmYr= z4Lriu4eIzQ8g2WTcSI)yDv5!H?V|sxEJph?JF!i=iy=- zI{7?Xs`p4kw^3rNZ!fytgB6fFJXrINU`1Lf`bdW%R1eIXqI za9i0?Q5a6wr|yOddRVAX>yQ^Huxn$lISr1Be&{!?Qfgl+O(fH4&5&XZk7k7GF`?eF z^M-{F>^Tvr4eahl`IoFS1u9 zVf_#c$Cm4_V@M5ac+|8Lf8Uj*@MVAhzQ$vA{tP_CQ6<;*V(o$x;hn->#TZ&PWe4o% zJ}_LQgq~qIKAoZ+ISUTWepLDx6}qPu!Jb~yn$;+IXsR4hiPHb-8TNrQMG8F zUL~138r(!@%|@ZAAQn$93L(YBZHhP5Ye+YVsk^_WJ?yv`=4@(4k3ba@8^-0 zSNj|OHGBRda+Ftb>73{v%u9N#s09+Zpes1ps50EC=eB2ml=Lce(vi7Ux}f{q1KSK7 z6?|7u^^)Ce!rLtLk=YzLMjY+X&t_58mVtN%QaUB`NUo)ur&jngzkS~pqI$q*%6Ndcw%-z++UnK3goq41 zmSU0nGe}d9)Z#~r&64b;(g+#g%Z!!C*)MD^nV!n;GNP5GzR^Id>A0k-FB$wLkqW(Z z^6)m1yR%dyFKEz{w6=FnU&or1^l&XC1h2^{`8GZmtC~+29rp-Yi>^j1o(gPw@7WPM zC5xyq7)Hsjse9}@G@Z*r)~PTDv6NhO8C7;12D*I$2vFum7@f=#?4eM9xg0F@@^_D} z)IvC~KZ_4k1M#;p?X$vFl9VcYuyCiym$0tY_DRC@8F&_>v-%^>e3m`&D3*LyDgM2x6}1eu;SDzg#7} zUgb|)ds7$0K6R9!6H%R!Hnmf(o!r;MmqC?Z|0RTPve9OEMYi`12*xw8GEH>Sg-~zk zM}AA>RS9j63eJ*^B1hbN&U`>s6nDMtN`nULhJTv_!4^3vqDvdNkf)fk#AV`z(HFE7 zWrLZ&h3YBsAO^S&r2*U4(obR)F}fs!00QHn zcoDzg%5^*{u$155W)^RCA!}-lUoV#amdX?jR9(z8vXrwn?%?DuJv32n_w-aQbt8fo z9)qN5n+lvbi!9pMTEb1deh3|GHhb2vXl`Zj4V$h(o;Y6hO}tl_F6C?E`cG(Gy9JDn zMv^K@1ky$=B%gbR5--G&trUA;W3%6AW5vub+@5jBWy#m&<)whSz-njI!<}T5M~M54 z*zr?TXs|=Pvo7nEbiEMvNGY$P{Y|n{uP%iQ!hBkTpM4T3s$!EN4^2Ixe2S(dJSA7K zqY2VUM>5uO!37nQnd=kI*bu8e$rVSOsD-3WWq>#F9~OX~uy9O&6+9^ee#Wpp6&6O> zImt!~*6QR4d_g~~cxC$DI`_(X#0Afg* zUfI*+33CW%wO41r{vyjNye)e8_P0u-)SJQ>$6qghBA~sT?_#(;K4sUamv~F%da9vR z7F9NCvwp5MTr<@D5a`ts7L6%s%1--m|08pWN9;#tq;6rpL`-aSAe6qod+an_gJx$D z9&L>t_uk=rLJ>($T1X#SKnnlz{;{0{b16g$WWo$VlWX^8=HT%ueKL7-K{$nuQZd1a z{pl=;QwY=B6DgyEBa(K2`k+4Wmy59U&m>6A`sxB%R#%V^2(0R%ysSc>s&RBblAn$` z0yqPj+Wuf(41-r|NGFX6{c#JT6 zzD00Xf6`sBgHg`KniusQu$*)mw?S_!8&tx6d^YQ)yopu5j=KN7N{_VD-Y9fmzMP}; z-kvMl4aA1<&#h=6jGg!t4X38n zSUO3-G2!fm=@MOZGZ0L36W+EC*$H`)36g5^;8Ado@o*abeQ8lZ z66R2XTuI5=kS>mzD)E!kBSW1!JD|4E1TfLQGSyGlB6t-Rv_pf9Z(1q8rU=*hah|QK z8Y#Jyo>o0#nj)BgUj-{O99vpxSIV%+p<>+A6ddz|rufA(SZ8m>+jz*Z6x=NV`C^d- zy3r&&&3RD_TZT9*N@*vp6)k*r3+EPVnT(@tzo)43z-mzNT-w8qsEO^S>Q2YQ_(Zk5 z3zH+-CPlN}hC+W=aWP`&kQj9tQwVjq*4*5?W?`f9dKvuO3#`b(Oi;Z#;&VW_;1XdH z<--VLF=rthFc}hW{4YlUTf)l-BwYd$*2`uA{Q|nb)HxImu9M@sVpW(X34*iy?Wy0l z--hZqEU>y=;GP?iisFkcF*x6EJ&L$K_-c78<&!AW*p57ppsgWvj(TXRVR2mYK4;s? zQ4F#-Oa8g*k-`6aG-?B*m!$3mb>$tpoKfVh`nYywoWXGjNGUs1XJgyeY@?y|d?W6@ zslAei-{DEGR@P3>J>-m9IrldEyz6!= zb$kE|9;kKVqHE5&Ou=zDosDS+`|`8tKUQUAGOx%ug=^!$VEXf^5BzQ+B;N^DDj&zs z*UMPQCNb++>mEhxy2VzHL3*8v=cgHL@=r?WRr}vo|0!XNAFUYj2667wP0~ub+SQy? zJ@ms}`%z(jgEl%QZ}eDnut{kBP3k-Kb^8QaPOCdDaL#yL&=N?`H{Bz6;!OX2jB>K0 zqQ=NCSgZF^{^nDebKFwaab>RK)pyE>ml z9E@!+*yw*!$^VV<27mhjpihu03r@I_j?qzKm)nE>lT~;tf?$kdF|o`+$w|sEz9mCy z(wks*R=zGk?VXAmB(tiF?&YJ-Sw?Db_Kjj%CgF|ZT;|N7-~WvhX<=31hj4dmfL0NL zvK$?bVrK~1H1i8r&cfti(S?UiNnl~459IGYoq%XUdH6x8Gt#eRpo(cI?TXdh-ZJt>~cn-Hl3mvF77T zr~3yE^||)QBt)r$*0a@||I1{h2q{o%FBKJ=+n4%!Z~8nhRlrC-U0f)gug~RUL@J2R zHTx(R(s=B|kggh9lJPDrZS_l-0tety532%Ge0VpnRuDpa3`UGYZJFPx#^iEO*&9S#6?j_HmsfgJGeg}!B)(1uqyt&t>p)#>a7kS zEYl0TJA8Ph?TC->lPxdCe2ca|8VaP{O6c26tFQrwo1CcN%cWJ>0|W)y3~m$WUjxY@ z@O<`vY!CqN0;F?Tep>gayvm=LQ{7cH5XRyBUu6GZa-UWJ47P$Zgsjy_`pyHt2H}Iq zjQYJcwlOMtzlQO@3Ka0qPZ}JivE4^u4lI(P4FA3m%K1-JL`YtK!y>YyqABIq9DyKnTNJf^UEd zI4;R989h4|`LLFTb1wF*t|Fj}!FM_7{G=W0|KtoY&jrbb2FN9CtBoeeuIJwT@ zK7YN8n-(JJ`!T8e32{5*>0^oi5BTT>IeK;d`edd8?sED>OCa^CI6@B$x4gZW7!c`Y z_|1Wu%H@Hqli^51^5wbhbN{^$RT>;A!^0IRf5kIgJ~@t%Co`G8Ew05GKA-jI1k{{3FS>^PvjAjQF*$ zQ)}4s>RHVerC~nEdY*^NV>vi;TYmL`lqV6S*lro5a-#{lk;i#4eBk@;mO0@n4WoFp zkA((LT>>Ak;;mXTfBmG5A$!*vx)pKTFtJDQ73abP#}wnEMukj6OVR z$GtBMnPGR)x1o*Oeh~Yf)A~?G|G&ZRv6x^B-j&bt-wL!n=;ldTc=ErK5~n+VD>*BV*MeigMgWz#?t@g%4-ZQfOUNTPOh{M zkd^(@AkGW`BvwAYxA@YA2bc9c?P9+1Sb2NalkyXyS{58-ce^4K#G`%&Kno<_s6DH9 z1v0qUqA`nQ?g1_t){%4;l6Sq@BOI~+4O(IVzQ(1}5i5jbDc56FpB(OO+E_ka<}jFy ze7gK-eL3V7fFY##?X@&y1B$vmzpfG3iXmWvTS)_^DKA+z#TCv3gO`>Qb}ym6N=qvB#&8PwC>+a1=$Sh;6GRAa zJ~y#{2l({DA0g!7Q0ntb6oLFxHvepiq}`j#Jt6t)4c)I#Yd61qP2UsT*(NRe?+D;l z!)iN(@-tijpbs8h`Rmu$@s~Sow>POSw*WqdFI=vNN1!GEV)A(q zU}&t=j`=t8f z(Ddgtt^bpkfp8$<91?-%gJ8RAeO}8BiVA#uI{d* zFh}NvFdo$L*zYFUf1_zMTH$ON<8qZ(S5}8wIU3@TvZiA!3FCi=p%g}+*(#j<2qFIh zB6#T9PGBH+VkR%B+1a#cV2?4G9L%90M_5>n0Wa=sWe^TSi;dA1QTns5Hx?zu{JF5Xf?~dJbYM8 z@9YA;zTdQ#YbqjXJ91K><2v~MX)}|3z(iKCp^Ztj&moN`8yH8;(-QVYq%bixKuC=k<-*)Dt{wrkqsRu^8ILw&b(@HV* z-|rs4z>YSHWeN5GJCqyqe7`1K0;HtJ*MAO2>0aax)A9 zK*4+_6oZMUgqRm8eIfV5hrD~I%V8?L$~8Ql4p@`pv2zsN|$6Ne6WVWn`u^(!ait~W~~TNWdiI(u`H^Ksyc z4w8U)E?s6%b+!1I>-baf>6?^eU3Z+hBeB7!f-iA3ULJr zy*x=NrZ5neH>BTK+fP-F<5(F+<8T&+YROB`JE=8(NRczAgKeH?4lDk989`_GSqDEZ5s~ zE}T3faXDr^-m&uD(jpHxdP1{)DJbn2@aS<(UoAfrESHC`ugRS@-2;z1U^^aH(x{ES z_-Yq;dpSRm#kS*lL(ZDx*_AYzA%>2`Juemew+<|*1v5Wb%l3Spm=?(iQv0;!C!_CR&cW#Lqw;mrVNXaG;F)VukGe;hz5+1vc9PpCL1y z>8r4n7Wb^Pj5i3<(SfXu(yW_qp+G$*z>BT^V`OSb(bkT0XN*)84Uy2hFf=mxNJ{iu zU`2e3WJA;{QGN#T>ZTTVpY=YgmOB0`1USQOK`IZNUc|5Rk?g zd{4|lnAr!sU_YpUFM%|y;O|G0wrcd#Y*%d=QfO*feuUGt5K-XtrODnxkZ zv)fq$k#zDJlfjNCY{6{)w0J_&#_rsZ1;6(Rnc$ls5%?Zm( z`y8+F60wp@MgH?B-i@UseiYsO4v#~cS3^Z{^UzNG76^OqV*E)K0MIM&2hd5Svp=O> zi5KMM@Lr7oW0n`r=+TaSY<4$M+#4;q)Yl&#Cpp(RyvlYk3Avve6Yau7Q*6h!e~aPXPdVZSF{&`V{pP@pnwvKpX<8S3o;45zOHqnN_8 zY0C=|`8m%KS^aigr~h4wdER0LJya&V>FB-a(psnWuju^YX+Cdxd)?V(gsU5@v3$?$zPIOK5F%p~5bC(~{<*Qum~<&ATt zKY)TO6I)Fb!0n;m2frvR&h-pMHT4PewZF_A2xHN}&l82*XD5ZQ53(kM8Ci9Yjwtia zR0K?shREN6xRlPe3F_Qc!})(CW}@9xq*k8p28Tm~f6#X%k{qxi@T)UI)juXHJxR#Y zCFCh$VT=?T+-_Wflh9^>RrnU}X+ba+pG>p9>PUNZ(InYox(41MXl{{TWuf|KhKfhy4b zN)+ojLC;6&L1;U@iQS=nuJYa7$0HBSDd9I!mBWHskd}Izpt_Esb#zb|b zS5huy9eO+B61g95T`b%y(1sm;V7_*`4lmy72Eu$>A+ucB9!hub?LVBKY6b{(MoDoj z2wRG!6fX6A(i+_<0lF*))WQd2%K$HPJScp~0f%j_1Xgod+dXJwpZPaa49y$LI3>=Ox&O1q83oTV;h0Zr_Du!MBE#I zxRoLwj!~IF>W&z#_xag;fl7mFUhj-g(P)78XKqf$%H_)Hc<}U#Ra3kQ|91e5_>~rc zfb?azr!{MWlYi`z0W7@-VhK*_+O=k>D9%t9fd*T0AsVb>seOo`y+ZWS*ux;Bwlp5#C!|SNsTIn#$Y&RaogVzjZHoaFAsUX5EJOxJuM10 z{w{dSfx{RbY|<7Aad4C>78s&RS70%7A1w4ZtOKq(F2Oa#A;Y+5qjIFvBLQ6a}2(=Q)!e1^2bl9Nyxs7nC)HZ3kQe1j=9 z#dnJ?3jNkEyHS+9dE6D3x4ioEIcdn#nu8PISo`Ksy}e71Ftl8)pRJK+K)i2IUvBXS zTSDnTOl{cC3D4HqQbNVZrh$;W694PCUp}F~)5!Qn_#Z;8h~OU%-=vmdRACL}_Lgs# z-ikaz{GPYFna-WWFUfj6!79KilL@a7;+A<6^BZXv=vXal+Wi@n1x-oV6wn2{I7+ma4Bs|pO8U7iTl0Jo$A%L;Q~O9| zwmY?)RJk#))YH7f1gw+$J2DSN9GX}z z10Mxc0sRO*Ao`{&y#M(_{FlO%tr&J8m_J1^j50o06;=P27X-7@l*J{RKWM;^qgYv?RU&OOY%gy{a zksM5*?1hSZM=uvjSMGqHO|LXNqOFi#JvwO7{MeHGE@5;aB^t){WAly}8vHJ_sw#ZE zZte>SLn4{**F=xg^O2y|@etz|z<4rbKxz!(7_^|fICQ#ObB;G+A(qh# zqMvX-J*&l;aF_diY$p7E&m+g@+KtSEqv$Uiy7>K#!Og`;TiUoO_P3FrBV}T3&a0vW zM>WL0DF3Um1LEdAvvc((ba{v@u6u{Hqac^-x7P_H?P4?GkE8sNu>?g)>h%C{Z%m!z zAOhJqb+?JV)i*cEaTqc>3(h)t-ovn{)~=iehO5(es$#u9bd64-G_*)^S@r;K^JU7~ z7Bkr>9ak=4(@7s-XB%JV^kQNyn)?p%(SH${YZ5kXvl{jJ^UAk)uOx-a!Ah|j9jHom z9xV^=_V-uOp-gkJV{vXnS7J4fzfTsE2Bn!)_Ku+d2G1_2$?705}7{~$+0XKNsI5Joaour8YzozM?=xS+AMj6ZVtN- zHiIk}D_Ltsg#EV4j+HN=cUpbf<#B}+I9(Tb=N4C2yN)q6ai15-^xj&naQ1jZA=v63 z6vYR#R)o|>(v2k@e1n2rKGS&3baSG%P3O__DIY2V_`l?pj+i*4mv4W+DO@qH85SPh zpAI9-;Hd)2*T=5QBXCN}laUhsV4qh@D8)fS~?$&k=`2)Y<&`#0N4y#~ip5sEte&{r2!eNq&R-Wc$;S;LTk$rcUqx|wL zXX-UYApC40YGS^T8KP^IN#g2QA=h;YDja-`%dS+qSaxhpfPTSwjw@TRJvRt&7^%D2>nQ!swt|;f0yk7lu zP+@%g$7j{zRcwxn>-mpaW}e@V#A?NuF&7u+Tfvpi@jLu4-dvrwD9oLfpSHS1Rxdq2 zeD5=O|5_y@iT-V4mPt`xUuY>{+M4L7R#IN8j3e9&lYb;ObY(t@`uP+n_~ap5bacpE z|1$N`VNBXORoSHBN(nSUWdda_glM@!$Y=bYy|8CC9Sl@1;$K3eY)5tG1>!1~4cXHe zH&x!+o)FuV2B@c*1gCXsI?zD<5t+v)g8bp?p#>5h5iIMfxg=?c>8VRndU%Do>t!qO zX%PlOK^_zd!GXz|x=KC|Dr0{ub3t{+t8XZOE6ma5K?8(nLQv~K;MnJeD~UF`Mecag z|FWjxz3r~tr>h%g+@(9eTdk*GZ9Xebo0=D`9q3W}7TkNQ$vV3Y9MlZg8&pz$hD(xv zm~C!^ycEDFLVRjaqrNBt4J1n;#RBmNIy$fR4qT<`{q42p`OHAY*b?VyUc`nY)E%AcDj>EYA>m|LA9NivEL?lGnt=O zYRKVa?46X%GtaI2{N8sBkP$%xtVaK z(_Gc%24ue|PXb=zd@Swx5JiG03m$5pZr!3g(B} zX8qX~Auwu9k9*(zeXj}#!_Cu@ydh1Jg=X027R@3z#3@nI%IA zYh07nkKBL4E+-rZgo17d4x--Q{A|O0(|4SkF{e-M+Y7ZHmt321@(U^11!|6k!ts6N zZ?`y+hhwGMmAuxjE+(HcE>8$);n+AY991U(f_TfDM=!5MX5@_+@0Y#G&Ulfw1o$AA z#LWv7=uC;#(_sw8e0ckPEiWYf83ctZn>=?X2c|lZ_Xl2X7;1Ueh-#$kl9Jq60Q4R~ z+h=0G3H?Gh`o3Q>$pRW!+-UyVx&gNPPx?$eQbHhwKtfn*vd*pC?rY;L54ARmb-rY( z6_m2>OP=bncxB7eVz2LzdFIBfs4L8&p>j(Q;rKG!V~=oUZf$SpR#A4@EMWJ~fOm|$ zF*W=WM|Szg(7(H1a z9U=@ahv->`BryCV?}Y%tp-TMWp(k9weCa;*TAwGEZXd4SYZE9rR_7imxAa{}-CE*4 z;Gn+VDxgANME%ZR)x>sxypX*~qqYW5bMzt&CwIb7H^TY-?ZPOgH=kSSsF{y?Oa}7#y)FQj_k=eRDWAH)p|F-06gUMy z!uf8_{bCe}93e{+h@UHwh`076{A05yYLX!A8eDLUkBIXyxZf;X?3QaC; zx)&3c+z9XAcj6#C9%Xf~K9YaAFN|{!(BrGSNM&9peaMYfU|4Pe0RH{Oa>pf_&C48S zJOrap64rEpegOW4;fMI@WM%BxC1ib@k5t$U%>8!ij?*!6v=&%};!oTe^yv?)p!xK# zLl`#rzfJ@gbvLV6ck&BAT;0O`8c5jqBq>3hMtYP;FOw(XL*f$#Gnf+ka;KL;@jC*# zMDUh^Rms2A~b1}ud`m$p6 zNFxS4x=psLDW(y>X$-ns>5hQG7F}!*Fv0QPny3Q49x*6HzU8R+V9YX?SE%jaK;dmk z@38oxnSDK-1D|{LYMtLcHjR|#9pF|^`=DMIkdEckUJY4%ob_C7FZEk1{OIbh!&5UX zt|wg>^J5yLs{;yB*}X^Ej=8?PicmqQ3nI<`9XH#PP;yrGANLk^s(P8ey1z0Dx<2L` zg$7R*6Wip7E}xQX*{uShtBOKXm`Y$5iDYTGdQ-eU0S^1S&4m$YOkr9etTxCtKq76uan2+7%l0&ezgnFmsNNh{ zIv>h{i$rU?vpiuSX6IiTZ{wt8t%f3&+m~K6(-PER~_zubnzjJ|( z6~9-2P6?mwQSMSXx*=u^hBDEi+a%ZtEb!5XG=_83Hdz_bbL7pyd^F1S1;F-3H%<}@ z0PNV}q54zs!Q#j1N}#)h!Ao))Bq3qq_uX$UMDQ@*13MFp!oCfjy_(4y;5p5El6yYm z((x{Yw)WV@^u2gbsh&f2x%eSWKLOJ`hV8y`!w29&EMr0@LKuUYud13Kg%J^JeghJ1 zmZn0Ycegh%91t0BH=E zF9TRl1(<;ZgsMq}xXxtC12|@_4;S-GL!6+%x$Zc_bfA91b!HZ3GGCnK@U40^U3@^1 zkNLhyukIrPz?6Ws0G2!&C6`|3{WjRKu8XHQ`^iJGnt6|UU}AnJf}hclh9@j&f`o0d z==>VVWkU=*sV(k`p%E^YXPq?txMZ-$P18oE zZiZWIV+*bh*qJ*~LPU$ziWz`%KfDEM1{U4YgNL1qmq7QG=XYK_mngV$?ILtFIt@b@6Dz;grUJe)b<*Y3qY=rK}#9n+)Xq0sWmlw)2Q&KxET*2c!;P@GX! z5BNa_1##D3ORF`_=d1`jF%4qa;-EFfw1i1?*Sser8DWJE8-`#EgrHA(( z>L%%GIT5-z{5E;AAXz|WVvxH;E?aY@5~oaSm&E~v{-O}=Q%$luuhqKmgApx9Z6Gf7 z!22cUdKP8q99>WMvFF`KHIT*HJNmmbgQCLmh~6r5O4pi^6YT&D106VLKa2;~2An^w z1KQXX?y(PKqqc%7M%uHKw49=i96pnT08!UP~cG{XNKB94*3g!65VB@#Kruy+As8ZUAlPcgptS!jN652 zUt%Z~O9z*lAmqw^?CUs)gOrc&sOJlrJUf0l z0K7N+{5x>KVQ89CVVtuN^)DsdeO3QvrU???@HZynAokk`GS%QykN$Mzx? zAiq(RFD~T^Y@iGbTqRUGtrL!@M+F?U4)c)y)|=xBpCM|tXOJ4eyBRU4isN?32$MhO z&uQGQ9@rcdP132ESJ99~(GnhKQ!axR6t~ImlW|c0&HhR8Bau_@X(5e`!zzZjJ zciy1~a!M{T@`J4hBG-D>E!6q~Kew9jxjIF)TD~p4HS&qcfQi7GupikfQ(^P@YL>Ag zMG1vSou%~Vq41JOnY!rZ5Xi2AoA;G7XHIpJ{)&m5br8P*@FxbDKOA}rwUNfB2Sj*Y z;28dkN0e90^nuDxuP`NIF1zoFA~&(RHnmyU(8_)1ZRz4gf6;QFF2&CYSnniNm6O=; znhO-__C)pDx9C6UjEwT&FzFF-%>u0pc7+@8=_96e>9C`_bEuzJy-k&D`{s;qHP3Wq*@zF@;i8 zm~2kU-XInF6KwU*hEHg4lr{qK3}2B}0YE4rd5{s6s&KYaRxc(&SB)=GH&6?QRmf&- z_bYIcww*4LLi&&xQnM=uw8@#U#|ytRYCdph5XJVkD&%v498UJi9VCg~GYM8*hvVpB z2}cyGw?iANEeNJDMis4mGg$juqP}`YVn>uOVKUOyBrrz1dPc4&#zPIsmiF1$&V)Xx z`L=h?w_Qsw=lV9u0#xLqqC$R`->T~cxr+h+?dJ|!lDB%6AMmweIlZGRRO2+L!WhN9 z4PxISpuB|$Xm=>WU_(6$+(9LWd9Uwhxr*QE-6J?;eYL=HS3H#CeY>GO#UVYozxw zTdpUrJuS z9B15ofxfWCX2JJJJTW@@H}9}4?&mD> za|F{AXF|3XKP6G40vdX@|F&fWNbuUk@w^qE@7P|4hE*`w3J9loY;j48^x(}1j|-z{ zqmhRsU16hYv7y+w_)={6pChnw&AQ;&npXMP%UHp*7PBCc$sdFSgSG}KUa?SOQB`9e zGEfnNqyaH`d^en3CG|vvTI8{Ce{gvAojbXq*<2op**^TU`F&oUuJ*Lf%X3}pP? z>oa?Eu03R|?UJrSqlaIr*i~8`zO-bIXFI$4jfM$AR8x$yc8gB`_kS7@DBS|~?siL(y}djsC?k>%DXc;!O&TUTU;d@|Pg z@Hs*+wVOzsKIki8jRdIyPvL_!K332I#~C#06p!rBLUl(yO599MHQ4wo_{iV*{@ue{ zH&W)b>jyp0$_kTa?%8)h*4~bG4e`-hgWbE@I}rztaB2L-VShN|~-m zGD_H}#~*5l%%cMw#nr}oB!qt**vmSAJ)3fTz~+bsViOeZrl zZau7c5z!y?D(m9b((GHUz@=g~j6v+e{AKrD!9H&@^2S%Zv^%AE=)w&uA0F)H#=CJw zd6-#F)I`-rf&-86k03go1)+&fr`HFG)UdZ*a{I91u)M?UcrK=e8}0k*PPahAyloUy zJdM<=$7u|!A<60s{+XRJl-RJ_2^B1%xK&!i<#84hL0^84;%>^wo%RQv+y$rF0=k0Y zX0HheR#t=3m zE7G}wvX;q(V^|{eSS%H6xpd*BcTo1NrI@u^voIXXg5rxqxjvRqjcSC25>~~}ipLAN zz=v9}pwBT`;3mh1hYu!;M$hiTWtjcZ9mDus?lOCXQ(P%&j)A{;Q`ZpsQhBAq!zx%K zw_g8-wCY_ZxB|@K-6A$au-a$(WGQkX-S|4OVW}?P;aQ+mLoh9u!JU0LVm)~*E#ljm zEjCI-16MMERXMUT>Q()|u&xE!U;b?E3!w;{BR1@2szoZa!|q+66tp4hbHtU@vi|b3 zcRufol^in`;+vI-L5XTfJD5AjKpkzk6AfC8AE8g#{xy97c)KFs?w|}~cVE``2Usw{ zU-CW+*aSwS+{ClI`+j)v`qep2LFe3xK2x1m&D;$-a<0XXrVTG7{7zgY`)o#XWzDDW zaq=QhIAf_%#T+fbSlc+(u&KbZ02yL?Yg3MtnhsL*(Od)7yRR!rg~ zy&psf3r~LDd0ZNdWl=?C zVGKey_nXhAW4=iBeuyP>{1NwYk}K0?h6(!z{^fdT4hiMEb6Rq;;Uo=(={gsh6)I0& z<$jW1=jeXo3@k8qb9T{~;vW@k&P3~9>Id4Cj8P4Y&Q2?eY7((5hA;|mhm@oxoi1r z=mDm~)r*s_fHXH**FI#mJKOX^Mv{z}A|DtmQ+L`Cg!r_QqugxF;mkHt1tR4mtp|6p zY@D+)M&5nvAV9r0erNJK9{5hV4+2kE9s_5G$DMm$u`EvB9Av}pO?|g2Q{0Y&Uw|tuNSv78jIy z*apz@F+;(ZrB=75uQ50oV5X) zY_saXdB3;{)YPYne$-KiZu?Eo)C1a(L5vex}kx%+(#`M$C2;2zpkRL zrS1EvBpXNA4f*}4r;FMS|2h;f;GQ~ZU;C@Ra%yn?&H3vzcn#kQbQT{X9d;sY6EA-Q zpTP0LJH}(J1&SYEG-0`a%`~0*b*$y~C&>7t^!LHu9nfNB4GW z%fM*S?TO9{XQ&VyTt4IAh{*LA>#jJ850P(*b@O&(xjK`?1lBcZp~u3EQmT+OwZF8< zv$4|ha;8f?aWNzeFXm>E=U25uFT;gik?K(^6JcK0~8c4~M5LTrDl~l?s%akGW5SBzs zh@#M7$UK$I^N=MAg~(Jg4@*eNun-dRT|dwB?7iRpzI(s>`|~@F@2}_RIclx-yMOn6 zUFUV4=XKpTTc=|R><d z;d^|0cK&NSFtEk6_`Y35#qcLvCA4d)+*&E9DOf^TmxMlObVpCV6@i{bOSfUSo!%t| z#K#wL^MM~jCTSc@Zj4v4lr##67o0`Euzl#*EX;<&1-y?}`go!ixt9~w{xWMbowz;I zmX4OXZi+^e4nJc^9*2v23kxk9l`R8Fx*Ed1C2~$bYd0H@{~om%kG=B?Xgv2}Z|tsV znq|6@qKf@YOypsWLp*J*^{`_wy6fC1UX@?YGtuv*uZsk zqfDYWO;g=Z^W7YHS%a7$?f_QbjwkO<97$XxM|{SrANHo!o~d_vBQu7RaR9^E_8!km ziUYE@3~sK>&ZtZFHDah~j(IeW7G?_tf!a-z_0*I5SWb5v%B0&05Y%f_x%;mjj;51T zx*N~bRr|YUb7PJ^Cj)$?`R)11I+5}lx=y@ms@9)Zm;XG9CQWBZXYTo_)lN%QZ|8T^53AXo zzvzUAfuOg>>$_jNzBsgGb`eb>UL_uyJ~7SiU`Xax;8$fKE<>wIFWH~QL}kmb&~{mt;328(VDiU`dYmT;=EvTPn_S`qZ?*J7{GW%NmSco4w%pC4KvCOwLHcw*3%8-49+UN%OA{xq2dGkte z|F6eNQcMt_B)2zU=D7hZBoQFh9>pfEeB$ zF4#&we(BL-z0mloIdp&&-{an=9MSBsWP4%YRyX_c^^OGg9!x7OwfTm*V+7`M@}2D* zCR(2m$t&EHe_GhFOE%zznpg^FYnM*WYoR5z3AYQ)c zM~Y!J17*nLtH}fgsu4h7yi0};^k9DM7Fh6KPND;(`VfPbHi3p|@5=3Pd zLjBd(u*XoxaNA`FGQ9xCAJ!LqUPOJCNwtmp{>BFLYTmm^rHO`Xzz{{iUU{Mf?8Q*& zTgI`QAEhQAULY*F9!v^fq-$_xi|0;aIHdr8De{z9nq-PP!*Zao``KC`Yg7uTcQ<{Iy!&QN`t=h#$fkJLKT?INtVvFte9 zbmH;0P%%czgp}`zora94aP%pzYRdp{%GXGA(-qK%U~~`#qOD;A?!mPHq3?CEV?lVq zNP$;jeF3L*S`9Z>66Z%XV+@Xjnu!L4b7`Ls0Wimj8{q=F88g8uM z6wO-HB|~~segmsfm0;Fwy)=6-^ZIc4gfCa9_tq27mDDX;s?ap|#_iE~c*8@c1=o%~ zkj8Pl|vzhQERvLQCZ8Z1O zld!sB>5l|B((Y=XRpn{GE?K%DmzFw}L|&yOimPApx5S>?qaNn+9eWt-9wkw*ull+U)#`hP>w5YPZAwXL;;JL zU{d79&c;l6kw@+mN0J4Tm|2Cr$2nK1%im2mjUYXkdjF1lbt8*)$I+&4XMEDi*||^F z(gMS`3G-g^b4=N`bjWTpM|M;7Ve(jQ;z%pMoOQ(riav($b8=S_$0-zsoywA4z1CWV zAyjIba%k{k$n`=3$8J>=YoI)O1LOH*SmHQhSZM1#t6-Yq33J8Dg+2E!^9QCA zxHf>H%F&+ZQj+v+Yx#7V3;+1C6N%0Zpu8!el&3Lvf_-{H0Nx~SJ|`_tRY#h ztPn*d#t>hjDqMxa>J|)TqdI~;y)S*fh2^9Z;d(X?Y>~2fh<}iFuy&Kgo+5wkMw{Kj zyUiXh?+<7QJ7yVy*(du!Mpa4rnDmTU%m*>N5Bt#{5pwDCQ}x7gWpf9Kz%tBhlA*$W zVswU?>~=nKi`MB`P28J>=2LR)FLT`4uU*2lTayARFS;Ioevtjcpj<-q;vKK)+5KJv zPcp8!srYwjZK48Ek(3H&|e?L>B*m%{Nq$x z(42BW!g(hDp$5v>Qjh5z{f>J^sB5FS^n^OU)Ec+;WK%iYb4Pn`>`Nr14qSXgzs#EG zIle5qeJ8PzB6^#Jgj<0RsDyYj&jv-s5}!LI3(s`ULVMfj>yN$Rs=g<06?#=N$k;wC z7fFm*AiGDO-#qL8gegK`Im1wT;%Fr|E6cQO2isQ{)ymcJD~YP|1G}JxxWi0Y?HS}3 zz794NP|hBA5ag4T2+2REMc!W_sy4Xp7Jj7-DcCeF;Xd5H->pUU)}4v}eaI zUR9N_gXEp+oOjvjIU2S%u*!r*o)0^!+7C*=_|!@Ju+w4ELJ9$#`Y54>19RIlD7&`r ziE2M>I|D6!-`j-QPYXSv^3obuT(Vr}YzQ>J&mhE3BFvb_n^$0#Y?-xWZy6p*Y*&}! zZlkMb@3>cZ*P3`G?DP$k@*%EPB@UyL8(4n~;X?Dd#Uwa`nxe$$xG#i_1@hUiVk?@n zbAvtsm;C(Vs0O3E1f8Q5L+Q^^4XkAX{C9X$tnyWAc)qfd%qYzHJ8yg`zc;eyMg#NC zd=EcnL)4By7L0m@(44fd=^w>L$az%0U;Qry@A+#!X1{2l(nwQ&;BY2smsGUfxBEvJ za9;}=v6pBm<^rQi^f+sgN8+@<4tio~?hi^AxDu^TGn>tIDx$Bgn}0Ox!E97IU!os) zh&G#6fvdL6S81r6=C#5K0@?xxiOOzo&3!C_&AmqFgNNR)&{i>4aS1$eAH%;Nz8&hJ+cRgG!q(l=S(B6pzvLu7F5j*{KfkS*M!wM3YZuf)-7u*%#S;E@T%*>70i|ldpHPlr15ZWA@kAle>` zm$ zq6^j|o-grc;AJYmY*;`rUSx0bb{hVW;L(;6tA6OiDROTu`j5)DL4=F_7mm27 z!eC2L|)^))h z9tFzs-=7$>PNH2tMuyKK7O)?N*M9Ao{G zwzHXVD)S_{5-rln7bjJ~(;2XMf~MJZ=hEflGs)6coe$f!1iQkI!Qu4;3O*j_IX{&< zs6^B1>p;ubd}$Lo!$8fkMLD)TQz~=VC?d7xoyet^qJ8O)p-(*}%wJg)lxAmO5KmBGD6zN2#)~PSr)AqRerH>;c=xSZSDP?s-h5XoU%O zEAa*d(pd&NVsj4?{LRv-iSDEE#PHhrlGaIbOsZ3u$P_sSQ&iY?XP)68Gm8RCF%+p+ zX#yU+`Z@jNRH#{kdxq38xrV)K9buN&L+LDe(%7qI>W(<8iYEEb#H*Nb(QuyP)RLuN zb-=zZxi{KuGkWAly+Imoruf29c;wkTn-_s`73Ctgn7GfLWodTVL{nUAMuh@bn1>q& zd$t9PB?d7llEXQq(WF87_LZjcFVzs-@HU>Q2$S9*<%R5<=YvlC`(}H4*=Y~y#ESefDWhuZsgOXS*h4S5&eDU9t0vs9esf% z^(0E#D`5ES(_Jn%!WWGoP4dUTz>yD=pOl?F#0V$mq*Yu zkh8=ux-ZY7NzLx^$frxdryp+IBar{4fI%qWEJ0^1QX_MpP*b`(`8$@VVTbxa7vVWt ztu}X^D|9fu{ku4tL>iX}3AlpB?dBwYCH%6F#ANU!kGS0a^7{M{$&J%W)N|SCE1$H{ zXk3lH34Hr-i5P;iN>Jf4?IJsiF&DJ7^e}WX{fvuN<4-$wEJfE<+rq==KS1G~o^eNA z%sg0mq_RB1{c%voFV>{%1W>s`IRhgOp#K zj~N+{#At?UD+fl6c#~Ofr(-*J)18x06&}U63i}4G6Nos;%gL`K03wz%Fj7`O#dX7r zvd2r2_f$%z<&N*%eeGoE_A?nMcGYpNy}}|X@+7C-qQH)^zpIAaIeZEXYsecrUds-Jk*H5JSfWtm} z^uudz@)btM%z$(ZA!WDGQ_N*mkcC4)$5PvV;JF2cAiFJAO&&uq4B^q&PHq~1_jo3^ ze7FcSj_m<46;t|0FN-8#1OMF%j$U@0U>XfETXb`bG_1$!w@oP&#vg9E~2En zj?+?4TG;9`ok8!9cmaRSn<-{u6n+j9?@BRI%^M zP!CMWb%|4SPHai4b9fA+U}|>ue(x)OVQiDi^5{XiL*EBw&*`tYA4N;QLKoL1u9#p5 zeI}AZ*WNL_a_Bo0O-k4qj)iQk>3daCsa;Rey^-@J zwU+cYJp`o;`<_h44V9Jh6b2Sa8)u%r62Hp6XKL0Xlam`8ZABNSXARr?pXkah@;)Ca z+n652Pa3UQ4=v_pi%4*gxyj5Ppj$*>|5*F+mms1fVywL@Q23}jZ!cX%Nz-!{=R#OA zeyekA8=VtrUj2J#TCgBY&tdbUa>#6x`9hGUR`6~(G7{XvHtybIpJyu4?(d1(;oYK;Br>jG;=ZlfwN#v70 zTUV=q-6p<~8#M=A?e0PZu!LnIjgy`*9K!zjc!;kCN;>t@ao({Lp7nQGmp|#7DiIaE zX19!{gE=j#INZXSqf~IM^?@(VOuA;hs5R;M2>92Il!xTkh zGg`9t-3Jw8RpS&K2T?TMcCnvUzvtI@;_JW|7+mT!^y>Y+ZIq{xrv>1tBQQ!3;i8Lg zC<%%aqaQnX3+^RGN8hRxJjhNOyzxNbKt)tzfRNty08EOZ-5{qs5Z+!qSUX8QZrLWlJk9d*8e=6it0R4J%Kub;!n zOdx=s#d8$BUbDGYQwBqns(d(U*w$ZT7TQ$iaCiZZ%!{n^ag)!@aRn2?*ZqUA*bpkJ;h#C z8;eVK){DpI$B!(u58vM0kaASJm7pbm77aR0Ry0H(zx?vQ<1(g{TvY3)@r!`>Rk6Dt zV^HcSH(W%4AhzyKwBP`ZE#`-19kX?6u8GRsz52%!EfM%j^<4!vlf>m%F}xlKbgzS*Qi5{Wwi>8W#+;wJL=~5w`51XhqG(d zE#Ah3A-HDo=SrbTw#sjf?#InT8W23qf9P6>Q&VA9c5}Mws5or*p;EFLU9;=$y*S?X zSKJqIo!(`Xgt~wQxEW%~I?i$Ab@gHc&7rYsOvEu@igS%uoS0Ht)ji?)y6(ToLE)cb z2+#)p*r&s~V-zg-MKBzw5u{$}$xpvwOhPa08|!SnTsLnO=gJA*?_Dn+f7iej$VuSV zDuRc=YOZkWrwf`ycx{!X#i(|efZv_-zmS^c@;>V$-V)_UJ${e1jvzIE#}Cz;3V zv8NkL>^?QKrYhQYBvX!E*_9Z9N*u}FO`OzsHsU5P=2zmV2UOkV_tPCX37dRchk-IL zqJpRiMh&*^i(guEEgK)QO=%0On zI;i{)7~HJ(`@35G_l|F_wlBwxtT!AD9IO2eP4JcIyQkQXcE#i-ypzhS`F(SXP>z=2 zHoXeNyPWQ>xLi!VK);*%r3xC`X{mLeZB3YTcF~~-s($q1^KV3>r8zbc`AUIp_4&IA z<7_=izULSFBVYzsXVAAKU}ejo-Bf3n{G&B-4B_2%V|*)yF!A!yv;vxRZAr4+)0^^C zB2?j=|A7}ogfw_Hu_g(but|y-{lIAc2@g5D3&q~6VAmzC$nCf?J2W(Y0fnO%Vw|&4 zMI;XQ%Gt!5Q~RS+@7zmklGQxGR5w-^iozSP=4$$9WutZ#p615^HfyuzBQoR|lL&>( z)%!*cqr%td$if$^^I!jJ)(Sag2)oMn6iWKp1=pusIQ)Z<~#KUy&d&9g_k&K^UhNDCMQ`UH)MG07c0F$ui=<6 z6EW*|aj)Oo!6iDzD#Vp%|8DJO7)ZOft8D5#ONx|%AG&~ zhT5$P?jc4e`6&rXB0*nS{36|Kf?NL4U*FQo+&(tr><Ep1%5ujH| zZ=E0DYc>7fld%K?p3XK;to5!IKd%6}s-!&wL*P_?3eJ#7aSUmoqS;}&B7`Abs%&EO zb@{z)>pn^>dxq3@SMVvFPPu=-`bvJ@&``-et^EsPO=uoFx#q0g`6WQ_asWyfb7Oofol8Far8tR|;r z(S*Aal}mitnFJUkMmEIr)%Kf8mCxw z%k#W-?a2^_)@Sl2Ar1uNp;sP)lr$D@BaWMzhRI3)BKSgD)Kk(mJPj4mZYCZ}S=c}p zP~mJp;1A!sE#z6sIW!JUy2QsxX1`jkC%-omlJq>x|2x;MEqz+vOd zQlBDzDx|u9+Nyqy-OKT5zwYV?Td6H*MLwNB1ez0E<2zKpJeZ71s|pW@5@D4GPFdV-`^(uYG;M z$d%BVpwbhg6=LYG>@hX|d<|danG0K}-EjocYL4STnbYlWVzktA5~(Z4;4QS^C)7fD z`=Fh~I>~oAHo@=Lw=!rx=|wP;WkXmY2wJ@eNETGS;UX6Rc}8CW@*2ZB6p(g}f?n~h zS5-c?3Hg?x@M3@#5AA<4=!YhamuTr0Z#|ha` zdaAT5AD)W%BfJ4wq5mgY5v}eXmc=8X*V`1i$8oU2wY%KeR0Xob44~+;7l}8U{gt1u z9D~*xhY{bE(rw&g6~VsGH}gzuUc(ro>)l}AI3x+~N&wh61Z-3{p-3=du9J8t}f3(@qY?K@M0W*}iMzkLSxD5OH7 z<5SDH*33u|O|ySxw6(i7;3WLHyI}il(jnWY@=w&a){X*@jn@*GBliR% zZBYt0e|)vL7>^ha*v4mGea}+>;J$(i=VQ~n#J9^+K*7PKky1;uJySbP{aJB`)P9a| z2w#=oj}WM4Dlif;*K2OLEt~g}*YDNI0jp>eYX}JL_)3&dvPl3vYZ$c;F;#u>Rcvqb z(-I=^KLtoAu5AHFh3m0`)msb!>i0H?l3b^Y$-TJ8m(o#p_eP>G7)h27wvq&?*Fw)CY}jlQ<7(nB(d>c}Yn6 zsK{_#5VOhbL2%dcsj69ML*Ve@mqX)lOBa;C?M0^6?R--O)r><&@(H~fO<$FjC3l{a zac6I$N#~hF%RxR+a=h~CAaNg&Z%P0u_)8AZWyQc6{Fcsy^=^(F^8uR&W@J<(j45huXHo_x84wp$gZuD)tC&XF$XSh|>}LD#z{1HB{DS z1Zu$IvbP%ce$b-9 zP9p&!4?^F zp!>b)ZQ;I~8!JOOPZlTJ%l&B>cC2Yi14F~p=_8Dk2l#tiE$g?FP;^xbW)qBG5ezDIzMU$~g9yV}PK z>Vqql+&YYu3{I0OE`W&Ave&QZ;XJ1KFnRy+q?hG-u?}bo()xF`ZkKL>ORPn;y^q6%S@^J-*?(y zWV0PG)Y-=h>|Xa}1zB&Fi)PoK<4b&HSH}8lJipAZ<@txIBEK5_xM~N&=BT>(?We|& z`eq+NKuofStqBe!bl?(AE0sysE>56)ds?;67k5tyM9yPP&)zB?UMUHpO zKW9V)$R@+tfz|knQ@x7H#~juNGS+a(-*wY|xh=tZ)8z3d^9<&TxmO10dHmCt)E*Cw zR{@s0atWyx-e%BJ?`KfUc5f}Z!+)~qD7>C8Yy^qQ1*sg4ev;^W1c1k9w5+Obfu_kn zLDQwTh%@W<2?$7bQh%=pQCm=%j#@ND1Hba!Q!jHM?f?I0{Qvd%lP~yxW`B^*F;j_- zn$c3FooN~aMOH1-T~{ua>g5{s-DMS+2GfSyn&+^$%>Z;N-aO^(}!?ag&Af$W3 zSSe#b5G^+Z(4<3;Yy-Au9)c{eTJz*zo@O9Va>kph3Q+iAt}j>pQTVnG6StDLDs*N( zsS@~-9)v`IhW76{_C0ya3oX(EARgFO`>d29UTAutGZU0OitO_$8-6X%*h}c;xRi~o zP9`~*+I>3E=>fR6DMgVdnY=P`dyXPUj?M5{neEQh-m?Zn2M}=S>09PL*#_t@|IkJ` z=0f>p`TYQn+@KQv8#tOYg+R{KaaD_nV)cmuDx*gdb9Ep#XvVmMA52LzdJ(J@Pj^tD z3V8Q=W7)V0hCNP)-(F8c=0hyMIKJ`jq4JbC19i4$)6f1x`aQ(a;R&OId$kmYuC+u< zd*#EEWH0un&7riuv?t9i=Nc9_z+zkP1avKDq1u znC|;1ruA$*(GakuIl%P-u*7tFM_CW3dq_#cLrftd`!9s|pFIRGg{}wK-i>@#utUP! zU^n}Z&mWi&MuQY?e7#h@MYI8XWbXUxhwCr_MS|L>4?H`TJ*Kzew%xQuq;jPik>VL^ zeK(eRW=1xDm!t3^MfhrQ)R{z_cTqKrl9zfPw?c>nPPLnDdi3Z`P$&9_edcCJ=j^*O zM<+;G=k_4NWcoj1zQ5(ge+bbBiP7bOO8S5B=)&Se{R|Yo@uA?%=TPY~1Re$n9fM)c zWv~etsEM~0&Q<|^J+#IwRY9hb>mT2vyM)+U{ZAL9#Z}x_hR?KknzWwa-Wv5h@Aod4~Xxt zJp8o!EXxLPuWgJ&wIRFT_~Vp~%Nzr~xCuTfJI+5CM~HyS+VfFx3ae-%JQWO14SSXcn$8U8aI!}7x70~0o(1%W$*2Mh~J0C+C=Dv_E`U)U< zXX*XybQJ#TIrVJNU4WX40_NpPYlsKu&D!u?Qd6hyvdC7ZX>8Bi!ACP3d4|vs#-CRM z*CPu_>DT~XZ=+cJ01BpuNBr3C%QW%gQR3rW0OGS# zJ(8GYD4K~4P}=WE<~J@jt??^^_U=l5b1_HrM^P}>v%6X$($(DhAL4WVv=-M%&5DD2(qIV1^Cf7Lf={;@@7eX<91wr`Xnk;1Fyl$r zGsjp;_ORU1zM_74-}(s15t)J~P|{hK=bvzsLp2(&+JPLUR=WO>lbl_GlCEJk{oM&9 z^Y_%;-`mYj@+`<=;$rD4y%)c*^+Sc$)twI}#6bV92Rr01cU_8#hW;?kS_^R~!~;SM zb#@V>gA8POc*vn=n;f#V866euUOE_{Q9^h^;PhdT;qZQ|Rs+Cn?CjXHDxbDa_$upm zb39U`pX9xGHTD#n(i^d!*n>nqUb!JSLCPMVN1)oQc@FyDU&;{R-s-dex-0^}%>shZ zC43%Zf363i@xZJO<(MN5T@)PiS@%of+B)jzz|QZo1{^@xplIMxlIfW5zgZQec7&F+ zErs{rAbN--bvAsGM%O^QQ(PE;O3$eb{NcS7u498a#Yph~_VI-=h#3!D2H^j{M+EGJ zFHJ%UC}}%r#RUNjHv#-vr);zlE!9n1vc{5ji-rX=*1xI+;i19xoa}Ao z{Ahjed!}VLR#knr10~lj83b=c$hu<(-uY?8N*!Q`8z(64?7_-&{(&FhqrZ zP9Rba@LW^^?d8V%3|=(ps!q!s1WceI*|`uJ^3xlhm~!jHYU%{@@~5o)1*dHNP`syI z_?6rXt;!JobP)>dw$(o!H3UxK`{LeYJ3+UV(_NMBa-{I%#uhxBLA^#~`0@H%dSs#E z(ZZxZ!Yc@XV4K40WPIKaFFpKWW!LV%NO^>EII&eBJgdOSMYR{yT=d;_W95tc znu}NoA4;xYQ8eaG&iIjf_TV2(99F>!s8pSlEyeZUuMbLmV=jHOl%4N(jF)k+qfg9G z$)*+=+ha6=rNIK$&Gg8XE(`!rk;P* zhI^+TnwuW83~%#(tLIm)Jhf|oi%I|FWq}S4qI}kXxK-@A>Qg}IPJlX>?NuR;6My(AwJE-1<=6x!%(}3{J32$sETrjd3rQhxD4UPkXRUwTL=L@1o{t5n6b9NOC~IS-J{4ach?X4 zQxvW>abyFT7g&5L((WZW+VcG|pLZ+~(UXU!Yest0jH7zb4Lq{ZvW9HBH{R$)8LQnu ze|IBQa3s4GuRz*#@iM*FeN*&+C{f>)PZNxO%Ue7M9`-75c=r(hK*bx;9bhWm6&$UI zA>gk`Ih{u38!$Ob08O9HMgz*PO1t{Qo!{!L{G9<67idaXceNcEV*x`#<1~zp_E?>R zh8U$an6J1qIc=MmkjS{_!uUI>7sC)O9RS6{=lsa;RvX@o?i=um$xTcuXQ_X7xJ)DN zaM4pWxsPA|UTmw~Q)}hDC|6~(iKs#JiHQdJd8|^H7pqeLpizA>)*Tf>U@5qtfc?Ii zfWn_)$rZ~3N6TlfXj|xW4BUwyW~6vWR#t!*VzHl;3-nCnjtVj zWxzjUPBww{=K09H{p5FjFCaB;1{))3f7Hfgi}v`qMSIA*0_njOU2z%;dC_s|f;y@_ z^!?ihj1(TtQAbEf54y0rfmN?Jf3GdWO+pp#B>YxeOqJQIVA@X;B1kig#dLj!8EED3 z_La``K(hoQ-_OA8o-!EBUtg+fa**_hW)9J?=f@gbXxd*c1W4I(f8G9mSb+2bJCJ2k zQHUB2Kl)}d6hLroc>6OvNu4{-tc}givP%=BE@_4L&~C z%JrP-$N!?8tW~-SAkMSL#LEzP;_8xY^(@35<(FUeCsqAVuS;J$ z>)DjQiZD>1y0b#U7$QM~v~Z1?_n?cFA~0!)G*H}O!4ic(d{6DV+Wp<9L#@};+A}nx zmDBh)V%|mf-S*GKmJk#?p?i9|R@WP3-?vvyV8NkLef5*d;4?%9KK>ov1sk8>d+#|xywMqUli(vu=tsR# z0B?nuWApuNR&u@SXZEqMc+d6GR^1vuUxr{ipn)%^S9n1n(RuRyq^+z_7)slXMgg?m zn@dv@*6Su9oU{m60J!S|y4CtvzV17@T%s5JB$1oiKs5PzNWWref|VbC3$DIDUVXj@ zO6cYJ=jUTBOkUk6gwIhTHlhwLkAe{_2RQz2=nH;R6@={PZ2v!9YdPRrDJ-6BM5P8}4 z@auy+B zpIK*Vqog07oSy|cR=-`mR9Aqg;lzz2#YAA*Rag{oVANnB55N=|62S|sP1Dfvsa^}i zO?mk4zjvrk^858t5JC+Ay5nn5v^xTe+?}e%9IanqG*Qb4?}E;1CM*pp|H&?7huJiuqgXr~hHo($P?{s<+?`T&+0xO{stzKHLj# zV7p@iVj2Ed04k9$_ES?hLVV0HAypgV0{)9X&7vbSUweB7?l6l*G`iYXF#W8)dIV~) z2ggm}8x1qiIp;BT=N+&esA|}pt;c>+XpEug6xcQbE|y%7^MlIj^Hm2 z^G_Ws*9eVMH`;e094;a#tW6^pPRfCvZ3d0+ zi=|)xOm&2cUP_o%ZsXCVS0@7KMcMP3=YUrJGKHX}El=_&n0f2$ej38*Wh62j*&}^O z8%uh`;EX7GeZ@8co{UNaU#N4B>F5dg!&da?@`G@<{&lAlMBN^Uu-Cm0k2V1kD~yDctYBHE{qux+^yk2IwCm6})u( z&2OSUpok2rnuGK@B(lrf6DX+}x6#$F9RInXNf9_bJzQM{ zl|8;2>gqRZFzn=i4?!4$ob~$xP{))kb0O+DHOS&>sK2HLE8UeA_7+1C#BY*)Ef?aU zyCJ-UDTL~tbAcpeF24?gRd{I0V-_^+j44(`zJah4!;p-twzU;7$Hpqa*^<5egchiIE9;!Ir1ImRhCb*L|0|9kf;SbkGKA%!BQ{;3Fhp` zbtv{0c0@a!K;@V0e)pP>tf@D$V3)4|;n0fJEmte)*ig_p5vIqxWlmuBx*K>6`m=Q2IV5oCdQwftxU?lO`S|#%*)|`sw6vR%MkWYKqB7^B@|GFj4*J7 zX|GNkD{%c!0So~Ccx>ax6#;O?MgN8)r)Dj`kcOn>s=xJe{HYg~!qWUKz@80k$ulUd zYqe|6c_HlkBXaRIv-Oc&{!@2u_n#}*f!}6P&*bwhJ9;`OAAe@wm2}_W8!#O6oA<*{ zS+O|6gBvGZ`JntV?zCM3I_2a3yvyLy+aK!-O%;9iQFELa0+1I7FUF?~lGwi0cOp;u{z>nfQ2l$4q3<}=8r8eO_+sxCiqP5oRQ^wsfP-W>_*B> z$cdlBN^Wm_H*Isb2STkBq`-!Qj#E=vDrDm#zk3g2Wb1@teyiF=?w&Ol>&jlK?6>R5 z5u6W!2}>~GLJBk>Qy8AKJ|*Y`U_Iw6mtutm0&W0&$-y0O&-v-1q`e<}s=El;ZzDY@95SEqfoO+QsQh%Hvo+jgO{3AGC4C9v z3NZ8nR5r_d9gHYEV4Z{mrS@wrDPo}3=`Ux;SU@IZGUplwlDB}Ikp&4uG^DW{LlAoV zyAbA4!Oj5FLy^$Yn+CxBJN)?CiCjQp0QOG@fV@iz1rktZNEjp-gW<+!3E5UitIuD0 zA^@3ks}^3QC%eV#!;6c;KHoB;5xu=2fK9onY6>Z|B2}O~x%SdRQk)20GJXm0=KdEu z1s$4-vO(N3NZj)uLMUE9oD*V-1FnsNtj3*<;1oS&Rmuro6h86MNnKQXc-trxnRRx1 zYrsB9YRE9oR6sc|e))bn~;#RDRMeR=D!V zPXqhZzx!!4)H7*C(5DcmRBEySe?ZbOVT#LVkpQ2XyJgL^3&ER>NsSW6}sMd&i9Pu}h7 z3htU|2RN}srQ>{=^v5?n* zPnt;$l_~;S+$fY+axjorm4w_Tc(s@J)%~~GC;TQ^#w2mv_JDsOP3e#}ZJ9>`!mookGp+lf0D+uRhpc z-m-xOvCs?bhgHe_lxy4r^BLzJ@mbog0w)GUzzUEDO_$#qfsm}HB&P>9HSN?l$j2hZ z#vk<;-=Ts~$tDr0(PF~*(ceGEx{Itd1&+9wHEUvrPXsn1E)GSRTWv~{y}&4ZqYt?a zC1O9NHed!4l~>u9q|J4kEi8sxuWMxf4}W)HB!Gp(3=7mZdv17w8c{r4?V}-d{lQor zQc$bSYQVP5xL*q5S&o}^4ww$oPTxZan)CfncpzcJN+)fu3PT8Lxk`}eVK(Me;S}+L zJ9sikfun*%G9ZCIxR-Nm`kVs^xgAofAGtTx15Y@8tAj1k8=|O3u=`FSV&G67zPcar z#Ow1WXR0yn&48$LxjD)dCWfuP+FJ<`;H*bWwDt1@TZTGNoiTEV3di#ryN1NQmetD*gJ`RQ6)0 zY_=s5Gk1Go|F;b@>bL?uDSe=x+9jxXmBD!%0HEav97UVJ!>a^sUe!UEsy|KoiL|6B z7E4}Yt6In@ZgWfYPPeZ;H3w1F#Zg;&hP$X`o^inNwcbZWvXbAqCi3M<222L;tjtB0!g&uqGrXH*Hnit{mntUPI7U>l7 z0{^Ys7WQ--#p;Q8A$;JyvvEZhTXB|U7EyziVJv+=VuwuMlEGSnd(t$*pG3=$Xp9$t zJS>1YP9JwDID91z^4r9l1E5F>{biOM!_n`z@vD+_bG#p>-_hjPh44qx}2W|!W{POe&CXrZIl)!829L9w5twxFH+DWe@;T&l{N8(68%~QC-XUcoX%@T7iq{B7J)?{~;gV9-XrJ^uwJ>A7ZfS?%KP* zB486Rsx#IEd=ZH)s~^gpN0KWL5-HLfYNKNHvrix_pZwJ2!8`PL!av$5@idswMGrVv zKHVy;EHABqR$2_)MEcJ|bj#*R$9$1XLL`P0$L~W|rblN$gmv8`{u_BP z_S}6;h(JNF!8PgEk=g(60+ytZ>VjYNekr`995?kW$On(k7xF+&pw-;O1Bs9iYlU^F ztt+Er=BglrX6~Q!b+=K_C(jP!ps8aGE`K$LS0lT?#Px-oso>&FA^Rg>k@q{S z?LoCKK=HrdA*v1Chud_o^gxe;QCvFmAh|AmkbM!Fk_L0H*%(U!uwm_8Wiis=EgrbM z&#*Iq&X|8~0hHQuWEfmubT1XIfe32h`3X0(4ug|%Es3qmXfM2p-S4&=??RJ$)5ESt zZe@u5fX70^40SI%qJ)nB3Or4%7e46UdDw5A5M6MjH}QwPE}HOFgK-9NH-jV5Ym@~x zuV*EFG#FW-Ma*@8Uuot;hGt<7hVb_CH9TmOf1B#;k$@}M&w#h{eKZJt8tJbldLV^y zpxCRZK&88DV^JkYL4gcho1D=79i(Y(Zx(9D6`J%z?J-iMK!iN@{Zn={GRoU4Qf_=7 z?N>oqQzx?{F`X+f_LN4Qg|gy2)7vU=y5f(fbwS~wmhrQhk@EJ1RoAmD9v z1IfjMo)_);TR&?Mj z{n}dzX9@CkzA*|ZT+4)aJv(iQn}5T-PcUt^|Z9apY zS+L2H7_&{XKmsNVo{r|rpYH5`bOB!Djj1;^(0=TvCHDy$IM1dKR+uSovu&RABVa*a zAn#|epl3`R=Tq(Kt#%-Hsr#2s@RQnNOrCNMAT@?)|J&j@fFquj%EUmxH8)0I?LBeq zON`}U3f8t(7&SK*eq7Pj+CVH-SY6$aWGcy-2K)zGbLv3bXUhHY17%% zt5^Pq<+r7__7iWBSz# zY`Bgsoe8?^;SbhVeWARH*Cs$CLDhG%D^@s=mhw`gf_?ym;Ts>jtPtW&f6l%1Q=XxU zM@lnE1NeZv#Q&KOKtiG*XoR94ZqX))>{Q}#(EO|mOm3Dfm({$-gVcCUtNnX`ygP@G zcU!Xq{Jis5JU|3oT8KG<2k#GAVrm#+_KL_#>TYTpgNt7t~~(izezQ=~9J^6&Cvsy+V#3 zn)DiEiib0NrQ}PTb;`BBS!Xr)0P)+S4b0W>KX9Kg;em~Wlr-MntCc^kjG@CU`(D{z zB%8YRN9lup3rV#KlB%JvW)INzMv>zq`T|4-RRJZpbR@?;C^@8gV0I;J82!1)pUVnO zdnj(^`0sgk1ll`5`*Yh*q_GQfTr*=8g`FM8HH(LF$$3UBhui#XL;{zgVtR>#;!~{H z%JiAzZ|%LG{e9o({XYA7Kkx6q=fAx^TWj6-bzk>69LI4U7epV^*F6A6{cU8l7?9lK zT_~^jcasWDYgw z#3z3f3hG-WrGID7YQLl`q706o*N0T}WU>8_O9k*Yy#PrY$X}B;R*PtgKSt+sr@!an zD*V41+d8&x#{%rcvlQd2#`ABAt^@e1Je+jb^A}}#Cv8%AZM%Py-?phv`MvO~;~ngy zykkAhpT_{f$IQNpd*feqE<q*g}L4L}R~E5{!%iX3Em=>>_V9Al~zu__$)|e#THfN)U?tfn1I0k_jQ6{GfKgCnoSp&+ zAE@8H$+ndz?U7_J5t6X!V11N#=QY^Pp)9g*F^s^3Qcdgw1DH$v>!gCDFW@OtsGZLM znuBM^G`+}~3Y8dMPhiLEHV!5h1X5_-r&|P5h(`rSd*bpB#WMU{|#ApojSIod$w$jYONxLsQhWEUFkmZ|S6dTp(vzcA*|hH0$L3Wyqao(W^>K(YPrpTFp+-z%*L-_KKB5_q`&8T=a0NGGG!4G>-rS5V*a{hqhL~*dLbmtN_+x3{Fk7YTkI|;t2W! z&u-!BQPF@6HLt@5^x*J%s>3}61}M=_kucL$(XAhR>HXbbKaZ*&QO5?111kiz`WC#_ z{{|ETp1DRcaPi_&62ETtD2STJFHk%s1Qo8A?!EKv5_ysQRozKZ>q(f9Du2S74`|} zWjqkN?hSg6i*o}-h@6M}Ex?zZ^N)&Qq~Cp>yIgBma#2o7+%JDI6Sd}gy!@U>SQuM2 zJ0gfPY!cC~a&_6aeXP$vsd=U9ZrT(my>d;f_W8{~=~tzzb#|EM?DtD7FW$mDQnlu~ zP@OCw^CDV8bR~eb6$6!DX2RQ;Wr8J;aSnV%c>_yo{*1)}WS);l%-6#>VzuV=ktKAU z`q%IyqaeB)Ls`SF2$W{!;#++@oNl~(lN5So-12YZH|=F{Rl8IDfVk745O)xzt8`UI zRq9BrUY^lrMUJCNXp-BaOxGtbO6WBh3PGicW(8aYy{}heEsL9rtq$2zOpxdF-3seA z$~kI-dQYcn>YDvCd!7P%I)0LGViYk4`7)Rju?U9=I`a-8+LtU2s+9>ek#Jzf`TD!6 zqU(qST;n&Xd=qqoUr^{bqg_zy7jTdN4wOP+gD__|KRsQNi@X#E!tHl*4Pd4`hG_l) z2J`_4c8&%u<4tILn|5Ltyug0f=d;Cg3vbKKmn3)(I=m@NI_%nU)VYcXbE9=XS5Y|t zrM}a?c-RhBSD=>`9^#{qs6kNYyz3TiUbtdwKM;gOQ}1!WsC$1IOp_}b30swbt4lF- zr{mvy=W-p;#N!XwpAN=V6Z}~oG{jfSE`b=~zYa`5Bw*@^fOv>q@%fM*i2;%Q&V_T5eIY8 zr}0$9v};FE5kEVSGl|Msl86t226|9)Ht*RGJl^lS#2BRe-%1gvO2pkY0WV7Y6)VTw zkg4)fYujfs0>GEREI4vx{M9%siU*#FMBl73OyA`TZ}coIzI!EE^bmV~-rCF2!j`Ma z()p1P&$g=njjLMENn{>8gI|54JqwB9z~FcJ$mX9bYIo*iDqfH5&W_cV0Ja`4@a8u7 zO>`wagJSpr1CIf;y2!ww5YF%tDk5nO8P-7Ri0=F2(A&0bJ$!D?0pczBtN!21pTK13 zc?wy`y0u@x(dU%3mgoShdaBIZkFOrt%fcz25BfFtn0)L>vcXHIe(nt91Ivgv8(Vu%J~}NT2TM=8snkYHBn9i^FKC($Y_a)HP)TcJZp&m1Y(+RpW+d(7PRsD zB8Svdgl6cQ`)2P^xcdxfxc)u6h?bGRhdurnDr}^`cD_*!Es)4K*Q~^Yo59EzJec6O z(fJ7i$+I9RGC3e`zQ$g8XIk}SZHhza8KXEpY zCoCdmCw^k{tGT4h;?HPlSFD$DTKB^G;fK{q9tUS{xCe4@;_Qv@_U}oEVZ-*EUhCkB z7=@tmMakrAyUn})^zAFGyCw;nw(U>~!TN*4o+!Pst^we>I?1gd@`BWn1gV3Pl7J|x zc6-yi4!=NE=1x&>SY%?ym*(f1q?XXS2Z=D;oBH`@00icb>497{?!mJ&0eVjt^(*L1 z>FE%zaa1?Z^)^^dulb?QI% z%co+;qgtfFsU$<6inifA<%3LQ!X6?#lVcfTB!F1lH2rKl#0Y>BsN%=l0NGMwx(;oK z(QJ&mMF#1M~w|pQrUGTzE6ZR7$#= zM2(rl)ZN_KsVV%G;L7bDFM4p!u&N}%_GF%L`>1~CtWQN)x(h^TkIw^bW7RGk8qW?u zPmQolO8*thM3QhUjr#8fS5f2db8 z%A&zXDNiHDNI^R<_W?5RYAujWH#QEMz39XqjF*`_FJf=CcPU%r(Z>f>i6UC<4oFxy z;Teta-A{t1457>R`Co*EP>iEIgL*ZgU^wIci6A3YT>J5$3kkS#LyzMuBU=F|`lB&F zt*_hD5grYViKmZ_vV+f%5mufIURTxtcwOl$nUO(cb$D8?B-nBHPVb9X6=HZb9D(1c zMiK@b2JZxP&~N%VMSER37`#z{e#&6bRrh`e{7!Udb^I9~It-m=iNy4QOV z)a4fySbosH0nZoLLFu%oZ~^&H>1*dEf-lA_4Ure5T*w#<@!=yux@CB;%HfCv;p~E# z+Jng3(9<~clHuP!)cts_6L&gawMTHcRFAucA=YW$BWySDXnxVlkD+yefr16CB(PuU z&b+j-b;@zn3gK3H_%%jTO@1%>=t+L-i|5In-+Mzk0IJQ@ z#iOdv(4o|Aw=4DQb!i)T*F3tdymBDPVYQ)h2o7e0ayqa|F93_8VU6z{XK zGAppa>Y|EZzO`^-amA!~`Av)rMy6a=VVn>4MnqK%DRwNgQbHYPFBm3zmlWI7!8SyW zwz!YMDi@T2li}Msj{?NaJvuF|#NzuoFmjNq79K#iC#+LA2HMW2+Xb$iKl@YZuo7W4 zlG^)FPg!e2T3i&*udyF7ym60O{T8P_V=xWtOOhyd%=s&QsYO>@(9;sm<3_?yZV>Qj z)>F=pg)AD3pc~KZB1!YW$OQY$2{9zC2XG4Vo^CkeR~coeYiD)$M0ASlY049RuRht6 zg9l%$qEN>5Ob0qb8!Zvl7n!>bjD@}wW#n=u4D{(V`gjD-*Ow^aF5>1!!^nwjYO~iA zF)~`S4;&uAWRJp_03G34jAY?uOalXG+7_t``Y{3VrW<#J^CZCS;hH=#b(9=Ai1p2vBvK~vvUzKO@x;&aG!eZjiJV4TxQFXe!#AVZCkv5tM^=DuQUiB@{Waa4o!WGL z=|77L=n3=$DNjy}%u5|_lPFZ5uGG_mT=+17s47b2@lsz;M3uCH5=dO()jrk}3T5Tx z!?CA(hQRbGI5~xWjXhZtzlZ%3LNiq+JNfw3nSl^dfO4-?{ACW6MJ;fsuZ8AWSZax! zNKRB=%Jfe_iJrUvRf_s2L5gPZnwWnTjHH%96`geYX2!gD>G=$AF1&aF{pB=(d{4u4 z9c2;k<(5Ra9}WUU4fWYG&rL@lx#%O5*#`6ynP6^md1$RE5fy>zdu9kX6W=u$N{UVG zVjVt!w$Qs@p??HF$m?X(LW<>~Bpj7Go8AAA{jAAxCK_!C`^Ptxv>7%_e|#-r1c_B_ zpX`p=tA!fiHnZgKQX+FJtEB*6dMNtv!_sd0arCKh2z4RLOzCWD#j~$ByTqRGrzM!~H6~+jU0Ms81Y1TIc!dM{_8Ht`r!6nJq7Mc8$c3e0rM*`v;p}q)@#!HV$n~#lByqB zq*zky!^Tj!VZBrXZu~dB-rRQB+p3vG7<8)SwU&dtctLFxt+;rk37NP!Y7lVWY*({5 zQOk1+!Y9-4iA(U!hT5lLW>>p5UbOMM4%s|14&etziB$rV4r?sryl<-~a86waBV&K&?i+a;ne+8^m>pZ=@=m&?T z4W2i^5T(%7O(91*!d(H@kBKPJ2geVFQ4mjtUbjgG3IP|Kx9`qgS_2n@n3x#-bI~X4 zx;l_P_Gv{bY)IIWsfxHcBLOWz27F8FZ=$zB@0 z+QR7Q=pl!S1T)wB?w<#E{IdgCy2KR(QE!RFv3IPg#qd#2k3l zv1DD(=n0449GwrKfVH3TCd0_|@q}NEM4n)9FAK#zF{75~De}N}feIM4jRAyIB;%t+ zDmR`B@3x=?PcUnxf)kyZS8-GSmS_pVr3Sy`;q z0~IlUmVg%USJCr*EPqJlbH^Mxk@E}VX*C?nr59;aq*xjcUcB3}*x+GuqRjrmK>po` zu}4+;OlTHf%q|`Fy(=tQ!Sgth&u8=GcOzree-#-~ua67UNG;HL1h?@*>J)`hiYOSP zr?4`Gs_ez2>SG*r;rXqAogV=LlrDhZxwcTB%4O@m5KI%d@oi49o-f5c+6=lXhbc$8 z5u3^}C^-t;MXp`D_Hi)lR+<#?XJZ>FcKgdjfM!5ag>&?md(sl0xV_T+s)tlcK8zXM z7e5awIN_uCZ-#Ej?M#B8%x<0d-XRntpCWcuo?T|&_|GvUCb4ZDM5o&1$rLH$=JFnE zhtm@jgWkKnZ_=q^?R%1<}OX#*$Vew@>QS=pAxAe zdzYZR(*`Z1hOW7hlpjAjRh;6Y!7))1adB>GiMF?7^bV{4nx!p*q)XoKM_c%zXkU9N z>->aFqa4A0RxjhV?DqPFIAp;5;8JX(3i9sw_LcJU;LGXTg5g!-P>__6!`U;m%$Z|k z(#TFFbKwQ)QM818uU$jxE({&IVn@glc^Cm&^0*IOv~x#bhkk46!#L3q$YMiQeBV)W z1x_eOz54c4?}49>+~w5ovZZpohsT&|Gwzhb{!uuQPe+g%RDso`oYj`t-9NYq2qT#& z_C1cEw7#^lJO3|%D=@1qyXeK=zzUbxK%As^y@wK2pi85nAo?+EmRunx{v3Wc4iChb zI$({0sJah-NWvku>#iRtR|}pU&G@RLt2+x*Xz|cKZcWo7VX*1+m}NNpx}NXizNq{D ziU#U8uz~)SWJ&=8&DQtS1p-dL7-@9UlfEf z0^>!0y3(v zdelSX!?RhQqKbuHdTEd_cGlpC)B`v;bT6V4M9(?d1BlWIv>KL`$wJJP>m$wEvtce) z;~k1ZJ{|WOsEfaU6sYK(fxwj=1B<=C>!|}NmYst24lAbNK0%}1yRdaN^BVQJ|l^i`QDTU*;)0_lvK43_vr zFZYsMSAr-<#R@#u3zZaq>|L3Npa;+Qp)K}VdtZtJU>_GOdb;pjSZgZ+&oqDuSeeVz zxk}Ch2K(w^n3QdTRbc_}C*xVwJfmkBwpXrD0wG2SqF}UzS^oi;`7s9R^lhKOVJETk zEwqv-nV}2CaTnRWxj%&szvj?pi(7MMNm@ zAQm}6y)5so2b+lNsLk;UFl27Tt_Rf%Euj(geFI4C`L-+zd^U6ubxmd0C>WjaA=B_r zrSF_Far@Y#U8~b8yW=M}84DG72h5T2BKTc}^MpyS%s2H7(1h5+utq-nK9ZmB)tCv+ zFpVV8Ov$+cQKFN*N+&_dd?rs)OGPz$E9*szq|ReLsA`eD9pJ$SVcW&+Iw|&w#Lu_C zU$IltL~Ew4K)qw5FRvtEW{IpWJ8H=7MqGUELKkuzoGG7wcVV*xvyQSG@8!uBBEP^C z4g7ozlpuw<6qp81e$Hh&g5sN(Um#*x2p*hZN6O=dLOC1@2Oi0^$;{dy9awE&y~|>8 z$)ymyRgRT3Z`VCFCB^*!E-`PW9gIJ;EfWDE3qvv!BH`%<-D07ENXwxKsPa11L=X3- zkm9OATy+Gu*Ax=GEv@N&Q1qXSXCyfxZhQ4g-NNyevvm_bl?y!m=gtaG6y-&GpGBuO z|G10lvuu1-L{CT>CfcX%R@?#3*ck&PLvi|s{4yp*TRo3U61UGk61cG7$fS-d1K2H} zfKlJ8rLGkR zTD;+ed?LQVh-OEpb{=M1{x~y}J4Z7!)!Wsj6G)NBx*w|v_~V1Fk#OY(kHNbo>5 zg-*5bzfi=8MENNQ2LUE~O^}PGTEG1jLI|pFhUTFcxZ@rJU-57u<1&TPt8KueWrAef z6$U=?V$XKm5B}7UuV0Y=Rt^oF2AU`1CoevXGxB8)1yP21C{PL`vzOBw1OyOK_HKwN zQ&vm8^1vKI;UGIk4i^{}u=&93jm~jesU=Kjw6*HYt`lhujd_1Wne) zSyHUeDs`nMjwL@N;3d>R*A8F350@xPplXMdgprAnK0T>OL^EHOYn_;G3$p`1$pMgu zKF9dNE~i7CBV{vlf34_nvk?b9?B6mH#B-{7JR3TdBub`G9(gOi%`;n--WAoDsUTTz znb$;lhV))ICLP^n{n=A!?%_CN`XP9t7P(qMXL<2dg5hr%NwFrgl?V9n<;v5hXmqNJ z4vT~WyrhOR`$(~G>nWiBqZ?U*)UD5L0V52yFlG%E|9*bUrUvAcJOPeWDdj*&&#h2W z++E3qGT@?oyA@Ojd!EVUs^SbAP<@NSzpLj9|E_vQwwcs{z>7G0*S(AN@MS!^_3cHZ zzGXPE2ll2|$gG?`vRJg^$0QkjtHXLWFWHWlKQp)2mF9yAL5X<_swQ7^_XGyQXM@wD z=g=0V?6=tI2^)E9@2hrOS&NHsXaDdA;Pv}Fs}BLUB=*58ctIU~YKtHQ8$4ex0VKt; z9$SLkmG}nD?>D4knSme`3z;tc5<3(s-;_HJ^Z;COBz~_x1;hiH8&5~o;d6F^A!Mxv zofDGMyFXARiB2^o`_P;Uoz>!WWAyI6h5tp0<@G9^I;(fAETE(#P-g3!SggqCGd+ux zt>T_>Nzr=+3k4RJmNB2-Rh7i5h}c-Zzbe9m|Mp`E?zZ5R3H&~k(6CeiXM3+gvheEE zze9>`dnyfKtBw)=q`FoDK%s-%3ygpyB*}#TeqGk;$!a;TXNMGSzTNiQmqoR<9BJs) zmyoo#s_jOcKdim4c~2IVj&4>!+rA{drpL9Xi+^cSpXr?7n-caI_|od(f4?aNJVxJL zz|EC>L~df+=kaUU63fvAdnh1Lc>O1d0@l#`;O{kD@Nzp9rtWJvlQ6Q__wo|ulkX0* zp(n}6*i(geB@u-9=OT9&eKjz7#WvUcL%uFJe|omaQ_BBA5#) z!K<#_hMl#aBb1!zNB4G!6rHNFFAe^&AM^Othr6rh@#n*W!hJ;19y$U|kD@9HVqx+p zk)!y*q8p`1>DOQtL3|Huvp32q`N+Si@R8Lx{IA!jip$Y8FMy3$z+0yVc!MCDV;T}W zM!Eb>Lm4S&`-qpE=&0~DefR4%H0TIPtwiqxRGy z4-bHWG{4JT}6NxBD>A}8VU>WV)Wz3Y)BdII8mohbjb(0)z;3g z6ZC{UQc}jx8({uJ4MIN;KYyMw8Y#?PKw)-wL}>(?Li4V%cmK=gCnpwKq}Rd=RgS;h z0~wlL$qh2}mB0(g2lD9;N71Pf);q)dcX#?1RT6wl^HEt@%jfmu<7OK`B-wVw^1}dq zP6vQsGXR1nfHYL&N>AcZ1f~ydr&5L+xy1Pb5Loi*t7jB(IVz*auTT(QkUs_Ubgic~ zlG(^x>^Ls(LXRv;i!mRvT$8X|jhpvO;Ddf^Gpx@}1x*E#>{y6oAG}~4*5{5;x@@g2 zo&b5aF^sUs3Y!`z62N=Q5Gk6CVQ^Xf@oDYJTU$tmqBEsZd4d5yam`Bw0#qepx-@c7`TkDonXkKpsMnQZaffd%lp-PlPiI6GYSHAYR zhtf(YTl4%K)BcJV5HSp|A~8tl#({67*vmn$&D3!@#soU}aP;;%7QmCIqX80%NKRa- z8$c21{DT#RB6nyumOyYT0<-wF-{*(bX6s3MS}MliS36-U>In3Ug8#q#IwcjA9cV#s zU#%Y+(t{IQgJb!1mBy-^1k3IF;=w(!3#588<8FO%%)%zh*4EblPnW+zhk}TjURhbO z1zsB|;jY1KccDe2NnlY>5b3qi()1yiI-GXNLKBW+Xb0`*`qD}`zvk=ZWo-Wf5L(YJ zQ9g|6+M$zIkMzH2Kso~QaxEN-(t&V5ybiWiifQ2Nxk5ieee(O!L$JL`@hg}VyD3q1 z9m*##v$I1!NLIt^YS1!;bX2QK&`f2~5YR6UpT2Q^HehQxU`G&I!|-N^^Grl>6h?b_-cEXjp9Io+bi} z!WnsQ5H3t1+zP2UL~r|kl7g4$;>cA4Kt=g8=OAaYo3|MAAaBh!yfwcL23MWO8P22F zIs4;9%mrkB67QJYue5Fpbp@5Q8eoB0E>lgw8Hpk%6^)gc*H^8Oo87sVljt1Ec5}Jm z+e0|5Is_&9{o6G0v>=z-X5})UhQO(t*fwpXncaB=m@!f@Y|n>7*8VyC7)h;`5#mTC zW9~Y91a0y4lP3Jh67nZb&hSqP@5ye#@&GvyO@k_i)1q8^#9rq90=hvNqM7Zd{`A`+ z4ZpU2bR$NI$hgpB_W_lzzT=JVV-?pZe_lq10BqDreBTm5gZnXeG>T+Pf$~TFc8~2` zaSS^nM>O@8WNjH-m3kAI(!&>qf`+k4cHO7zkH{BM&6^+bf43HQqI1bf@oF|!6REk7 z%F5>{6jcELFTq8Re0Tsi;+J41p^P(>*{!*D|KyP=#N2}SeNdd{kzj%JP5_9)KzIx{ zYyf`gMSJpLkTw>Eehec?XoyQ7jfjKNTTu1_Df@X2*O#-vR>1(uQE)onehmY?%Lbav zZ%@8lwddW!JAZz5?Xlyr9yhx%XyE)vvAymL|2bL!mxm9x7KbMKk%h%DFaDcq=jg(2 zZTfDE6zk0NV8;j0z{;>6s+b0yy+k0uL>PJ1=@f>4fk;Duav8({I>ch2fCOYG!GoU* zkvQrVnMicFJ?v5xI?l`X<8enJ?d@&huWYt_d})b%rD>!xfNRvV;VwM z*rZf&UJ6vQYw)-)N!0+*W_8w`11T#$uZe}h{tx4B+D}K=;4IIgM?RQP0nXla=rs^z z3f>iux$%M?DACd*WT;7)kkr?k03kmw08#i?zCyIhGO2WMa998VH8(%M3Y@FJXV+!qc6YZl^6&6WL-iRj z7v24j{wkvphtjHVm7}JEyCjne;($DSE7#HK4pNA5@E{-jGGQUOOx7EL$*i81#~J#d zEo6FkQ`E2vLKgjvpX_0DSe>TZ#+wz#XJ9%ZgljF$i~ec>MajwK`mAHRs*zE7`y}} z_ylEeHAK?iTpO^FO}pvJb??8eWd(1+xix8`mj0`8NIjFSl_rAdLH&MAcnvgoyGgAM z=?+U3p7aE9)7~kW4ype{q8Zt3C|D-w(Wg)<& zX;^^&5t3|&ip@>IA;y(;&V=Gm5Vntm{=-QKF*!FxM0N|!TH1fm)|6n;lYooaK@z?_<-4{J z1(DklUa;(yNjp0`6)=FB0nMCVn34P9nEph=`I*Vy6lpu~wkflEj#+I{6uRv%{RVc; zTqw{6BcXso6v^kaL;9riZ@Bf#M@3ByBUng6gwHEjtGAffG=bpfqY}oX3^lOPy#22bhS4p{u)A1~##{k1; zdE4Y)4@v||3zP91E!2vlXX)#Z4&UM;O{2arD>+cyTG} zf3lwIsMHNj(({v0v}IW|`Uue9^z^LSzkk1STPV}8o(aC8AL;!?hxlHwLuDSvJ4SIn z0~9D-poP~T1~*7mh$wb|ikAUIr=q2m0A;UAoPa(z#MV(Z1Yi+kV`65`XE}uJ3~ZvW zm5ZX<{~BAv2NO@hUG$q!v6uqd=R0}=wgsiHSsYS;9EJh}l3*>xKe>KeR0E-Y`seN& z`m7pQjCg?xP80+4qvz9NM#uLYS%wLNxVuy=svv!_gKLdWP8Jxk@GSyRKmT#8qU5gV z!ecM((~fT&uyc;#D3yN@?$G8Bp@DsSYVvdM5hkkb{fz{GDD|%_&odIdbFy^Rc3XPO zqq~=R=a&dNwNd1Z8U@ik|9T}9a>EK(UT|rjUvW#?gNE(Mhy8Sa2BzVLl%@0(7D_}*T20@d8 zXia}g-hAEX)~zNX^E$oR7mp;a$Gb02ngd{()rGp0{tND<(A_W_(AH!58~c&=!UrHJ{++1+ib0;JrCdvFaubef?s-yni=f@r0du|MYh?ObtB%GKh1ya6Qs8eK5 zWF7B6h_*O?zvhtd%#5A;=+5ZxB13wK@b{F}-2mQo#C@)0ha^vgs zbaV2L9ak7fJMUwcF#`SbQv+dztn*OAJ5=!(+BMA#Ba|z!1Tt5@lCSflflO6FZyzB*^%N4_dfeN1awsQ)d zx|6)SIU0ULR{wN!FnfbVtiTTz&M4GzMCP~Z3$6kV9z%k~H7^cQfc&9)DUny&x|KX` zhej#iphzGm#WT49q(cr8LxqJxWSZ&H1{uIknL7MnVIb6w9Jin%#eNlF+4;S2%m0lR zj1*g}sAS9FT6T2#÷Xe@pd?eRs6i9s@OkYfLf9e_8 zUhYX06EL9|JOjS>Ma(A3nwsYvAEWweJ0vC0;cymFMmC)Y2%r?f(Pl&w!syeLyLMNu zsKP|cEZFy4+ISwSejHPxOlj$L`tBRU+HhNM&PQ`$4e#D~)Ze{#_nO_nOs|bYO;ucU zBIn`qGhJ$n7OqYOtp|?)8T@AVrTOPiv^SzO5wlWy8DDed)j?wzm>P!@K@kzxt{@8z ze-w~CU=Yy&dLjKK0lSpnH-XS6aI?{10ivp-6K%En3d&q6D|Ulfg_A~*O84VizK>;u z0~P_dfDlXmPl%B{mep$8)!Z+&f0f2P*v_4wv>R~h4ZmD)`m7voPA{){o}SQlEO9S{ zIjHBHk>VC0`F~qc?P8O27md^rkODNM}(-z1fK1{wCgZsN}_AXF~o#N5jMZq z=xKDSTa=T-p6tHqQyB-v;u!mBbRD8bIZPcK@(0ddJxT@mri4G>n{M2$+kx^MSJr&I zH*(q@&IcDk#%T|@(rq8GMrqpi8Gz6n)nEFMB9)%Y1ef|JyGa<*_KO zoH~Pdq*wzD_XN5>s0ZL!N-ws0l7>|KtU7D}f;H^^3P$`5go}?ukjgWyE>3=v6cu$8 zIY+@HT^I1FFb&%h2S$RFf$Myz(}~NSS2_hzFK$*>PW?=OoFcB*#Wwb(tG{7rf9*tK zSM3O{(z)6|!n70rkmQcrYOTcl%BKGo)$F&ak*>ihksH@OYh6o@6n;9GdgI!~Ah>u6 z$SM1K?#%D*`BS^xUhO~H^LT0Xh+o@V%fsJCL2_<6FgQ3nmlhBYtY#P3#N`qu*S~(4 zh5Iw4*pBmn!i&?_7zo5?$t^i{-2r?>k&E72hb8C$ls9 zgpmdLT@FZm7E3_|`;wF6%=a9}u{crFA8e@yzpTqY3XNmAb!>QUEn%o3T+-e~@L*)p z(F@{(cEM41`WD|5*ol4~$PLTP$*Px_eY?u>tIA~ZK(l(4H;TBVvT5SO8xj3I$j>l< z1D&c11o>luS?{yEYBGn&-SoiyPnm%mw_;;ro@Ok&y15O1Hx^3lhivw?IiMMW%LR?J zw)qMu7~;X(Hg8x5S!1}7(?-WQG>wHVdrg+tlW#EXMD3)Q#q4D1?YLcsR4Fac;(m&(j4ZitdeAXkwo zREQDUV|pk1W;AgVFo4M|Z@Y?o?Y^nl(@KiBhc(BUE1fGqk@|Z78P`!Tc(#qLgQt*L1iZ|O@he)CfPm~mOhX>8~1Ddr2; zG<7okqN7AE2wZ7Lm^7hGqTZt%ZXW)&CV8<~1_INaPf9WHjJ^QI)%-EF8j#uAOl@ee z;azwf^iK=AwToM89I+~DRUMBQ$~EzS5{-TVnBOQ;rzUmRp!&ty z+TY;rG)G`HxhwL3jX*;4KMo?>z?oUFOu!}9F;xN1@E3N!X$$liU%JMt-(MB@&!53= z*$rguU{Pr!wBvvAZG4a+&QUly-244i|1>s~%5oySq8IRr&UY7~jdQ=V!ws~O2G&VDn`x*>k#KIW!40$k6n1FL>(whL0k{_~d zgFw8GOppA5P5z;nL$nm~nIJZe8hY*Z5T%t?6$Nj+qRz!WU;)q8Z92e)GOX3N*~MX@ z4eSE#O+HAeCZKW51O=?lGiycGGw!)Fb8~{FhQML1F7N3PcR_CbNZczTf*QFOy0DsVK*nsJ^<hdE&rbrDFaxF|F<@z=hHeD=9aFGXDnU*D`Y|O6XKq9TwPj&`dlSm)7(nt>LF9!A z05NrZMMVX!GkVK+*)!^9-~FCV{OVL@PRMgL-GPb9s9z@LvZrjG_oKjiA3b`C&a~P z^9zcj5qRf`+?`O=B!ZID$`En8D7h#NyUa0h``%bEsO?7J81Sn+0jGarWVW<4h6V~B z2z?%DN;{c1&h0Sqv>4RuJJk?PEMGxqG<*p3P#XnONC;t!0O?^y^bZemV#CPLeVEAL z6Yw?-M)jGV4Qm5cz)l&+!O!5T0hozfE_R*m+7-Tm3y3ZBXDi^F(FGMp$qJ!yd-DsT za?As+V?|1VFbka!z&`qAnR9In=R0%`Jv-{H)wnx7-ZvhXI^RifkMjce{o`tHP zBt3svu5s+;>wpCVwX4!k`5JPcgruQDz`cgle$SolUjsEaZSsuEvkQ&Om9nVL8-w0w zh*Q?CE0OaQ=<82x0;M$<(Hm$0pb9M^-A2Bcq3|?}f;e|}bVNe8oO>$&Q*30nEgWiv zbX^h}oS(W0C5J+Mz_2X3Gls7lOeiw;_y+oZnFZ04=&-vpFcq~17H>e?s-t0F1trvH zy^QOye*%2Ti2${`eK}8x4dB29`-a;GLDwwsIa1 zX^S=4f~9R{6S)YS6+$)(XIJopWM12n0jcVJTEI+&g2*r%^T3@|M@(EVIWeOCb_jvfLXWhP zc)&{9f+W*HJ?yU3ap42>;&O)!)2X9dKFbj-v>r)n2wUA|3Vv4838c|1K&OURo=ByN z7B{YqaoqrxA+I&w2kYD-&<8?z{p6NrIMK}vfgVFU?ySKu>?2gR1*ps}-WGt=U~+~Z z7x8rGyuu0~=T8hKlq~v@1{-x}?n-PLcKW{Raz-rh-z~p zUn;pDmJho4;K0!c+3gwx=Fo(xBC_wO6`)KxLF=Gf%{cOYuO&Ab8QE^dzB*#b8p!hM z?bFB532{b#{*LR}$C{iZBE7HKqkh%lt=bcRACo#$-BfefgGsRusgaqX{2649W@qwu zCro2=LCE%Xy_btOoG9yO-ijFvpr^n0iCdw^Y9G~&AmuDLgZWr)5X%twcbxpy)((Qj zQg)h%8zoWV^txxz;lzDk0U4xRB1siK-eUiA{p-98-a^|QYRe0ETMX~KAztf=#N@UB z9d-`E$9mFaEfw!03T8+NJP$ta%YIG6an9_seJr^H)fr;-7yx^%J$l5(#HgxBKXvnrNqeKXq8sgcV+IUgl|)6tz-vvKH{ z#K-P?bVf7GWZ031M4BcRSnQmHt0fn4Ip)^CfPwlva=K!d`HIon&xgP8E|KZBpH)bx z+{qp|lP}$hp}jEC36Vm79h@|9dWBJl*(4N_b}baaLl!@w6SAK?sk z^r_HnHwHpmZfRv5D*>(AvP`cYcvX4kTFc2k=oHv$Bk8dhD?)5U_Y=}fC}y6MbV!2( z?d)T>UTyCSao(7Q(|VrX0I8~>Z5kevY?^35eQLWHZm8Ekcga!)yX)2%{?y01A=Efo8uP zB$XTr>8aUtx9%zt1S#gaI4bt|BhuHG&-#47B(6lqboy%l5eb0#MW4i;YQBJG2S<%i zzKj*H-72IV(G!^9^Wa0}v51%*W8~A+lVe8o??NUO%0Mv{bGD+?p5gapJP);q zhSdW%QmB}}B2e}Y++fh;RLw{S*&d_d zys{3)dwy?G*j>`@GdRHrnJ_U$c{_kgV5|MSDXA$k6-H)oq){8j;7D;4gYC~Yg>Gq{ zUDaEaF|1ezYThilAP2_Z&VY$-kfo%gH$Vu8wgTU0W13_~3lQu}TB%FxOxaVxvdOOT0Xj7r) z!y|^5xp|K(8LRIWOxFOu_DoM6vb`WO9`divEfcbpL+VVyCRvr@j+x^`y_cc37)}kb zIxu;Fco4-viN>BZ(8*tsE&a)l#fR%IO?DS2=e|CY#0;}yGpSXbW~K0hNaf1JbZ;et zbO%;cktUBKt^GC=@g~jz#dV%1)+U)Rn?C`>K}DNk#~L!Ob%UTr^^9~-#NO{pc&dzp z;1HOHKiozCo;(F;wBi+iW#n=pv=4-l!Ge+$00f}rv9vNX zHlv4R;r*CCB}o4gTJ(5IDe!m>bw|(Z!h|O>Lm*(u)LsS8ReVBjuD<^eet$9-Cj}9c zJZ+3FB{|eZM;aLwm7Xf52Nikr^N}4O-vxki`42s5!!!xj0@0EHCUy(hKRte6=I2bj z^#Sk^93se<}NEpOjRGx_lVHI)=Y|uY;LGOU8F_VTUH*ysm z5LNu>q%uM2PbYVh1GsGFQ7H3M!;c+=DPAZCY78x&%dtw>-;r4xmO-XsI%Chenh%hA z%ghdjtUi9+qXK(9gK-{sA$=IEtkszuhJ>KmZCwtR*Zlpjcinzd+!`cmVH%>9gy=xM5^Y&9`|8!z4^yn=jLD)e3s9-KuqsLn!u^tZ>|qTNaJhdl8NLk zgRU$ol1%T(A*{xJ^HYCK9R~juJKvol#ZJx}3%vpLM#5iYNESjV>mHf<`OHG;)x_1h56_kj zgvLd7Mz@O0DqR#7Sc1*8lhKoF4H(v+-2$IwZz^^)pIK{(VTkV}QL$9r%B+26=~$Oh zLDEqB#k1>+O?*dRcUAKC@Uvw}O(U)dUVni(-e2@vZAani?vgVsL8yLw!Uw=a0T7J zF-A7-8we&;mL|M0GRqb2)`;4_?|vF=RPD|!Wbv-gx55}HtZNahYf);OHE5aw-tI!; z7IQ3k=}TDU?2<9wzHpPP4riMtzF(*vIJ?7D`yH?KmG!u}BFqJH^*{x4;Ie!uJ*ehc zI72^PK-6VlPHO$B7e@4dMxYojOkUT9cp2RFMiTk(Tb5i(3;4_#wcNUfpas0g82PRHFsEGK8ovkG`0AR-Y!*-x@60hWrp7{$tp8;GjVuA8*sQ25S4aM0jwRg= z5frVv!MJEdZ+a0jc$l#RLe%E=F~C#?MV@7$cLRm)AHb9+^ydc5Y7l4@85YT4WS{5F z2nvy@F9M_hK-q@kRaWFax7h$;c9 z59kg$^|4&M{KwEEvyjn|`}gl70&sHowry~tw1cywkSSHZ$2V?QI1HVRt}W`)5kpEEg6X>+0zmIHLb({rBn2v!%zNB6~I2_AJN>~$I6M|gyOxYFCcgv zBDpmKSKKoHEzcxLr{oG?l;!H6u+bZ~x@ z6Uh7C%;kdPz}$}0E@AlRV9Ng_Z?mg883ivJX5^YRWT;+XJ_AU^3F=h)kAPW?uuz-HeQ1sqGJ8v*s?JDg0641ub{ zc_eIYjzIMB!m0BKFrrZVDI+X)67*X!pnVcg3vl;!nl>9?0un3BvBB6Lt6NGV(A-YsGwjoruBUp6jxWa)R zUC^Bt;|4e9Z)ww80wi4$O0FiBUCxQ2)xMnAPEkYJp;zl_N45fIP#ha<+AMJ| zI$@~^BLM@Wex)}*DDApm^nq1CVYbi(!}EITcDF!^LpIxgXfVMN4t4W}Hc($Gdr$<% z8a2!jBkq|;&r7;5tR{4H4)1W|x721xZBXBHjET_=h}>sxNY+J_jq7a8MU}-?Crwcb zQ^64FO1jfQgcSC zBL|~8^at~!qNIU-Y8_d}#n1=Id_2?)l!moqSZI9;xzb9<>y=0HCWTMuP z<31cb3c;qfd+aO8Y!h?+Y;5wyrb`!+-Q11F(Cqk6APE@~uuY^y9|=9ded69WoCax_ zOwwLxR;$xE(Jo*IlhP3j@2a(CFFeP2gK*%)<<@#vtveE~28*&h!cyCAd<>Ra&=z2oFHaGkrTy%pU1A>Iz)wX-MjnAb>BBJ{@0|p1MHU{YJw=zg17>fX4-ZRcs>TU8MBZcSES(+&@gCFF7ef?5 zKWzHn)VkTPGBe-Db{FmF7Z*K~*DRTJcJPPxCNUX9@CIjzf(CKyR=9rKlT&*Jj6PHU zAh}Zk<5%kFCUB*fL(ZD^2#6%D1y$(^Ow@OXa>+(+MVN{%K4jtt_4ezjh_#sYhUrCV zgVOGfj!Wlol zFDV>1>8SwFFo=;NCc-Jl;$nMKb+Z(pySzTDuYpVgyF2fFZH7DFas9UP8TE*^*!xiT zO~63){g>yz6o#2WFUh2vbkGp83V2Q>=+%6+*VJO0 zK&30nw;lZz&V6Ef`i?{i?tKyPolUAYmeGN(qex-*7z{C&vN|4`8L7*y>@ zZZuw}4odET6Zf2mS>{>_iCX|PQ+$dG`fHX0*ze8ZhVqxW4ZDCDE^imIh&k`%{2ICF zD3sPw4FL9O4qJDxSH1h1JgDlYfVxU~viAc(RUZ6Z`~eWqKzfLxSM!AR#Peom`|n1+ z?HIepTyo)A;v*x5djNT?xE27*T81Kl)B*16mqUVoK3Rwuj&Cr64`0I5NEejn;%xd` zkg*813;pWE2)sm}^m~lPC#24`Fy<2wt^g4ZHr9gQca1CMhOs>^)-H_l-N&gxgEtGC z{8u@o*_gS~u0QLxt~>Dj-d!uHpqzap<2FjiOR$=^U*b&dT4H+Od z`cKVKn^-K}eHjA-?7rg~ke+7@DUq@#2LYt!%N~AOsMt!TyZ%k)-HdBc4DTlZhfV^B zNQFmJ*s2QRZrPU5SD^huyz9fZ$ck|u9zFW-bJ`U!1&JoM*?P!kaZ8%y-8JInp3-c! zfJj<@iwZ8oF}?*l#727d6+rXxK{)L};s*UM<2nIn8TV$hB*92XMKZQjW9bFDOOh~w z42h4rHYnyBjpKcA%-_bHpasiTIUWH`tFkyFy*&<;uDpb#! z%sLPCc=&d0{Lib!n?60E;l?kGitfZ$`)aJ&{qepK^?7Ewo`rSd-(hm%zbs*Ni?7?M zp5uh>@7w#+Da>Y#KOi4MZx{6XFFXsK*eo5l>u$TlnmW|3Ut1tmA2`y4y?e54|7`Tj zgYwM>>RqWBuY-~dBh}5uOj6%+v<>ogdDyhD*G|-X_NSCTYIqpC+tFJ~`RUGBGG8dE zoOIi&>7>JBLdK=I{Ou(mMC$aT$i6d;kB^h&=SX3l=_FytmduHhc=Ogt(p&v|{*%>W zw%F=Hly6Qg5}$0H7QT2k_|($&m4iopI#A}9)T~piOn7{fyt~}|#bd)c{myRNdycco z07JHph&O+CABFFJx~lkFzeD8l({-fF^HIg%)(lVrt~Qw^U3iMs$m1OiT*~C$Y66%6 zrAX9c@IP2D#2z6F<%f7~2~Ck<2y~=T`>n|)a-27BE3=5eM?mPQ6%u~8a#Gr`{p<03 zGz|~rxdiynVPIs0pROzE*WT-PnkD3Ro308~BxU&nbR6}mr)wM|_R{1`H6q(D(0LrH zE-{VwR82^aPu_o+Zpk**D#u8FZERZ=bhx!lG~|^&C9M%6MVj-IgiANFviy?(j$QO!D} zK`CM5>i4KRG5N)d)o&JFTb_lYCYrA=bjL6|Zdl(pME$RvChNMp+*9(*=SPQIhb2kB z9r@;XhbI&13YF#M-^5+{1@fDUPv_X-pQN3eH-rNl#&lz|ay~&n_Hu`fND4ZsY4r=9 z_Lx|m^_aDtcr(0ZB{*V7&5@Vs)gOUE!U+IrCX5kyT>th`&7z-rI&p=*%v}(Ro#pNKI}2rU~qRf@^_?%bjus z#!?kE2`n^_m&sqBYN!p_0xhcKE8y<5*3{+<37XRz?TkP9>7jO7+rC$8nX-%~3NRk{ zF7-UVlcgfE6mp7?a$? z+gSGJrA)i3^?75J$D4+P%im-esc&3eq4(CHiDl|fGPnQ?JXZu8xsLkCC}@d%dn>w% z%}{6y^pjxd!9NH>L6RbAAX;(_qo4F$FAHOQyG;mT$qNfQ+Z6Qy{$L;{SSQ`lm;ylRurM05yCLet4;0X~ouB;rl`e`Ub=TVJFhcdr zteqoCqGbgDkq$6`gw{`<@}O28FtjZhK@9(LO=uTCbr~P4(G^d>CE1>&2L?0v@Q{WE zX3v5iN2nI_w5mHoG~f)@Ur%P{X3R20$t_Xbg zn}}C2$@$pSN1Nj#k4H>=(yyCen?x$dVWmx+DTn;BVgU5F&=)lEU;N>BojvM%^k#Sa zcQ3Y}wK3^fE!AyQH_HUH*OT(j^M`|T+%bucfu^;N& z0jlr!((2ufTH+@UFAMKKGUl8ql>YlWj{Z#uynsPZd0WK8yCK)&uW`lu#?dV$z_Hsp z5SFDrnITrj$!d_Xjmq`8ih6Tzq-$)>OR{3i$89CjyT>&p-rfG=Q}brqs4j1EU&`R4 z&q8Rw+=pCa8~1!XbQO~PgT5bK=WS(gtw{F48}JZVBMz-Ke(jJoxq{u_0`fWNcEubY zqWbsIk9lozi@bj%)i*&d7|t0P-7+b48&dD*L661qT62UC;&@WHxpPHB;xP9hvir_j zPh+3syj=)q{XHYM{kz-;2u69#gXUvmVV3X1G70>br+`6RT;Fuv24y}6$asTY>&7zq zX`IG4aBKS~jD}3T(@jPNWgemrh=N#?qD3AW3l!eG8^619c`|Y$IOZPv`HUo@{i8-S zY2BPx+e@3mVX-ne7oF1YesR#DX^pLSy7F-|Yc+Y1RViVuYg8D^uc2;G5>G>a+AyVA z9{6dfzsMqw^6%ZLOuoH8n@Q)P5tEmDi^pLG9sMuW)CjVl@4Ij&UNEaEbKfYj@<|+} z>8_s`d0~+!;eK<~yqZg^z;4^6_#|7m8=>Z+_WKKu7sq;Rv<@dXug)z7@vccH)jU6k z)2iR6>3^IarA)5Z^1~iqHFULLBHxpjLE@FBWt5%S%I(3&i4#4nP7qr83JN6NJHC$hOQ!#)m^XZBM-gV1jeuV zT0nXBRSjMGuS>kFLM(kGkv8x#rQQM$bt58jmI04@8#bK z?+j7?oO^yDAAi47_O>|tWBEZB$NI~NPOI4*sOGmFPgv=3Dz(xPla1#$`B40BkQ`X$^zrukx- zv3|aX?}P16OrpZ^s!!D^{bdq-A&Be6cTqo2vt*Wg_dB)HNoNe+2Ql3tj3j*dV8R8a z_fbpKq3#KcI0*056CRe*W6*^mKOe*lDDWd-mklTAOYE0bq1OnQYvt3+y89k_O>qk3 zr|BTPv@oY&6TKnR92D`FJ@=^O3$qQ;97vP}X6P^O8CrCFsks(ByOS^IyZ`)JkV*=u zRrgd}SC6lmzfr42px|;XZLABB3%_qgrUq$gYR1X+1AgPf!GcFiz~w3#V4B_tT7~?+ z@j3AB*Hw-WvOR!$`aX|E3)Hp6K|OQma|e)H7z(*<>@f#wmMOqgL#=nl1_|D^gx^EiQVcLYc|ao+nlhKTw*wNiWB|nNzAj}0WKmK^ z1|PsV>Jme91XgIxq5Is|)lnjpe3LdjU`~cj4sWovDjlnX4FDlu1CXX#I1vp_J-#cM z-hocsa8n{bD_{G{#?7mzrsa8nU5zJ zaRqrgm9L+D;HikwVj6?;?Y0e#%Eyz90JgyCO>BDH8ePek_%LR~s89->r*Ab)3wiOUv4tZGz zNEV{09yp^=Pol*koU4&5lIR#3KEooKLZLw-44Ev1gI)I%oUwwJ-sCrVclT`pq(3x z=ZvY=rPz)+djP0hl1^c6ez-oO^{riME{uvgtlX#~Yu8;)>UbUSeaby>Dm$XSsET75 zUz35<0pohO8gY09&{)Ij%=_yg2bQ=^a~H_u4G)|kpN6L9wYBUgfKt}&6`9)xu{-e{ zbd%>Nd&3#B+GokhNw3rHb_Dh47%*ewHAc3wL2qVT7+y}wXoRu?i~A6@d}y>X3R zJH;lMYoLJO2viDYg5y5tsyilx_$xMM?#3z+8b2qZOK|@V@&hnco*k9?!7O(b-1Xv^ z8_8p-gKrbQe@|J)W}ilk92KQ1)fm>2&L|U8B;A-(4@%<(LGgC1`)b!QKVb2KQqg{g zJMk%Ha zXZY8_?nk^1O5<p#%=URM0DaZ}zpYypupYH$9lEVT;Z$N!nHuK z=3{}|{0;b&?1B5d^12mS{0*+82NSw^cA{DM;jxZZOmaX8ecJe$m z*NRUuygdW-vp0m3u2I2d#UD$rlHM2M1PK0So&8RHN{nWM&*t{a;jr{!bI9v8lE}gT zf-3EhJ9&Z2{C8;HCPnd4Ff2DB5Z{tfudg1L4}v@b#hzYJbT7{v5~yFKyiQp3ss94p zesW)W9{HVLvNlA*r*``c5X1oEOzx^hR?wx-13tV9Dl9;MTVeJAEh+u0(3QORchd9W zfKNYha3t9}OCmcuE2ok{#a!mipW`#TUAP71+P^GlULO`6k~knUXwSA;+qy51*d3XBB0`$ije}lL6ys=zoI9b=>%gbb7vcTqbf$zhha!jDpGNON;s_ZA z%aTBj>@s$h3vvrCHJ{5U@H_WT4xSudUJJ#uP_s}}N1GiaQ3OG;LR$*iZNsKu&m$AS zp@8AJ;Lnm*hCP>rNNI3}9b#U<=jT{jj()A=I0yehdZEs7RdIQs6pOY7v&~l*6(NpD zc=|jct1^zj`3MOHqPydRR=e~=SqRW}-E_%pq`jYl&8`{hM{bJiTa4dcb0>6Dfe$FsxuXxMkWilEjgM#jD` zvxzNDKC;_UCbaNC5J%Vg9FxqdlQhL$VmqU;=T_-wt=LEd=GKd%YmMsY#5#~zEA?Z1 zNPH*%6Gi9!zXL=8BwyNuday4t4|mRLZ|8AT~JJ+iWTYvR%e8MQb;VE(X3z?&~{LAiS$<7i(hH)wOT?+@H~$B zEnQSE;-B|RejSqOkrEv8@|(H4u^GEU(}RpM~BN zN}4kFgS!Xp|5EuFr1X^J)M39Kg@ju~%P`o;$`Qw$<=p8AK%^@-lWszdcBG@uLM9C% zI;fBAvz0I7wpIP(LG*qZVTa5&;mqF7y3!#J5Gx4Jb5{5&n170v z4!v@h1m&4$$HIB8I{^f8M3C6V-^X$A8Hn`GJ>33udU9)xh*g!w`^c*%paQ-2hUG%a zyjALK<8u_DlhLTR~vq51 zTEMq)lInyc%#Y!}U3iNQKuco}bnHp{vjk9p-R#c*0YA=oi|kw>y`+sI!zq=&g|!q) zhVe3bdkA9MGD;C36A0OWm?nT)3=Gziao<~yHE&hd?DWr>W z5MyScb^8FV^HJbln5a!oOzpVpeiRvvRUZi9Ybvb=fM|GG6G41U+=Ir4(^A7%T>bP$OmAf^r2!p5q36f8{;^ z>pTwb7{wZ&r$2BdVO;(i)1;@XbWVOw^;VwmIm1UQ^okC*i-k5_XYh8PoOn6;WHu`H zl2l7UWuGSal5N0S(Rmd`I-V=#!slK9-g4m&7J6e0ONILyFrABGsjM~3m}B1w>cfdi*m3mIH=o_Gz&G#Unw<&h<< z<9B;P@47JWhtFWk&AEp(;kjT3P~rbD{HYTn5A|o_i^`orXBf$`uSNu8Ls*bJY|Id< z)~g*_ip2W}hS1uYT6#2r0F@vL8{J`Q3dg~|=lc0Ja=Q=s9u})U-)}<>AvfQD(4@? zENwSxpe*v&eAbGQJn&flJ`2k@rqUk(tcoJyw2={wmQYA|{`Z4eCQARDxJ_H!{kG5t z9M%a7PzX5;VT;61z`+E+fOTC!kYT*x5I>Z$Lb@!Ps-TPAI>r=%ixDJ6L=@W(__l@- z-u{gXLm&`%b;wj>W&+=ZO?(b!=rKeyk%?AE1kr$-K2l*JOCt?L&4!r*2$8tEVvv_| z!izCzb!gbKjpqjlTJb8BKyY6&IKfSWul>FjoDUji>Ejia%=v{iF=^kkWSOU>mmV&F zMVYH5*Nn9ZRTjGQodkpV1c4<%yKbash5H4k;X-`VS%~T)o?Bww^!xb}LVknOje39# zNjA6*-GjXQl?_SAn+g5{aYuSX-ghzL)kWB27?2e$#Y@Wna|q>d zALapCoXivroZ2diO+k9cCDtE-sO#_$k@8reAHe3rnuEf~$9Xyx*$XD-Tf!jyV?T+) z2046TFb{|tDn{N-2kvPl!OrqY=mG8C3UvK4`1B>#P!f(e`7NPFFB>`gbF#>HkF;NwrixbD@Z~TrD{ZN2-`oqY-hq&PpYkC7-oaGwghL@MA>OEt6JZ zXZ@^KT9au>cZJ$B-q-eOTfs>~+@^7n_ke$8nG0#Hn(Q9Ga1I<(V%4BQDv<$)+{WwV zWUBqFc`Q}&VxymzM*;?%VJJBq4vJs`I2)EEc`!~R^AMfFkXWnOCUssrD4KWDxmer| z^X`f4#N~Ya-b}hFQ+@UJeUx5-67^+uABVC_SJ$`Y=m+Hr ztw8I8ELZcL86wlFsje4xD=n+h+V$#=cTD50mT}>yK(6ca!TJcfT^3VlJ4Z=RIdIiQ zsTw}xFr#?O|4gfkt?_IBMYDD@fR|Qj{RrwcBb|GLQ9izjqWmbWzNo!$%d?rPD7*_$ z;l~y(tIN7^LO!lE4VaWzz%<0$q*fkLiYYHS%KKqT%=)Jar>t=>8 zE%3ucVg2xsmBz`!8ZI2laabJINANKgRnPs5i zr8rb0pgZewz-l#~r-SN>H}}J4V{&A3;10g`4y%>t&Q<3|eN5ekORMu&yMs_)sbMQw z(&s~Gl#c`rR=x$kJ#9N&E8M(tC0;(#bEs;+Ax!eksNifyf(}xe zwIJO6uzu2QntuGF>eJ0^$@For9bxl#c8!|{;c0gIa~aLHaROe0!tDWnFEdZk{gKQ~ z%_H1vh4-drPSKZ4#d=r8C1BaWQ0P8+wD5R-4C*GY<6^!@qD_dm9dqHaAJ?7uX1-fV z8qco^zl+I#q$UfX3NSnG7|*+{QD+Di-O0@aq-gfVw1LX?r~Gpr!bW)sQRW;bYMboV zauo-1c)SiEa9De^p}H_H3==zCSraK<5PmHk zr;ZvVi}QOe#kB8+Lq+@HcV+fwrk1NeOzx1%Kq+kK)@J3qfOioTD^iEvfvpwO%asRs?$*4F=wdq)J-I({q=mEmN)Bs!(!-PP#(zxhKX21=g`4d`2+fSr z3)M6E(uKy)1UZ?_6STi8DLU$Q^EJ|GH93*6uKZHA0oY7_*9l*LH+zH59>BPYJ{~qW69CF!Mr%aK8eke{>?9CvY%l z1|Dt8DEqLxTlbwrq6Z$|j2cj)0M*vv4S?8Tb&_d)GoY_2)hA9|?t0)l?%DlLc{?1t>=Q(VP#ngDpEQ25_)0b9n zzLazHdRnq=G)K3c8a72*t;3rFLG7n4Rwag-W8L>l;zFD~CdJ5{VMTnSvh5YJ&=I)A z%)4;9t-9iw)WuMAUD&nMJ;dPa2-d*k(dnJAZAy}O zVDMN4NvUEH8aQ^SLe>IsCdo)P#SEy~$Mh2pl9uIZ zp1;=G2C^TF99+=&J?9Ap0#~eQw{K?+ z*LW{&gMgyUdE!KS)lCuFTh^&Yh=`ig%%Z~S2UK@l#emXs1hOt)qNYRnpPQ5{3K?nq zy3p`74MziYRJXOJe4J#Q+?ITwgtawzaunJcJbLtteHkp+=nI1M?~AHbx|V_+OJ2W} z=wgSPQPyVeOb+1ghBuPb@4k@+U-21ibk1KaMq7Z$vbX{Z0RjMY{)+tpbuYn5R|Qw` zl^^*|fT7?8w$)ghH`Pm7RBx>&fOJT#%P4Ty4HBFFk^gcK$k~Y*5()G5(5O6JNHFD} zh5bcE5#?g8_Xqpq@X6cc()$B0xRj@;S6ZO(FGe?#-M`WHPQ|4W)EPTm3DUwe*uS(& znoR&NGBKI;?0;^{g4;MTtlZv{x~_!`vM)bp!zDMfn7sFHxqX3T2bwt z(YR6}vt_l@60Od6IM|;acCrRB)&ITQu<>ck!05??_WL1u{WWwL0(S+sYAu3NcdFyM zWpLO|dV=o!J84h|Z8PA}KU&Uk&Uyz(W$$Cdl*{Ex62@%7`4>rBKal$W^AupzO2Cvq z4WnUl>tg?$!e>PGOLLTujtPI7kULY5wjhBj&bSt=!FYpa0{>kUfziR6VYJxl;t;1l zVSRN`_^@ODV*;z{xAffEjoxczr=DPUipzBG2Y`hit#sS~Pc4>a@O^QRlh$oiEs39w zRW$o%d%I$}#E9jCd+g1$A1h@$bB2GQLg3Q5m1|`+9ah`|EVWsk|%elEs>vcPw---aGMV{C+TKH`p|I;#i{j*xzCM z8*tXaq2es z<=7`v*JDSkKhtSSm-M<#ybv&&0lj5eg$VD%nvte|c>>t+0{Hujoft_)@LAY|8f10d z#WmD8QjS^X+IK$GY@^FJ_5bBNms79fR<8g$mVX_O|G2P{ngHBUxcSG}fBCzAgc|PZ zY*1&Xh9=_wD_s1Kdov~ikxQzA!0`VSfc@vIP|pP-``RZn;r|lZ|JRFx3tklU+_aZA z{|s&Z@tgmcI`kx^Dmr7Le7lwNAA!&R{H+{LDi%0bKMbf>3;iFl-+x_X=>$gWr{jqC z-2eV;>cL=WbXPx&)BmqSBUpPKmDQ{_VE%O){__hy$jx3sAdsu86t#`^|5AY8n8I&- zqBM+)fxQ+-a+!@W;ku;gyXC_WhZ{Q_ZHuPaon?2E*B4s{gMa zR*`5A-dWOKFSYmo{=z6ZBE%T%-(O=D11@U2X7T*HuY|MVHf~@?%5fwR+{F6df1m}G z!9`Y+VlPbo%~jwNu*?tt?c>2M={XayrUjic@89`<55o1`LIuwL-5tqs2*!Ymoase1 z--;X%oCC6MR&qCp3yiLuX8v6I{U+4IsQ9q}D;KvNon;c?AA-2z-*&Zqde=XsnSYeD zIQVFk0yvJBDfndbv^AA#Qw1HVmp%B(!Q>fvW%!gyZw$3?PkNH&!C?K978Y!J~+ zFb`(UA8VeC$2(re?OB@|tsHsR>zIE&FYk%7@jIL9{=-xFjimXYb2Zx4%(=H`;UAOp ziVS*$IrdnQ^s~2GTrypYm+~|9DqMJbs1bYTB88lhQy}Q`3qjrFZWZ$C><-=!nl)kX zo@`cdF9%)3$<0r1C)mEa2m=#&f%DW>SPFLSk!x^ydDLulPY@gnMxDVubsQ z@y7v4(OSA=y9w|k6ti$@EUr#RmOuyhu5;jCMlXQcPo?p5zvG;VVDT;TUy+E5x;*Ozvg~H5#@3PdK-79L76#l`Yp*xwXT={TWSMjS}Ur3KHv~r^QpO-xw^18+462S zO?~1uwj1Z&_&!T=C^Yot^UJjBcLGhIz;+(*eDvf(BNQz@6|Y>kK9~NOI+sZ~eMm7}bC=O`n+H&` zvnXt*FQeAckGhL50yjWg(a`6hVdi3Mx6ixDUH9^|-{NvN%?&6zrGo6cb~k7wSY~@+ zd)6#2cJtmqTEN?pM`nCA-!HfNh!vTrI!~OIx3_KU6Kl5*13VSin=9*m%+H#{X1t!w zKhvpkv})eeHM`pN$h=5wZq7J+M>Ubh)H%LYYZH41u$Hw`&D^^0f%AW8)79aQ;FUVg zD}rEZ%qxOpey>@W%bv0}NY|ThlQFyh5K74hrejr;pd+*-z3rn#6PM+?R_r$TNYsG+ zWjKvtk*LQX@tMw_5>BImoia03@#*fpcWSBQ(kzc6WY2TY_X}n3IQuU!^6{b?->%%> zi}dGgw)JUG1Iw|QviqmVL@9qI8Y^Y?OZMQ8{+YzLK4Z(?dy`oX_RCbAt=r%>1X#~T zJ9|K)!V;lz7(43vD$`QzXE)jNi@L*cd%og}<4##~8B8$w(A4H(hj>lIrd32hP0QK3 z#g(r-OYN9#hS;;;-hNq$TA4Fh7H2z$6Mv+(LC{ysLHf&~de|^S-*9nBa@RRa^e_4k z>rbhL#!=dp_Cs6q%eYPW2axuZMX7+?(%fe-WBRH0l$QQ!hiH>dZCUjnlj<3I)$53C ziCgwWZ469k4}f3VXNwvq#*0)r)=qhqTs&r^gf?GnI;UlW7h|cqBEWI%`TN)&Abx4m z-vG)ky$?pr^uRmx<~Cd4vT$~Bp6%zKq#;cwKb`;hlQ7Xf@%Z(r`&NcNM^tha$d{HP zxwCkjeR@rMR%DLqB`CCmE&#gkE+2q6dfe${dpYGfa>u^3O8ErrFcttQlKim7fjg7e z)U;BFvSrhB=G2hOD(GU}f^T_uV5PinKFaqA?`sM_pct3TdaarlWI7zj7%)37Giy z2j@!2xc;+6Zdvi~Smt|y0s>60D;CdWj$6nLXC6w?7g3+*mYfe$RkXkJ0>ugr^c3&T z)p&5(Pl@kUAx1#cz{&@}CrY-nD(Ik7Wxt>=(qxN+>kg2FKX=zK1KausCYU%al9r$ZcFTPj)#oBlLz`}OIZ%d(bdVAzA2P~j0wKet9=?jw30(Dep`l}iF{)Va z>7ID#d=DArtNRMriiR4PJ8A*TV7>Zg+F#pTl-tlq8E0jm4i&ykVDthdCztP_{#8>l z&!q8WukKwW1ZdDd`@U~)ab_aD!pVA~`UdQ5x8`78OWquU)CBoRQR*VQH=!YDS+HqY zqc@k;_ucjaxBX^JBd6ckWjzjmuo%BEzdxzNAw4JOZgje4oAPiWVQE~5OhDpu?T z-ZZz&PUKx&DV@i{_=(Zwwj=CRvgdQ;e2MhJwiDd<8~JzaGfKV_noLmBWVOk8KMW ztdq|0cr#z@AS)IPfvP%g1h8v)zo+_fW?#?w zsk8KfOvv^ET}Mrb`gOv2V0E4>F9MBsL5mXPV`%)F{iH9E#A6*ty|1;rWe-K0dsc{& zzs~H1f*jgz$*IqsBo3X8N9PiEehfP~g9p~^OdadrJR~IB>6~zCaP&Ewm@)5PT3_5| zI_WktM;I-xOaz+!mhtrBH>X-TH8pq8FW+qwp;tXDZk8A;xa%*yHTVu0v2ebd)?CUI zVqvf40OY4yNJU0-@aC|vYd$}I+%s~^!HzRyY|bY8v?{!R8T5%Q`+jbUFdUtNfDpEP{-iKD42fza309#W zSeJ6;Jj$f1G!W_Y24R@d+<-HaMj)by!O>Pa@Swz^%$QNKYS3Ba>-0gmDvn%>wVCT& zI9+)oy@j`Q@#j5^Zm_`-Nl3=mcc7Ja5SSb5J^rFFijM5#%qLzPsgYvK;MvM5QrCex zMHe~NV4=H(Q}?*A6k|awsyK!9_!7y6R(tzPadeiS;zsw+ch5cAVJiBamL~UumGmRT zY%rQJLJJY@3pd4|_w{Vt=u`ZHsC0Y_eh;dml2E%@s--rv@rc)m)oA*2Z7iA&_Xnd)MF z2xA!1VsVkbE2eb-A4sr3_+Rd2SHxw~W8=q7d*!Ruqwrf>)HwQoG9X;y@8{CYaa6kG zWV+_FVF&j1|G3+Z|7VwW(~FEd%CdFL>+?kdR0O6z7maAUG3MG>b2IO`7Mv(oo?qNA zWURO_%Ou?78INfOC-t}i<14ZJ0E#3)rYKHw(lpagX6_5c4Z*Vs(a}RIW)AU=<3p__ z**C%4o_&-4rRWM}S~dR8!;>Dln@egCu9_FD$E)h9q>X(qBXtShNKi~$76{NstVshI8Q&p5pJAG(4Amil1l`8mPH=Zx0AQj3CcvRTnuVKK-m(3)XIZm+<`W z^P5jKEwoeZVUOXhX4pN7ll%+vYhTJlsAE&Vd{3v9HwZecq6$umkUEI0OSR<2iXKpM zn`(jXRO_2&FvvZZJsI0Rg=-G$WK_Omdrr#|;w&gXt4I`?hMKK2ltX(KR6KHstd}Y0 zHf9^e3$wlYWO3!OY4?vzc9j8oWZTr4GezQb8Ew%T|AUee;o9%z?AhGB_Oaxn#oolX zY3WQ^4rL5+>eH`udwY1NSQ=Zuns(^V*FZ)4(QK|;GhHR~o$iPT6zV_H5`d_Q;=Q ze`%Rzf1lgE59!bKzx{aF_My5K`BtKhRiN|a3tgN>j18SRePlX{Z_cr@qrnQfy=pI( zlf!RX3*z>-0-9?~h&lSx2wyanR{Fqv`*CR_hUDP)pTlnQv^J~F!FbNUD;>;;w@qbQ z=lPp&m@8?XvpS2U3QxNPxx}JLjR5XWL%5nzSgyGg|GdJ;oumWeA5eK9*qvMcU$CiXNA_YM7^n?Dnm3Wuy0 zo5;rezS0l4kho2x9tQ!t+-?EHtH7*W#eK@Gc>0q4JpY&!v4GH5&!7QF}s8lsZjf*We;W z{+3P@wtvirigTjozCwvOwocX^Q-7>Lh4Z=I6KAOiRA=EsA#5!<6cTNoNhS+3qk^|8 zx)RFs*33u1;;oI<(*DUp5z?Q`U| z1}$8a7DT3w$oDYWW5ng_eh0kw3TQpnlX)rxZcrgvU7FWUtSKR8=VIgL^PaREJm(OO zrUL&!^LH*xlek8PvQ0)Z$7FMaEPdQk;?wnE7LKi*3*ItB+OLH(mN`hq1vfUpyV?gPo5QW?*Hx@B3{c-_P1R*wJb7${w_5yGY2JdPc}@ z@y|VM8QZyiVVZ4lx9Mq+i@eW5!)}M^Qdtna{VeJj_A7&ePcSNDJ#-`V^E&>@P(NpJ zh0VL@wuW0S!s#X1hoWs{&YYOGTXmv3zpII4)v~>U22{RCX2p+uRavESa1Wrum^IgB z3Pp7@#r;esAbEfVN|K($PfxHzkc?8?{lSzLa7PMu7{g+e@FLS>g18EnIovFvSRimG z`k><{FGr};C;`lkyVV;sd<;4ANN#d7iNiu(>zY##EV*uQh-KDpGU4c=*f{Xu_u{Dk zF4l=RsAfJPKx`Nk5DbEzh_2(T7)&8we!9<_DYwNZlls$p4IDk6OaYZ%p%MWsbPH@I z;f66!g!_KR1uDB)e37C8MFrmM7i=#nrNq?{>=r%p=1k=As$EF`L;){O7@SWtm1<>Z zm{n)#`Ph%hUK=xeJpF_|6g>w~4?BeN>4(oI!ow6MouBZ76*=bQjOFtaJOaXhV&>}- zpTex;m|mz*&*hYjtst5cQOhJ!7rY$nU63lG9iukU z6zV@geQF{oi7Xp4N8vVD1{ZbXH$8&(d*vH7z}!g565t%S^2!1oVDd`m`%7NX_#$QW zpv)R6s{&nKC)rbSb5y!KO=$)ei$cMZ>i1B>Jg%hwkE%0|hq8^^e#T(zGbm)8v6W=a zzVBqKgk&d6*+!v}u@6QVV^6kFA=%d=#!_}sWJ|^tS;yKCdN22L-|zFjf6ZU!GiI*y zysqDQ9LM)~leETupqP(LK*0oPy^da5KRr-14IAZy`h%_VhlMNT1BONR83pay5?`7g zH3v$OqzwcZKHZ{=Y6e^$ZJWQ*${3s3sHpe+)k8v?t6`7^47lGY1g~`Ged}b|XAjSp zpZ$75=ujhw!`y$)fqz4qJ}=ze>?Fgs85{`eBr#`Rsc`;Jm8@E1bCq~Lehs2w7YxM0 zN3f#Tnx~c09$-+?6m(AA;dcM&k3HW+;S*i9eR9m}6zOOHo(nrX|2d%^MEQ3vnj2kHeJ#X!U7TOBN`p-+8np-p7~<>tk+!xPCJC1LHSYH^WfE_Mk1XK?djcj z$6RWkd}I=kN_SXbv}t5eUZF~%JM!|ue5K^v?4iapq9Itnm5Dl8ix8#g#MDSoKKV%> z2N1rTala-8-0yz=Pmh#=(^~f8c;3Dli7QRb!kxk6@ zEIBZ%uD?|GH_WXfFRFD_80XMg%grb<%Fa{#`93?h$%pN` z0U`!-(8ziPR<&r)`KM&YOOIa{}$``S6ihdZPBBtKMjn;k)5;<^H@ zJ?{o4^Owucw}zv#wb+m*ZhO&ERwvU_g|wsj;wd|>W?m-1d!Wda z^l2@vm=8KQmd*GnszPFV+v_*roSZ7#IVQ$BnV;Rdqt9;L(dZ0iM+irifYfUz0hrG^ zHYrF#Z*B5D~=Ndp;>l(pd{iRlX1+@n#_0)E1*aEz8*)16^36zd<0VsrA5v6 zC|Y-Ybr?gM7;N-T<2t*Mw1 zRH#bcScN~qLQ3QxVn#VQp23&VI-PGtJ2<34tS>v&bm&999yGAGKMfZnA@umPK3rz7g>$b+@~3nghSEFZtpZp5Js2KvulkMl`zEjH3j{KX~5B z4Ow)e#Zg>H&cwgTZ~C;v8FH~#9u<-MgcJI#^o4_f|0drNx8DfiQ;7Bq0|U&-+w(rl zA3I7ZE`jyFt$zg}D+Dogu#x0|;ff6ZqYy?gV=0dejt*y+A!!k6FSZl8_~^ox8-rC& zwW=1vAoq86l))UwxY_wD(^M+X^h{bk1a>l0D^J^k_Tr!wbeK#XH)BGPGYHgaq3 zY7;0qu}$b!RYcNqR8G9iPBq96&EwD++RD^zF41tA%L=a zV#YglQA8#%qCg6AitBD?5bgQpko!O+@Z?aB8ISQ$l`PC_dBZt! zJBU|Aq;Ng!JfsRD6&VgeCU9S65q;Aehj8=~&}*PaLFglmOKvdVd%F8`^f}1BR>~=1 zg5R^}PBDgGU#eu7_&tSNhAE7=xaXuEYea$ ztNy9y;`&tQmV95CHVf{LkYlM~gFQ}mpFo7-^eZ3_ux~XDASmRhLwYG6Ytq~cm9})v zmB3trVKZXWAo)_g5-=JwRwwgRX?@${8>g0j+XnrENJ;!2)3nNg;Iw6stb=L2`jGBK z`^F{R%p*Vr(M>G~jnFQvt6gLNHRcevNKLa^I9MNhiW{g_W~q6t3YC7QG$gyOFy7kE zC6#@WF@^ZP`r3pcZ(TbE_E?b19{k{>Gw?)f_RC`@SE!3+^hV*trlwos3iE%D*zfhj zx8~i+{xT>e%kMi{?0m$$fl6321hv_{xq!-C9^2+Qz7j!Tu~DPZNB$M*@n<0+5uxp> zJe7W8e+_TU`69(zzn$3eHnGfVPt3&!Um?t^w;6t`g0WIS36|6 z4tn)cpW_ej7xK_4Pdc&T6XHZgoPzKRY(MQiLtfg>#{t)?q@xBD;W@HN^b|>d1(4eT zUSsxXdA`JamN*idGe570Mz{QK5=zSSZ3UShrbkHPh?9VYIf^+7qhkK-RFXM)=SJXY z^>t$Jcy_OTmFr`3!lTwF3*1Mr&`-skrme|H%>c*D(DPa~E!!Tp)V{Z@PRvgBXjPcP z+AH@ITd4Fq>PM;mMp~03(DY#)(IgRDuB6Ti^GwmfaIP7|bOj$vquCzVe%9i>AI z`}!DT*r-GTpW{yzg#3E?pk9NLB^Rd=M?XZZL!)K}1!}YmlUwmccKPm^e@{|)DkIvt zReQJ0v95lBblY%}A0nXbN)&I^O zWChA=C0-n-A%z`0aOsBj(58j1Wmn-dx8t{qIIE{Dl(}kMuY+SQo~dN-scMcx^}`|U zhJMeguMr0GCgy<{n6UIcw+w521Dc!RlmdG2o78KeT;xY}4Xh8K@<26I3^(Fx4z&0w zcibmW$<(XQiXG&ya_uvVg^C1PKqb(*R=#0zzm=X z;f^6QYM zCR^!P%#=A7aEd0yc>haxvXmfOTtsa}Xbrr;6QKIb3X<0l8HZl{;xyg zH-r>?3=MT84oDD@XxT75gZbYOEi_j?f#!Yw?&Zy8a>}HZj+$C#(jy7sJfb)CGKRX9 z-GLvqZN9m6e1)&bSdDLXK_7zHJkjAkj^JbSmnQ$_wulPN!R0R~XUmYtVTCUVFgX76 zqPZX6AuNx~q{h)LlZtof)F*2o*z%*csOtDGCY|w3trdE=Y zVuoZi^FoXrEozlD^z9W2vq`UT8PkFv7}M|gOH=}f3n;Kq!<-AqAN718E~KgKVE9Lq z5E?7m1j_I0o}Y1>r0<$iZ}=EIYBr1KtBA~=6F8!>ouhfAV$B8nGjaZych*UQm(MU* zS?^GR+O$)+<(xiocV8&rTjqoln#hgTRRrs}&eJMDS`%xP+077=?Tl7Y*I|VoeBeVxFQ9XxiyJt$0t~Bd95BI>x}S zWryUKaIfv*`Z5H;+_q%ch0F`-W2%nW4anOSYTMY&4jGDL+jmMuFQ^<=55B`~`L}xG zQXS!8`6XuSNe|OufKt_Y!Ie66D9$BqZOgRxaA>ZoSxN65F8-LOSPe-(K;&4`wI4E< z_){p}K*c+>@0XXMfUQ#IRNy@TCd|IKPS~IyS5@7(bZu zvSWPGarOo@z%jdR@>PXo&7k5=dfVwwbpDEGt3WUN_G2{`cc=ifoY79gnnnmWO{2k4 z!p_U8PQTSfn_CV+m--J%%x4Z~|2hu*D2ZK*<=F^ux2kJ<-!C&2Q+}WbbJfaMw;$Cy;%Up=u`cuKMpi-{4bt1;J)u2<%?GQrH!Q53ZN0$lC z5YycHsP`SQAycNgd$iN(`d}A&?)CVWLy_@rFP=kRsBn}v`qG}RZrKLCaX=~tuw8O1 zTdW zhZD=_jpYf09mk@YNqUFw_U^;bGM5SGY&n-Cmx-_IovZN43iazd^>*sX`MXF(^2h7X zUYqQ{JeGiQfWWs1T%h6F9$>n7)&P|xL3#89bvBao+OHe%e2IChRdA8Mc$^Pe@%@dcEBIBU>duQ1DJa4DD4!fnxrTjmOW#VydLx#~jq(&N7f4V?z` z^cRN?=ZEi3KgvH^Qw9#E_Ia9BWd8Az*~9szeGW0cM!}BAZq-G|QV}c(aFbVpcWHn}muPQ({yaNkHaiyoPz7wFY zPrv2=@5lH`2RfG6j4-?QuG^qQeL7@fCUs+V#yritK1QVqgOTl?q_gR!j&9bdd&7gMvAfD^%lLMT4F6vVZjl^$uS_?Ib5ox3Ox9EB^?DLTr zu+;Q)J|#=T!P0bR*;Sx@i*bMT^Wxms6dKy#O#{Kt@NK9gGnW^O zLO|RJW-@-z+y+JAhFI47rw5OBAODdWY8>p2^864Ix3aYHaz6Wo^uD%Ig-|EDLseX^V={k$kYKewNWJ#@;A1vtNa3KvrskyP>;NXhJPwmM>4BIu} zM$&Rgr0)Um_^JGkVb7l49p;PQ#D9e9k+rVZ70A!Qx?0KyZAK&CUfaDDm{$3{syM`E zm-|rEJP1zxSV?{;?nozZ#{2gRhwfn3x9-Ou6@Ta$|*quW(Djs(ypd>+d9@_n2pos&@E&zK)tU#ZktWWjdLP1FFA z+gM}iynG6~CCdCQu~|Nc?%9Dp3wBTX8k#CLe&IGrQ#EBMR|6LK8y z*d{+Rf8*4rl?{R0>TlF2W3QM)+IL9-#NVUuiXpu2dl_~TJ(45mGH3|9@0_x0@gTG^a|{p0CMNUVv5ll^M!Ni=b17b z3z$N?r=Kkv5vK2@nykwC9^X>yE4_TcNnfq>9HWUPQRpRiWrmk-)1{3rEs^#iqn~{PbkD&A zz=o%N>FiztEC$38A8gvLj@IB}?T5=6y3<|q19%wNfYU(IcpuiF#NHmus#OD!>MXUs zq=$FEzQiZK9zj1nG5SLftb+MQj4=4sv{JcBr#$h7#z|V13-|7l5XDjzz^FtZ?3Agj3qLOyfgqWW-U3k*LR;~S#k;AdnPB(Va zo4?)u&RHp7z#ly>@7soI$s8jh^Lap*#V< zuQGR&ZFU-^C%HFPIr4u9cB}#kMC&Mg-hHtfYOiG%n1ACNEw~*{R;{jNP67@hPn*Ym zRy;)&>o?5~$mHn;H|6N?*fQA7?F-%Q7Ix3Qc}_HWa~qUDOGU{4sbdcnh8U4<=N*yN z=CJ~AJT6eUF3WG5v(v`(?cQvd-3Te-ojAyEHO1CW8H9C*Y`ff9`Qt@UrR{0@VQJ%S z@$Nf3miyMBpw!4NZCnGW$#QV-^>;a$9e0GJ^Vz+$>zc(yr8Ec<0Y0yVnN-10-vpk@ z1evmkj!@B&0^sVXU;*5RsKRx?AZWp{a}kU5o+9cmSA<4NCt(VY`^WecZ0^!Kbj6RkF9qMF8xwr3rK2OKPda+jw zn41OXnVh1$b3|_G+y=a4ty2%m#?XbwbM`(V3|nsi9g9n=$CpXKr{R5bzHM|v4U zKmQ)Y1k! z+`5L|U(H4ZDtKylpavl$vR!f^QsAyYhP1L-fu~=9$j^=G8UEPtapR*Izy@zW9h+F; zt|c0Ahx3T$yCo1zd+xsb{rJDtenm^iNnDF6M`y?g%nCFeHi zt=q@lWHP;9+wZd}t#2f4SVGAUNw_8kL*V-$0Vi_F|PvKa1@S`h5iF~Mk_Ysg(*{Ixw;Qegp(O5=08z6dxQ zCKc%UZ5B_~X~nY}`NqY136hb=J-pj-;jvBFiw`?6moW70E;K2w@0GeYZ3BeRvc(gz zQjj!PDvZ)1on{SKXJ*k;*?GKgdfFNKRgv6y(kdin~)Lpm{zSp@qO4Jy%IC=a)~mLdVM?QjvNI$@1RVgpNCW z-ol^^wWn-0qJ0@u7w){ELrGn^!S`holD)84l(eQyEq}yI@hvoDG(4OXbiCa?j;)); zHlG&S_kDGLJmD03y#)|txnanAH!$oJF{Mh-7Hu}6s@1ZJM7LWPxLruY>EX*N<7Woz zI~jK$l!}@n%0fLKom}`e>z3^;y6w4@G$_6MpYoC%cdpI2YA#iUKJ$*|Ix!e>zKiWU z*qyiqp#d3Ugj((c0oBrZ#fR}g!6zi z+&bDwayj8dz-5Sabgz#L$Gr6`x43|zi)0Q<$gaswTKdDY=G(zV!6eerHcC^xnzp{& z=4YQnx4aMxAfDHrH`a`Dnzc&|26h`wv$m~PUyMYxmy^;Vb@SQe#P=+_jz1M59Y~C} zbdBwl$a5;pe37q0c2Z~xZgb6x!;Iy38HKApHrKt99y4sdFB90$OygGE8h&MBe5iUd z&bP+=1H=SLh$qP^Kri}PPIxpze2v##WOJs#c&6BJ#EKRRfTW2LU*-Ma(j*T9Cg>tL& z9a8n!(&5`DP~AcMO+J5bsHmMU%1S1y9b3tAxMzI;qtYgLK9o{1Q9BZ}0q5D#A4RTZ z4i{GvgoUqN>+fr0jTkx0*c6q@){0ZcoU7K;Ng1vMBI>51vD&&m3b%3ehjRvEFx?-} zL?B>altOvaybGMgujf|+p}E;JaCH&igHh;-eI(?D#K4KLgwFURPE=EUNp3`C2uY;fi-j z<1(h^_`-~E1+2HpLK9)gxdF0=_`B7!szOAL*MVBd+qw(^0xB%O zNt}gDbB1c`lFfNNUoOK|2Ese_Nj%PR{KYSX?e$umB>tL;%|%U9M(!R^P9&nC-*I)x;R~RnVE(0XTfzOX1E#12 z9y_0PLyaOfBuDXkq(5>+hcmE)q5-TD$s*R8l+I!cwhFUSoP{cRzn(@E+%_4e=&`8N{=Iyf0-^nDlNcO)j`5IjL8YOdO zSOmxytYRD0won8y)sZMXk8_LR`rE_2Q{2ScOpFc9F4EI; zELiH)-Wdd#9V9V0&b4D5W+0%|KJM3ac{A2#;r^+mFmGf(xty_1B+tM&nu_U@+R`;D zl~Z0CQh6>dY6{a7dxd4XF0)~ERCr1|tSXYyVcI$V>Vrd|6R#2k_;Lbkk0-HZox`gT z1=%1Q$4YgU@X5HSg%?%bCey&uAa!R$98mFcA0q}#zkZX<#JWg+%OhX0kLsVh7fCv5 zY>lIwTSQ%Ly?2WOek;3^*B;36IWEQScn%T1vO6X5Jz(8u(nRKm9F#VA(8~edglcQ$ zj^fmC_wV3*NLnnaGl!1KB5zDN;)bx65|`-S%49u0EPcj!Yk zALvLYW8CIjbmf$3QGJrlPg6zQ;Lmg5G((d$nN1xLh^q#XPxwW_EI2;cUgJwKx$5z4 z0~%=oI10ia!KkRqF#ypyhs78?dxRG(WI%9!oyiuwp@!TLB*NvuSBA0(an}^BviW@> z(1)E_0(lT@Pib3v{4)X2P!xrYE}SQJm|wGMvn)JzKmfO4_y@y5Un?AY|Df7B zUpm}_4&}C#l^%n?-z{v3x%x7iKOuZo{YbNTG(Ei7>)8W|5O4!Y3Ie2e?$ahhE~Y3K zoKM!b-FFIs-9!-KVFn5?b2jeFG6I^aw#s1Fn+KQAlLl74TG^TbTT=+eE$&4hbx15@ z#pMHmZphn%=ywM7t zJhr$uJLFh8^kXLM@xZ0d6SAyYvij|?<(lz#$0|~iw;Tz}3l*zT0x(0!Wl+np!VgVJ zfed1pi7BEISS~7%%n10ljE>OqLPu3i&Zu8xwqzF%?e%q8ms?fY=iEPCRG!qov(vt@ z&VFgjkkDB+O5X;dU~cQM0;97pXAFy)PwJ>0b7zo*g*$ZB+zI@+@)2r{bLZXZH_M$U|VWp-}DtL7;t+aUSOf#%^`D%{Tpd) zeBcR2VxXRjd*31iqk(OlCsY`9D~wA4UWAde(0Nh4Ypwms({uXRJ-DgCb88Royh$xL zLD({ugS*bRwhSs%yNjraQGq344Y;qiw@Wp^hVhKGZQQW~P*2Eh2-O!%4mKem6R$T9 zeaNOIP!N47`Wze%X2Rv)(1Zx4`1L{f5k+WRiutlFt=ww2<-PE#X#T3WJF*qBx_p`m zIt zVM$Hc+clG7lJlt>J?FI?+n(^_s}Q1a-YMLK0YLyb=vvy~UbIu23g2eKW&Fni(0FA( z<#0G8a#Tg3HzTxYs;*_3?g|YLCqNGXV+5YjYunM~p!<;@e$!w_ z`}J7o3MPrTbcZ7=fY#+i_f*GbVy*3n`5_gJchMlsxL*W+DTtu0uGJ#k;KtF>wSKEZ zvIX|wr2Pv6%uV5v`6RG7SZIY;YLX{B6w6vi%$~k(hKET zeiBvx{@oBmgYCEpI#^Wsx5)oU9tqsw?^}L6zr%7MvvV?F6!}&Q@IYTl6c660w_@C+ zb=!98f~mx+{}iF3IrLG({}owHvPjJNdM&3SxwPmDG$LKDVh}@*{{ls2Z zbTVap&MJF*355`wxH4gIu1hsNwz-7wrcv%5jRK^rzs)?`gP9LKAfSkiXW10tv;FZd zMJ3L1XAcf^nW(sW#2Yl>UlklhPd;6Jm0vG%l7TV+ch-((h70kR;z$3%tQ zxC>+^znqE1FB=`F)IA_bX9Ic~Y=U1EH%IZ)0YmtYiP&M-CP|loyxdv6+7V&uK^w&Fg||BBpMHnLi`Yzz8k24^HC{>mK}eXqy3W2pLmtWBNJAM~iuzXl7!if;H8oCML@F#;*O;aOwA8DyH!NWSwu zdnTijMzevf2y`5O$*z}xG(>0n)y~{F_vd98C)vN7r2_pVm4uHgfALI*z<+CwzAuHW z&K*yJp(FX^*81NyunHW?1qkAxX<%5KL}cG!ms>w8)fXh!=WUrHTD>mbecR2SO9Ah&*7fS`X3);1b6O- z?lA1TbZT75gax7ll|%2;^&mXZDXv6I2TrGp$b(v=rPD=KvgY(h5!Cczftd)X1PS|f{d4FTh5fbc&B3w`cl<@6LDQR@1%M}EfR-qus_YqwR# zL=lux@vL6t!9~NgiIh?7=%<5Y)BlIU@`8urm{EUMfjPaNNB$7X3E|#Rt*={pzb6NI;Lk8ttKnc)j7)KP3VxqNEtVmfTw$ z6q0@F`#Nh2_|tP=#@V(O@y|>ZaY1@#0csAxM&`8G4&13|ZIJI)rHV{Gpn-Ed0V5g9 zE3mCEOyDWFfgpS-=$9I~=He9a=`|NEq~Q1VatHtX4{ zl|g)w31N4>s0gR@=55PP-~M1^i1ww=|H$LK?pR;yjuEqn)>I(KjIvdVcC<< z!9UC`tzFY8XnSxT6wDfGSk!Bs7s_CU7-z|EGk)(R{_jI6KlwM%6PyOrE>?)WUn#T@ zJUd-o&90;k3_I+1c-nX^*s`}Pj`d10|J2npY>I4U&|tVq+e_AtuPNcN&AJaxd8UEo z=YfDCo}lG_FU|h@X^QwLY}@G{p%Z?6nD|Jx$A1$GdxB>_nB4UJJQINPt{spzQVozr9Tsn!fn29(uK*c^F6x>6PSVNFa{|%&Hc|o8GfMy0n-raziss_MPj)uQordjYm z3yE7hMo{&+tLW=QAukJanP_@exG#2UY&bJ@7T#=g%Msx4=47e*wP2m`6cl;(Lj07trnMfv;~X z6_;~rhd63|fy+p|5@1;D&ixHpX9Xid9RbaxJZY}#t>!#m=w1*;76F7_IU=@9Jo2>$@||Qj7%14ShnSQNINIWS+Tm^#C;_Gyin2 zcnRPJIGx`K1$Z8r%z$*WE&%e8IM6DnHzHRC$Y3vMi@!b(cMqaxk_Wo#nWx0MVZ$b0yQo1@ zc~!?VPq{Rca-zr2UHWT$wi!8I)FvkD^OkCkM=>dth$o)(=&Rl0OF}?7WJut4AtTQu zg?VgE7mE>Fx1umri?|X)?4L6r%y)pNDI58rotle{G6~>^l&Js&Q40fDfNlc&U5|o? zKz*SVh3RN8fGm0=`XKn*3@z%LEbFO#=(~NDmcMin8_Q&PEWQHJ7+3MG!oW7n&ktQv^mGd;2NuviMdLB%&)Yv~@&Q0k zQiBh>>!N%;qT|I zubHmP`LzRTy9t#F`OvP@Gc1+fD&slhuY31}8!lp1k4c^Np>B}COm4nX#T$}%g{3s} zzO#H_bg_bPEA06hoQf_QG7q3irk+0XRVt2%cL)6PH7Mz z*>n4o^6cTz-M>{u&AybwfNWY*p&Hh2^W)1*0cC>(RSN|WL)klQM9W3{gE#KDpgY6s?9&~(x08^o-Z!_O0GZC<6QxkmT0zbYmjx| zr0W`ss~YKy>64r8K3PLR_{@{dLU&S^HUKKri>F3kYr@B$(q*{k{V}UTK2TbID`V>5 zwnms;G=v~H=p5$_QK{$Ll-)ae#a?~lc~W~%M!`0)OwM0wJ*y<8~;xkVzm8 zV6qW3xElcZ_h^6<=1U8I58T^C4Taa?EZJT)hSmVWwJ-a}-s>Y1emZ1CxUqnFmtfx; z&S0z+o&)s5xOTv@;>3C_NOpOa{p0Un!4;DZcH%z*52b%HR_3I;$z{DVI|CC-tw{5e zG<0r;>^rO5(E37BFJ}W=W^fkZ67gi4{c>qvb6AYC8m+@C$&Wuo$)((T{j9~mM*_eH z>~nwB9K6Tig?;I8^d<8_vEY*$SAYQ(v#WDCy;#iX8Q@(RRCDDI8@CVk{)eq2W%YUk zpcd=fiByQE#D;AFyxjmo6e;dmu)<61JEiQy67B<*lhHqkzGxR72uy11v$kLyehbyD z{2(vtgirYq^$*9Qyrna%;bTZ0J&Jl4V41lIapaS`L{>vaKd4c)Y~EO(Q43u;-u)~A zl&u6aYLuS=*S0^hSGEH8jtgbL_}yWGWbl0{v|B1XH_~%4Nh&&8zhgNbwPz#2I8Eb<9SP}RM zsNoFn^QI@(cRn1~t{Ae=q`-rVXglPxs{h<@1{892_G~D3m-rM!4NcL6P~cRLE{ZQ5 zsvc!8HlcWn9!*C7R$_j<2g(3i`6?e?(Y)un;Ev)BEmg0b0|W#KV4->};V^0x(*VM% z?3;CK-%W`iqa7>ufZgpDcmz2&aO!@Kf-N1Y{{BweKYPvV`(QFiEzWXo$z_jSi)XWI zd4v=B6X2d#QxCV~c%2!3+I=akTcts^l(d3jDsR9l=e5{csBqAF0KaAN#i{h_4#q z%C$$fQRhhxRf2uKO>SWX_YNlS#1KCGkJENqX2UWQCe`lnl-KGwr4ul_Y zb-|VxrFmS!J#4tSVC7C0rA}T9pn3Z~^`_7C)LE1BKQ?sf+=`kj-|H*m+THTtFxCzz+^gOK{ScEGx*=z`$>X|Je__+3CV2de z{-DU3><%r9e$(!u7T%_r}gMd_MW5Gv(Dh9r2e^-qmmYe+t|?4Kn8m+ z%gd)Ljjp=XvKgVQm*V_z=lw6}e4G^ju=ho&c{C*3q^aq>C{1IR<{2`$mp918t?FUS z)6Q4nJ4loN*jcjcB|N6YA%2;j ze7lVKP`));4-{;hm2_4(>-plKTm95;-(^Z~egWy}Ej^rL%9Uh5ku;~KL;@gucJ4^H z@09)UL3fwN7@$UnM zUEi_JU+oip5RXTo^?8HtN%d>Zj2hl{sLMCdyCdT=lzkH|*mbKl?_<>WEWai|h_h;0 zoGX~Bil6jzBf?k&QoIhURW1)9-nN=>?o0aX45#qdvePDFZim%cPeUgmk@lU&2?9jx zoi_~TW|LRnp#})rx67~S?}-B=#RHj$5o7hkteIl*DbqXv6a5#TtR8a7ojCTh|1V2~ z6~CjSD9TyV(3s_hhV&+2%Gx1NGLHr)FB>(vPySTf_~9ej*g_YT1ouLvU^j}oX1jSi52 z0egXj_3ojachqnu!mk^If1UeqWJRUw&`HER@K$!}Nf~(-!EUlTv6&Yk|IElEE`iM+ z#E#HC)Lon~_PAya3ZkcAzPbBbZraP7rr8a~vTREb_rRot$MCmPZKpsUU?gsOQo4a= zd4t!b?g!?aWWeL0T<31>F0IY2AW9**_W~mtYi{Hv>&{>PVn|^9r9XvNJA{2-)J@p+ z(z<7n_T&udhDqHx=v_M_x$$_2vcm)~TwIJ_N1!v9`#+26B8WG1ea#7FGyK4|q=evi z*6n(f%QAaLbz@Jy(oUBogWQsw2nb_klkBP601)hDl&UgT&X&k-(8Gm$kZ1d@at(jV zBT9pXoe1GmAHHru9A}W7i~;Aw`;6F-W(Q5+Zr6(;XNN~spgjFplpuDkSHKX-{yoM+ zn7n#~zz0>+&T&^(`;QjseE&mIsJ(=g>;FEOAatB%9m@sVCh_(Lgq~ro3I^9@3%T>6 z!#F4AqA?mRL;JqMIh-JcoL75p({rgiUA^C3{7j8v!9G-0$R>bgRXn7`&-NsU+sTYD zrkd@stX5=vMRo$`!b(Q^AAb#JrHQ4&{rur_;Sv*YgIq#KwZ)$8B=Sd_0oFx2j@S4# zI+2x+`Q#6Z10XHKV#Y|R)}Nl=`>A3{p0>{~ARvMYsc4^Tdzjue$wDfXmz{MXMzBi* zh$jY5f1Ma$v$#YJnzRrD9oiS255^a7q&-ZfK3`>cLKdy2iT)dv7_XMb|Q9*hH z^tqoQ_caY!`WuIBU$V*3*GJmstIB1qc2~zoYXb zNgI9U`nBa_oRyulesaIVlI}`fs<*vkH$ZG&DU5$eyK#F)b7%4FwlD}m=}_cr5M%_B ze2J9viiaRU;o&=)nxxKxLr!jGcjA79d&}NVSVqUe=1N7<^NPAtli&Gf+9`_@*Fpxd1uBVLRq%vy`f;)fSU@|{KI zh+S^wT24Dsibt6WVF$uOg$2?`3xX0N z-Cc`D8U!hUMJpgIy5Y?AeDAy8z4!U{Z=c`!$9jCc=RN1VM_l6?*C=D{YxIcd-34Kz zcVUVZj-P72-KS@5>$WS;EI)bP&*7>Ps}s)cb(X+QKQdu!#|`Mug38d&FsMI{mtISN zu~$WZE6VmlY#0Sv_zwFK-wsmyy~gpO6t*P6fsUGwm`^bg-m6?25T+khIw;XNC1pHC z>%cjE3%DKbj}T4lWnA?#!A|GXgSQ_!fAqGwI2#?)7pG7cFwTT%*{2(1hn=QwnsFC1 z2I~1+Z~_x|82uwPCN(5**8AAk*V!#h_f%Xx6)Aul-ZhPp2_|=5a60t5yFEZk_@h>yy+3-=Do3>%s+3u%bOOkWgAqX(HjBEi)b3_^T6=vs5|MBg9b+W4g6HQk} zgH^jZ9zEQe+C)Nc6i4c;4|@=wuYK$=Oqz~=qt*&bimCWoWlazn058*SUbU1Hf?0|~ z3tSq=;VF88DkEIoG<$oOc@Dna7*j@r~;zB26Yy47q+H&$oPItd2Ppx}^B^#bPB zBksfZn-!JQ*vp>xGF(&$L{wwLEh6u*#Dw%YONGvBktb$qFQ@w6`8;dy9}z^N;9nKi z>Por~*9;w0p{~5**{S3iaBu4`yAV?`a<+3b?_le@7j;voeJ=AA>vEx{nMO9^w$RXA zn{GH`o#|D!5N<&!{RFxEhACvp#{`Wi!JXFxQUE8pA0pH6WC6K!n;BL7@{}|xwC!lmbA( z>N9DKYhG$Dz>7R2GN{8X!s+e4+akSpB`uETWY*^6zcS1I^`lDM?bun>K?_%^q2?_k zkIjMZ#b)Y>HgvE38Cq^LwcjUIt6QN*r;QDM>~$$-&lI+1b!_wAr2V{3Z}hD|6+0^8 zM?968(Xh&~JZ;VIekS>QCrPSn5;CswTrlXpw=I%ph+0Ylo4*o67FTtY=eDo?wo>Y3 z$Evj6S!U_5Lz-@XMEtq>wQY8w9Ih%6TXe(%2SU?daQV4yXfP1hEJI7#>>%;}#Xhtv zux&~_RDw3$M#VuK;Vk&b1t9yOZ{f_NSr{!}@Jgty8>jtE=`BXoZ7}Es2^dk$e=;r! zDX2*5mKecxt_XGMAfbV&sN<&Le8ZVb3*-6CBxgq{xF9z+i2ty!hM_qb`u=u#cEJp+ zAa>*>kLA><`{tTI!EZiWLKUc+9M`AxQ+!m5AAWTD_XkRFU8C3T>Q#AQI7!3E-Tfe4 zBGEV_23r}iUi0bJ8E(RytaOI-0A<8GvqhcQoBi?i!giS{)Eie$e+W$WrZg$=6EXf; z28yxePR>npubt9eY>8LW8yAbV&U;YVd9cV=Y_|+~gq3X;m5^AkMh%!Z(jeoRE8aXT zrz{79vCGCm?;UaUi3m;(UD0zwM|W@q|I92m`ccFF7TMidI023QgqPgn0CH_ohQT)T z-Ox-(x>xjug6cjHWV6l9+ND&(MVG>5152Kk_-r@5`9%hzBP!LaO-am_@B95`xx_#l zK-nVKHkmYxR7CHUBT?v^b?<3LJGa7X8H^xIwV7at4Rs~#(f*S?euZ|0e&7Wvr`=jvkLy_Ehx4M#4jdRZsreTdH)ah}hF>Pn4aWNhHA%47 zNlpvUsvmOCZjajdb09DWKz|`vNb2o1D2Uz>@hs+hdbU?QYh@gw9Sb%fah3mWyDmJ# zGpQ?XRk*uR-@qt88sM~TGw+D&0SDON-e`#=Ul7ugf4-%^cMr@GYHI-)N=AvfKjE|; zqB7H_OPliZD+hbMh~rBsZ;&pwP5zS#yoZT6-Qh_RzQ&n zs+$T`-Eb4|B2Unec_VN?Dq?-uHY9NSt~@@+LCIad!QYYg1Ghj%n9tC}LuuBdkDk4C z_Ktr_oEuT*DORm7Q0#9FSj6W?qM9d08}=VM;VvH=JoM>7-;qut>md4Iztp^VTy|mG z6rd}5d(S>)&X4TZ%ujO*jqQ%C>tyo}ZeRZL31Q1HS~hHwq(gJEFMjJ?l>NPk6y@GF ztE0Am*C0iVUKCcoKbC|%GRrP=dwFpg6`JjcPWjR6i@@m4;ZtUTsq~nMJHWhLz0-uc z4e+yfBU8!PG#-GGrD^axqU;@iX%y@dyj{4tic9R;u>9_Ld129dt49RTY3+lXP2FET zc#*X`wb1&u?ig)W_vS<+bK-Q5+IVzr2fk$8?~A+_6s^d#sv?z&Q{LGepms${*f%Yv zf){p&y>oq%QKkC_2HWxOh?UP;_ zwR(n+7m#8i6(k~C9k(Y*IZJ+QS41XH2J(%yOuIO)h37NWW>*69hs2k-MufaZ z1kJU;;HV27I}Aru+2|j+V3Y!16`0~PU0?5ic*oNI-KYB1n`NTV2ROV6Z)U)(9>L_b zT_E?Koauj7-BJdyNVTf?2XAZ=HAM>FoV_8_`d0omrJUN`*saxjnCQvTPt{HJZY9CX z8LugQ!zshNEas%0pgBk7=K;>^H*yKyYoz@3LrRv6fh5xRsi+^Zxq+lpjSZDSPOr{* zqM)GgnZBV%XkTzzHdnsMwh#y1Pwi*cUT1ovS|XJc{%|g~ViDZ9brhOTFEsh;V!0jI zA85DyP-pFm67EH1>V0ld(A4Mt*?x!am`AuErC_caGjPYABFwJt+>8C&U@$b%^{)KUXYDW}jjlDK*2{cRo;~dGnC+jWlS>W5ZNE??F`{cTW%E zF{w-{N$ZJQPJr-#bS#6OS9TQxQeo0swxSC*f_w&weXV|9x%yZ-M6 ztg@v(72FA;WZUVh%v_C$THGj(y-*2dwINg4*OwAd9KIQlJ^aTVe6Gj&wZAPn@I_Xl z1`8e-sAbsaaOIR!5z4Q27|=5iq?0akLhX*5QGLl zE*{`T`&#tggKLy^+pZfMtnCzVd-$p9_d*@D9goeQQZ=LNLHalfUAsFr<Byk- zJsG=!mLHfDIEmmad!(4S2WFGK5U07P7Sb(dvlx83AZ+Fnu!tB}VU&JsQeCz3EjPBv zZQq-4IeI1>`{4b^I$7(!P6hrg`6O#mrDlkj2{_EPgtk z5vzAC?~@qP&jOum-?}q4?ymrK#ozb`bzMf`J3<6$z*jW@I6$018qaSr$ETOt?&GV}+911?+eZeCkD^$LSaIxLFXVT9^G8*FMdnh83lWfC@bKiqGXSUy9 z2+mUG(ssU){ES%xHHd(Rh=awGNsW1pfwOhNXI`InjrxhD{{04{Hs%kJGw(~P-ZDL? z+VyMit4kU91tu>h!-H38m%fb~GJoD&i0BTWrC;eITv-A}&p34@Qi#P5RlHyg#E~Rz z8ds*BQsv39XQNPvfJeS$d4+skctJw1^yz&D2cp(@$YHyFyHmic8V^-m@dV20Kigd4 z)|5p~-?^pGnb7Ln>~ugqb?@>qo?mNUk!-96v|Lsvvq}mWaDf)At>4S{5%fXGj^rqW zCR%zB&Xc(8jz6e5hZ2G@NY)C@iud@-d_hF*Rw>kFDoJDw-5D6N@5B7k5%*o|0600B zOwrvA2ov?hhZy!Ag#f;7_q?Au8YL~IpoB1ppC0l=`^Y-TFBuE7Q_g~MbrA4*(J}OD znRub9OU(33g)(hc@3Id{7Kxv(n+=nbUy-`kI--d_wd0aOipc-YaBV958AI^uM{!w` z;|qZpSyFk-sfbs8pFD>;ySCjM$dOoEjsrntVTR0OnH}3%8v!EfKWk%Enz5Txdwfo* z<~3<57sPkXK*S9K$1HxD2o{v%0y>RseYq-7A|ruE!=6T>GNO;>aeTnT9M0#a+YIX8 zE6%ZfbfC;&C2{=_L3&{G{hd8$!eBkMPxXj|(+fgX;u?&wh_^V*PZdShp5z(U{3sE& zVR2U^c%t*_-pYN_zyXeer^^p*5$W>2s4{;Cw=I;}SW!k@_C8t;ZG#u_4=^mgb>yAk z>lm1dkUa4e)wtfG0zS{v9{75yS(^m@alJF;@}rwc+(yt6>E$$;(xp>&oz=4pLMP1}2$zZutXjUNcX!kx#%+~Kz#6@KX^V5ZWTDQqz?8lh$Y0M379oLNA_#ufrn|5vGry+0(Y9m{SmmHZ`_axFMu17$46B;~sRJN@=}?zoR1gz4nLYw_}EN z-8q=kVeq=vc9vv}8%oL4+g=h8&-aC7sq5Z;B}(NL@54>(tPg`Ee;V<)y!r6ra(Zg?;v5NQ#(5UWH zRwjq5R$2Ac_Vant__HJ-hgoZC{lCuM-g(7MFRD5bdZ%QFd`$L8^SXb@mY*1g>xX4> z@yY2P%lp=C32?fE1ve6$m@PL7Rj23R^#e<=4*Hjw5zCqek_di9KB&4N0>=+E?qi}ie4wBpm z9HR!wNm~gKs{ms!UX%;Q`ZXr8yK$|D_x)FBdbq7-1jb3+WVar|>T{YFPd+SSl4Hm# z=|G|KJ?|cfypq0~TiJJ3k10%pFCx_YiXc~+;Nwh>t0f;3v$_0}BB2h21m4f&`CEO} z*I0YM0te`Jvxe8fv`2swff0IvBa&UM+fsq0wcA3x9&8W8%mRjn&|2&v;+dio%!EMy z8M{@L6+YfU-8tJyZABpvQn>Pf#0Wbc&fi^WX-kGw{>UL%CH5@O6RA$o7yaCG=F7hN zM;GS9v-RjlYHHYUzM?@i^HnKZ#rGGvP+}IbyZHsAMtwv4khqT_GaJtvc;#@c5H%lc zJCF77-zFK+a72xl2zU?J4{boMzmcIUE=#C3+I*$7OWW=xiOxA4k=W+;Wzf1ma{uEO z{&=>kKN3j?j8kx~d{~$^`@(}rOUTB)4UCUO(f+_D%r3(&Is#|$@b=xRRJI}+A_>V| zxNaZ^;j5Qvi^D^l8wj;aLtCzB-u_Owg9x_s8QpbCwYauFeFU@Ec8rrS7nJI+jRszs38jaH=` z)(zukJik06o8NvZ?B$ho*YrJ_*mkqC&`C_TfH;{mca5qQ|I+ zh&O66pMHz$d*hA4^9MWczT|5sEPK)?{Y#c+1Pf{I$}Cr=y-yR-bU){zUF|Y8r7rPd zgh6)CgM0Sw+-RdAXZlX%KGTyCUNJCRras23y~4vmtWQ6PVLtj%8*e|D9k6lg z8VGre9TPf)Rwq)7#BZ?@uC^p#N5e+54=0n4qQf_*sHPHodL?kb2U38`9&460WFXM2 zwr_?&t8QxnO?#OU!b})-4q{<+eNQ?(EH+Z!X&m4xQEx!szJ6)691ZES^)|9u1>+Ap-u<>~$W z)Cgg=G^GBSB*xIn9%-mA%mEPMS#n}V2bTD+SlI+wZew;Zm~D_O``3ieE^RUl1=wM4 z6;oExNtL>|2*yk=zql-yUxWJ2Eitw*9Jo{gy2lr}y($*1`+LwS!xUiv=Ci+Wq53)TSDx(L-Ar zOxLjyrQRWvd2(!+7)XJ_oZyTmG&DbpYzVZaB0%sX#MEzCFy?MYQzpdiuVVH`^8P#+ z#FS0HMjP*{2-0qK9*UNUUL(j+0SGY7!ABpZ1MAdZ+X-Yo#$@&?EFF1+B~Pb_IDfO- zZfbNM$PTNw*e!&Zn)rezGt6;@gX@{xl~EM;)Db*fjmJVB3j>MwVNl%;D#a&|G8X|c zEoA+bM}UKHnW#5olnFx%8oj5Ms`|{)htxqOkDuC9kvy?OVUl|+mSzJYt%jk)_P$?o z!r^t#?jyDaP7~K!pq6kJT3P8=sJ-4&igI}FLa7b0^H}*QK;HPe1P^W zWWup9Q00P~<2J9Wc-;^fqjqJ7BVu?|F(-^_`jPQEFzhW=p?Xy1&zyfyYFEDn>B_AJAy? z!QyE*>TnS~-c<0Y?aC*v&~FbgG!uTaX@0zaafh2(9>@DBxA^$u{NLm+Sd=2tsnv>= zqP&r67Pgdv#Bg1U=9X@j-1xBmSSq(q%*6w>+gZ1wW zlf&_gMRRcxZ>ObxM4 zUs_xa=cLBrhx6{Q#xh2(A62@>HF(dDX8U4jHw`patE&e{io-UXV3K&nRm>AWWa4+z zq0wEk&1n6X4o8<@K3rz+%%&je%EGrNxrzZ$fxyPkj{eDtX!$q z60M(u^*IeHw~_@^%)IEY5WdJqU47mr4+*nh2bbi1mofP9!t8HlIU6Q8t_LyT-sC@VyB?*0 zM>!d~LQHFErglF4*?U$_6wo7U5oUZ`JXIeaav1NTXO}^0w*rTJz5zpGB~dYqQ4N+D_MEHs?ee1hzIG4c-bY7p@14JUIzcoczdQ|C`Yb>Z~yMmM2-^vrUn!{ z%tq`bn7Hokj*sZQ?2VzFudwLak-a^S^Zql@6Ej+*5Jk!33EKhKK)q45Z6iO+nSOy5 zy>W$k7;QL_QWC>>0O0o~UV~wy+11edxwn8D_X(68(BO;; zNaD!~225`-rDw!F1KQi_3PBXj^+;k_{>DtWcr99bRo*6wxw1aV3J}&ifPUclU zli!}W-jT5?HYm+Bi;#9!x%iB2HId^nmgZ3rCOcUB!wfR|Jufs*P6HZZ#{0ajKg8sE z@QvIKI4q5j$X-T{3jzSS8eyIoZaqq4Uxuk~P+ zg!B?(#$w814)8_*#15JU?JHrAx$nUAhkq)_XHwDSv&2bFbjasy-hhF)k`Hn2cirpy zf*MNd*cDDB%0wB<496c6wI5^hWf=E_@n;sP^R-SAN+0Q9cKOxqKHaX3_!1J9M(*J^^#f9Nd)s zlb85Q2Ev%l2$?C=W8dtC3UX9k05GXBr-U)|suI0Wkv2!%>d;6f`tI@(!t`@GBrEY{ z?wJaN@5+S~*xJVb*_L2TAY|9k{Gtqb>mFsvb{kwBsau<{NuHnV_Z;RKu+N(L-U0(* zf-Z9>gup#w>TcL$@}R!*&S^_hif?ajU&mocF;_zyFg|x;Tdk1wzmrwjz$B|U1EMV) z!A5^c%))OnRl35v47+y09<0gDJyD@wAL%-!eP|BbjvDQG!xKW1;4$x99dqb}%tofq zs94h^e%Dw-;jp_B`^R@_&wA|g`GNzEelG{zFM~41RGwc&Yi^+5kWIgr9Tes@X&BS| zn4^!ekw(|Up8H$+M!(#&_1CkbKFT?Ik8TnzNeG9P#z&#==UHbtXYQ!VOCZ)*^l0yz zNPr;Wbc|_2N_5v|>q;aS2C~7!QI{zSZK?chk%8f~BK1I!N5svdU?(WDb}`30WxR$K z#0(-sUQEEzzCby6ux(%l9yoSS5<4)Xbh$~;BLwmU2Ijvu4Tx(3@VfXD5L_f&1Ey(M z{IM7>gx6LMH3xe0{>>`mZAJUlI^PI12j4h( zLtF@`O~iY_-Ld9t99A>)aS;56@AcGP(#GEZOo)tl-3Zv>lACc(ZD3;Yf;U16N|~pzA zS(REz!b6k>Dh^)JLR*mast~@C4dAVgZH2&ZKibh$azmu9h_Buc@-`2hOKYS|z|`dm z0TX`~%JSJ@sV%rP`0`|Ax|0!A+@e=S|N6pfSlCs#51@&|n(;E%M?VOMmJ{PSV0+`$ zcG{c8%j(Nc3CWPVU7@RB2FCt`f~%IK4%ug`R6-oxq0 z?5;JoF(lYp@5GEVeZ~1Fq?o^P{JKN~06_Ig`!P8K0gP(a);UO_ETe?S5Wtgg}ip(Jli51bqwGXjigsKr{k+R`Ll72|@&GKbq z0g|Ab^Udb%ZV_1%>{W|64{guzA>kb8##pg^=5gOc1I_I#$)!OJet&XyT5L_CKEDz+T&E)?-%c>Y@pVBW?+5keG7VBRaP z!GxBCTt)tkaJ1@7AU6oAKy~U6Y9(OG*+L!?>qLzD0U?|w{{3u#(_1tH+0Gr%DCe12 zWG#vqr1P1vApAg58cwA8ly-CF@}W%I9V05aE?cK+{AWRTWDmoz87cti-E;1%v9$=w zDui#KqX5Z&k;-*lvG-k(EBjN(nSlwJ zc?_;)ED4W_8G`krJ8u)@8WMiyIRn)Y7}wkqNUZzaai1@qOqR@yV#)RP`qURf{4G+J z2lnqbx;&b1F18o!`u?y4 z5UC_5S?(+=DnsIMB%N281Q=Eq0jNGgf`PXfdV9^=llog4Vwu#baR2@^56p09nYSwg z&2QN)7I?2=Ol0TQus!&#@Ticb5yTHB!FG)d?}x$Jmz$8=5Mzzc0`GBmEEN*_L)ggE z{jAZ*?qGjn?m|+ec1Qt1Uc~e`*4@}>Wh6Fz$NaMz zt6pe7VfN!Qa%ymz5T4+4)gCh&(i70F){35J{Rnb}B*-lxWQ$HOu^~-yQsAo+hGg6K z)-r-l7gx_f1U!tF#b#@6ZtU#4MvboIM-OVU7yA~_0PFV%7Id8Nq$KdcBU!b}JUAan zj^2inVjw%6MnJygu`oygFE*`iipi9mV4%KLy&|J*&~H|AX|7q#|5>5j1wx>mWe9ZC z{WruE^pdMu)KA3m7VC>5uZDvL=9 z2Sne#{6~lf0#=bM>asGlF(Gn7bZQW`Y-u(14$84RGdQr-01wA>eawsf#>e$c3USXL z2*FoDJMyPPwIjM1r!_Bu=4k1cYjn2x9>4EmS45or9uA>1J%azSKA~%d>uz0s;S$`N zf!681#&Y=}k{P%wOehe5Y@F#k_9%rLA9d_o-_ecmFiEep{Yn=zdB6p7T{D&?X#hiT zeVosNn7Rf}h;hPF-xa-`*T6$IwT_ie3L~|gy>nhP%$Kujsd555u+xSxi@%0tyoa2qj%4yPLDK{Of{fLL1@g~bef*wTWwJY64lG=cE(YJyZ7MjOdAdTC+$&yDh^j{I5U zqh}?YrNQH==rQCTGc&*$UBdX3F>ljgT} z9{mVacG$aB#a0R#qL`3q{D=3aQk;~DUPhQxS{-CN_Xtbr#WKaj^o@=`H>JlqTU2if=S{GD5;A`3Oa*#nK4D#YCo2IBbd z+Dvpecdi}IsH0z(hmXox(_5`_EyvM=j$~KMRF5=O>nD6cnH^4FrvEs7VV1%5jt;Ys zlV+oDWVn(Pkjjzhok7D{=~Du^K)TUpS!>7jV~j!o4WtoEV6$It6tUWC(HUvUCfRs}-V zR;#o%@l>ILz;|nhpg1lQ)tN4u+42X(wFG=^y^1e6K-K~YygUGrLo&Lij6 zUCTq}eOs3YYwMdEMVtvn)d6l9gf>q-fDIatAFqCcn9>y4)^os8Uu4w8;vn+#oxyBs zh?M3_im!jZP8vhp^qS@`O5Q$Tf*D3U54iw1{CK4f7chf81DJ5Dp8}D9#wz9VIBWvi zqAKfHt26y}PJ9SWuN<>NrJFG!amYaM^+<_8f^Xi(_@5e0cgFo38!3QqE!Cz`!9-5z z2seXZWM?kVTM@#?V}oDK1v5-D-J(Y!Z2;2ClS84Ad5R5&&=&XSRP>J_<=<#*X{F)l zImBApbFV*=KPZEk(ZQS*5eWw!UR5%J*Gf=^@vOYqcm;J%g2L*&pE`{3H2#W+Cw=jA zVv~6!aIO$THCKE@^5p%$(rA7TTy*t(s1UF_1!9Vmd>EI2k4W%*+_ZUd^_6L6>z*+SVF=|*f0O@{mX-Y<7#G=vY)p&Jud8%K*yujK6j1_8>|F9j zr2v9C0DL`-VLTbu{->H!WY?ILJI2_MW;Yr4chPE3gw z9y>$g9{5n2o6BQqzGT(+`mr6JJ-R}9W?(HuE`Zzor!Zf;g@XfRZZ}}7-%e+epkM>H z8VFH%+&e2(zg7S<=R=5Hf-_~97A6bz7VsQq>@dX)ELPRA5ovF^KZ?5T8X#G9P=gS~E6gXW7k;)V>$0B*G_^|Sg)Sswo+hNbk28=oo*ZYcY zO8|fg73Npo#f*X?!<+oNHo$+d11)!r3qZ-kwYScX(M27~QQ^UfddW5brcMDPUa}a? z2BR`P)L^ki^Lrv2yPwO3>B#}^au}|+}ypt0CDgRO-q08GFz1oQruESc`Ink zSgBb7yfiASAQ%Jkmmlu}$Gp&jXGY|gm3G8<7>V}Q>?Vh)>bLHD9fVKuNU^hXo^X4lUZOvUrun#-WXonvAruBt?2h<(OSnF5X>K%$1 zbyDAcJsj2;a(&U5a~n?YleTQVSMx)wEM*2P!OqYuxOO;8dOj;~R~&HX=1q#2Q)z~W zlp%YOTxB{KrdJG(T$q?YuWEV?%8VSGLQM4>w<$!6?WZ{Pf9!Y=vSK1VBN8X>L0gjE z3|ksN_@W*f`H*>xcFLa-fLePC%7pW(G0H9j9!?c_*3`|-X?rtH=x2stfZ=#h&4%ta zZZYye>V-kDp`s+OH;>eEvFu=El4jddZ(yL<~Dsh5elp2pGCEh<58pSd<&~-X)dX zgXgNJ^151Nc(SQ;GJ-6O!pMt;0zSy0?T8INlZmfr$eep={^Ee6_+ripz;-4_^zGjY zS$cB1y50D=5=-iV;nwKl6!@Bw1m}v=Dh( zvvV22we7cI6N&1{4c~9guf6BnGW5%I8Yk-pih(&p`huN8C4q1MXeMEeZ2}gVr`cB8 zSr;SRjEOYvdm9Q!o2z%*GeBc+%6=df`D@h|iTMOl{&4sYfphK=E?I?Q&rZW?Ebps# z*T3WVlxN9;By#MK>l;Jqt60pQ?=H%FG9acNN;N!~1LbH}p#{o9!e7##-BY7+C;Dt< zY1{D%(^WRXAJPJVo4%=p4PSBZ{B!!wJ-2^VHv4^vpi)bbY}M?*MswVSucit;@h@S+ zjM)BB5Vst^8bNgTA>6g~*ub#=E^NBlch4IVcOxe|VeiiQcrbXYumA(Oa{9zLivsds z;pWd@5zE~OUE|%R7lzvljrCETT(y;+j=TOF-wO?0z2# znMR2*L~UH45W}qvfl8I9vSYZi%Ru$P!wlnAd@K8>8|crkwC%YdxVasbs9s3-alP~^ zb?h;t9OX=|q+=Hp|BkT<)DGrJZ4^B-H;#$T+Bm{qeN`0yaL=RvUbt8Vb_2`6K1k?m z+JZ0V>gHXY;-pW~)a}Yl8mqWs6h2ul{-~g6MYFmEN-Gc0Mmcgo{CI9?zS!cw>x=Vn z7nSa0K$ryf0Pm%~(QEVh=E$cP5`S;O{r2X%<#o#w_CTq)jY88_ z8~9f}P6$y!O!{-%=b%Xz4b?C9>yE_y&qb_F`3)_%OxwPjy974MNFkOBd0@7*m{ z_re)V1Ns6wNQ59r8c*>NJ$t!zQXt;nG#ZoIz(5}0Bz0N}rYW zVgrb;R0@uWJ>sarTqN^OMWf9{>wz+4hv6&&ahJxI&u>ggyZN1G9!mcH{5pDx_B9rP zXQ&{N>=iO*p9Q*?bB$*dhCm49CmI5*@Nv+dybZyUi;K8VsV@ew=@PKSGDb43t6@uM0k{N9$~@GpFhkfHxzH_; zTOhcFv_oKqfM4%U&c^WxWgcazXC!9-g9PA9ARAFCv&eIOw z^IZ5~;SprCKIi)ms`jZyfFKc8`HM~`7Re;)w)l(Ua~AN!>g7!NB1Oq)Ljc?u&GcHV zv{E^xX&*9U-p4>j7U%q}AxBj}XNP<+kpkNvHcY_{oW#5X-QR6JHN{?gru6A~$@S%F zJ<#PDC{0t_Vk?xLJX==BPry{9g$TD5ApxrZ`r&Zh_4nA9vIg;lOT|g(c9DMZMv2tX z*ds#jgkEaRGibzvap=SK#b43~uFw`Vp#mjS zcWs{X%WAZ|1n4!Un?{Dna2o8MfG^l^4`HNgp^fagUlOqx5{~{t(fj;q-(CSJ{IJ-L z)CtY+@zaAP;6}cQkWb9K>UHd}49xBI)zH{A?N+3P6M5tCRfCL4f zz0dTU(E38RU!Ub)r+LRht!wdIG$>tE!#u&8FQxBTLKWKTp&Jnz<+}NigKaBfHpF$a zwzQ|089knDHy2K)&2QWD31~%3P6OLa@YePp;UfIW1;21WTiz)6?}Ctv>gj)_3ot!Y zh5I##$qd|-$>7Ga!z{3R6iO(kQbRAi8zCkOc;!23RnJIl%S58hB=JV`g}g!dnz~Pc)~7qqRM5ZO0KpnTO1fOzDWI2QnX4S*nUGMz z^!;W<6H?y5JePu|D~Fe43ViB>)3a}w&=v)XLr_`cE6W7|Ig!g_XaUG$X0YY|6(bs^ zZR8P=Dk(lC1vk-~mD6D)Q=HJRLP6qfec|#2=s}V6s|V^VM&X3?CePz3D<5dMF}7kN z@eE03u<}>}-q}miki<+D<&Z3^x^Pi?+~fR!qc^c67FYG)h-VR;9aoME2u||k{s5!T z)%x!3bMVtse%WgfUr68~dXC3Z$lXyfV=yrY*8V~E-a2RzNI6zjijgHKEB;JSQx9g; zg`tA6dS*_Ctdb$yMPwtY%!f zky?e_7`P$x+@V^{Am5Ur_}RU-&lbDYBCWX~P~HRevwvCE=Ks@XmQ;y*uBjTvXA2fC z|0vSus(*$_qRwqC|CIE2rZ<|}ihE5Fy`T8)=^x*0JcPK?!Ihv}&TQcAvy9^N;H=nl zqNUn^tm8}nFQpY`l(zk$${WB-rh7kbGKBCI>Ta=PinwC3@uXSjZp2h&hujoCWwgE= zuCISVvtwCXG|35wIU!Y`jx5Oyf`2b%@_&^T9vSPxQ900wJn_F77gIrFNAlo{y`O#Z zW!9@8ohKtZ&v!(|cWVCE#Z;jSmm2jj8!T0CqXk5Q2WokQx}QxRAiLkN@02C@QYZN3 zJE(}t&IP>uZK3HX$`O<9847p`XU6~M$AVlSr$grU%8ORiHIqwVbf6&UvF$j9L7a8| z2+*eaPLW=qMa$Llq&Sk1(#-zH*O*q3W8pw#HMYy#io;uitS`Rht${|T^i2g>mT4Bh zV{B6)rk%#DumbYAH*t|okD(kob-?uE2g?5Y9{oG7L4oBU`OE|s-hvSW*yE$G4D&OU zR)2xVehJw_MJ>jdp1>i9`0kpZQUp|jG38)fW&K|Blf)E7)@CNEo^X<}bfpXqVa+M( z#F#_xA+_UJv~THXMCeR3tKf&&%v!V2Sn1Ns*o9J`O70N*j^Rv#yIPiQhl_D%v@nqI&=<6M!OAm_= zi$<4sp3BT%a8tVp{=AB>)t&&XlJ7~ncrF5;CU*15=V~ZYf;n_V{@FXm|J&Xj|F}Hg z%ob(diTGw!`{Vp@v-vIMyF1h?%!jd{tt(YoeQO6aXxi60PE`Tn$bq^UOZ+LgI_J3^ zUkO2W^SJZ)^yAU~88@Bdt)F&LO(MCT>(PDqvjFnZ;DNSSOXM%8L-<_Y+L+-b5ul)i zMeP4jA^yRb{`b$$?fQt95cw?zyPH-fs6jnf8i=KpzG(L0J=m?CHmwG=v4#98(oY3S z+@y{mqttnzy^y3(O#WOu?!%BOl!Mp+n8}I&01W&qfB!F39oVJ-dv!c9)DmQHP&rAf z;}j=~8zc=jwPaC~z3 zA(JJP4N&-q2lW6V_eKE^!Ad@$CyLtX|NG%4otN4Srp&4^23uS8Mpr6DQO<&qZVrm8 z@oFk-1RO8r+E6SE5sOZY3`yTI^@0q*H3TN0T~PP$ze@`LODv-S;rpVK)r5-x(Lg|w zwKAS+SOvL#_@2tE6AP7;zY6(0VSNV$yj!VYl8p znHcIgL8S7{)-c-I?Ij^q|79|bry|=Yr`7%DddC5eY$uo$4uGr6ZU-T7P*T7zY(DOP zV~c?##apk;fMlug<3D5tfeiQ*%lecndpHwwRpROZLk{4oj#EXA6ztdlP=GM>Y~7+r zFctOI-$_xEMohXFK|hhLW`@YZ1DVx#ZK~k#eL)cPWije&dQ8ZK^UrVpwMgxy+1Ii> z6%KWrfdy|=K&FN-Z0{o7DPufYOU*EF_h@J4N7LTSf}>&Kr%*&pDS#p&i@hg9TLJm7 z|2Yrk-9(Qj=+KmWNmihtRsS>-M6#{U1K>3vRB#Dm0#$3tvS< zFvv(;YiGd=&VYG^HWZvs-J=Y3(BlmJT^8~|8GdeSvT(KTSlbo+i=Nx7m!JxupoICK zyZHb4&Hv{Z=YMP-8T{~pEt($665=wDc>_!MdFP)%DC<9%?SF6cae=Y?|MT|EwHF0b zy?k5$1ZUIFK zF#t`i_4zvoO#QI)pNV`|n>7x-uMJB2&#Y}6hVy^=YM9X;499n3jf`D>+fXrzBa|+2 z`1_SQ)^)8J_^Ks2a3CM?ffafiydv}w+>|Wf)(`JnD+q%3e|s>a2^q~z!vuFJDhhnz z)44;g!XKmil*_LM50QW)d#7IwmBmXlfS7q%w&e@5X~m5Oh$(6E)d>kI zJ(L0IA^5{A68cnd2pTB&a8mc*9YWG@507ozb9e@)k|Tl!I+>(@|y0v2n_P zmcOhKq`X41CE(Z*dkf;X6m%H#$#a0i07Wva82BjcgrfDKBmzcI7)>rwFcf#%Si>qz(@I|w6W-d-_M!4JzFc%x7YYs?~|M>RVP{j;#2|NX*1j8ajr3;{t$ z$lu!O=*gKxZJAl6X~0&}cKxS{zfuNhO+U-cWK#_S$K+Q#xttJu-N}mh+mH@Ya8QtLE3$Vh#+y5{k{tL%P#k&JLAjh6D1}SGq02QWR z-jG(7)a+v5o-|d=bfcGD7fHy)pMp=Xg8(i$2Y5wJhnu7IgbNZg)q+_7|NTGi59Wgk z&viA!&ehyF8BaU27-N_Zpk14ilfUL5W8-z2`9lAQBeG^sP{U(U{2@%ZrtVvHuTiZvsvAzW0ysO;nU3 zQyD5Evs7jhMVS(@&B{DenYIj3NeGEZM8?QGPuozTGG@$7gfebqPW|5>opYb(oO_@1 zJnOgq>#TL|>f9B3f4`sM{eDgFl{E%ItR6zW1*g3EOivzp?$}m~n} z_i`&?W#3ATG7P&-SnYc^iO zyhI*0d1%U#KR0Ofo_&}mHB}01$s=7gL-IG zeSy*h=6Wbuu{VUv+$S^zd-I`%4HwUUxK144FsT$qHE=Tz=LGi%t1us3pJ z%7?kRJ=@r&&3p6B1(b3e2Fq90-QsQO^j>Pj-ToH>OMAZASOepDmy`gQa9@yMl@$WO z;D3AMFNf=06u-Btp2ew22>EME(`?(R(|Hx{xrDMx8>HV$bQ6;RL@yn!}qCJrs&5=M$FfP2$G5fN+Vu290vWKq{p=C}x?{nYylUAM35o4bFlT)0L8n+UtVx?X%33 zH0qzeL*#Z|_0Q!T2O%FOM?xFKGCJ6HriE3!^{M#kze{RH-AI-iU-pyq>r*u{{7;?t zB$&OA$fXYWzWZtX_I_C=c4`4kqQ!Fu<3t_f00q}~xU&F9MhV=BYlPUFj_6bJ|EJQN zs{m|8#*V>&IsoKCV+9?|*<-DSJBU)0&)&sihRDhAZG%T?zJd6>INPwIJbu@`rl&%0 z=c<2sn+o|r<=&4}BOVYAVbC`VMYYB4A7h8@Artwg1?G4sBRfZ!m7Ps8N>wGnhrPe0 zYKVPtiaI5^m1G7CqD}gg^$PntUTR9tTGyZ-Sgvi9;UvrexlIF8K zg-d7n{hnOCB>h+SggErMpB)2!YsMOwYb3usI%h!J0d@`p+5_M65f#U~b{llTg zz@gG()wM|D{=UO=Z`#wNfKJ0wBinkjpiJo}@x-=iSW7x9>(UJ={g z%+Yq*!okc-r=~knLW?qo^It596t%#$KEFm#v`E^FZf1@q-fJ4H@28lq=rv)=?_F$S zq3Gd-);crFrmJwLXpU*eE6q!;av?^QD|6<9ZeJa=cyY~bw(QbiI#{XSJz{^^$3Mp} zRH3t7RdnzRrydrc+;wLCOSpqjAXui=TCTo5jzoO3t+#&`cqsTFa9N+vFOo}YCQMHG z)+~ytfgf}`LLr>wHG&g3i%yF`sURAXA~vM^dl)gE@gt=z@7WXdc>=}EV* z*h)d{CUVeN)aryS65sA~%M~ndvLbf_{;t^hn}!EW2!LCF^~w&TW0!q%0K(V-^>Au< z^IEOlb;kn1og7j}ok|vLlN>j@vrNy~L{<$w#28rw2hD@fH2Y$tJzD8Tgk*AAe3$H{HXgC*C&>Bryv5QY zMntjjC|LvYM;SZ9ZSbcrXZsR^70<92Tg(3ZPN8-n{aCF~kg^1s%W<8-hgyxu=&8-< z4jJuK8_P4j9a&cQSu7kBE4ENiL(+R061;rv zM}6+gA%MG5dN}>PNO+B54PhxLKme0HRNxNg1qg&|?$j7y^xuD+2+U3a4BxAHrJXaE zP5UnNViokuWuK4^mIXXv!|#*z1L5&QYjpBCt#8!j<5vff(Jy)cG@3WFJipV{^uNHg zzT@X}IJXT-U()w|VvFk=Fn6;58M>q_t+}k!m31Sl9#>*BsL{_ePmbipS7x)IbI;BdYQxh;q{y7Af^hhhFdO|s z<14~MP12mNr@|&Y;Lw=jA^D>{cc;n6Kq*Z2U_2oMy2?2JcNaND?8&OO7W6%~8b8f8 zr#nXKJ(!)cv%GD!zO&yxV^GOHgTREIp#R^WAjl;UZU7kkD9*_St^<8tS71U#TxWEU zQbgI;oK9~lHJ66gFN9Y+d&cH-o*X~5yL=30=(MBCXIigr!I;QNi#@ARTYt9Wu&?bv!=qd}gn!fH6?Rqyy~Fj&BD*mXKTiZ@&zSUpV^14#6Mm?vEeq zQlp(|Gs4mNK^p*>t4f?D>^@NjyoaU9h_FheF68Ga0#93F?0^cv!M{J)Kk8Q;>; zX{NGv{edn4O!43L0(7OU?9n}iu)h0GtOCK2IS1SF4+Fxs`e@tw`L812&z=yve&~p> zfOi0PbWR%qM*Bj;RP6fT+_rzZUEjf9{L4Z_sAK5K3TT;DpflhADcV1;CuA1cVgRfS z{^DQGh`$!P|Jvqoq3_o9IS5o&ua+ag1t`LjRO3|e<)w?0TlWNjHV_>e@y|%*gPp`I)EOQh>Ra(R$W8;3{@?fpo=ydAVtzMh#lA zVgQFrLONtVY{Pz_Z$X6h|Hn&uJp63DF{&Rj_AS6p zl!#!&7U@-k{#(;o_TRs@t$+ka&yRu*@$lW19&V_#orR2*VmPRD0W!`4c)=B_!M-o^ z$NOjFhaA`^LN1=U9vNp?GHp{Sxms-VQv z|KkGq;JEiAVANFw`_8P@+~3P&@$$Lrv!GJaAuwnSLW6Dr&0#KFvwigXcAw7sd$nBM&DMVPpv1=HR@yrEPa+p6C$%ZMnioQhjilS0otpG=1gW=H2Ck^s zzQ1o<+QhdmhY<|;vQ24!x`1hok5uH}{Z}yG?#=dz5raCy|MA4oUdzIcgt2JJkd1(zECYJ-(p*CsKFO2C z4mET*w1yw!jmh0U947wL;Sk3F@y)e~nB-$5qHF281*u(G`k!=|eaelG9{TiJ*uZ;z z+_hDDm9b4DMgxi{?~)`-SLhoANcQ=>dqbQ2@v%t7Scs&A&pRhz$w?8ha@Q4*!?L~L zM}&MkP5r9%n^@po;jB>qaP=@&>b4$d(J`MWO>UX39k4ThNGay$@BV$lSN*3GUL4DM z#0-i+f#vH!yUnTi3^qLB9o$S+jghzzPDauW=UB(g4A!zg)4V{5!=*r?Ia4j_On1I_ zqed@xedO`)XEhPTA0t||HyOIkmKni=v- zRej9GlWp;6z4}65bfnI;Jy8)p_zvK95+7HeA;JGFRI!2InXciz0ar8@zoxQgvoz6s z>WA1#MjxxS>udHZgpZ9e8!^ z*c_zPldV9yrfkBZQq;}&HqUo5Uq9Q)-=&=`i_q#W|I7#Amtr&datrz(WtLy>U~$?$ zYXxaCz<;a&|B(}Qo?6#Fpw6D-6%LAiUw<(VQ7%>0#XT_TZU7NjTDDX<9kNO5gAvpHjW z_4Za#MvC0tC;L%>Dmd>{>>Sto;6nFL*V&j7+bXItUaj1ITkRrhE#C3zXRy3rH`@K^ z)AIS<`*pt`7FP>RdUn`cTH@FdGfB{Z@k?u%v_SF5Y^ z-`{CJzO94;$ZpR~M>67__nESR$Y*f=p%46#B(Tzz>u)uo2b+)o(keJLb+Yt|2KESK zv}k;d?`nFLxi!j?h$`W6ua?Nn;K%yu zC(}`pzE;~pF4}iVTLi)K_}R2Jf4b{{tx!oqnoAb&aBW8`m3%d!6<3z;CMX~s$CskJ zm^f|m0|(*UKNh}v32JRcU~xEMf$+NY}fXyh26z)@#`;{hw!s~L~o3qk2z%-q6#FO&ihe>rU?__ zkh1`Bq`(cwg(7-EqZ#_@HK9LyLpavOAUDC$pK)kaQ{TZiRf8|>$aVuptJ?;C#wRDz{%0*<`J;Ibe9A#KCU?a0%OFJZz0U%Gf6ih zv_PT5P7%sSflEsHv76`sI-^B=?T^V${3Z{gZf(7t~_rsQ!s(_cYT`sJ60Gtm@cE0}C zJxOaJ5e5gPZxuF<;DODVfiy~38)>Dk8=7Y^tuLQ_fR)biso{beR_ZVhy#>6s6P2Do z3I_2;GmeBw$nK?^`iL;2Zm5yKS^IO+LVt|2qZjeCjhkk$|DU|~*ol>ze>`!T3xrg0 znp6}cDQvd5Bv^XbA)x*Y^!Qw+F^+s9uRSzOg9Ojt&ygn`LDFC>GUB%KpUp#vxI7nl zV!b>7Ii0;jDUb_CdRLZ}a`NE=`zF%vbtS z31^3!qSGN-o~8tQm6Gv7Zv^xL z{SQ8pe)n1*kIJTErFQ4Dkd>CkMUahznKZJ|XF4~Egn6H~!!E3;Q`qlLFJebFA`Mzm ze`_p(c{KStfQV%3OHbj~vR_sL9A@;*1Q}lJTBlmV5auNRu;gFV1tk{Ll&uP10c};hI`3T9UgbYmTwPHECGn~F)~D*9<7`AOl^+06R?J%67mhIXy>f124!XHHo9q!j2POMH z6Zn3sfJOY{)?vGNzPWGA?(?0#-%<1+K?*0n6_Y8*{% z2ksATn|uGbZH%z2kek=i257D0@z{g}OTIs|0IuH&iZorMv!aEY_lf2~F$qxaOarEy zaqyz%SbK_cmdEPc3d%`2ZSFG!R~iWPHL3_6XjMljfyV6jk`T-|oz4RLMYS*XKCH6E zVNB~y+NPQ)~-2o`AuB1sZqwDo)dK+rUgldHDnf2O2J3%DyEBE!y4xZH(V$+4bIhK5%phQp+@><)zR3Tr>56i)YZL8Wl%dkRo#3W%ok#CiJk0 zP-QFGM43T;5cG?y&@cWbwaYpcC!b=BB+4fWAR$bfOR(nc!XYTY^`5X(su^1Qk?Irw^sCb>geh~cgMN1F`ZM{Z043m#~3%awAWilyvg@U~0P?#%1 z7HqT2>LFKMPjpimYs!28$LH}i!QXCm(9KQV^ZU21jHtW!TBVjAE`RHI2l}S*Y+C&$ zwhNefNORHHOIVZUv&ztYZqfZx`LM07%*-B7S%$Kz=VZig?XqPVH4p<7>BpafKVB$m ze-5Cj#1@6LKBkqoW)m)jISos!Yp1?xLICu(eXx}s{h4oGOmMTT+yHhcg1`NqqJ!r& z41pU`2xYZL@HMYCX^p8+7=+-wQ+?|HO4M*sNwfvrp-dCo&c46nfFNU0w~Epo092@g zq@uvG)ph?o_;wogav3M@b?QuwtIOm77}P?3A~bpgs>j!Q0)t3I$rV zYsQ^dOl2uJ<*;&mCttH&Q%Ob2OL2#L)#s=)+j$DBZ4M2FE{wgaxG2dU)uVn&x#*%a2bU!+JZonJIog_nPUI+%FXmd}TO49wt(Ym0$J7Mt6PvD6~ z+LmUtcffc0`Rnt7)~lY`<>|f$Q`Mc;Dz3N*qyIw1lpJ}Nb}?Pg0Wf3XPq-SvVoMZ6 zVuH_kfER9h)4JT(WljL?X-hoi4d7dpUNw0c%MXJ!=OaCz&Pd`6s!JHWRzX-<;i1sU z63v4gB;}bAk0T3@hr{=e)>OP5k1^Uf#K0npN=D-$0C(ET*0`WZm#4u>)#$gzN!vqH z=wdnc5NQW5x^d`bc0;Ty z$TbT|U7@Uw3aV#+7E6eQXdhJq+I(cA{o1$q+AzV6+Zm zs=}d|`>hZC3$P8f<%7jrh#+@=%pM~4Mr7F2X|g2m4fi_)MeY;jtKN-rUeo=rLP_#t zD!LOO3XPN>-vJO$GChg|4ne)~>QyLpBi*sAA)mm`IiC0xFwTi1%th0?wk-1kK;kCh zaATH!5;Jx&jBEw`Rc(hv?7nI^yi|>u2>O9Ok@)xDc&7#vCP zdVa-gr_@dKsqq}g6uta0MpHEJ$RY0W12qJS2tH_}ct0Q;*-t{_wpR|wg{Ihu0J~baOm*HUy0>%hp)VT3 ztx%dDcH_(0`=dnl;KYAg#t&c;mwJ{5YF8;LpySCs=+Cl72e>}&OB^PrRmi7)d_c^w|nJRxvdRm!kf;8(bNx!I5 z`8}&em(TLN1KiD&|yTg_vK%JofU@_tR9#Yxjldd|&P&#D+o{bIDyp9QvrhOkKz= zHGjfc1oDn_#hA8Sge@}oRwPOkg!;WljdO{deVeIurDc<rNUU4Gp>VIlR^{rIP&z{5r@ zL2&$EIvxVRlCA(t3R|B#M2|M1LhzySlbmc>3n_5aUqd}Al18`9X+fiOVD{xEnBO;=v;t^nA zK{{W&z)bZIJy;IL5#k#~Zj9Et+#FL5lU1+w_L2nc#-~}erC?O4J!{qG?Ehac83JTI zc660*TOHtfa)rj2JP0Z{ZG^+!qR+4T+p{-P(A5|O`^GB*2fS%_GCbmT;Aw!2(SCx{ z7->XTtDvYtGQp@vGQlipU06$Lm)Sx1tj6(5Qxg0)>zEHn`cAB1O}UDcZjZ{&&M z4&u-6FJ*d#-Z78S=U)?9?YyGqLX8JdtEf9m{~dp7p%yEKHQpFlEe8=#FiYaVI56E; zlp$=_^HCv$*&1ODSN0jURY1Kt-Py@TKb`#kC-`?NM66lhX!?so^K=)#HPmcOHSBjs zq1QXVP&;P z5Sm7VC4gn)Bd1#23{D7IqbG><62A-gZw_+)qn`T7PM-_>vd#qfZ?iB=r3lbBvFmPa z@(jH-zNCi6-%v(ZZV6xj`k;&IPUrS1xfD3#xZ}UFXqxT74OSpvJqOZ(D}#@rxrZ_v z4whE<@DgwR%Ry8%8>#Q0JE@OZmOG95kNvH>Z9=fU{PvysQFFlitwWG!}s_^gKy z_^O5Df^T7Wk*jMAari{#*VHvp{zeNiMwqamJx38jwY8jcy9nj{VF^(1)?JcXI6tX> z+Pb}V1(K53-n?AJp|w`|lDEJdOK2)ju7vMu?>DxBc;S0JuzwP6et@qs`V87LaG>l> zvQn}fP}$HQKS~nXdIHii*b{FMM_voGhiLYqgoFve0UIOvPfhLFjuo7m1)yj89$k(8 zarqVKR(p#Sry=}Vufd`iN?qNk)2jf}q&q0WUV)Ma1iscNLRR+xK|z=ZI3H*nU3k-P zQ$X(s$2C6l=Cf@y1mz|C{qQb01WFm@@LeNYV3&91jt5a|(DO>bbaX(uV!GMh$F&a= zy5WY*73cuON?aRyyH5roI?+g&>Tur^G6_~d%&!kk40BHqh-!i&-Y)-hfgzaMa=iM8?sLK*(<^gr}InO%@oSsvr*}wF&&P{b>D8_cmC-gQPf9`I!fV z>}ZxG+he72FrkMkBEDIDd?Bkc4Wg8K&+;DGRwi^d#2rp2O#Cl>`IjAdF~dv~S3nQU zRT9R<_|LNt7BR%NJ;S$uRW8pD4|s@|gkTd9A4oKs=e! zgP_^{SMU|Tu>mIM-m&y~X#an2Ya^@?XizhOIR`9!_G!}$94@wYSM`|I&VVxbE^>Sc z6uf}*$a=39=WMnC9O4`GAREX1hyo}0o2kDGEm|pwfUA$iW>X53)_`2-b4&ffAn%^n zDh_?Y3nDA>d`d}S7$cHpP`o_&QWk`Xvy@$W3WWjtWeR}2H1-SoDcwGN$U6+|eA!Kes-|;WMeBb9qWtwkAlEsx#JA>HccKC1ZK$rT|d+0=5_*}F|Y*J%FFdPpl< z$Gx&;So0DMD2CkN9RD}P&}u9uG+WGlJD?evb%(t-vocah3T;1S8$Q}_Yjd%4<}S2# zDG<)+4lL!&-neC_s(+>2#NTg62O(N&??~P&>17w_&>Rsjb=T!=1vK`O z@gbRkARPZvSsiw`6oe9)P0SQ55<><$FhQzMI>6x?Uv@Pj+ zHf2!)N#hA~j02cp-_1AOU(wwpBbI4UirU+! zHuk#k^k+7tD7(X5=?W3KkH4AyXCV#&fExpNPWDJ_!eaF{)TB))an8rFv%VjIq@|(L zEpNNwF8$m>hB+Bf(RyaHr#9e#JzUrhZTusH@OS2wGH8Lj03UB+>$wM@h&!q&Abx+l zM1w94*v@Ess3E&KW-H-(^+UJ7+Tez9LT^oVOt5ERr|ZkC=L=^l)bAEya-6LX0F2<^ zhg6A3Iri_p$Pk9L%@tU3r$_c}iSor@X4pBxb-SJ5?xao6@yZ^e_>3pv%+G`9oZ6Hz zjbBf9cg0D2^ns6bmj1mJ$okOn0t3}(EGyEa(Z%|I`OHpG;|ef~8l5V#;uP%=8s03u zA1puuLXK6_NBwa&Bjer2gOi|v`di{0y5(CbYl3Qne~5XCAM$3B;k)+Q_Hlg~M2kmD zLmGsD$vglaLW#4NB)%Q1SQ`zRg?W&Lz!UaB(`f@&NQ}$;&|tLr3@BU5zUyjO%~#By zaia>m9iyRN2BpyVE#O46)&$28CHrs462k~W(-a^{`UopoSkj$$1WR(!2zE{%sL0on zVvK>axy!{p56T?f5t9kHM|RU(gOkSRz5IP(rznVlTopr(>^`fY<&}(g1R1Xa?JyGf zvdwL*%uf3Z0aaiNdeW5b+tKvK5D2r8cQ}`h7uN%I&|qL`!jJmppEYr6VD{v*B5{KC zcU~zW2xCN%IB^1M&VSv4tZ)n7@P$qS7w-l6zV@Ij*{+|VKp8(W1VtPQJ&HqFA^%rK zLlBdL|6+KfGN8&-DDYS?84t2)A|cEc%*4w>ndp!JX%G5?mxi~#rBIF9gR0*Z;(B_3 z54M8}!N~%PhkmFST(0GQr*dTjq0*=6I=rmQ4`qXMn0x2h-EF8USv+G2><7oWBDB&zzluP866Gg*nzeu1Obj)k z2ncmHA>1=IsX22UYJ+4gtDwM=l}Zh(pg&jgL0HL+N0b7=TL*Mh5YSP4Z;th-YMwc> zqsQmI54U+FUB?C1hpY~W6nxDeR=4PyByV;M*Ib@OXk@gvwM3E4g} zk^^4;Z|VQk2mE74=ZFa*JG*$m!3KW#5hR!!~Sg&3EF@hiS8Pn7Tzg1;^ ze3!j-PfJ9k@?kdO9I*cY<55dpI1Hf2{+I8M{!M$4RXvn^W3goa_1a-!oSEZrRVBZE zaZ~E0=Qe;S7DH>L4-!U`BZ{ybg(p>hU`gJjMI#$l`7-T;slzAD9t3^g`IQA~egO%n z(kr?VX-#{|qA7Rx0Ztv15N5^h>R)@hcYKKx>g8j9EZi(g;Ycz!9jNyh(;|9O7 zX{sGiFc4OUYhC(c0hvRyP(QnkzX8jTjlO-+lbGwH zMO+tYYBsgrF1x?0W#jETwcrNasy;9{v&gQNQ~k3~^)$`wZJUqp|dBX&?I_N)!HZ}3ed^~Q?C$+t4Ae{liZsxBe)(1Kof8M`XW+)AQ?<`W??7+8GMC6{vB~TnbQuW8m=z zkK_N1^gh2ceX^BA8SLRDv(>+TQjMqw*+XO<5~pGb=Xp5i&trNh1qmZ0HC*o$#9>1F z&QuK>Q#h_&g>NZZ<8~Oi8g65IwMGPd9x~a4h)*ykiJhzqN=QEXY+F1q3FiBMU_hye z7Oio!Y{aL%XO=s!7(SPXweT$att+Z^!a0LXA<^cZ2WU7i*(yNfPUVmpeg5U+)W$XJ z4Jt@11tU$OM3TsKXsF#XaccaA`Qp_mgoDB!j(K}9@7`#`%YSCMPwUGNbxNp)v?okL zS?CK=GVUb8;oxK@;c)W#w&ahX_P;F?by7Hjhnx!f(TJFg*>iiJLBajA0kG8zdmyux7X?EvJ^Dl|c`p4Eix!r1lA0 zTKH$&i;B4IYpG_G3DjD_vk2Qn7(z)9 zhZXT$yN?L92;lf+qaM&4mJY#5BtEJpwQw({&b&MF`UR`+FnN@r z?uCZULpJPa^awfV1^P~QKNS!p3PdNulESn&EcgIBaE zfm}MIu67ss+8EV48G|_wCH?4pa9jDIj!D#0F7l*$THYf(xS2o_Hj1Rrek7(`x zR`=_Slas^PTwH;NMO5XYv}dHW$Ha4Sk7`TTmRL)WNvb{pRabNet$-|k_nS+(rSkAK zvcS;SU3$Y*v?$V(1n+x`dPp==q(5Kij&bP+ZpWY97CCW_yW=~rkI~X^RjigcWR>g6 zcHN=HEj<3ZGyr5V)t+B~(D7$WE-3NGNS>@=l5kGbH;JzIGryhweTO>%Aiu`Qr=tgg zt_=XFNxsLR@>Y?P8d8W1+8l$%+2us&HX7+S zj%XcGq5_1(oOAJOTaqEONS~QS$t-x9km*qJ_tli>GT;}TL1AmLz1=1Bm_dh%S~ z?iJO{bSAs-q1yN@`GukzYUc8%ae0+!$^fDy!OdT`KDqbMX|5JkR*@mneFP-C(s_Pt zzNCn^X{fBr?#)I=+60&$p{#+X~WTp=}#Q^rkIW$6x zu7ZhVAJz@wGWukzkm0qR#Riz~1EYpw!!1`=wGxj^)3K10ke?S_L7;Q{DEtdesc~4I zu!CL|iUXw6c_NML8cQI8HvS~;mIp{#=|}}H2!rzFf?SkDFvf^m=w>N^chNm!g`R7l z{S~Z8Q*BAUQ%Moevj4zd5kF{mRD_9A|5``Bxw0FZ`-pj}TdV=V&}rbYIxAJJKdk?S zyHLN_?#7i9j7P6Y)KgzzHU=KI=t!Drry%(w1cpzGo%EtFxu)RHZ3)*GDEC(nvbs8- z=zZ#{d?CEcu{R!qTWov2_%diM@_?KY?fdNwEyEc#}<$>=)+(G0;!Y&K6V0`i&a>Q5z$T`gqo&xSD3YlHKut-N5_C)`%q_L`a~1MIuB3K zI<0A`q3N^5w03HkwSGq)Q>5N-*A=^!Na23YRQE5LA(py1TeoyOBvd<^bQPP(h{ZBB z>=P_3XBBRKdx*rJt~J=5rI=9l4C-~C+vo1@xv?J|(oQT5*QhB88$P#Cw%*{b9pjJv z+x%+|%tHg}>;r}2t10tYNUNR-n+b?uu`fC{ONnkdI1ItZW(Wdp?OAOR`EA3AB5Br zCym#}soE?QJEB zajMh($$8D$>R$b|r$gb&WKg+c)o4T4OFEBQPq=q`kT--djx=|suLGd>kK`AsJ1%hp z-=FuW4xWdtN}VADgCW5r$dKl7J;uE=NUV`{QNNsPkz9zV(+f($LW#CjGGd}NT(hdj z;iGUx96trQL~^O>;O{$R;(h{DjZ{RUx9JAm5VKvA=Mm4{87vbf2apI#UYgennkqQ> z9N^${%DNHp`C#-4jo>%%7acqintLS~J1f@W@#I$>77*QYa8;hJT{Pp_N0^=V^hM8qmiZ4ftn7U z!h~hzAf0T<>_(;xPoXOZ&sI6dzxMIwR+-9A(Z_Ezsk0g;hYwJ7C=$4WcMg0tyB=c%T|oh0ja&(^AyU@4zcnf|kPna7iD(6%AE@K2!q;wU5?Fh_N zq}*P=!1I|4R(^l{+)sti z5)Qc`8TY1{pic*Qg&lSasXQnA@NagV+^1~!wC*Z3Ox-N%mVHH^w=3cSQNX9f3p<3e zF`>0REn&zf<}1o7(1BvTZ9lRbyy@XYh(f{>kd#~d zy>v12a7e*t^_IMLAukn7t95w6FZAZ5=yTRUoM|p~{mQ#&)=z2b?BFVkM&kzS-ZoAw zkFTaM;sbr7nk8^mcBq&+|LX^UDt$R;QRNDEabw_ ztWP>(;aL2&{uxQ2slP7rdIr-t)q@On4EoFBimIf>LMhr$Xl8)I;T9@p_W^6YMPj$6H+f zj+*q7-H=ToIucm2$0hFP07sO;=&uc}%ZaE;ChKC$@CWuQD$4b)ufSYXjYMic>E>=x z*ui^Dsn3y@LX9&PGM2{*H-azC(Rsk+Nz9MxVgxA@@6!Su?S*A+^ebSa?guTjH9VDq z58=o4K?E23PVW8q(qldlO)%&|i}U-NM1>_6T0Xovrls<-G&lESurs7pwmcnGkVxt; zERwQ9TBkaV1mXrxaSfQmLtBXI{3Yi$4F-0k-6+4mrUpRVOYZ%G5B4=yK4PdI zTYM*GR2~XPvN{~d59HiWZ-<1{CT<^yV>E46JDK*nthdVQi42RsO#Ns>Vc#GD6`Dp| z<}oN=j`6bhyuH8I$@j&#{cf{;T^OZ@_*2c{#dENZTuC|W;;zL4x6WSq!5O5ErRfDb zMlCQDp^~Gkg>ajFYtDo0eX)H~xFOS`OC=vjb1wQDY;AuXX+S6HX%~QGtYE~~o?wL% z$M!Zza3f<4v^E?AeP1%-#nbJ6NkX8BK$n1c6vilko_iHGLPPiy7~hcj{#pLz?cJkc z6wAy(G4g>_u<#YEb97b0=TX$9TLh85i;=6!xY-lOvozRAtTJ>_aR`=in2Ny|DbP5E zTZ4uKWlU~orR>orxiRMJUv*aV1vl|K5}BcifpvSG{JnYWQES>t9!*FF$5Ywg+r57VE z%g5^@%vwVjdlD72?J;9m&ipP8Kw?H%g%_*}J<1Vz+J+>UpLs!uou1>5&&bec0Ep$E zPxLDOGubAWf?px%6!by{_XjfBo=PpHsJkSLa5!rk6`5Kz5fR_Q+X-p^!jBScbNk#}3f zWlD29Jc=hBDR%Wo67h$!Oh!`J3l8f;l&ORQzdZUqHiqfPTz}u`8mM<(E7WejKkQI! z;)>cx7M+~pWk;P>k;u8@IJ;+yKszeA2Ofwz?p#%%$j*ezaahPxaa*y@4>k1q*yVa{ ziS!gdTlErn0Df-l0^bSx$FEol?rIv2Q)mv?4Dd(SS(Q}S&1A`Z20td88~n~opYv^? z4273iI~?^Hy$P63%006&a#)Fkb@zmwOVo)jdu#V%Jqf_HNyMCfBjm6wO;7U4%qk`7al*3W02n(=+?7HaNOR5y8O*}hQ2O?G>CZ3nq;Qua z(DILW9q`6O4vYZ34$zQ>kCp&(hxwkrsW0e3dL~-q=W*0OKqNCuZ^R`46E1WDon5V3 zpO%*a^~(@LF&JH?n8_n`^i$5&d7LZIffiFAG1oR}HT>)RCOf43@HJ#V=#GC5URSQ*$r%jDz32cjpOC0529+ zs*jhP<-yV2xx6Y;$!;?}ZSMQW6q_)Y;m~{DeE27E=VYB>If&7r1Kd_RhB0}r7En(0z;RSFsuSCCbSJK76yZiy`BH=J4x`?5iXt-G$`xN_OwV0j$x7ES`~cdZFQRu-`%HW$oy^K!$Vjw z+*js6I|%P(ZZ8ab$bWrzXFt^GlTv;QrKWWgXyUAI8RC2aSes65i-U2}bBQ-)q9Mnc zlG0~#2uKY_n63j!BxOADE$=P`OZ1L}lj|?rEJ(X|h|jO&4;EU72OpLhGP=Fx@`>Rn zmEchP_j4%f-c;UggB$OC(&WT1WxatOYs%{xnPO!MUqAkkzG&}0$ZnEa!-H6)Allpw-Y^ABr^D) zTP!OK8w*=jhmYE|JvLI>A?;l%>lLHV{`2lgu>X+NOINWj8(hZ0*)O`qfK2Dt^zs>D z1O>6ILoRqTfMf;*CI$gKn7uXtH=RA2p1aiK4Hdz2H`Yg?d*>xbC@7cq(~qFJJJ20y zUw2)KKFl+~ zohCqg^!pEJ8_bGf1fROTeb2jTM@B{_K*bn;uxqj|Z?sEXh^FS8-@{^LZ*jTj{(f{h zJpM;5v4%;=P}P?w>~E4A@hebDjT;_mOVVmD_6(!%{aqC=GD#P4E2eHELUa=F0Q1* zcG5_{@)(|XqWmQ|jL~CM<2=LjF$Bv@NPPMR59mWhS7G$%d#}5G0#7v>zxTB>tuIW> zft4&MiN{A<@A>x?x6}Kirl_`PruN+}=X-YNIM;G$G>J<3)@YC*Gvi!al7EIj7oahx zr@*QUP|r!IJC2zhssjPe(a7UhptF{87$w0=lGm8_HA9fggu--ybvrNvrVTq}0|VR_ zLFn2G5}7QRd|UTA-|z~%&t!U?Y7!$H7ACt#C2}}KWEBxdt8Qd@&HE$0 z$wg9u;KIdAD#a*$>>JI)_xWr*MtC%7DUVhhEC4~riqO48I0z9He|%(?njrn%h%!8x zbq+s>T;d?t)n%3-+t-2|)vs__QzJr`=TY3(G3rLeMFuj##vkQ!2!0@?r8z<~BsC!)u0l-O;(=1;3hq_|Ns6;!WZDdeV~<<4$O zCoD|eKap^MdA73L#pKDkiSYh-yC~PHPCdHfK*Pu!5@d5en8l1GSV@ehJV70l5w5AJ zI>H2C3>H@Wy~Uc%@YjO?hvw1K~&(7`)6v>I(vw7leqU|Un!HMQ(15v zwID0gOY1U=K$gm{)YHVl(nkfUmq6p6rS|9*BeZPk!yp-)4*!&n_NTj{Y{##g0J_`K zi-L{h5Csqm|L{p}xKwRS@iHu#fdJ0oaHu5i;&X_t)EA=j(YDaigjL8t7bwg$^Lf>> zU*Z<2#9RF2~&;3{p-V9ZawVg&LKSgPiAO+Saus0t7HmDjG~=G$(^i{)7TS&4?KaAte$CC_|v~d%JQrS$dp~`zjJjqX_S;hZHrY zw`P&hhDE}e%i0`@_QH#K&_Kw=@^M&G*hk(|)qrV5G{!t{jS-I3a9OQmE#%K7EVvga zhTo_o#GG|-feh;Rr}*O}54CL$bgQj4J?!(2gdU1^Cw>ZR;Juym<|!yBl=L8@i-unV z#sj77Kd!6@)C`S?Vjje@+)LjB7{LxA&pBz;F-Zz&9&DHH*G?Tc&|HP*FuDK5B1j~s z2s_-GD{E`XAsYC!Jx~3@^k7Pfi|ru4Nr;jK^p~5I@GlRb0dD3%vc?~e;iQA{4kDW( zT(*jwISL?29^bJk;V#wS0Vo-fqKHKn1}VmG%6f7YK76Y(HNOSg^A0Qg}sl)!uioB(Ijv2bm65C`A9 zPmhpC^`D1O?L6){+w?UC4!#H|`w}n5HJ|qDE_gv52w(chV@nVRwBAwRy0m>X&A+20 zirDp;I7Q{;tq*4heLQqbG(FEDz38WnPf^Uu{1*l4D>`(`bM<@eJ!jWC1Vd~+Vwa^8 zbGmpQT*cMq#J9tf{X0zd^Z53MxXrIR%>=JC0#tA4L!b!OLkVJ7!Y(#<5g;b{tnQ8e z{yyFWjN=El>DhpNlRXDaCl##rK8`uP{#jr0(@<{EE7~lhn1Pd0@InmO`6RU08?yQE zL*-gAwEX92c)xpd33ZLb8T44^yo~3~&2Yn+2%KM5qGdyV3dSgsOh6kVcEn|QIxcLl zYt5_dp4#XW7$Yo$tJMP3(`3PyV=+cM!lYuLIj27@cLN;&>!6LMf`X<1q;)z_fe%om zD@XF>K_17Uj06$e?xv|uZn7iY*(W?EaRs|WFQkp9t4?q>%y%WG^Q&aOKV)gEUHyEp zW0rNgr_?mogfL+ki^2b55pHBMlo_J~;<@sa7=bq&ffvd3LD^ ziWN`pGL{BppXjIR5{P*TD)uy(6>~m83f}4;pOO0fqTRN~Xjk3t5iM_@oj(A93Maj0 zun6%-I>1#AZUQjEvLge9SzW}BKEiqT4`D)=90y9<5D?BJ>6-Z*v~Y7C0Nr&0QDTlz zDR3q9LHvWkVi&kE|8+(jgEQhcDkpgUNGrf@ab(Tr+s~;i(@|UufR*!MWJETK@m@fE z|HDelStOMrbXqh$HAbT&07N~MJBJP@67&DD_tsHatzF;fO{X9#AdMm-AdP^u z1uCg@iXbH=T@s3Pi3q5)NF#{AO^2j3NOvin(*4b)&)!d*cklQ6&iVd1W4yyL92*%M z)>`+vu4~TU{Dmv{E+E29G=R{Z*UAu41ft!*+k%0D1!C-VI6><137*`UKEf)URZXx@ zLA207T6;!#?t%417{N9maT$OMm>tjohn%NE zmz?^wUl_rDA=mU3s+3!iO@WE1I7+HQg>%UJIFvelOPt;9es#Jmoc`4OCVGf&74d&V zOiHRV7omqRtWJj``!yK6{Pl8kr$t%ye|iWcm+$t=TYq=~5@WYKuX19iOk5^&(``CEy{qvsMj#dgz7l6A=7*09;4S zxyn|bM{^D(Nt5Djf&RtZs`3!M&R917oJd<#bQ=$RSsyegivY8-doBK#-h!^_D(SWH4?_T= zTD*f$~=JavWz4p zO;VJsm!X}_w-0v^4J#5SfarQH3FkpqqN9EFtF$ix>g$<78q1P*lCnu-7#jvQ8M*_ev@YHU>Hu3xl{irnE_{GWugS5V+}SgA2>%J4T}Se z37*IC?k;TUH2+S&#J8f`(A|vb&aiG)Q zp1q$6PL#}2AqZ*Vf?awS{;yrjlgWWL_dCkjX`GBN#G3;u2HK*je)u_7Kwo^2p7|hO zW(MEW5dh3VC}rZ6NXx!G@F`+pc)I$mz_hFW*)7!%h6c=<6hAu7QzZ8Upn;Vp$MOJ% zJr%EmbirBI?&DK+#OmByBmxm87RF}i_M1}48g=Y~+RGXXjgUKDhV-{=U?4MEqreT& z>VOnTSvw}BgW}I&%UzJ;M1F2;2Ju+79Qr=!%9720+Tg6h+hs6#c^u4ZO_#6d&0_If z_p|}3mHo*aATVW5X@bX>1>21ZRGCGEdIr|dDw$+?Yv5l&p+5jIDgG^n@-U#63d!<- zZK|YpKm@Q&pz%Cb%X&4}z^Pl-zi9~kG9%(~Ud9nH)ZYgO^pS2&3ezFjkFtOz13yp= zxL9`N>jnUbOE=iv1t3Y9Ljnz!k9M-D;A+-Hx{7*YXLt6?qfFiM5-$f@;}VROFS(4X zR&&DetM$q?`C*PdGHJdvUzq1wGGahS;~1nA)K~V;y6NBNeLqF+AzP_}*`-58MtZja zWDDoel~RvDq~Oj!FqYI~f~lxeV{GoO$&xvb-zrPMbRzTqX%0(WqXRjr{tXXN+r`j* z(H!&bCh{{4aqe7Le#E|X@Yrh5u;&2@-wC<+cYap3j#EAY6%HR2hq%3_2I-C6LTAki z2q&Y+9kMUYK~S^uPc!KE)Y?CN1GUYWrYRV}VU&qx$9+*V;C5+z#Y{3yCTuua>(9wa zpSz~Mh~&rEdUf{`jpJl^13u%8|2p{*u#^d+->ZYjck&S{oWe5IDVKg}>eNtH3%v~H zjoS-LUB&E5-Cs@a^ay6IDF9VhokEe20Y8?!8|D@r}**%VWEOZUYS76m#=t3k0!+@b{!UqJ8j1cp-8 zSE|9<^PRl-M<`7)sC0o+StR9b_Dkto7{Qmv1>43wz=xq$6@VN4UD=)Y`#x~0+Mo`pLGrHft* z`cXaU_gHA2e$&g;=5nN(dm?(>iXO zJ*w*Jv!0{CGq6N-#+i>dh6K#}^#Hhri|!qO(cB|&K+6_g)jv3S#arMVGHI4$F$Rh5 zU%)D`fn{C&yB99X5z?(Gl^(@IENAAuB%dXSPZxOf^;hiHbqy;J^ZJ1b|`69DyJofb9uC@Wo*DL%;A>-8pOsSAfd?kdQnX zUahXiR6t@BuGBPxB)IRA6$ik1r^Y!n^M82&n1N5zdJ1B&vIru8LKwRC!!a|R(Zx|X ztdC5zT&+FbX?T9ZCC>a5g_>;312QUpRt2t2!4`87UQ7Pin7;(@kWKm1G{kTH3g0Yv4=o=rg1O+5=sF^!hRH6c#WMUV5OV;WzKK6R7?SdBX0`g{zOgWq2vcnEwbyAFv5#t5rM`2ZRueVxzF!25`v zx70$TM$zw--+{q{^NCg7bZO+&2%86LC}58?&}K&Q=R-D@qJJ69FONa^0&v)0AEVEI zzlmGTyb*iyav(A|w}Py=p*!0!s-WY{;A@B+1cI&^RK^$M+h1WLO25QQ!yRgrwu8jYpTp&ABb8PPy;EMeoi2t6EQ}#~&x7#rY z{VTPl#(a$dk+ znSyGnZPFq_9`e)l7^oo1 zp)V}%5fK7#IN|2ODYus$4CSWiho@@%ZdXxRYF zjvm5>^7;95%uI;Rwix(a2euFkTOggb034N_^#KaT;{4l)tQrxpCb~s&>I}nS=92xI ziqEJ&?OxKNCl<-4@~y#Uo$$#?5R@i=9SHgJb@lo`e_gl0Pw+Hx0uXQ5@Mhijy5a}H zRb^$%xb>YO79L4HD46>H5m?Fb$JHS=Y+xMdm;(oUzr6VFqXi%rcqJod`*2}q4$H+e zgTP)h1DZ7}s}6EU$e{m^cNgp={`z$K*M)AFwGFyH^Pk@vw*bf%3jKrx)rN@vyghSx z1@c4o;mkV0#XWv%^v>cR^eE8OjrAimHoQ>?c`*P`DL-6ZxY3RNs?7()%ovSGtKS;O znU*eombtU`YrI+v1oANrc={G~0pwdb9*50Hz>r#wFXRrD(f<-{L;m}6)g3_CWT5t8 z2M0*k-)kD~WC^y<|GiaBwQ!;eiAY7bIB3g}#}A&ad3zy0G_Nj@a%jFMb0jx{U2Txd zeT@h-rz3aU0OWdq>z%7?_>bK)ZVaxAJTn%)6%1}0VOXxV(=E9$#uox|epE^kYm1=8 z;3I@&nz#b^+2d}rlpe%!WDA*+{Sw=jIzp{}=KS>eJO8MrxU!JMufH#qL8m;$o@>-f zjraPEraCZm%s>bkC|v+11ko_;U@wH@ip1$m11>=9>Q)JKCquVEJ~4!dDJF9#c5$@d z!NfFr{*G`%M}^brD)z|p;ju{bIO?{oqc#SB1Xu>Iqi+1^TkNuG2g5J1SLT&yQdSu6MzsdaL`jTW&Ik z&f{PW;R?hS%R3=d|V@p<~M!i zl~K9ZlS6CH76Ro$RCagWRzABd5$VRodPXdqFUr#ho&K8GBnbjB%N5$cg(ZEZ<^Oa^ z*Q-5`xVnbMYQf+^IU=!H<0J|^jCh&1U@69t@WVjqazXP~zEcUWQODrdWvfJh70RW$ zaKQrUlI2IA;nC0P-Cv_Zwv*eA#sC)zm~=2wD!nWTLYLW!y_vphH>+4!gqh$1Nnjir z{s1`Yqz5j&%@74T2woD5eim_YSBsFMI}c+uaT6Jxi_+^1T?e{Dt`9QeMDVNnAkXj? zwFd8K$#_NE5L=}QqxR4&P~GvE|KLB?p*t{3n1xN zEUw6J({YKb*i4=hx{jBXzwm45=kpl8-A3@(Ybq2P{Q!F2ZM*b)#X5~9vbX2RzWp%} zX{;`)FyJ(e^WD0ve?J5pLsa>zNs`=`^|0W&%fQ+Du0N97E~QnYteJi7NeeSRy!keJ zQ0M=^a~c*^r}M85@(BqmMx+yD8<0}hj&7iLEn=8JH2c$8Q8~b813vM>&0U~d{UA@@ z0ro*%$)k0kkO}mRpn+L!aV4E!8X39#O#TcWpe3hd&EOW<6L@2T-Q7{Ikj#$-u&;pt z2W#eM)6n%8lv_VP3l?gw-a{KeYuT&ArKJn?aDr}{L36|*~5w?!3QLsQiSn0TJEEr8va zJz1s=Y!%<%$_DBLfQ3Z!gp465^x@uS9+H;5oZ=2$xA3+;imusw8~7VS@~Bvj=PEW` z4}h|a^y`RgCzN{~{4kwmvFFV14w;#Tv5OS}uc>&XMWHuk+H@ zE*mk1e9Vz_w@SqP>@8{#+k?yqAdH%^d}sneuzBiI^)8Z zKkFEao2q#9#{d`4!nwNZ=RY(GiRg#8`}_k9+2XU0MLz95dzf-nCcflJL|~Y>t!bFC zY0PPc(jyKLvyraerwgxy>cG{H`X|BoA-Ju<)L;TX*>g zT(?VEKVwuSO?jVKahgR)-|wzR*IT8Qr6VmO+tFoDwOr00p3-X0mACth= z>%IUUz$b7#q`xCOIDniz2NbEv1{7AMJ9?;#Mm`8T7$FBn#QZMf7J*##xoJVrB3H#s zaL}i=+SR&nkWaV3hheK=K6AR3qY3w-yV@a*;HThYEk(vIOxOIC38VY@m(AZIn{whd zkRkHbSU3W7whU<)3i!}vmOYu~{wlqpbmMOPuEir;^D_d5UmSDo&ab~!kyc-M9+hR^ zlbqWG1}Znie}UU?s#js~MAa;Ddd6}FB=Aa<_@4sQ=#s21+qO032YKQ)U&HIUfC6Vw zfw*QEp(-ZbuLM>}FAD;pzP^13G#O2Xv2++0kheZ`MmA8549jE;dN)PUFG4PZPe%l1n>U_vZyMk_ zjb#t$IhL<^i>){IHGT+sdf@Z?nuBgZVHWI1cUymP(2t=Fj$jy{xP^EOP_ z39r7d*;W>~TPYNh8gp`8eKRpipd#_vj41B2P9|sDrx&lFxdztZAn4|jKW!o zM10~x70AqR`%?n7N0gTAxgB5;Jv+rZ7Ufp?1vfO>EHo%C{2S-x+&0pM#S1isf`c^= zQxX9MNIXSO^A!tNldoYm%KD@6E?9P}K5Eqg5%cdqe~%cHm0>$&O$s}$j_c3jw1AJ% zkTz{G0GsWIoM*Ce2}X;(dBzHq7n#!xHpb8_RYVoigD|5AyP#F#;w82q2v1WavU#PDaq1l$C~d;T{H?DF5Dz)~$jlazToH88w2 zL3B+PM+du>=RPC(sA%&6-i5Ks%JqDhU0Pd{pRVGgQ7Vnp>3y$}d-3hE-Q(^?+<5+e zAVL-KT!L!oR>q_4H$XvF8w-H`EAw{5B0aPm*%caGe}KbWS&4tw&O}9_xYD>T zov*+#!)Clp^!o`RNI9rD{u`tmB-4;gEe4Ch z0rJH)!8>gUF5!vw+Am>OV;uM)9I8J7#iP*}xA3o#I zEf;qnxBBG%ELLzrHq$a%KcavqOpD%W;uJ`7ilm-_&DVe6NO$-CIk_gUDtfsvy38Ik!?NjA_8m2euDqXj^L6(f-t4HfZblA z>TJsTXGBDY*=9TLe;GDGcqGvV?DFj2eK{qj7{Kb2{%hGBm$NQ=LN@Q?*58*lg} z08m>^@CTyJG8-)`CO>ngAND{q&_7ytWCO;n3am%op1>omY}G_a-DPr<7Y9u9<}@!0 z49l6M|NW~UFdF%zt+n}Yo&jYlI0dod>b~AfV$Hv&%=Q_0FaK{JDPZdNuKsxTi^SZ7 z#dTvs{}KGtIb9zO(D0NDzDQKWF8 zzo}-`gXb~IxV;0uK{@?j&P{m`2$y(`w*>}BiBEVw6O7lLKCIA75*M6S+ zm4ge%t**Kj6qkPo2uu9{>ehceD8n}}A^!6jia+{IXhpV(z`xrj;6l;><^0bF^)Iw? z_?9nR=s%7N3$VugJ1FXG1oXN8`Jn#o8ScJBa#eqy@f-k!{|!DwnFM(Je?F*xe})Xu za{h4+-}<|A7(Ov&dE|}y_XqVqFWdjTY`-Z7|Ce62uZ490vs!>34Wiu3gy9~dNZ%Us z@FH`ThOOf zkfEaQ)_N37-Jh`MjXt;LHfc_|?3$=^5FYr^sNkJ-06{CFebfR;5^#7U?t*DelyR1!BMgw7{W- zoJvdM7;B%8AUuaSHe+z+{NQD0Rkr4X<;oIk_euw3@0^fGzs_I=8Qfy4RagS^b<&II zwlVdYX_YL9^4c~Wsp9~cEHj4nDYGK7K;QQY6H7uMI`N}yu+#CqLn-x8uV(^5rfDh((7a+51b_ZnM|78!z z6ixtJj9SiOEgvpBR^}OmT&=gqMkXsrVjEp85~Mr)`ff@O z)RJA0jo?taUlO?VYC0ZHz@VX-^}Tuy2&q7E-KB#n@%{ZU7`Xt+PXkDPe>$%i5(K$U z`m<=q6AMZ25M7U>wd=ShpkIr>56OzePSV3u;Ql$B4A%0}TOieH<*eAz7o5Ue-OhY) z*i>S9_%*xVBd=%sq_{dNXOTSn)vWrV;+CSjJ94{3nQ#B%@cNos2bFHD^axCxhLtsh ztq|)UrN{oy5R>m>dAg(`S{7XD*iu-Xqvd(CIL;o(crBt*Y{2D8EK=A--6K~aE(>}T z7;#6*#Gowhp99K%!Y`@-2WXtK3d$<^;C zlWy!lGtm!mHNkU~K(F|WMJn2X#9hN+E^Ekm{xXfy72M4Q18ELt?aU6)&ne%6it`nR zDvZF@e;U39t^?D6z9GB^%nzhsRs!Fek|nO-<$Hca6Zn#c*#Y=J4#4Er0yFy&{y~YO zwNTFQ1c$|DIpw`2c5{<^mJA*TxjZyj2@R4QZe^QH$Cgn`dEjzQbXZ)5ata#d+iN3q zfF02B5oKZPw`ftQBJcM31^@?x?Y`lV0r0XPzghwY#eiy&`7YvN^|g*=c?z=n@IrQ2 zoQ5#I_Zt)9c5xJi;x(+5Fp1L($Tn1kEDhbU(j4jI%$;9|qGV{^*x$D5uEJ%W)U)s# za4vZ=Mzk~;uWf!M)AXTN6F|!2MDAj<(~+>tJtH&Gs>+!-rM* z6)>cJ1I#wbc8$<#-p&77Hwv@TMIaS9hVVO(}=V+QrgG?GaFE?kAAz>@Sh5&i3ct9YF|DFySkJ$wqH$ zsyJxGn(Tm&0H}6GRdOf@n^l>1D^R*N7M~-)BaOq59r7=LZNgU9S=%!P%pAQ4z4Q4~x=#-AgQ%sF;S*IR0LEsCJl}Qk?vgG&ssbG4qeqg;rW#$BWC?!C zK1wQumLu9v%aAx>r^glzA#_MoId4k)sf8l&39jeYHq!hwUe_%RA)5QSn4+BgT~Y!T zgWplVQlnt>oCy|qUok{YxanP#FQkm4wvZ@Nc8NNH@BqzwadKMdL1|mn{`dzm_wU(g zHV|3_>!`1m#A;iZYwoh~>Z&WY9+fE>XvHuNTB9R3H8tbC_WQF>I0Pf~Ub~|BL#GNc zk)s9m9|E6O7W1EXI>;9|f*55w=afcS(&PGgUVEqJnx%QH95l6xS?!UoIH7uh zWw_XW<5L<@OWnoEtt{TYHg>0Gl5AlyRgDKdX_gA)y?LRJ`vO!*-g*p9r#i@|+GaVA zOy@TLRH@pqQXNY2KO4%xG8ieORmv8)`2KN@M&IME$umA?I7n0l4q?mc_(IM zwDVVPIhYR_e6-vw$*oY*HPYQlA6W3<)uLP}tRCCC=>>K%IoTEmV;jS=$D@4PLS1e1 zcB;J#$dqq3C6?QsnT84d_UO=hy%9@+-s#@_OVQQ$Y08g~`hfjG`cU970{=e%=@0^}F`qLjAU{f`^%`2{U= ztxgfK*N$Wis+!$5Ea*}2EPF&l?*}^hTyXtr+ZxVO;^j-4_Xv0D7&Tv0wMo}4D7li^ zb$NmG{GH>Mgr@{p66Tt)l`${Z4P*>Zn7!;Mxfih#PEoU&{7`>0237;ty-+rRk(WYc z?=`Pzt$bN3kiSM}rkR?q@^Ny^8$D?izc(BCQR2eJs0n*teOGi|cYbiM(7lF*vjHk) z6Po2For3;CYG8=IACUF6oHub>lhmazQ@5SE*W9EhGa@)iqtn^e-uy6y1FhdGr4{8A zn%8qXA3OMTLWyO>cbBfn$)CFF>Jpu*%dhf3YR9G>WS)9wb$5&D_Eexi`bceu*n@c? zEFKxvoWjj$bjq5Vvbkn^@72~!we+1q3%P^1v(Dyy*&nYw?!J^hYIa(fOJTNrz-u(v z>$^`Qz-Jt|Eev`&?^4Y=w0Q)hDXhHNWM zx4B;MC;W4swvB z(Ar(@VrM?inw#Q&*LvI0a3^}FLN@2zMR^e9jC8+63SD4`P7dx%K3vTA^9Dn)`5T`d z!Q#Bh+!f--wB$a!0qt{$&`PV~7vnI4iNMpi%jca1#6Y=?zz*%zF{!eRZ!@|<(7+~m9F3W%MHH^pQ7_6c59vnZmiHv*9FGEUl51)=s5c>mGZB@1JtR)Ek@ z&?PU@ideK|^}Kq2)`?;0wRfg|e3E?x^Ok(8*{QhcwXz<;ixrJ;YWY7jYw`yl#tL7V zf8?b=9iv7+U0daPP&2SK?)NtLs6rS$>Y_?#;n?fMuX^>7?(4HLbT9EWxG(Vpi!b+r zRf)3tu*vDv zsi_uRDB0VuS&(IQeSN@R0_;4e zd6t6$_DjZ$r3-|c2s7$Q8D7iU?ujCaHwz1}Y6RkH;M1K7#m?YXdnbQY())+M8M8ZM zL?Tx7;^Zx{kZoMlX|5Z>1tQgBv?|f&{cBYiiu#00FoA_y_(WK9g#S!kwKvhoMh1c) zC!b??Y@*v^TAG7o-U(4XP%JW+=V{Y)(BvcmV`l&?T|NlF1X)RRyOmUYr;0>)mXR|4 z)^;ncpY9jdw0#)#=fT8V#Is=rH2Xsc@f#XA2e;CkioK@R&nP=mXfoY@hR2^}A2Kx4 z%6pi1f>xxjLW5j_`0foYizxE4M>zgZE?J{n{ab7wpXtvdC?az-(@iaoE!=ii{do7N zg?>joakogLdFe1GvTko3pDg!5gvJ&TsdZ9ire(@ORm1{&^*0RD+`E`Q)>dZYzOE^1d1eob%42H|b^` zQpF}0N%;?j3hA=9LOl*@ z>RO5suV8Rh%|p14ZWpI7>$+&?0eigYi^gEGxkZexdXj%(zg9$TjeRWJPPi>Kr#$M@cl(!bWZvX#=u2L_H#Xbg&P7@I;mIV6JKeSvLz${q#NTAhmKxb)rNvp@>0Xz$ ziE34PnKsjNI_!I<>K)y)+nGH6RGP#j=-b5nD&2*sTh?lR3k3CrNpgu6jj1ke+dt73 zcZok7zN8-IVX~CbSKJ!Ex%gnut`S|}kInF|FH5!Sgr|y~MR=@8_*Ba$V%*Dl*4azT z&!YSF#sC+O`RjBz$G&%DD5 zQ-`*eF<(K6b9$WR8~0(@q2VC^;eH@_f;i%uvudeVBEWv`adQPGm~`3)67VYFCw$;2 zB~)w&ma7IxFyf%oXe~G14q~Ij0dps-EZpPu_+!KpyO0ng?)A@1lnY1=-7&9w@c0me@2w%4Ai28C_w$8YuBWGMxQ(}I=OM68IfHiFk7BdQUc>5IG7L|j6l zB*I?CvG$Q#DFX|Xdtnsyuli>Fr6obK7u2(QcFlN@Bv6&#a=0Jif2K+_AXcYk^su>k zh)lmR&>nvh`NgQmfI?>nagLeY4OIrvfI=w=v3GZ@G3!Wp7O2&jM&2dp3t<%Wi)Y=0 z)@o1d#IT7PoxOKqdg`Gsoe=`dE%;T!losu>`IwT=-y3y5c)9-Qo5XWW6NNaBBWHwI z8A#c5;`u_Aaf)}+Il&;912#BUbYNWAVltizc94x3P~wi<%_O?UaKv|MIq^wm`n&-K7PB6dJBGk-*7t5J4jrc+J=_;na zy&pI;2Toy@m+{H%uz`+w$MNIp$)wnC(+U{bbnlW5ksu?zCV)VeC~POTe;SPK?5Br! z=Z+n16)tZd85~C!ZjVWe++_$&AF2%ojH8u&_h&Pgj>q$ThC?%p0ToY|Hd`hlY9#q& z`6^GTvGUxdPkLW4DStjIG6dVPRzjz+Z-(RvE1rT9?W|5Qj{{RYp=O^u(@ZF|hEPK| zXC=>h(me^mJ42z2dVF&6rLU!wEmfyc06~NESMlS6b%R(B=||aOIYGP$rwfO8T2g#x_utr!TOpW!rTPt-b^1AKZk^e94p0ctv4)aBWieZgh>*@kK4Bwt;;aUiP<@vHO(UBDlr`UhiQkCtP)V zy!T|d(g9mBM(0*iL|fIlZ86VNOSwo*>hZ3Bwc83ePPsiV(@{})o5wCSr;0IQul^bD zfa?Ra`CC`_5A9hl^g5thnbkBjZ@hs=deMn${B3879F3sO?V76FlfcZINQvQo^%YC_ zgy1KF)fatdM_mT1$t4AasYFGm$ew_+tI!)f{BlrN(MeX*;3|-4C(YqcdgBq#4dkV| z3N=LX&eB5R&?DyQ8}HnZKc0A^_1O)CHr@5hMbksLH%Jdd(vQ^|0GQG_O1rW-Y|yh0 zJKwMoLop$k(0N`=+uP?!o}&ZEt4hh*vFNEg$xIq}q-r~lfOl;}Pmq7L*6@ZSBb##v zc@Jg;&`K%%Hn3U^%#uCi#k@12g$!5As zGs!3v{lz;suPM*YKWgCZh;}rMqYGwdyV4QMZ`T*8gVPaxbj8hgJYs||!|u!_o%Uo` z%X#e07&({JR7g^EaWQ;!;ls1SYAkIO#}Q!U!d#^Jc$z00uoSox*huZ2W_em8O)QnS}{xmJ5%Y(WgYUF2trKg`y zRMc?h2FhR9FIk0i5brY9W+y?~vuw{3s=2uOs1PuJ`(F02gX-&-ZiSShwd&~CW<8u_ zp47n@ec11ey)Q`h6j)@UEV#SW&pIWR>#-5hJLOY3jfaukysm_cwR(v;O-$D@Xq5K} zDyv3`lvHext_q!EW0`pcwSwR_S||)IPW_IPQ-oaKkXW?2mNRboA7G)0FY#I6qDafv zJm0$&14(C?-*%9t%vIyjgq6pBw3)}*OG!y-a)dYUd5Tjlv21fCaBp99KR)o8Yv=K- zGu9RLdkUZH^0oqS%lF)J3= zCfZ<$ZP#Ym8zRnsqODa&m_R=Gp2?oJU3f#E5&J}EjM??$ApUF|=OU@^NetIZo5i<9 zo5nR)U$My5y^9jRJ4Dv4G)OQn7GE5icy+UE!(LKtv4});;fdRAQvUYoNy+3&N5kQn ztF+}#6;cldDfTdD>9ny1Bdfx2wX*Tt$g*)Z-wMl*NP7%!0T9PKANFxYmZr)rZ0jw? zk|fh0e$3S(l37_XDT9qiL?O8fuC-SZ#g%X?u?8KMbG{H0?|s+yrE`Xp{Y%e`4`o?e zmF(KXcjTncfuMH+FCq@_#^ej05eib8D@~nT_f0(=H+$4XR2hQNey{yfc z@NF-1rN&j|o#N{t9hbgwsJ5L?$Ixlfg2 z>s3#M`C1Com=JZN9*v*yi8_({bG3d-&@(kS<-CPfXi+By@a>54$H{Pul5SwCR}YbR zN=wr$rFZ4CXfw@=TaZ`~X<{v{9weW}xl@PEW=S8jFyT%&Upb6JOjZFf&x_9HO0y*J zd10DJi?ojueF~Y|1+R)vK&0}p=_6bbNx8zO^A_#9fU&qRQd&sH7wStV>^1m7@(hlX zNrL66)Hl8a^t2|9bJ4)UI=1n0&rfZ6oe*5VANhq`V6X>5g%sWVE?s9acUqk)yelh1 z|8iGuT`M_9>Nv>vnrQu6Tk>HYQB=hh;UtgJT1mn~L0nv_#q;L1u*~=UjK|hmI!JV= zKImiBPZ-MgV@}=-mpV+%PP*Dv>+Exm7US^bpoV#4yB@_$uK`Lh2nZnZ{XE4=eaL0> z#NQp$YnZqKaZiz##wB!~PLhxo@j8*HF#Dh*lth2wXY?Bc6=UXVXFo8?SII(ApAvaj zfpy5ghq=LFV%2e$#S017m?1MS=G11gOI_|luO8fve|}Iqm`rSR+LZmCr?GI{jkI{d z;U`|UA&IdMPevM3#1;ErRT#OLf42SNu;5jBS5p4EM1Qp3K8BlkjM&+rx8ZMkt{0uT zo;mC^6)bX~4B3;fT%roZ=)XJlkJCLTt*XlNZ0A2^cayVZ_B@_sS_|-Oqt-rE=L=Dr zkNcsY;Fw@LRi#Ww8VFl@UJzfON+p}{4K1XL6-lB0CUjDZp#7|{FY2Q37`Ac)7U@@P z_N{VOWplSOU zzo1=gvGoWOQf#Hgeq8g}yTiiinV#PxshBN;QhVms-|550@fz(fFR;c~Y}D;xaf`5n zQz_ngcG?}E5l6CXV4D8UkY^lm3MzZ9c1^Jzi^&VOdkhie+LxKm?CJ~yBW*d6eueD$ z{V~XUMM!HUzB2PAsJO=ulyQoDaY#w3PLxHjX-{Kdks9wg4Cl<&bV{vj87F1)-mb2} zt8})QAY66*A=sX9y5y&1!%CFHGoiYw;xGbI5BeC>@Zzzxmkm6F%7Y=mp1OLny8P6* z9n8I6#5>^?BrM%x(e1X~Gg{;m@iVKEB&X)Bd2UR$*S2xaBy-NHW6MSX%afb?)^X@Q zvzN*SM?SNfr#LUyp6PG{>Q)u=z@v!1V(kk=EHachdI@}ZZu(DAm)rkp$O9JhD8xhaGiFS23j=M`1QOEHV zB*K(VOR(?D8_e2Q@ZJSvb^im#ab}&npHSm_{Mz@v2C1XRxL1=qQxR0{%%u# zmV1gGCH+t`cp;a`3WmBxd&(x%kLW)X&rMk0^lHXB_aPonLX@P)^nPNo#y82XPm+hG zp9;L$B^yX%pC#By&8n|f$>D`cVaQ?Fqpqe<@-q9Rd&yGioeAr-7of%yo_;!=lIB|; zP>_1`4dY9VcjxIz&qS_xQzz?Cl5C7%E}CGjL7Njr*LdkdXjm#Ss2!vVO!|{|(eG@N zhy>_Kisurkl15E#i_K6coZDo0gFT!?kENDAlg}kOt|v32;DQ&RdLS>2&BawlVp3|# zVv?CYR_4_Iu$xBm?hI9H<7sWYY$`}gF;8F5zXmh20x2e3rH+ujJ7R{Cp1}tBEW=#R ziixc$M>YySo{_zLE=^}4&V@peZi%kKGsZXk$$eM)l|z8sQegPC<6DZJ=+}5cxS1%s z#VN6>8v06)@l)GeDEiHSy&$^HH4u+*)89l@&AE!g4gH|yBO}vc3b5B^QBs`@-Q^ZqtyiC?#3ht9KrLqSPX-%JWjK6(x%?cMin=>Jc zwVu#=%^;49>gA8viVe_uEzp|ols}38A?PJG>IOXILFPTP_B_nnEy2M}cc+@XaG22i z=V^_JLh#UE)DoCSc%l-H zn=VPZNv$LhT+7tMe#vk5osWy(h$#OD-`3NVpl|5eY>ooVZUz1u?D>~-9xBe{5@Zvh zpH2_Hln|0X8sGaMz7P@>5Ry-R&Glsd$ZT+kxd~;}`|)kbSm_4>dP=J_6MS^_jKS?9 zdlcP0k_sCMVkGld=AJEgk4YNv6Wh(~J&F1FUcfg$LDu-g@dwG9IB6c;r3vj-s544W zj}B%eO2P7o{w0;CG>wdyk)*fjO=%W-Wt3d^9g9wK)c6T1-dYS%6fS9t=t(HVUcSg~ ziDZ<;L%r~|GRj+(#2+{4h!nuYFEXpm%lQG`Rrcs@tiY!$^o@`#+ zPMLfh6b;z$S52L?`S`+wMqk&gV0lUpZ_@3KvXP1&4*m<6!-(@i@+uC>38l|S zM@l4$%}W=}dfjOi>*Z~xUOH6h27;MdG1H%Pu9_->L#=}v6|hZ4I_xwuL4_*`Ws(L( z6(QqktaDDI6Fn`;9Fya%cVsQFE?SG>lYiNqObqk2u^3|W9IX;5Gw{4##Uwi8WPuuc(Cg5~H_U#;Z@CQk15uVivuC;ou~JdEXg^Fg+qWH!q#mqk9+%dqY0@`e7Lo`BQ5=jkUzTvNGx`7~F4$Qc5AhT=ctESHn;GO*VSw8d(vBd7TJB+BIj{ zMDp%5byxc1w3_3}a#z!^59&2SqhntpN~o*_{R%Nw(h|-G2oB@B^F##%5M9Gw!55ak zX;_w+fIk|whoc?8i&M*EVJ13g^mubaB4Kept>tT)$B)-m5^H5vq5xGT#o%Jd?_$~) zM!3HWSv;F;QWTCadn9%;v@AO^qTQ>C4JWMmEzM8xJH;0w=A)4to{(XQPNv$O`cfXWq|Kz4!R z3A>nR)*a5=#zcds9w#%_zN2hcGE3`YQ#XQE9L|UHPY>o`zfSm^em06#;HHthsA<0T zhbHPGwcNsvx_hal#@S*aqQ^Nu+V~=GVFqHW=Kipx!Z`_xI8ZHGrg1XVQ*m|qt%;#xaW0~!T6c4a0%_5f@1~L zg92Ng+E_j5lkeWs76wum?;R#er%nB=rrIwiO0_-+lfSU$>tG>>5oCSgu3li0L;#ON z`&3|g$p?BRd}h(zq@hZjc${8w!Lmb_c64|j`<~YQO-YV1hsmOwyg zT`@=!29+Lc~Ghs#zjUA1`f_y7g_D@ z{oFmMF&OAuAHhB%HWMw-IWQI;vWXce+z((Wb!;@fqfa^Y!|myt!-=t|2F`}1T`Pji zy9Vg*MGJgSwidS!R@a*Q%5?W5s$Oj_inSbEKMY@ZRcNr;-Zx#$aya8HH1+v=Vc_AT zV$n8dnKs)4!*~OEeVHl||9F>`tC+60t0?nSxNz!pz=&fk_m@^hc94yo(MGG3#gP#ed) zaMHR39I*v_6cV`BT7(!HT=KC{yS5pE|HLtF%%Vy3Og?k1DXlf(o9LnYReUU^8c0^K z0X<>V1L$G!fGAr3#&?hT#dRF*;5%6tu4ioPu?UJ(%rSKg&@~G=G@lM-oJuKQ-1|eGRLiF|0X&0cHdh8|5x%|0HjK62n!%%yAMQ~Bw z7K?;t(9hUVA%DlEb40j25&ysp`_$^ul?@#sOfvOTFWuQ%l^G2wnY6r5&*9s-SAzPa ze&zAhz*`x`n;qYT`oNu_3F|x~Mrf53@rCzy?lF&()%%%gKS2u~1B=hcPbv$cKKfP} zUp9DBMRtVmr~n+vH>a=NOK=1jlh~Ng|#2n{(rDhhUn|J?uy867vjXvUMjhi;vA8K0wy|@9l5@rex5g&Y#eD z-~l=FQ2j|6P%fpOUW6E-i?%mtjn1i*Z8|$PF@E29Gal z(~sB^`Xk_EA%HDPB3vD>tWiA#QYN2S+;OmZ602t#C!51}sQm(RcjJE`&R)D6_lm>KIFvqvCeH^}X^*5}}fGk}zxx^v^{5-rb+Z0OkU%{@HthRUOAD_m~ai zr5?-K*S#5>agz5@&ftf_PzECarn6luLhNbJW#xt5CsI3Qv)rEAn~G%mPQZ)YA6fp@ zsLXcJWVzdOn>!!^u6KRAxr!Y%&Ag=c@~gE;5((xgh~TcOW)pZ~>S4S$OZR2*R4xsz z$@%BDyXEhx?(Lb{;=kjcCY5!c@Rm8V(fQ{Zip;D26Ml~;uV4-^g^Kdl3eotcdz&Xl zUNTuzxy&5N^y;cEgJbk9t?SV{ZId4taev&!cZFZId#T`hS{~0E^?NPM^}G`4A7wBU z+zi%IwjVA}awp4gu!fy?|K5Fig>0=h`m;c7Rz10|&B#?B&s68odR}AwX(0QG()#aj z6kmzLzd7zicDqVMH(o6-xayXSBw>bRQ;qGjaL)#X^COG#U!o;t6r-qd))W+`n_iX=8Sk-=EC}(V!BdAOu(uxvO5U1g5?? zfeC)vZ$oRX4_+?J2%EL`PSKsni(9i!N4op00j5Z(uO7QKEO5P{RB%!%fXj9M(EvV6 zNx@l@R9j3AO>Yf9XYXeVr1YJ-BfEeb`^z|* zTDAeBIdzKC!;0d&z*)E=of)fKmv@SHR=R9m+ThOfWWnP*pLcCsviH=X4HdwuufOm4 zo!WJYai`||i1j&s)BS+49;1KHaz3MjeqFi~gXUOS_MH4OAJo#Tp9rk^<5{jf1bU}V za0cV!lGSgozbQN4HrsE4b!-FE;g>sZ*ShURL{L4az>_O2kJkua_gFqP@<6Wo)v40` z57hFMs{~(M5fS@euy3X38&l2grzU-H&kYG} znq3m98oK22t76&DLGCj=p2ws{@h!iwD)CIByUY`w64|YvI4Yi5u-f$5JPqhsd2Z9J zYT!!bj{>|){|K>mcD;HRnJ|w@ZlXS8e*OFVX=06!ofH_~@2CknYYYo@a5J{!fSiy* zj^JsF$2mHV0?M-l-+mE3t`k?ZU}d=a)-cW~k&lj~#RGSS{C)8_<#E*G7Uf%oPc6f^ z^b&!!se@if{(5Fu{HOAbO!>ce zH)3ng8Mj^E61)HW>oXglZDg^RUMBc?8N&*(`BwFxB_5w`|NK?@Jh;90U#ywf-KQVA=8dO84Y-r?!^MNImRZ9nibmTW@E|k~`;iRl4j3x5UmdK II;Vst0DFzYiU0rr literal 0 HcmV?d00001 diff --git a/benchmarks/benchmark_headdim.py b/benchmarks/benchmark_headdim.py index 341ae4b21..5e5ceb2f3 100644 --- a/benchmarks/benchmark_headdim.py +++ b/benchmarks/benchmark_headdim.py @@ -1,5 +1,6 @@ # Install the newest triton version with # pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python" +import csv import pickle import math import torch @@ -11,7 +12,7 @@ from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined -from flash_attn import flash_attn_qkvpacked_func +from flash_attn import flash_attn_qkvpacked_func, flash_attn_func try: from triton.ops.flash_attention import attention as attention_triton @@ -24,16 +25,17 @@ xops = None -def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"): +def flops(batch, seqlen, headdim, v_headdim, nheads, causal, mode="fwd"): assert mode in ["fwd", "bwd", "fwd_bwd"] - f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1) - return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f) + f = 2 * batch * seqlen**2 * nheads * (headdim+v_headdim) // (2 if causal else 1) + b = 2 * batch * seqlen**2 * nheads * (3*headdim+2*v_headdim) // (2 if causal else 1) + return f if mode == "fwd" else (b if mode == "bwd" else f+b) def efficiency(flop, time): return (flop / time / 10**12) if not math.isnan(time) else 0.0 -def attention_pytorch(qkv, dropout_p=0.0, causal=True): +def attention_pytorch(q, k, v, dropout_p=0.0, causal=True): """ Arguments: qkv: (batch_size, seqlen, 3, nheads, head_dim) @@ -41,13 +43,13 @@ def attention_pytorch(qkv, dropout_p=0.0, causal=True): Output: output: (batch_size, seqlen, nheads, head_dim) """ - batch_size, seqlen, _, nheads, d = qkv.shape - q, k, v = qkv.unbind(dim=2) + batch_size, seqlen, nheads, d = q.shape + v_d = v.shape[-1] q = rearrange(q, 'b t h d -> (b h) t d') k = rearrange(k, 'b s h d -> (b h) d s') softmax_scale = 1.0 / math.sqrt(d) # Preallocate attn_weights for `baddbmm` - scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device) + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device) scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), '(b h) t s -> b h t s', h=nheads) if causal: @@ -59,28 +61,56 @@ def attention_pytorch(qkv, dropout_p=0.0, causal=True): attention = torch.softmax(scores, dim=-1) attention_drop = F.dropout(attention, dropout_p) output = torch.einsum('bhts,bshd->bthd', attention_drop , v) - return output.to(dtype=qkv.dtype) + return output.to(dtype=q.dtype) + + +def flash_attention_pad(q,k,v, dropout_p=0.0, causal=True): + batch_size, seqlen, nheads, d = q.shape + v_d = v.shape[-1] + if d == v_d: + return flash_attn_func(q, k, v, dropout_p, causal) + if d < v_d: + q = F.pad(q, (0, v_d-d)) + k = F.pad(k, (0, v_d-d)) + return flash_attn_func(q, k, v, dropout_p, causal) + elif d > v_d: + v = F.pad(v, (0, d-v_d)) + o = flash_attn_func(q, k, v, dropout_p, causal) + return o[:,:,:,:v_d] + def time_fwd_bwd(func, *args, **kwargs): time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) return time_f[1].mean, time_b[1].mean +save_csv = True repeats = 30 device = 'cuda' dtype = torch.float16 +# torch.cuda.set_device(5) bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)] causal_vals = [False, True] -headdim_vals = [64, 128] -dim = 2048 +headdim_vals = [(32,64),(64,128),(96,192), (128,256)] +dim = 2048 # qk dim dropout_p = 0.0 -methods = (["Flash2", "Pytorch"] - + (["Triton"] if attention_triton is not None else []) - + (["xformers.c"] if xops is not None else []) - + (["xformers.f"] if xops is not None else [])) +methods = (["CustomFlash2", "Pytorch", "Flash2_Pad"]) + +if save_csv: + csvfile = open('flash2_attn_time.csv', 'w', newline='') + writer = csv.writer(csvfile) + writer.writerow([ + "causal", "qk_headdim", "v_headdim", "batch_size", "seqlen", + "time_fwd_CustomFlash2", "time_bwd_CustomFlash2", "time_fwd_bwd_CustomFlash2", + "time_fwd_Pytorch", "time_bwd_Pytorch", "time_fwd_bwd_Pytorch", + "time_fwd_Flash2_Pad", "time_bwd_Flash2_Pad", "time_fwd_bwd_Flash2_Pad", + "flops_fwd_CustomFlash2", "flops_bwd_CustomFlash2", "flops_fwd_bwd_CustomFlash2", + "flops_fwd_Pytorch", "flops_bwd_Pytorch", "flops_fwd_bwd_Pytorch", + "flops_fwd_Flash2_Pad", "flops_bwd_Flash2_Pad", "flops_fwd_bwd_Flash2_Pad", + ]) time_f = {} time_b = {} @@ -89,84 +119,56 @@ def time_fwd_bwd(func, *args, **kwargs): speed_b = {} speed_f_b = {} for causal in causal_vals: - for headdim in headdim_vals: + for headdim,v_headdim in headdim_vals: for batch_size, seqlen in bs_seqlen_vals: config = (causal, headdim, batch_size, seqlen) nheads = dim // headdim - qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, + q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, + requires_grad=True) f, b = time_fwd_bwd( - flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + flash_attn_func, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False ) - time_f[config, "Flash2"] = f - time_b[config, "Flash2"] = b + time_f[config, "CustomFlash2"] = f + time_b[config, "CustomFlash2"] = b try: - qkv = qkv.detach().requires_grad_(True) + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) f, b = time_fwd_bwd( - attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + attention_pytorch, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False ) except: # Skip if OOM f, b = float('nan'), float('nan') time_f[config, "Pytorch"] = f time_b[config, "Pytorch"] = b - if attention_triton is not None: - q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype, - requires_grad=True) for _ in range(3)] - # Try both values of sequence_parallel and pick the faster one - try: - f, b = time_fwd_bwd( - attention_triton, q, k, v, causal, headdim**(-0.5), - False, repeats=repeats, verbose=False - ) - except: - f, b = float('nan'), float('inf') - try: - _, b0 = time_fwd_bwd( - attention_triton, q, k, v, causal, headdim**(-0.5), - True, repeats=repeats, verbose=False - ) - except: - b0 = float('inf') - time_f[config, "Triton"] = f - time_b[config, "Triton"] = min(b, b0) if min(b, b0) < float('inf') else float('nan') - - if xops is not None: - q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, - requires_grad=True) for _ in range(3)] - f, b = time_fwd_bwd( - xops.memory_efficient_attention, q, k, v, - attn_bias=xops.LowerTriangularMask() if causal else None, - op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp) - ) - time_f[config, "xformers.c"] = f - time_b[config, "xformers.c"] = b - - if xops is not None: - q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, - requires_grad=True) for _ in range(3)] - f, b = time_fwd_bwd( - xops.memory_efficient_attention, q, k, v, - attn_bias=xops.LowerTriangularMask() if causal else None, - op=(xops.fmha.flash.FwOp, xops.fmha.flash.BwOp) - ) - time_f[config, "xformers.f"] = f - time_b[config, "xformers.f"] = b + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + flash_attention_pad, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "Flash2_Pad"] = f + time_b[config, "Flash2_Pad"] = b - print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###") + print(f"### causal={causal}, qk_headdim={headdim}, v_headdim={v_headdim}, batch_size={batch_size}, seqlen={seqlen} ###") for method in methods: time_f_b[config, method] = time_f[config, method] + time_b[config, method] speed_f[config, method] = efficiency( - flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), + flops(batch_size, seqlen, headdim, v_headdim, nheads, causal, mode="fwd"), time_f[config, method] ) speed_b[config, method] = efficiency( - flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"), + flops(batch_size, seqlen, headdim, v_headdim, nheads, causal, mode="bwd"), time_b[config, method] ) speed_f_b[config, method] = efficiency( - flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"), + flops(batch_size, seqlen, headdim, v_headdim, nheads, causal, mode="fwd_bwd"), time_f_b[config, method] ) print( @@ -174,6 +176,20 @@ def time_fwd_bwd(func, *args, **kwargs): f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, " f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s" ) + if save_csv: + writer.writerow([ + causal, headdim, v_headdim, batch_size, seqlen, + time_f[config, "CustomFlash2"], time_b[config, "CustomFlash2"], time_f_b[config, "CustomFlash2"], + time_f[config, "Pytorch"], time_b[config, "Pytorch"], time_f_b[config, "Pytorch"], + time_f[config, "Flash2_Pad"], time_b[config, "Flash2_Pad"], time_f_b[config, "Flash2_Pad"], + speed_f[config, "CustomFlash2"], speed_b[config, "CustomFlash2"], speed_f_b[config, "CustomFlash2"], + speed_f[config, "Pytorch"], speed_b[config, "Pytorch"], speed_f_b[config, "Pytorch"], + speed_f[config, "Flash2_Pad"], speed_b[config, "Flash2_Pad"], speed_f_b[config, "Flash2_Pad"], + ]) + +if save_csv: + csvfile.close() + # with open('flash2_attn_time.plk', 'wb') as fp: From 3caa059110fa588901b4aa74056fff670bf63c97 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Thu, 22 Aug 2024 09:28:23 +0800 Subject: [PATCH 15/46] update Readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5a1c700be..a6a863aa0 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,9 @@ Currently, we do not provide prebuilt library, you need to compile from source. ## Performance of Customized FlashAttention -We test the performance on A100. +We test the performance speedup compare to padding qk&v hidden_dim on A100. -We display FlashAttention speedup using these parameters: +We display CustomFlashAttention speedup using these parameters: - (qk dim, v_dim): (32,64), (64,128), (128,256); qk hidden dimension 2048 (i.e. 64, 32 or 16 heads). - Sequence length 512, 1k, 2k, 4k, 8k, 16k. From 493a4303f04ba130335e6008f38830a990a31066 Mon Sep 17 00:00:00 2001 From: feiychen <2394209769@qq.com> Date: Thu, 22 Aug 2024 08:28:10 +0000 Subject: [PATCH 16/46] reorg code to reduce compile time --- csrc/flash_attn/flash_api.cpp | 1 + .../src/flash_bwd_launch_template.h | 156 ------------------ ...h_bwd_qkdim128_vdim256_bf16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim128_vdim256_bf16_sm80.cu | 2 +- ...h_bwd_qkdim128_vdim256_fp16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim128_vdim256_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim128_vdim256_sm80.h | 42 +++++ ...ash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu | 2 +- .../src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu | 2 +- ...ash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu | 2 +- .../src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim32_vdim64_sm80.h | 32 ++++ ...sh_bwd_qkdim64_vdim128_bf16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim64_vdim128_bf16_sm80.cu | 2 +- ...sh_bwd_qkdim64_vdim128_fp16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim64_vdim128_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim64_vdim128_sm80.h | 57 +++++++ ...sh_bwd_qkdim96_vdim192_bf16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim96_vdim192_bf16_sm80.cu | 2 +- ...sh_bwd_qkdim96_vdim192_fp16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim96_vdim192_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim96_vdim192_sm80.h | 33 ++++ .../src/flash_fwd_launch_template.h | 92 ----------- ...h_fwd_qkdim128_vdim256_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim128_vdim256_bf16_sm80.cu | 2 +- ...h_fwd_qkdim128_vdim256_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim128_vdim256_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim128_vdim256_sm80.h | 39 +++++ ...ash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu | 2 +- .../src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu | 2 +- ...ash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu | 2 +- .../src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim32_vdim64_sm80.h | 11 ++ ...sh_fwd_qkdim64_vdim128_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim64_vdim128_bf16_sm80.cu | 2 +- ...sh_fwd_qkdim64_vdim128_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim64_vdim128_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim64_vdim128_sm80.h | 22 +++ ...sh_fwd_qkdim96_vdim192_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim96_vdim192_bf16_sm80.cu | 2 +- ...sh_fwd_qkdim96_vdim192_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim96_vdim192_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim96_vdim192_sm80.h | 26 +++ csrc/flash_attn/src/static_switch.h | 49 ------ csrc/flash_attn/src/static_switch_headdim.h | 65 ++++++++ setup.py | 75 +++------ 46 files changed, 386 insertions(+), 378 deletions(-) create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h create mode 100644 csrc/flash_attn/src/static_switch_headdim.h diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index a1a50f2f1..75ab69644 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -12,6 +12,7 @@ #include "flash.h" #include "static_switch.h" +#include "static_switch_headdim.h" #define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA") #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") diff --git a/csrc/flash_attn/src/flash_bwd_launch_template.h b/csrc/flash_attn/src/flash_bwd_launch_template.h index 404643788..a1568285f 100644 --- a/csrc/flash_attn/src/flash_bwd_launch_template.h +++ b/csrc/flash_attn/src/flash_bwd_launch_template.h @@ -321,159 +321,3 @@ void run_mha_bwd_hdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { }); } -template -void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 32; - constexpr static int VHeaddim = 64; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 128; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB - if (max_smem_per_block >= 104 * 1024) { // 104 KB - if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - } else { // 96 KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - }); -} - -template -void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 64; - constexpr static int VHeaddim = 128; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - // printf("max_smem_per_block = %d\n", max_smem_per_block); - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - // Changing AtomLayoutMdQ from 2 to 4 takes the same time - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // This is slightly faster. We want to split M more so we need fewer registers to store LSE. - constexpr static int Br = 64; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // printf("smem_size = %d\n", smem_size); - // printf("max_smem_per_block = %d\n", max_smem_per_block); - - if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // A100 shared memory spill - // run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // This has a lot of register spilling - // run_flash_bwd, Is_dropout>(params, stream); - } else { - // if (params.h == params.h_k) { - // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // } else { - // } - } - }); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - - // run_flash_bwd>(params, stream); -} - -template -void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 96; - constexpr static int VHeaddim = 192; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - // printf("max_smem_per_block = %d\n", max_smem_per_block); - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 64; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - if (max_smem_per_block >= 116 * 1024) { - if constexpr(!Is_dropout) { // 92KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } else { // 116 KB - // This is faster for dropout since we don't have many registers to spare - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - }); -} - -template -void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 128; - constexpr static int VHeaddim = 256; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - // printf("max_smem_per_block = %d\n", max_smem_per_block); - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 64; - constexpr static int Bc = 64; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // run_flash_bwd>(params, stream); - // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). - // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. - // run_flash_bwd>(params, stream); - if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // A100 shared memory spill - // run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); - // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - } else { - // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - // run_flash_bwd>(params, stream); - - // run_flash_bwd>(params, stream); - }); -} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu index 010fbd630..9834bfbe4 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu index 53e334b12..8bfa8623c 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu index 1bbccb862..35ce26dbe 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu index ba1916590..17521c9d2 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h new file mode 100644 index 000000000..8fd4acf28 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h @@ -0,0 +1,42 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 64; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // run_flash_bwd>(params, stream); + // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). + // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. + // run_flash_bwd>(params, stream); + if (max_smem_per_block >= 144 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // A100 shared memory spill + // run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + } else { + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_bwd>(params, stream); + + // run_flash_bwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu index 621e9f679..7023d4741 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu index a87d7b453..0f6371b41 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu index 0f8b1fec7..285bca814 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu index 6d2f207fc..8be40bb82 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h new file mode 100644 index 000000000..9ce14f6a5 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h @@ -0,0 +1,32 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 128; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB + if (max_smem_per_block >= 104 * 1024) { // 104 KB + if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + } else { // 96 KB + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu index 740f0baa8..9d18044d4 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu index 34df4e575..0ceb99220 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu index 5e9428a4f..543f16045 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu index b0912ed91..771708192 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h new file mode 100644 index 000000000..b09d032a5 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h @@ -0,0 +1,57 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + // Changing AtomLayoutMdQ from 2 to 4 takes the same time + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // This is slightly faster. We want to split M more so we need fewer registers to store LSE. + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // printf("smem_size = %d\n", smem_size); + // printf("max_smem_per_block = %d\n", max_smem_per_block); + + if (max_smem_per_block >= 144 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // A100 shared memory spill + // run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // This has a lot of register spilling + // run_flash_bwd, Is_dropout>(params, stream); + } else { + // if (params.h == params.h_k) { + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // } else { + // } + } + }); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + + // run_flash_bwd>(params, stream); +} + diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu index 17f479dc5..4bd2c82dc 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu index 229078332..7536e95ab 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu index a502004d5..487006b5a 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu index ebd73992f..9544f59ab 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h new file mode 100644 index 000000000..79ca59f86 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h @@ -0,0 +1,33 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + if (max_smem_per_block >= 116 * 1024) { + if constexpr(!Is_dropout) { // 92KB + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { // 116 KB + // This is faster for dropout since we don't have many registers to spare + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h index 6f51d423e..ba4c29d8b 100644 --- a/csrc/flash_attn/src/flash_fwd_launch_template.h +++ b/csrc/flash_attn/src/flash_fwd_launch_template.h @@ -327,95 +327,3 @@ void run_mha_fwd_hdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { // run_flash_fwd, Is_dropout, Is_causal>(params, stream); }); } -template -void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 32; - constexpr static int VHeaddim = 64; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - }); -} - -template -void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 64; - constexpr static int VHeaddim = 128; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - if constexpr(!Is_dropout) { - // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower - // Using block size (64 x 256) is 27% slower for seqlen=2k - // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - }); -} - -template -void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 96; - constexpr static int VHeaddim = 192; - auto dprops = at::cuda::getCurrentDeviceProperties(); - bool is_sm8x = dprops->major == 8 && dprops->minor > 0; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), - if (is_sm8x) { - if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // These two are always slower - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); - }); -} - -template -void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 128; - constexpr static int VHeaddim = 256; - auto dprops = at::cuda::getCurrentDeviceProperties(); - bool is_sm8x = dprops->major == 8 && dprops->minor > 0; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - if constexpr(!Is_dropout) { - // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), - // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. - if (is_sm8x) { - if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // 1st ones are good for H100, A100 - // 2nd one is good for A6000 bc we get slightly better occupancy - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // A100 RuntimeError: CUDA error: an illegal memory access was encountered - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - }); -} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu index 795ec67f1..b20271f2d 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu index e1048791c..464e0b283 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu index 582a95236..5af5648fa 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu index bfc09dc6b..62cb67ead 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h new file mode 100644 index 000000000..9bf899ea7 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h @@ -0,0 +1,39 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm8x = dprops->major == 8 && dprops->minor > 0; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), + // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. + if (is_sm8x) { + if constexpr(!Is_causal) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // 1st ones are good for H100, A100 + // 2nd one is good for A6000 bc we get slightly better occupancy + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // A100 RuntimeError: CUDA error: an illegal memory access was encountered + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu index 3f80a1fe7..f28444255 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu index e3dba404d..0aa49a111 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu index 5677fcef4..b88785f29 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu index 36b511f06..28c42a9b8 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h new file mode 100644 index 000000000..4c4941471 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h @@ -0,0 +1,11 @@ + +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + }); +} \ No newline at end of file diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu index 2869ccfc2..762253d09 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu index d9d444fd1..86a7616fe 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu index 2504c540b..0074f41ca 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu index a5270a3ed..7578c123f 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h new file mode 100644 index 000000000..3ec8ee12d --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h @@ -0,0 +1,22 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower + // Using block size (64 x 256) is 27% slower for seqlen=2k + // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu index a307cb29e..a140b8d33 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu index 9f00dd249..ee39b3da2 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu index 27bb08d03..8943b8922 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu index 7843c337c..ce4b051a3 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h new file mode 100644 index 000000000..bd106822d --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h @@ -0,0 +1,26 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm8x = dprops->major == 8 && dprops->minor > 0; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), + if (is_sm8x) { + if constexpr(!Is_causal) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // These two are always slower + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/static_switch.h b/csrc/flash_attn/src/static_switch.h index 8e663d8c3..3cb31f9c6 100644 --- a/csrc/flash_attn/src/static_switch.h +++ b/csrc/flash_attn/src/static_switch.h @@ -113,52 +113,3 @@ } \ }() -#define QKHEADDIM_VHEADDIM_SWITCH(QKHEADDIM, VHEADDIM, ...) \ - [&] { \ - if (QKHEADDIM <= 32 && VHEADDIM <= 32) { \ - constexpr static int kQKHeadDim = 32; \ - constexpr static int kVHeadDim = 32; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 32 && VHEADDIM <= 64) { \ - constexpr static int kQKHeadDim = 32; \ - constexpr static int kVHeadDim = 64; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 64 && VHEADDIM <= 64) { \ - constexpr static int kQKHeadDim = 64; \ - constexpr static int kVHeadDim = 64; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 64 && VHEADDIM <= 128) { \ - constexpr static int kQKHeadDim = 64; \ - constexpr static int kVHeadDim = 128; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 96 && VHEADDIM <= 96) { \ - constexpr static int kQKHeadDim = 96; \ - constexpr static int kVHeadDim = 96; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 96 && VHEADDIM <= 192) { \ - constexpr static int kQKHeadDim = 96; \ - constexpr static int kVHeadDim = 192; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 128 && VHEADDIM <= 128) { \ - constexpr static int kQKHeadDim = 128; \ - constexpr static int kVHeadDim = 128; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 128 && VHEADDIM <= 256) { \ - constexpr static int kQKHeadDim = 128; \ - constexpr static int kVHeadDim = 256; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 160 && VHEADDIM <= 160) { \ - constexpr static int kQKHeadDim = 160; \ - constexpr static int kVHeadDim = 160; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 192 && VHEADDIM <= 192) { \ - constexpr static int kQKHeadDim = 192; \ - constexpr static int kVHeadDim = 192; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 256 && VHEADDIM <= 256) { \ - constexpr static int kQKHeadDim = 256; \ - constexpr static int kVHeadDim = 256; \ - return __VA_ARGS__(); \ - } \ - }() - diff --git a/csrc/flash_attn/src/static_switch_headdim.h b/csrc/flash_attn/src/static_switch_headdim.h new file mode 100644 index 000000000..e1c18aff6 --- /dev/null +++ b/csrc/flash_attn/src/static_switch_headdim.h @@ -0,0 +1,65 @@ +// Inspired by +// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h +// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h + +#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` + +#define QKHEADDIM_VHEADDIM_SWITCH(QKHEADDIM, VHEADDIM, ...) \ + [&] { \ + if (QKHEADDIM <= 32 && VHEADDIM <= 32) { \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 32; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 32 && VHEADDIM <= 64) { \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 64 && VHEADDIM <= 64) { \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 64 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 96 && VHEADDIM <= 96) { \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 96; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 96 && VHEADDIM <= 192) { \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 128 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 128 && VHEADDIM <= 256) { \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 256; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 160 && VHEADDIM <= 160) { \ + constexpr static int kQKHeadDim = 160; \ + constexpr static int kVHeadDim = 160; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 192 && VHEADDIM <= 192) { \ + constexpr static int kQKHeadDim = 192; \ + constexpr static int kVHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 256 && VHEADDIM <= 256) { \ + constexpr static int kQKHeadDim = 256; \ + constexpr static int kVHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() diff --git a/setup.py b/setup.py index f80af1c32..39093359a 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,31 @@ # For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE" +list_headdim = [] +compile_list_headdim = [] +if not SKIP_CUDA_BUILD and not IS_ROCM: + list_headdim = [ + (32, 64), + (64, 128), + (96, 192), + (128, 256) + ] + # "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu" + for ii in ["fwd", "bwd"]: + for jj in list_headdim: + for kk in ["fp16", "bf16"]: + for ll in ["", "_causal"]: + compile_list_headdim.append( + f"csrc/flash_attn/src/flash_{ii}_qkdim{jj[0]}_vdim{jj[1]}_{kk}{ll}_sm80.cu" + ) + + # "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu" + for jj in list_headdim: + for kk in ["fp16", "bf16"]: + for ll in ["", "_causal"]: + compile_list_headdim.append( + f"csrc/flash_attn/src/flash_fwd_split_qkdim{jj[0]}_vdim{jj[1]}_{kk}{ll}_sm80.cu" + ) def get_platform(): """ @@ -265,55 +290,7 @@ def validate_and_update_archs(archs): "csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu", "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu", - ], + ] + compile_list_headdim, extra_compile_args={ "cxx": ["-O3", "-std=c++17"] + generator_flag, "nvcc": append_nvcc_threads( From 0607e6cee3ef6c2c0bb014007ee03d99958f387b Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Thu, 22 Aug 2024 20:29:42 +0800 Subject: [PATCH 17/46] update (128,256) config --- csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h index 9bf899ea7..3900d1fd7 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h @@ -17,7 +17,9 @@ void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) run_flash_fwd, Is_dropout, Is_causal>(params, stream); } } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // slow on A100 + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); } // run_flash_fwd, Is_dropout, Is_causal>(params, stream); // run_flash_fwd, Is_dropout, Is_causal>(params, stream); From fd6fc295be5b836d9783b723425384f9c776be46 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Mon, 26 Aug 2024 23:43:42 +0800 Subject: [PATCH 18/46] add (192,128) --- ...h_fwd_qkdim192_vdim128_bf16_causal_sm80.cu | 10 ++++++++++ .../flash_fwd_qkdim192_vdim128_bf16_sm80.cu | 10 ++++++++++ ...h_fwd_qkdim192_vdim128_fp16_causal_sm80.cu | 10 ++++++++++ .../flash_fwd_qkdim192_vdim128_fp16_sm80.cu | 10 ++++++++++ .../src/flash_fwd_qkdim192_vdim128_sm80.h | 19 +++++++++++++++++++ csrc/flash_attn/src/static_switch_headdim.h | 4 ++++ setup.py | 3 ++- 7 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..9eb4e066e --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..a8b7b2ad0 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..2889d1829 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu new file mode 100644 index 000000000..dee0fccd2 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim128_vdim256(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h new file mode 100644 index 000000000..8d259d6ca --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h @@ -0,0 +1,19 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim192_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 192; + constexpr static int VHeaddim = 128; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/static_switch_headdim.h b/csrc/flash_attn/src/static_switch_headdim.h index e1c18aff6..8d2e97b3e 100644 --- a/csrc/flash_attn/src/static_switch_headdim.h +++ b/csrc/flash_attn/src/static_switch_headdim.h @@ -53,6 +53,10 @@ constexpr static int kQKHeadDim = 160; \ constexpr static int kVHeadDim = 160; \ return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 192 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 192; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ } else if (QKHEADDIM <= 192 && VHEADDIM <= 192) { \ constexpr static int kQKHeadDim = 192; \ constexpr static int kVHeadDim = 192; \ diff --git a/setup.py b/setup.py index 39093359a..7f6f30d4f 100644 --- a/setup.py +++ b/setup.py @@ -69,7 +69,8 @@ (32, 64), (64, 128), (96, 192), - (128, 256) + (128, 256), + (192, 128) ] # "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu" for ii in ["fwd", "bwd"]: From b6d7493d9e67b80203b6947c9350042bec2c708d Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 27 Aug 2024 07:22:44 +0800 Subject: [PATCH 19/46] add config (192,128) --- ...h_bwd_qkdim192_vdim128_bf16_causal_sm80.cu | 10 +++++++++ .../flash_bwd_qkdim192_vdim128_bf16_sm80.cu | 10 +++++++++ ...h_bwd_qkdim192_vdim128_fp16_causal_sm80.cu | 10 +++++++++ .../flash_bwd_qkdim192_vdim128_fp16_sm80.cu | 10 +++++++++ .../src/flash_bwd_qkdim192_vdim128_sm80.h | 22 +++++++++++++++++++ ...h_fwd_qkdim192_vdim128_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim192_vdim128_bf16_sm80.cu | 2 +- ...h_fwd_qkdim192_vdim128_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim192_vdim128_fp16_sm80.cu | 2 +- ...split_qkdim192_vdim128_bf16_causal_sm80.cu | 7 ++++++ ...sh_fwd_split_qkdim192_vdim128_bf16_sm80.cu | 7 ++++++ ...split_qkdim192_vdim128_fp16_causal_sm80.cu | 7 ++++++ ...sh_fwd_split_qkdim192_vdim128_fp16_sm80.cu | 7 ++++++ 13 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..222fa2cb0 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..61e47b53c --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..0b5f0c766 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu new file mode 100644 index 000000000..3fe80fd91 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h new file mode 100644 index 000000000..624e55f2e --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h @@ -0,0 +1,22 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim192_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 192; + constexpr static int VHeaddim = 128; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if (max_smem_per_block >= 136 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} \ No newline at end of file diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu index 9eb4e066e..52dcca482 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu @@ -6,5 +6,5 @@ template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_qkdim128_vdim256(params, stream); + run_mha_fwd_qkdim192_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu index a8b7b2ad0..cfe937021 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu @@ -6,5 +6,5 @@ template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_qkdim128_vdim256(params, stream); + run_mha_fwd_qkdim192_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu index 2889d1829..82db5ae67 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu @@ -6,5 +6,5 @@ template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_qkdim128_vdim256(params, stream); + run_mha_fwd_qkdim192_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu index dee0fccd2..2c5d5c7e9 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu @@ -6,5 +6,5 @@ template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { - run_mha_fwd_qkdim128_vdim256(params, stream); + run_mha_fwd_qkdim192_vdim128(params, stream); } diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..db38107b7 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..62cdffd8a --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..566dbf250 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu new file mode 100644 index 000000000..9f3023f8f --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); From 85fb8d25fb21c88d3afde2a6105c69faeed77f7c Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 27 Aug 2024 07:30:08 +0800 Subject: [PATCH 20/46] fix bug --- csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h index 624e55f2e..71e550db4 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h @@ -16,7 +16,7 @@ void run_mha_bwd_qkdim192_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) if (max_smem_per_block >= 136 * 1024) { run_flash_bwd, Is_dropout, Is_causal>(params, stream); } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); } }); } \ No newline at end of file From f0644c268ace18f98d0136d879241692546c39ef Mon Sep 17 00:00:00 2001 From: feiychen <2394209769@qq.com> Date: Tue, 27 Aug 2024 00:57:26 +0000 Subject: [PATCH 21/46] fix bug backward --- flash_attn/flash_attn_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flash_attn/flash_attn_interface.py b/flash_attn/flash_attn_interface.py index ecb3515c0..9ce12cf9a 100644 --- a/flash_attn/flash_attn_interface.py +++ b/flash_attn/flash_attn_interface.py @@ -588,8 +588,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension - dk = dk[..., : dout.shape[-1]] + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None From 00922858e5782b7011531ad42592876c80e0949d Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 27 Aug 2024 09:02:22 +0800 Subject: [PATCH 22/46] fix bug --- flash_attn/flash_attn_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flash_attn/flash_attn_interface.py b/flash_attn/flash_attn_interface.py index 9ce12cf9a..d9c05a026 100644 --- a/flash_attn/flash_attn_interface.py +++ b/flash_attn/flash_attn_interface.py @@ -675,8 +675,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension - dk = dk[..., : dout.shape[-1]] + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None From 6e88a4da6ee14540b32e864b78d82869c86915bc Mon Sep 17 00:00:00 2001 From: FeiyangChen <92138383+smallscientist1@users.noreply.github.com> Date: Tue, 27 Aug 2024 09:19:45 +0800 Subject: [PATCH 23/46] Add support for dim(192,128) (#1) * create bench headdim * update bench result * update Readme * reorg code to reduce compile time * update (128,256) config * add (192,128) * add config (192,128) * fix bug * fix bug backward * fix bug --- README.md | 14 ++ .../Customflash2_a100_fwd_bwd_benchmark.png | Bin 0 -> 183246 bytes benchmarks/benchmark_headdim.py | 196 ++++++++++++++++++ csrc/flash_attn/flash_api.cpp | 1 + .../src/flash_bwd_launch_template.h | 156 -------------- ...h_bwd_qkdim128_vdim256_bf16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim128_vdim256_bf16_sm80.cu | 2 +- ...h_bwd_qkdim128_vdim256_fp16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim128_vdim256_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim128_vdim256_sm80.h | 42 ++++ ...h_bwd_qkdim192_vdim128_bf16_causal_sm80.cu | 10 + .../flash_bwd_qkdim192_vdim128_bf16_sm80.cu | 10 + ...h_bwd_qkdim192_vdim128_fp16_causal_sm80.cu | 10 + .../flash_bwd_qkdim192_vdim128_fp16_sm80.cu | 10 + .../src/flash_bwd_qkdim192_vdim128_sm80.h | 22 ++ ...ash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu | 2 +- .../src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu | 2 +- ...ash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu | 2 +- .../src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim32_vdim64_sm80.h | 32 +++ ...sh_bwd_qkdim64_vdim128_bf16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim64_vdim128_bf16_sm80.cu | 2 +- ...sh_bwd_qkdim64_vdim128_fp16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim64_vdim128_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim64_vdim128_sm80.h | 57 +++++ ...sh_bwd_qkdim96_vdim192_bf16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim96_vdim192_bf16_sm80.cu | 2 +- ...sh_bwd_qkdim96_vdim192_fp16_causal_sm80.cu | 2 +- .../flash_bwd_qkdim96_vdim192_fp16_sm80.cu | 2 +- .../src/flash_bwd_qkdim96_vdim192_sm80.h | 33 +++ .../src/flash_fwd_launch_template.h | 92 -------- ...h_fwd_qkdim128_vdim256_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim128_vdim256_bf16_sm80.cu | 2 +- ...h_fwd_qkdim128_vdim256_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim128_vdim256_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim128_vdim256_sm80.h | 41 ++++ ...h_fwd_qkdim192_vdim128_bf16_causal_sm80.cu | 10 + .../flash_fwd_qkdim192_vdim128_bf16_sm80.cu | 10 + ...h_fwd_qkdim192_vdim128_fp16_causal_sm80.cu | 10 + .../flash_fwd_qkdim192_vdim128_fp16_sm80.cu | 10 + .../src/flash_fwd_qkdim192_vdim128_sm80.h | 19 ++ ...ash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu | 2 +- .../src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu | 2 +- ...ash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu | 2 +- .../src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim32_vdim64_sm80.h | 11 + ...sh_fwd_qkdim64_vdim128_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim64_vdim128_bf16_sm80.cu | 2 +- ...sh_fwd_qkdim64_vdim128_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim64_vdim128_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim64_vdim128_sm80.h | 22 ++ ...sh_fwd_qkdim96_vdim192_bf16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim96_vdim192_bf16_sm80.cu | 2 +- ...sh_fwd_qkdim96_vdim192_fp16_causal_sm80.cu | 2 +- .../flash_fwd_qkdim96_vdim192_fp16_sm80.cu | 2 +- .../src/flash_fwd_qkdim96_vdim192_sm80.h | 26 +++ ...split_qkdim192_vdim128_bf16_causal_sm80.cu | 7 + ...sh_fwd_split_qkdim192_vdim128_bf16_sm80.cu | 7 + ...split_qkdim192_vdim128_fp16_causal_sm80.cu | 7 + ...sh_fwd_split_qkdim192_vdim128_fp16_sm80.cu | 7 + csrc/flash_attn/src/static_switch.h | 49 ----- csrc/flash_attn/src/static_switch_headdim.h | 69 ++++++ flash_attn/flash_attn_interface.py | 8 +- setup.py | 76 +++---- 64 files changed, 756 insertions(+), 382 deletions(-) create mode 100644 assets/Customflash2_a100_fwd_bwd_benchmark.png create mode 100644 benchmarks/benchmark_headdim.py create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu create mode 100644 csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu create mode 100644 csrc/flash_attn/src/static_switch_headdim.h diff --git a/README.md b/README.md index 3e2e066cf..0db9b5f9b 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,20 @@ This repository provides the official implementation of FlashAttention and FlashAttention-2 from the following papers. +## Performance of Customized FlashAttention + +We test the performance speedup compare to padding qk&v hidden_dim on A100. + +We display CustomFlashAttention speedup using these parameters: + +- (qk dim, v_dim): (32,64), (64,128), (128,256); qk hidden dimension 2048 (i.e. 64, 32 or 16 heads). +- Sequence length 512, 1k, 2k, 4k, 8k, 16k. +- Batch size set to 16k / seqlen. + +### Speedup +![Custom-flash-attn](assets/Customflash2_a100_fwd_bwd_benchmark.png) + + **FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness** Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré Paper: https://arxiv.org/abs/2205.14135 diff --git a/assets/Customflash2_a100_fwd_bwd_benchmark.png b/assets/Customflash2_a100_fwd_bwd_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..281f52420cd4e79962238d1e736f692a5c4175f2 GIT binary patch literal 183246 zcmdqIgZRP$0MkD^{dNd# z1S=jWZf||Gffu*Ye z4U2h;p{wu;vxKroSrsdS)S9V$E=37vHY^UudHYk0HC}(xoH4n-#dDqKGhf4A`lVsD z9&oSZ9_C8k-)?;m9?O&TJfz2%()Ree=B$q$LZ{Qo#y(dl$j6l}zk_Qlhv{XBm-nSz zN|5`>6S{7pxx0?`HUglr|EPTF>G<8f$6r*8iZQFlS-7*hd`c>1A&sU z7YZPZorf`j9v&W-z%n^!Y?prE$e-+wvfCy1n!?_yX(O79y>VZL2*u^HIL0=$?qw%Y zUFup*UO%EAReqvQO{SX9v-;)guo3mG=1G4?(Nv6TOe51B)B4Chv+_skpUJRD>DNOe zHhp=RFJtmAmO=z#h?t2JXuzB10 z;JrYA9~4 zy19I%2TwsV31x`_qc+)=%2%$nenR0d?{(r}M|MnfHS!pi7+jSfnJ8_stW2Sb{Ox*w z0GNEqAzn-v#Lw*AAVuvefa=Klg#r6a+vE~#Ic!0c;ideUWM7y@OKJ-78{2Xc6H@G0%zmIX zaDqatJJ5|^iq8Dj(t9AgWe_3Nxky$(xYJmdZ+*x)9z3H~MitN9 zNa>Dhje4spW$ZGZ6vv2obFS>YFDSgN4<3?XY_gIywI-~^o)h0A_ra8m6T!gs!E)+1 zskIT3YG^GOGMs(aOywtK{9NUNBsg5#Amf`w8!sd)Gye6AU@_qG%;D0FEXI+Hpn!nex+LWrtdPJ1w`<0` zv3Y^jGDVz_lW0F(wa~CWJ4&no_42;_Qj&yF@tV*RcU;$Q!i!Kj-~$$UI#sMR%Rkg^ z`Rd*J0%YH@AHRFZgI&=rQ3Gz872R4C-L1Wz=Q%W!u zHd61{8HjHSWm-&jKXw1t{?mTx{r5U(#t=Rh!(`u7cv^L8VT!EYQ_c)>?PqL}+Syvh&*q;w7MYcL>B#7b ze4NpTzNQmNlYqBInW#uBc4!9{7Zq8TuDvJy%%Y!NQKxKIJ?xjRl@9-8^2z6u^EAOH z-RZ}l7GG$8`lS=7SUgerDyYHMM8<{SbCgfcS@M~DV%28(fDW=^RHs6}!m#brnQnfO z>&G$O!V>dx^Y=k=w{>r-ah3X>g@5ju&ibtL(cK7{q1?s(((|o-iG9+h^=N($<Hac zkeKv(;+^!{Yumcjpp|afdHF-@wKsGph{pbDf{2EI^F&Rnm0@(pEN#_R8OIOq>BVAS z#ltiiMcL=F>cL-v!tb!Lm%7K87A!yY;D!t1cJwty`x9*u@zZw(U%XVD`Y|Rj;1;z? z(Pz*X#A;J)$yz&MQaq{LG0-{?G>|>Ood~Br&-$Smpb^2&t*MnI<*+Sal;Au-qfA&l zKK0{izy9mi`G(Oy_1TTa?Zzqd4E1=M`ktkJIhW5)>`!C}$>M7%gn3`{9+>FzWM@!k zRPc@)cQ)5GJ!>$roVJ{H`Rp)S%hyoV*a=mvM>S?!>$!L?XS^C|7_EB=_5VuT=-D_@ zQ_;}o3}0xQC)}T1Y=$!>98odT=t!?+gnz;wdEfnh=KZA3tl(1ZP;C!tLp46-8*3(9 zO6)i{twOJ9(xA-0xIsBG=?TsB)1AipeBtpgKQFIdEo{G=@KhE{3fizfWO18Ecr2zL zNX&SLI@RwQiusDgn&z9r+amqr{L`;_OGSm_zGV6R3P-FS3JxO62UWDi#skK03xfT` zVRnA|e|8vbq?HBice4h6+5a*-9Q{Qv^RSS-P*aA!v$WG}<@Jh6&}L9TmrKyNQB$SH z4Bw1;Fh_9gJ@fs!v-TO!**6RxCh)tb?|7ec3&^;=MD#J$zx(rU;a#WYEP}d+LlK|Z zig`rQ)ja_$rA1W6KPByD@Jc=FwUP5&YmLgP&uZ#XY3=shk6P}5U+|g)pwd%Fn5x!# z+z_75jE-JW5`?U{xQM&Ba3CQro7$5vgF#Z<$6>DxeBbiH{d~=3?E`9*V26N=^fl=( z4qUbj4xa>Cao3p^bv0bI-jV@ZT@Quhru92iDAF+TFb5&=-nMr%c807>vt8hurS4gY zP1-bvCkLY{a|v7Np2lTw;<|z7wVwFL7as$qM4=*wI4kh3)D_kvinoLXAN3{ z%c@lmza5R14{blHi`_`@T>hlANE_!4aL8mg;F#@_3lEah+yMYs7_e zFtHW+awf zuZn(;KL~#ioEtih7=2aXDB=0*`ExL_f=K_}OvCHj-{le*1Y^L2fF z^FKa8Yd59V#cN-_!h;F!J~Tl-Lv%fyLTAGg2~m-~kuLGd{3dPNrd(Zrth4oumF6?Q zwtuAzra#@yvB1kH;dM0KY4Yk5S`(V6%N+^KJmj3^;pYi%x894pN>51tz`atH)4rn+ zss43fdTRRTG`&;r?9-}7qc-RLOmFI4SN$u)gO5U$CiNG%I!tgE9Wna}3U{JZ+H@!GL`@L}NHHqJOsk6n{D6O6_Um; zjnfuBT}!a9$Y*-jS!YPY!0Lil3b*||@3B*LF&5W`m4q(8>zN2s`sT~#wnN($KksJ5 z!Ir=0A=yEu%n#Ue)Q=;pQ^uy?z29!T;2G*4^H-c_6CH!?gZu^S(w9G9+`K(&Sj+xa zpX0i9P%JGkZCC&g{&lKwv*ro?R+tlP(9z)Mcf{Y-WpX`m>fMprP6O+>NW9D4hrawJ zdaiOR8<2T&aB}^V0I+Qwm3GnbyWoS{iAM6 zrOBTdunu|*6H*K>m`rh!Vcz{Zj!vHO2L264C75*>ZT5H{-AlJp-YqX-Ssog|Zo;m1 z$S`7xFo9c`KPHN!cM!&!K~--bu}I-LuLWq1s^Q$1a*pN6_TPFE+J53gY@Jz{_yA9Wl^{kblr< zq0_%tGW!0@^UoAJH3|a{{X~ksVehd1gBo!34*Ngy*njIVBc9^mFyH zg7La~vi`G?|7Ztd=V{~N=;q}Jb!Gb7u9Y>^+e?~-dDpfU$2FJ zLH@rr{DOP}{QucDnpNs=sl;)cjYE|Bch=zc>Yj zM4$e5*8f)ZKUrUR+Ic8KUC@1c$^4hR{>A*?EC0nP#sBx!|JzjjbDsZEik@hh2U7h1 zDKwb}rSGTw(Q>47glOxdQ?!=-y)a|ZKb-$e(RoaBlanriPZ$_-7-|rCeHiB6GG5kW zrD>TXQV2xuIY98JHbhm6Mb%!ZDC`{-20?`_0DyfJt2&lioNDk*2QwuBpGBM%g>5K@ zsiUv)F6{oATj~UQptX>noiABmR^YJtYw_hv@14&(GSIHFPuE1gOYVcg8Rl<(r*eT* z0i>Z2rvELGAPE&K37PfFPo;Q_6)VS*jL-J}PD14#z(OHG?EmlDe{s^spsDERxEyu= z_lD4U&rO4W6Te@pZu6iqc7nxD%4|L=iy(`w1y{<(K<{as4`$|UFQ|0aNcO)>OB zoBQ#@e5P`F3F*({%=)Mw|hkJe^tYO49ZIYiwoiHwpHTz@8{tGOdOXqlvx=edw;2o zI*$@-e}6~RAYs>WwNufU#W}g<_Tg3GRbRok#>WdHS$0HIrFTsqV*8#>YD}*5MX^~x zo1UI)aydtpq_Ecf=nn$;nB?T4#Z&wMb9Pm=Cx4;2@jmG$?RefCHwdYnKdB|TUqUXT zV{XU2n{H_S@%q=rfLO-rCcU#&S`$%!M$Z#GwWCuS2sd ztwI(oKOjsv?d&v{fm19%N-ZL1iAp9tzLL9}jOJ_yU3r}-a=W{$*)Bd3nF~u}>)6M` zZ7a72YP<*Dm-A=qY1$dw)$MgRw}-*}w5}~yeuw@mi^bSM%RP#JkcIa*4wv7wgLHqL zcb=!?P#C?_5L@<2dpS8Owqj-uTa7$Ufd`ampcNPU^(+ph+4I`VAJH;T-+5duW(D8w zT5Lqox|Z)-*rmU?@}Emyzg}FqGdO77tui?q=B{3P5sf-I48CtmdauIZS5o<9GY1FN zyxek1xxUymcQ6SfUISfQcRk1xzAKqy82ox!lAx-i40K*B4p`}9SSfZMRGD`#n1}jL zy_fQ15jj`7)$x7%$genoGgmrw<#>T{FhaA|f?>mVHmYV-JNQbkgQ9)k1k~U%hw}wO_RYPFM-}AeA=aSoF~DxW4OlA3Bon`^)&( z<(O}VUFJ(4xqaKkJ`!28pc^Ek;q|OV@a>KsuKqR#N$36DHKBR!pP%ZPew*12b?R-` z2W=z8a2b;fWBY8O!}b#`AMu-gky$f6`vSksx&ay|4N&lPKbbc9*|_ZejUDPMWufyT z-kW!|mHPt>#8M+X^V{ZOI|0@1T2%GWpp)E@++Tl6C8`qYGtJ+8 zL|l%AQ`={~j~5|FZY&i>-t61mQ8GptJhIX~Nc(L)f4sdqodEP0?8oNl;Du?#sg`T{j#2gjTQj z=4xN8=Ts`(`vW=L5utbncwf1KFZvkypuyLR6(_$xnk5PTl*?DD=dpVN$G#ah?9^3i zZk#o5lc*`LzWjz#qchj`$%C!MZlDGk`B(1Fn%iQ$>l4&2J5Og!4Y9WC%dXfzwA=q; zC)PQWy+gr0`avpFXNPG09Np8qxkjtzwFg>;2B|+ABA3y^3HY*I6#ry?&SQj^TlDWh zhJTk8nh)LwUGE!GpjX2i9@HhC?1lWY%&eJbx+1Z7$#?OsJnxnIaT(JfGw`A-ZoP@j$&o7s@WDI4)=P=g4Qd;n!bt>?30v@CvOQdk} zQ9JNcswO0wGvIO~(@?i*#>6R|NObO962X=L=JkS8_2Z?<1|Br)DzAH>Hb)5zQag@w^78&~&FrA-;}3KaYPXl1!g1 z`>q5wE+4zrePd7Wuic0s)8dThYkf8ESe8G$S3hn}@SC@JNjJf9?aXJ<#n46Ke93df z;_f(eVMcs2D_lJLUnWK=23FNB+W1hJw}DH40(;F_MNd0{Elwv3?;5UTdMEU>rC;1K zFu(r#$Z+qNKIwe$F#7P^G;VxO<@*=Z0Iic{n&sg6G2=~Nsyd(XNi*XlPg$Tvz0BQi zp63|X{dw@cq0If|xFMrr`D@uymY~W?R_Axaq*leEn=*#2GF0;XU z88)o7M0U>h)6Y;|T9*c$4iY!Iq}aaIjGAuD*ql)rF&~u3Mhm>VLv+!({t0B|&F<%P z#k(EDu21JWxzJ8O<^abo_bC6nO|h#qNc*?S{M&evFkCjy4L?!c7>2*MLl)@`LX# zrzcu|-(3vKnisRG(0l7YVg!S=3WijSF8-9szGRJ|OChY4{q|Lq4v5!7t(-c$;7l}E_1DaJ=m1TwKdgHY|K!!ZnW`n z{5&RUp3<+6!h4?KRK+xKfuZSJHSh5y64r%Egz67=j7jYN;Y7H{2`Jkt)wQbkCk-dUdwW6nlF6-Gj(ZvL5aJ-Q*HC1TYeFHaMRbXsD0>A zuWe(N7An3dw&vIEson<656!GgfE zc@g!wwP@t3(^4PKx2D8bZ@Mw@raIXkv@s_yu`&I`r>T1kH!i_vQc%HIL@&YzsgDa= zlgavH-l@S9lM_H<8Y~$pkdN9jm+MdVsL3D#NK=`T6b!D{d|Zoj3ILFpx1<9|W_=`) zH0ekXXl5V$#BXnrmpz@*tEkzuw+MMw6DjMqOo>=+#z}UA z1SO5H>fPOJ$x_M-U!y{d9r7D-4WM1OXFG5}KR^t}^im<{Il~4ge6IH#f{4D~EWYrh z>-HI3!%e-3py-UD51_xkpz|5x!1%0j#yRfHG`LD?PydAEp4sCo^zsNZjCfV4t$}MN zI$vtoiZZG6wl)#$h|3VPoE8!tpDV7qx9#Xde}8vUc;8-G)BC{gdzN4$gJ?OSy>nD? z(@EoA$Roz8+l5bo&g6m!=$-7r>Tp)Hn7^iK%X+H%>kfznw@bsV% zvo))O{05R>))wmPi_}M|fVvvJ{4p_MUixvZcJllF&&0AfM{(F%k$pX?OcWg6c_>zX z+WZOuT}pR2@1B^?oYnUTsxtFfxmV8MS(Nr%-;y6dmhAD=>H?w@@)9Cu25in4@5kZh zN|HJr8dgUsNx?%LH0T5WOuXo~Ap}iO8B`11*hBLstBup|$wKc~(Kh{k?Qk9|IcCzr zsEEopYR@DVmW-l=Qln*V)02Q0>bccCMF;c*mtn?hi7SC{1Ax4 z8+{CY!O<)43AU$j-v&qvAA^P0A%1jN@+rdW0?FLOS(EGdF%a{{g_=1e#|rd3i-4ic zvX5FvP8QcDl#V%Nk5UZ8wp6C%-+`z1n^!uO&BdowYv7O@E0Ww?zap1o_qPycml@g8 z^LXdqBrT*blAp}Qh1KGi#{eEC3j{WxcjnociaR7Hh%6!!T*7NV%)Ch_Mq11>;e6YG zHpOOS6adLjdlQd=Os8($_${K_@Ni;@@&`><^F|>~G4>197JtM%iliWsL&Bv>d}yIW zKLel*zU-WF9^SdvtUCB9FL3GZ(0f@kL&aT|<|m$g&xdO(j}%xmdx?>q%|dm)APH*=x$4-_uLYEs8@#S-Pdvun|+$uC}~i*S*KO$Hae|y-K?8W|t-V=gAM3607+HEUJEPH6bGAg^`RGNawcWOmYFvfC_DqffbD0fE%GkKusVOW?+S>-%8Vf=$B zfbOGg;6{uU*7KXzh!Y?tK6O9m85wCqkB0jqp#DPF5A{R_&iDWYY;J%Ly$c);Av6!a zcI048DC_NehI^L|jmEZcl{~00Uy5JppJo9YCA!=aGVY7XggB5n9_6TN%9cyrL<0d% zcDeb7R5uY>0EflVGgge+*zqX-TEZEC+~w(GkW%m(8CGzFas~Fq>wpK0`q8bY;;!Wu zr&ckFS9s^LXLB|c_ydK5NOL+oLMg0)1I8Szg6@x^O7}+8-9A#7=L9{^yToTI18+l} z$}LQBc++D?kPTNH;4*(A*y;u>S`^_ZAWWgyou;y4^74HU6(SNGYso27PBeh0@hASS zkK|OAHo>zUH@g2k4|_PAl{nV0R?~c6fROYc0wN_&N^H+lKJsPTJj!y5?m1&nBqJ&1 z>1Ubt-=(=bt5?<<@rq8e*K=y$^`nFmW5Jo0TQa-BcU!?lJ8ypVgqFV=_$hJ*Nfqg4;&9x|y}-!hYXD&A6EmQo=Lr4HspyTVyDz0T`{N6n~S3Bmgx>`g4{R}^1<*b|^wEX8o7 z&crG66pQX-Sn9%WVihD-@RNpevx#C-X#qYeq#{Z`lik-J^+2BHV=wUOR#qcQTvPnF z=->17d(S;lF=wN1auxU!f3iFmL5m_Hh(C$RR~2qgB!R@a982)Cu_bOlP!Ho~*~9&6 zG+{W9gQ_&|QZfsi%{&Itp9&un^#PA8?GoC2sPslrjC23t#Op?|AX>-IL8fk$L3>vj z@7T(>TK6{dYNFCs6>h4S$O~N!z$xJZ5t6_;H~x*RiT**|CV~>n&QuMCiUWb8MXE}hyHC$!& zBMi$OIEzHOnRHs@UdU4chmOR|48AmRldPjFbI#<2s@PhQ@s;yP zQ8o8JM)Bkc1_4WkRB5;5Gv$m=`w|D)EfOG`YItLssbU`Z-$k5Myof>Hng(>%fpvU&3VU(a(@i;Ch%AxWCTJ{{L0(WkY~Tf~CejK3*uz8X!5jj;w8 z>aq*ccSZTRsen00TC5oh;^A$ELdvK0LipBTs>WK%_v zLkELL;FctpWo*#zrQDjqFY9^T)zdCFXd5_@$cWkjT#j-~_`;~)?DGXWwKbLtJW5cb zTNba-*s?5zSb6|eF`*vMM{JZIV$n6sE8+un*A#eDkqYnzH(S0|2%lCy8LLN2i-#0n zgDU0V{@Ok|$~^uB4pnLe+bGa`461!m^*m#-?5BZ*V9RaqDT2WoNr`FfGfcoda`^~E zSc98U1UZ{riPkdd)*jWFM-7B(oxojgJ;v+U!amx5=pkt8qa*=>t6&pn&t|Ij|x`pgv ziXDua_KYkP?5B`a%QU>H9dt?>y*^8_!!iXtUMHH+2y9~gF~CfFDpC|xLS>QU-6yBY zN>0r&K{8P3L!%e*7>jbvc`t5<)!F&6*Khvyz973E4U0(LBPE99AcY#^$N}uI11SOX z6b0MoFZ)f_OC&MJFP@@m!s);Fv!5q2BKxSBxEF($-7+=)^c;m^poG=!?`DZqIl zcF^lUweASU-1K7T_a)D?aA#WXAsUZz=Rq3i02oA?G6f$KY3XJqH3O-5Ge=%6`27a2 zn@Ou?iTPsY64kF6m-`T4S_p~*Y1lE9y5`VTKx1|mCg|#|O#dSY z@Q@GdhLJg{nynLocR|e;){IbqVKN|VK>(?ssANlJ;^4zp_~6m~4dC7-gE*+(&`DV) z_#LqfaWG5N(PbEP2U&RGcnb0cv|ey5UeYZ+;9sS@gjgcZSy6t>h+4z^Fx3_e zS~re0(iD~Xq)hm|I~_yc;iojClFp;ho07BR_!dA{v}dn^bW7379!JQPmfQnW9OGsM zD=G~vFc#a@Lr7vklA?IW4rN1dQCe7w6?B)8#<`Knm|nYk8XYf!(}V^Z?d1sYpE-+} z5|`_ zoD;886LZ{WNxN&64MuLR*x&Ea2x)zCAko<{(8wa*-+bS@6Af9(2J{gNF7% zCt*=_!M1&uvd5~PcR3f)k<>@8X;!oHgy}qBfPt>zYe#$_tN;}qt$xu~*MN5){^2CI zV<4e(;CjQeIupzMk7|(C6SS{g^NDqKFod}~wxvhK609(mUBCHVX$O1;JTkLwBRfi_ z9Gm}Ys^AmQT#_cdQG<`kH^nzR6o=1h6&?}|W8%-FlmtJe^gIP2-bNzC&DRrO^hRI&h2NNCCc~WW=E*x)3!5xfosi5BLR5o-zFb04-_2<1iuzNP8;&^9&KUb-F@r zEgk7j&u#kxW?BHXmhOB;SRE#XCd+{N82L9j+t?C(kMD270)>yCb+dAQgnKGKfvnxw zv+72MxaUDQZKJzi&5+?O^2GNk69b=yF>QS}xW%Mpp~2f}R;6FnWn?l^tr1j6Q2d^P zZB2|x@`a_y;|^b1NI9?eC4eFH!Nfx~Iej2cKe3<#+0CCXk4K%iwx@T&S#d+8Ut{{5 zm2uzF8cE{%khX)wL3S=4itphB>1g-Fkb0hS(ma+PLFbX&`rB&y0Z!*cl9HxCT!x?4 z86h$rX{*h-^hGQ}rLGYCO#lzlJh(qK8#t3laI3N{ECaG&d8`mw6SppTzvRZ0FvL<< zTD_drO{t}t*CamJFnGaomo1MdeYh&3NG6ktWch|1(DF~}Z{P{$}vY=9i0#+iQe317b%aK?xRps3VDuFPED(m)=rX!E4nQIC+>P@zk)J*4zNI zAB}|;tMSX<>%R(yMFOPmC^%L}-05W!yBr@Vz##arVKf^{5WH)(YfizeXT$Bq;Nq5; zs}g?C?nQmpzS7OA&86iXx$+H%g%*apvzt~>(vRqCl*?W}?@?}PNSW5%QsrY71|8C} zfvqDosbcD(uz2cYfJy*}-%=Zp-AEYSnD`@Xkc&)vfblICYB1n)6UfX>VhQuCr7I{6 z(XHI%g!fC6^QUV--rOr?={v+l*0L&9pP^M4-ooHYA6#?vuGjDA;W_RAhIyp6B!eh3 z0`zGv%o9PrPz`4@BROQdTGbff9oy9o3d?aXt^^)A&lUiW+}ssdm&uTksG1 z8PX`+80ez*Be(;8Wm&?TfO~EdQr-&c4SGqnulzovely53wq@!>dEzy{oeTBd@!z6! zwfh-bmrj5L^Dirc;ZwyHe}ca^O!?f}@;q7TNb^+AZS1Kv_}rpGy(lrrDt?&Xx3H&8 zjaAXo<;id_qLqgnP7qAgZgmLD_W@D5J0fX(eQpId-E7V!t7sRZc0nAcynMq!TNv&C z!)Qlf5V+)#JIHl=x^I=gd<7Ekr7T-7UDn_do1??TQ>$@*zx6OiJTFNI%y|u-^}&P# zSwDSz2vT3iG)C-+6Ic@u^3os`w2pwMYXRbscMo9r*eX3KaZH|5bWA;3+5M8i%n0|V zEYF5=0-S9rLzCyiRGC5njAI3?if`~~=*S)L@z!Iq2E?;-NXUrM(E%}>_Dwk*-1lk? zpUR{FQYG^o?F6<(aDg*3QR0lFZ(ms2f=p@R>4rwAaS50`deUoHhpV9Jku5Prq`oN> z5f~_Cx9JfOHzBoBU5s}Ohr3~+jVwmUnzji8KbW;}EuxU7 zDX&Cn(uRO6QI#}Z3t?|t-V2B#*37~_N=59(=(3nyebdmiqX{4T(w)^3JEgj#SdgM1 z%$&e`>CF0Cf67-yrYgW+pNwS<6 zWGqtSUx87@_z))<280AZLYdPo&?nqQvKC(!xpDzwiE@(e3~&3&1=8z9kVrPK&YM`SS!hpAR^x9sml^~W2gN?|L!7Y@F z!3%_qfdx?Z!dhOmz78-0*f^37N^t2PLTd_@l2H#%Ben|S&@HPpWOmP(Di@`+7Z+mg z*>%_wGH=xXR3U4n<$(-oSj^#$J;(c4OWO>Y?*uLpltG4VN1C0C&3)DMyD}JM;pvDR zq&pR|8_JT`baHt?D^G5?xtoi{P!-wM22ajBih%;HW6M%50?#R0VliCi6gcgh{eg6N zXM3|yXR65@qS8aQn~Wc^tm-YrZxUVFshV1%Zy>B;G_~^@_rN2>tM*TmV0MN+`M{tG zY)bEfZnZ8_BG#7Anr=*H(Q`w@MqgHhg7Rp=NvJicaC$zi5JLAC3^&w#;4~SQnZ-?CsG>2k43Idu6^Zj;lcB zz|YtQrYvFtg}n<+cJGY<_L75A%9XZjh(0odK`HnF5O6QnRyd3~B`@xAl4|ZyI(`P{ zV-`X@1w2qZQ)zx)sj5dqb6Zc*!~GYTXFi!=w+VRw?!*tiQ@4VCtu7G7MXes-As0K3 zBWr3eyEP+bbWFt36G|&VGLq~~sT87;Ss0T?9mz!P%a;OAi4i6Hc~zaP54Z{gAv)tU z5UllCMhMSZUIY)V-TEv8dOQA(+mAbu#thm&za zj6IpJmnS#T^OQv~qDu9+-gy2he{^7{g~G}^s5_Y+zX%ml!c;LtbPZxE8PmkVa!#Nz z>{km^?tg)oifj*!7s{t}v&v+H3npE#1P$<}?nx>2PTA2>84kOJ7ef;m*C_=fMIPe2 zUtyt?cYr=cUp%Mg1Z1X3Bu3_ z+c%W8GUyU7w2^#xnqHTv82I~*4Nq^a8fS$SI==c31drtp5R<~u`Atdovg@apZ7*lt z#V~bdy)l{Ula9!XLUzRj<49J;ZU(3?v=#|Ou*9o=w@@++R3FWUSw>(8SGhe5NYSR9 zB()20fM3Os-IUe2X)UIY-w%%$t_fWjv4x8E=H0-keKL=wTMW%Ke8tFk9`atJ>}IIK^<@7wd0HrJVKH9AQi$= zh94{cc53Ovu<(#dlB4VITSty;*kJ2`&IU*(2&x9O>$~P6$ySeG&jI(1cYxKl0=e`W{oxgC7@dxI; z@W;7KRJpd=Y7gOBVg==)q;L6Rcpb^y$-4%%(<8ei@97TBdf@+KJ7xm7S@V_C<-~uW zeZw~Pby<|swkY!BuROZ4enGc)d^PwHH+k%J5wF~vBIt>GekxPC0^+bYhR~id&ttw{ z&P@1jdl7gOGFpmWDh!lw76Z1PrlYYWi%NxQsX&^spFSHN5Fs02hjXF$iJ%+j&QmSDA0|n*Hg0=}D{S;@I6CQ1VA>Pjc-2Msh zWb;BFg%L|wP0aCI`cta#fkdIcj2gTudx~w9u(v__O>Z7=P!%6VKBlE(<)b^TO1zoKaNj5@96 z>}7saA5*`i=v;J(#Nb@H8PY?eqQhBiCc!R{pLw&Xr_g1fOrluh7RNu(rv0mrUw)y? zywwux)sFI?sg!?#>?~h_+(vx^H`7>aN|a~q&0?TPJU5nVGN@Y@^Px2w%Umzaa|Wd}zuZlaKM{Fg3;D|?fESg1=to&AG!AK#NDLB6F_|5nda8lcibW;+Um z>rXu|ERTVQ% z`@~Y)d=nZvxvdJohuM4ijYU7kwr{X|_J5(*sBZD`OHHcwCR33{ID%KE1-u&Q`cU*> zf60NqBgsOoAR$1u*!71#PY3@%H&o`sUj|7!r|DX(zli#M7VcN-2VpSe4`SwlJGcmnb&ylc6)VMy}wR_?6AhG zwmBj*rz48>mBUf(ucY6Hoxg_d3pa8Nc!##e>xO$apWQZk*PLV%aDEk<>3Oo!qAx0g zSIu4>9|Yg{SWGn8(Qe)gE8-Esm_k+Z_M}2P+n(A`)yBfid z-Pki?X!$#mBV3#oSbakJaKu}>x%Jzvl%hCH_E#+4o1R5Ws|RGr#pCHs&mAeG&;XIv zHcq5~ulB^=NvL(sAP1IWB-)%)jC9V(#n-?Dw0n;bQn&Ng^Vf!-q^_9eu)K*4fSphe zE)*nFK^>1XgzQK}vQK^zKzmL#l3`ZzZJMau;!}1W+iu$aUeMrOXn@D&FS;#Fw_iRB z8mb{;gKtoP_aIU9GIV>W{F4u$yZoznqq*7I@0!|L{^pp1o8R|{P6XPdz>ZgOtD4a4 z@|zI)iT*pluw=0>t)sR@+#F;DwPB_4x5;YGWcEXvK)=#f0@MK)=m2mP)c0iBbarVQ zn6R(H(2+9!Wi8PrT#W^{^sXUYzYi z=$icBXNY+z=NA9650Y9%Ll~=q{IB<$4@u2>(J&p@ijBXgsdIC+W7^i#+B-nysOVy4 zJnfKnL~mmIK6xSNRlj^2l0c^l$=21EG~1+cQ%HXWz++zfEqi0zIBihT`(CV#w0-Kc zIe#bc;XYr?qO;x<{oKfZN+HiI<@T;w4G#fWlw zyFt{6Mt^C##QQ%?BX!&D>DMo$F>U_!#_7>sCVK! z+M<`BKx1$vnw);tq=aGMb-y3@&Z(seh$gEam058?zS?Nc-(K_meuXlXnRqg&YHu{p zceSZ)KFFyqZLZFURbUM3c)ackmp&`+g^Mx8%K15!H`_$@xn%4&o--r3Bo}@*sGIhh zpVb%|qh|A^7v$u;Dz6%*-4{Su49D+exN7w6*#=3Eq$J07H}=!`1@!US0-|2ZOxv{~ zZ~MdK4=HoiaT_FU zN`BoRR=$<^#UA&~=?Sq5 zCzP*k+TJ)rGd|ye2eMlK;v!^eea|ixYmyDbK$3ekiu<{oD?$4)5?1v4ah>VS)sULW z>REGtmq$t<2n%N&zEhy%eJn?Tk;4m8`_ZcJ|B0afLy378E6eslTP)8r{G7V1v{4w)j_RNHp^F?Zpeb8nUu( z-gebci~jL7BhlIw=xJSk;9kQ^|2aM-f4ABL((AWGl>2(3j8)Do4?^QYIAY3Nj!!0z zMfC7t3kd370ZVxMunlY55pmXHQrUc!+2U_?oVfi*9} zn`xo#?v3v|G)`ZWIVVuxjJq$CC2aDvfHOikWG9}t(K#&DJ+A4Cw0h9iO+M39(MRi& z{uw$)NbABL$Jc@gv+FpWsQ_;)davHPExYv*jRc&G8oKLOUs-a&@2p8A6KL%!%4D-7 z3OM5?`Ov7+#+5(fsHFU{3N((W+QW6D4G%<)b{%?2{Dow{A^8wU565t@7U#eW^W36J zelRgI!q-;G_XW}p4%9mqcVHXYV@z5@!!oOd!`jD|1cub!<7-1ki|DA8}o+^pNI7*R~hY z{(k|R6ETT@(Q%E-9$2mGNU6|3KS>E5jScDH7I=Mh0M_A!fx(*AV5J+Gq8t&_{S35XUN8k)#tl!|NwG&fV zYr}6AOX__nI6;E1=KoWsKjwBQvlC*S3=X{SI1H?2u$C=)-$e{Ygh-SEM-NDkZP`Eg z)X#&T{<3u1WC4B71`1M@PyQrFE>O|W<`6hM1QCs5L_~^}#{SF=#OFrJc~Pwj&+ztz zRGdjhEiXh*QR@vAzS)BZL532TZN`JVExe~+q#rrOM?~~&qOmiTTRA8z(ZW{5g3R6O zj+qd5PnO)TxiaY;rHAx34{5QloX_SR_)Vc5-Vb1-Jt-^y0(iVp3PD@LM{QuTP>eF8 zz|-jpStLg+K>*c3C?#0=omO{Dcj)0N^j6Z4B0X7*Rx`h(=}}(M4?T(}4t)vsE>$xTI+L_U-RR zJ(0xdkJQ59mBxKD$F0@Z(TEx8Si#6}(mJ=N`v(UgF7yUvL7zJuF6ipeNDve+xoW1V zu{=&ycERQU*lY?s;nPknHU)11OD+3@ajPdOf?F75;FD_y0JGb5DM15t=pc{26CK4F z*)l5PF<;hzStL}W!G>kW(}?UbG8lMf?yKKv^_xojXPDKown2ybXf*cjRDoKXG5{-& zgkvn*LpSCTDAcn7F1$vC&sYleY#O3d7`Su`fuwqliI?4FM3dysIVe`htsul|5H0A$W`iv|9sL=G5*^4E@MDMihr!6R+6Ti;$G zC3Y+I26=J~UV2YI;{(|vF@MYKlox+W{vCyRDW)gqpQyLS*zFu8bLBEIe#roFpI~vt4l4Kse~JfZ+;XcITNM?BH2Yrwi&&exGs^!8V%Z}^q_l4fC##BX zGmsnAk(SyJ{J#CZ*ISMHYw-+jNY!3XKkDtwE&4%xIa_OD-iWyAkhS?lKj+&+=Q!3bjU9uCv_+BE-%y`=!c2ujWV- zGAaktxJ5l03YYBod5x7-+WxcHm5_DxIs%jjeoIjZwrTY9(U^zqTZNdKnCDwg{t|rq zAh4q4h?PKK4rX`@YwKErmunNi}_O37xD=9+%tWe0Pv(gazODfAM3-w>4!$P3Mpx{cW{*oTP4$yYsPUdfc($OrbyFG`(J z3Wf#Mb7jA)RHWU|X{VgMpR9?Ml5k`ygQi^hYTW*-xSrgrHi7IUAj_?OBhzrcG|Rm^ z;H4e|(F6wyJ&NQGaXW?tAcCcoeQs!cy2SP)fT&Q@D@b!Eg*NPNUL--bhs2elALxFx zyhm>qp%h%Q6^D#Z9OV`P)>$DV_$^=gv$(N!{t-4AsZ`cdv@+Krf5Zbp{0c3-=4ZD$aE(OrmYr= z4Lriu4eIzQ8g2WTcSI)yDv5!H?V|sxEJph?JF!i=iy=- zI{7?Xs`p4kw^3rNZ!fytgB6fFJXrINU`1Lf`bdW%R1eIXqI za9i0?Q5a6wr|yOddRVAX>yQ^Huxn$lISr1Be&{!?Qfgl+O(fH4&5&XZk7k7GF`?eF z^M-{F>^Tvr4eahl`IoFS1u9 zVf_#c$Cm4_V@M5ac+|8Lf8Uj*@MVAhzQ$vA{tP_CQ6<;*V(o$x;hn->#TZ&PWe4o% zJ}_LQgq~qIKAoZ+ISUTWepLDx6}qPu!Jb~yn$;+IXsR4hiPHb-8TNrQMG8F zUL~138r(!@%|@ZAAQn$93L(YBZHhP5Ye+YVsk^_WJ?yv`=4@(4k3ba@8^-0 zSNj|OHGBRda+Ftb>73{v%u9N#s09+Zpes1ps50EC=eB2ml=Lce(vi7Ux}f{q1KSK7 z6?|7u^^)Ce!rLtLk=YzLMjY+X&t_58mVtN%QaUB`NUo)ur&jngzkS~pqI$q*%6Ndcw%-z++UnK3goq41 zmSU0nGe}d9)Z#~r&64b;(g+#g%Z!!C*)MD^nV!n;GNP5GzR^Id>A0k-FB$wLkqW(Z z^6)m1yR%dyFKEz{w6=FnU&or1^l&XC1h2^{`8GZmtC~+29rp-Yi>^j1o(gPw@7WPM zC5xyq7)Hsjse9}@G@Z*r)~PTDv6NhO8C7;12D*I$2vFum7@f=#?4eM9xg0F@@^_D} z)IvC~KZ_4k1M#;p?X$vFl9VcYuyCiym$0tY_DRC@8F&_>v-%^>e3m`&D3*LyDgM2x6}1eu;SDzg#7} zUgb|)ds7$0K6R9!6H%R!Hnmf(o!r;MmqC?Z|0RTPve9OEMYi`12*xw8GEH>Sg-~zk zM}AA>RS9j63eJ*^B1hbN&U`>s6nDMtN`nULhJTv_!4^3vqDvdNkf)fk#AV`z(HFE7 zWrLZ&h3YBsAO^S&r2*U4(obR)F}fs!00QHn zcoDzg%5^*{u$155W)^RCA!}-lUoV#amdX?jR9(z8vXrwn?%?DuJv32n_w-aQbt8fo z9)qN5n+lvbi!9pMTEb1deh3|GHhb2vXl`Zj4V$h(o;Y6hO}tl_F6C?E`cG(Gy9JDn zMv^K@1ky$=B%gbR5--G&trUA;W3%6AW5vub+@5jBWy#m&<)whSz-njI!<}T5M~M54 z*zr?TXs|=Pvo7nEbiEMvNGY$P{Y|n{uP%iQ!hBkTpM4T3s$!EN4^2Ixe2S(dJSA7K zqY2VUM>5uO!37nQnd=kI*bu8e$rVSOsD-3WWq>#F9~OX~uy9O&6+9^ee#Wpp6&6O> zImt!~*6QR4d_g~~cxC$DI`_(X#0Afg* zUfI*+33CW%wO41r{vyjNye)e8_P0u-)SJQ>$6qghBA~sT?_#(;K4sUamv~F%da9vR z7F9NCvwp5MTr<@D5a`ts7L6%s%1--m|08pWN9;#tq;6rpL`-aSAe6qod+an_gJx$D z9&L>t_uk=rLJ>($T1X#SKnnlz{;{0{b16g$WWo$VlWX^8=HT%ueKL7-K{$nuQZd1a z{pl=;QwY=B6DgyEBa(K2`k+4Wmy59U&m>6A`sxB%R#%V^2(0R%ysSc>s&RBblAn$` z0yqPj+Wuf(41-r|NGFX6{c#JT6 zzD00Xf6`sBgHg`KniusQu$*)mw?S_!8&tx6d^YQ)yopu5j=KN7N{_VD-Y9fmzMP}; z-kvMl4aA1<&#h=6jGg!t4X38n zSUO3-G2!fm=@MOZGZ0L36W+EC*$H`)36g5^;8Ado@o*abeQ8lZ z66R2XTuI5=kS>mzD)E!kBSW1!JD|4E1TfLQGSyGlB6t-Rv_pf9Z(1q8rU=*hah|QK z8Y#Jyo>o0#nj)BgUj-{O99vpxSIV%+p<>+A6ddz|rufA(SZ8m>+jz*Z6x=NV`C^d- zy3r&&&3RD_TZT9*N@*vp6)k*r3+EPVnT(@tzo)43z-mzNT-w8qsEO^S>Q2YQ_(Zk5 z3zH+-CPlN}hC+W=aWP`&kQj9tQwVjq*4*5?W?`f9dKvuO3#`b(Oi;Z#;&VW_;1XdH z<--VLF=rthFc}hW{4YlUTf)l-BwYd$*2`uA{Q|nb)HxImu9M@sVpW(X34*iy?Wy0l z--hZqEU>y=;GP?iisFkcF*x6EJ&L$K_-c78<&!AW*p57ppsgWvj(TXRVR2mYK4;s? zQ4F#-Oa8g*k-`6aG-?B*m!$3mb>$tpoKfVh`nYywoWXGjNGUs1XJgyeY@?y|d?W6@ zslAei-{DEGR@P3>J>-m9IrldEyz6!= zb$kE|9;kKVqHE5&Ou=zDosDS+`|`8tKUQUAGOx%ug=^!$VEXf^5BzQ+B;N^DDj&zs z*UMPQCNb++>mEhxy2VzHL3*8v=cgHL@=r?WRr}vo|0!XNAFUYj2667wP0~ub+SQy? zJ@ms}`%z(jgEl%QZ}eDnut{kBP3k-Kb^8QaPOCdDaL#yL&=N?`H{Bz6;!OX2jB>K0 zqQ=NCSgZF^{^nDebKFwaab>RK)pyE>ml z9E@!+*yw*!$^VV<27mhjpihu03r@I_j?qzKm)nE>lT~;tf?$kdF|o`+$w|sEz9mCy z(wks*R=zGk?VXAmB(tiF?&YJ-Sw?Db_Kjj%CgF|ZT;|N7-~WvhX<=31hj4dmfL0NL zvK$?bVrK~1H1i8r&cfti(S?UiNnl~459IGYoq%XUdH6x8Gt#eRpo(cI?TXdh-ZJt>~cn-Hl3mvF77T zr~3yE^||)QBt)r$*0a@||I1{h2q{o%FBKJ=+n4%!Z~8nhRlrC-U0f)gug~RUL@J2R zHTx(R(s=B|kggh9lJPDrZS_l-0tety532%Ge0VpnRuDpa3`UGYZJFPx#^iEO*&9S#6?j_HmsfgJGeg}!B)(1uqyt&t>p)#>a7kS zEYl0TJA8Ph?TC->lPxdCe2ca|8VaP{O6c26tFQrwo1CcN%cWJ>0|W)y3~m$WUjxY@ z@O<`vY!CqN0;F?Tep>gayvm=LQ{7cH5XRyBUu6GZa-UWJ47P$Zgsjy_`pyHt2H}Iq zjQYJcwlOMtzlQO@3Ka0qPZ}JivE4^u4lI(P4FA3m%K1-JL`YtK!y>YyqABIq9DyKnTNJf^UEd zI4;R989h4|`LLFTb1wF*t|Fj}!FM_7{G=W0|KtoY&jrbb2FN9CtBoeeuIJwT@ zK7YN8n-(JJ`!T8e32{5*>0^oi5BTT>IeK;d`edd8?sED>OCa^CI6@B$x4gZW7!c`Y z_|1Wu%H@Hqli^51^5wbhbN{^$RT>;A!^0IRf5kIgJ~@t%Co`G8Ew05GKA-jI1k{{3FS>^PvjAjQF*$ zQ)}4s>RHVerC~nEdY*^NV>vi;TYmL`lqV6S*lro5a-#{lk;i#4eBk@;mO0@n4WoFp zkA((LT>>Ak;;mXTfBmG5A$!*vx)pKTFtJDQ73abP#}wnEMukj6OVR z$GtBMnPGR)x1o*Oeh~Yf)A~?G|G&ZRv6x^B-j&bt-wL!n=;ldTc=ErK5~n+VD>*BV*MeigMgWz#?t@g%4-ZQfOUNTPOh{M zkd^(@AkGW`BvwAYxA@YA2bc9c?P9+1Sb2NalkyXyS{58-ce^4K#G`%&Kno<_s6DH9 z1v0qUqA`nQ?g1_t){%4;l6Sq@BOI~+4O(IVzQ(1}5i5jbDc56FpB(OO+E_ka<}jFy ze7gK-eL3V7fFY##?X@&y1B$vmzpfG3iXmWvTS)_^DKA+z#TCv3gO`>Qb}ym6N=qvB#&8PwC>+a1=$Sh;6GRAa zJ~y#{2l({DA0g!7Q0ntb6oLFxHvepiq}`j#Jt6t)4c)I#Yd61qP2UsT*(NRe?+D;l z!)iN(@-tijpbs8h`Rmu$@s~Sow>POSw*WqdFI=vNN1!GEV)A(q zU}&t=j`=t8f z(Ddgtt^bpkfp8$<91?-%gJ8RAeO}8BiVA#uI{d* zFh}NvFdo$L*zYFUf1_zMTH$ON<8qZ(S5}8wIU3@TvZiA!3FCi=p%g}+*(#j<2qFIh zB6#T9PGBH+VkR%B+1a#cV2?4G9L%90M_5>n0Wa=sWe^TSi;dA1QTns5Hx?zu{JF5Xf?~dJbYM8 z@9YA;zTdQ#YbqjXJ91K><2v~MX)}|3z(iKCp^Ztj&moN`8yH8;(-QVYq%bixKuC=k<-*)Dt{wrkqsRu^8ILw&b(@HV* z-|rs4z>YSHWeN5GJCqyqe7`1K0;HtJ*MAO2>0aax)A9 zK*4+_6oZMUgqRm8eIfV5hrD~I%V8?L$~8Ql4p@`pv2zsN|$6Ne6WVWn`u^(!ait~W~~TNWdiI(u`H^Ksyc z4w8U)E?s6%b+!1I>-baf>6?^eU3Z+hBeB7!f-iA3ULJr zy*x=NrZ5neH>BTK+fP-F<5(F+<8T&+YROB`JE=8(NRczAgKeH?4lDk989`_GSqDEZ5s~ zE}T3faXDr^-m&uD(jpHxdP1{)DJbn2@aS<(UoAfrESHC`ugRS@-2;z1U^^aH(x{ES z_-Yq;dpSRm#kS*lL(ZDx*_AYzA%>2`Juemew+<|*1v5Wb%l3Spm=?(iQv0;!C!_CR&cW#Lqw;mrVNXaG;F)VukGe;hz5+1vc9PpCL1y z>8r4n7Wb^Pj5i3<(SfXu(yW_qp+G$*z>BT^V`OSb(bkT0XN*)84Uy2hFf=mxNJ{iu zU`2e3WJA;{QGN#T>ZTTVpY=YgmOB0`1USQOK`IZNUc|5Rk?g zd{4|lnAr!sU_YpUFM%|y;O|G0wrcd#Y*%d=QfO*feuUGt5K-XtrODnxkZ zv)fq$k#zDJlfjNCY{6{)w0J_&#_rsZ1;6(Rnc$ls5%?Zm( z`y8+F60wp@MgH?B-i@UseiYsO4v#~cS3^Z{^UzNG76^OqV*E)K0MIM&2hd5Svp=O> zi5KMM@Lr7oW0n`r=+TaSY<4$M+#4;q)Yl&#Cpp(RyvlYk3Avve6Yau7Q*6h!e~aPXPdVZSF{&`V{pP@pnwvKpX<8S3o;45zOHqnN_8 zY0C=|`8m%KS^aigr~h4wdER0LJya&V>FB-a(psnWuju^YX+Cdxd)?V(gsU5@v3$?$zPIOK5F%p~5bC(~{<*Qum~<&ATt zKY)TO6I)Fb!0n;m2frvR&h-pMHT4PewZF_A2xHN}&l82*XD5ZQ53(kM8Ci9Yjwtia zR0K?shREN6xRlPe3F_Qc!})(CW}@9xq*k8p28Tm~f6#X%k{qxi@T)UI)juXHJxR#Y zCFCh$VT=?T+-_Wflh9^>RrnU}X+ba+pG>p9>PUNZ(InYox(41MXl{{TWuf|KhKfhy4b zN)+ojLC;6&L1;U@iQS=nuJYa7$0HBSDd9I!mBWHskd}Izpt_Esb#zb|b zS5huy9eO+B61g95T`b%y(1sm;V7_*`4lmy72Eu$>A+ucB9!hub?LVBKY6b{(MoDoj z2wRG!6fX6A(i+_<0lF*))WQd2%K$HPJScp~0f%j_1Xgod+dXJwpZPaa49y$LI3>=Ox&O1q83oTV;h0Zr_Du!MBE#I zxRoLwj!~IF>W&z#_xag;fl7mFUhj-g(P)78XKqf$%H_)Hc<}U#Ra3kQ|91e5_>~rc zfb?azr!{MWlYi`z0W7@-VhK*_+O=k>D9%t9fd*T0AsVb>seOo`y+ZWS*ux;Bwlp5#C!|SNsTIn#$Y&RaogVzjZHoaFAsUX5EJOxJuM10 z{w{dSfx{RbY|<7Aad4C>78s&RS70%7A1w4ZtOKq(F2Oa#A;Y+5qjIFvBLQ6a}2(=Q)!e1^2bl9Nyxs7nC)HZ3kQe1j=9 z#dnJ?3jNkEyHS+9dE6D3x4ioEIcdn#nu8PISo`Ksy}e71Ftl8)pRJK+K)i2IUvBXS zTSDnTOl{cC3D4HqQbNVZrh$;W694PCUp}F~)5!Qn_#Z;8h~OU%-=vmdRACL}_Lgs# z-ikaz{GPYFna-WWFUfj6!79KilL@a7;+A<6^BZXv=vXal+Wi@n1x-oV6wn2{I7+ma4Bs|pO8U7iTl0Jo$A%L;Q~O9| zwmY?)RJk#))YH7f1gw+$J2DSN9GX}z z10Mxc0sRO*Ao`{&y#M(_{FlO%tr&J8m_J1^j50o06;=P27X-7@l*J{RKWM;^qgYv?RU&OOY%gy{a zksM5*?1hSZM=uvjSMGqHO|LXNqOFi#JvwO7{MeHGE@5;aB^t){WAly}8vHJ_sw#ZE zZte>SLn4{**F=xg^O2y|@etz|z<4rbKxz!(7_^|fICQ#ObB;G+A(qh# zqMvX-J*&l;aF_diY$p7E&m+g@+KtSEqv$Uiy7>K#!Og`;TiUoO_P3FrBV}T3&a0vW zM>WL0DF3Um1LEdAvvc((ba{v@u6u{Hqac^-x7P_H?P4?GkE8sNu>?g)>h%C{Z%m!z zAOhJqb+?JV)i*cEaTqc>3(h)t-ovn{)~=iehO5(es$#u9bd64-G_*)^S@r;K^JU7~ z7Bkr>9ak=4(@7s-XB%JV^kQNyn)?p%(SH${YZ5kXvl{jJ^UAk)uOx-a!Ah|j9jHom z9xV^=_V-uOp-gkJV{vXnS7J4fzfTsE2Bn!)_Ku+d2G1_2$?705}7{~$+0XKNsI5Joaour8YzozM?=xS+AMj6ZVtN- zHiIk}D_Ltsg#EV4j+HN=cUpbf<#B}+I9(Tb=N4C2yN)q6ai15-^xj&naQ1jZA=v63 z6vYR#R)o|>(v2k@e1n2rKGS&3baSG%P3O__DIY2V_`l?pj+i*4mv4W+DO@qH85SPh zpAI9-;Hd)2*T=5QBXCN}laUhsV4qh@D8)fS~?$&k=`2)Y<&`#0N4y#~ip5sEte&{r2!eNq&R-Wc$;S;LTk$rcUqx|wL zXX-UYApC40YGS^T8KP^IN#g2QA=h;YDja-`%dS+qSaxhpfPTSwjw@TRJvRt&7^%D2>nQ!swt|;f0yk7lu zP+@%g$7j{zRcwxn>-mpaW}e@V#A?NuF&7u+Tfvpi@jLu4-dvrwD9oLfpSHS1Rxdq2 zeD5=O|5_y@iT-V4mPt`xUuY>{+M4L7R#IN8j3e9&lYb;ObY(t@`uP+n_~ap5bacpE z|1$N`VNBXORoSHBN(nSUWdda_glM@!$Y=bYy|8CC9Sl@1;$K3eY)5tG1>!1~4cXHe zH&x!+o)FuV2B@c*1gCXsI?zD<5t+v)g8bp?p#>5h5iIMfxg=?c>8VRndU%Do>t!qO zX%PlOK^_zd!GXz|x=KC|Dr0{ub3t{+t8XZOE6ma5K?8(nLQv~K;MnJeD~UF`Mecag z|FWjxz3r~tr>h%g+@(9eTdk*GZ9Xebo0=D`9q3W}7TkNQ$vV3Y9MlZg8&pz$hD(xv zm~C!^ycEDFLVRjaqrNBt4J1n;#RBmNIy$fR4qT<`{q42p`OHAY*b?VyUc`nY)E%AcDj>EYA>m|LA9NivEL?lGnt=O zYRKVa?46X%GtaI2{N8sBkP$%xtVaK z(_Gc%24ue|PXb=zd@Swx5JiG03m$5pZr!3g(B} zX8qX~Auwu9k9*(zeXj}#!_Cu@ydh1Jg=X027R@3z#3@nI%IA zYh07nkKBL4E+-rZgo17d4x--Q{A|O0(|4SkF{e-M+Y7ZHmt321@(U^11!|6k!ts6N zZ?`y+hhwGMmAuxjE+(HcE>8$);n+AY991U(f_TfDM=!5MX5@_+@0Y#G&Ulfw1o$AA z#LWv7=uC;#(_sw8e0ckPEiWYf83ctZn>=?X2c|lZ_Xl2X7;1Ueh-#$kl9Jq60Q4R~ z+h=0G3H?Gh`o3Q>$pRW!+-UyVx&gNPPx?$eQbHhwKtfn*vd*pC?rY;L54ARmb-rY( z6_m2>OP=bncxB7eVz2LzdFIBfs4L8&p>j(Q;rKG!V~=oUZf$SpR#A4@EMWJ~fOm|$ zF*W=WM|Szg(7(H1a z9U=@ahv->`BryCV?}Y%tp-TMWp(k9weCa;*TAwGEZXd4SYZE9rR_7imxAa{}-CE*4 z;Gn+VDxgANME%ZR)x>sxypX*~qqYW5bMzt&CwIb7H^TY-?ZPOgH=kSSsF{y?Oa}7#y)FQj_k=eRDWAH)p|F-06gUMy z!uf8_{bCe}93e{+h@UHwh`076{A05yYLX!A8eDLUkBIXyxZf;X?3QaC; zx)&3c+z9XAcj6#C9%Xf~K9YaAFN|{!(BrGSNM&9peaMYfU|4Pe0RH{Oa>pf_&C48S zJOrap64rEpegOW4;fMI@WM%BxC1ib@k5t$U%>8!ij?*!6v=&%};!oTe^yv?)p!xK# zLl`#rzfJ@gbvLV6ck&BAT;0O`8c5jqBq>3hMtYP;FOw(XL*f$#Gnf+ka;KL;@jC*# zMDUh^Rms2A~b1}ud`m$p6 zNFxS4x=psLDW(y>X$-ns>5hQG7F}!*Fv0QPny3Q49x*6HzU8R+V9YX?SE%jaK;dmk z@38oxnSDK-1D|{LYMtLcHjR|#9pF|^`=DMIkdEckUJY4%ob_C7FZEk1{OIbh!&5UX zt|wg>^J5yLs{;yB*}X^Ej=8?PicmqQ3nI<`9XH#PP;yrGANLk^s(P8ey1z0Dx<2L` zg$7R*6Wip7E}xQX*{uShtBOKXm`Y$5iDYTGdQ-eU0S^1S&4m$YOkr9etTxCtKq76uan2+7%l0&ezgnFmsNNh{ zIv>h{i$rU?vpiuSX6IiTZ{wt8t%f3&+m~K6(-PER~_zubnzjJ|( z6~9-2P6?mwQSMSXx*=u^hBDEi+a%ZtEb!5XG=_83Hdz_bbL7pyd^F1S1;F-3H%<}@ z0PNV}q54zs!Q#j1N}#)h!Ao))Bq3qq_uX$UMDQ@*13MFp!oCfjy_(4y;5p5El6yYm z((x{Yw)WV@^u2gbsh&f2x%eSWKLOJ`hV8y`!w29&EMr0@LKuUYud13Kg%J^JeghJ1 zmZn0Ycegh%91t0BH=E zF9TRl1(<;ZgsMq}xXxtC12|@_4;S-GL!6+%x$Zc_bfA91b!HZ3GGCnK@U40^U3@^1 zkNLhyukIrPz?6Ws0G2!&C6`|3{WjRKu8XHQ`^iJGnt6|UU}AnJf}hclh9@j&f`o0d z==>VVWkU=*sV(k`p%E^YXPq?txMZ-$P18oE zZiZWIV+*bh*qJ*~LPU$ziWz`%KfDEM1{U4YgNL1qmq7QG=XYK_mngV$?ILtFIt@b@6Dz;grUJe)b<*Y3qY=rK}#9n+)Xq0sWmlw)2Q&KxET*2c!;P@GX! z5BNa_1##D3ORF`_=d1`jF%4qa;-EFfw1i1?*Sser8DWJE8-`#EgrHA(( z>L%%GIT5-z{5E;AAXz|WVvxH;E?aY@5~oaSm&E~v{-O}=Q%$luuhqKmgApx9Z6Gf7 z!22cUdKP8q99>WMvFF`KHIT*HJNmmbgQCLmh~6r5O4pi^6YT&D106VLKa2;~2An^w z1KQXX?y(PKqqc%7M%uHKw49=i96pnT08!UP~cG{XNKB94*3g!65VB@#Kruy+As8ZUAlPcgptS!jN652 zUt%Z~O9z*lAmqw^?CUs)gOrc&sOJlrJUf0l z0K7N+{5x>KVQ89CVVtuN^)DsdeO3QvrU???@HZynAokk`GS%QykN$Mzx? zAiq(RFD~T^Y@iGbTqRUGtrL!@M+F?U4)c)y)|=xBpCM|tXOJ4eyBRU4isN?32$MhO z&uQGQ9@rcdP132ESJ99~(GnhKQ!axR6t~ImlW|c0&HhR8Bau_@X(5e`!zzZjJ zciy1~a!M{T@`J4hBG-D>E!6q~Kew9jxjIF)TD~p4HS&qcfQi7GupikfQ(^P@YL>Ag zMG1vSou%~Vq41JOnY!rZ5Xi2AoA;G7XHIpJ{)&m5br8P*@FxbDKOA}rwUNfB2Sj*Y z;28dkN0e90^nuDxuP`NIF1zoFA~&(RHnmyU(8_)1ZRz4gf6;QFF2&CYSnniNm6O=; znhO-__C)pDx9C6UjEwT&FzFF-%>u0pc7+@8=_96e>9C`_bEuzJy-k&D`{s;qHP3Wq*@zF@;i8 zm~2kU-XInF6KwU*hEHg4lr{qK3}2B}0YE4rd5{s6s&KYaRxc(&SB)=GH&6?QRmf&- z_bYIcww*4LLi&&xQnM=uw8@#U#|ytRYCdph5XJVkD&%v498UJi9VCg~GYM8*hvVpB z2}cyGw?iANEeNJDMis4mGg$juqP}`YVn>uOVKUOyBrrz1dPc4&#zPIsmiF1$&V)Xx z`L=h?w_Qsw=lV9u0#xLqqC$R`->T~cxr+h+?dJ|!lDB%6AMmweIlZGRRO2+L!WhN9 z4PxISpuB|$Xm=>WU_(6$+(9LWd9Uwhxr*QE-6J?;eYL=HS3H#CeY>GO#UVYozxw zTdpUrJuS z9B15ofxfWCX2JJJJTW@@H}9}4?&mD> za|F{AXF|3XKP6G40vdX@|F&fWNbuUk@w^qE@7P|4hE*`w3J9loY;j48^x(}1j|-z{ zqmhRsU16hYv7y+w_)={6pChnw&AQ;&npXMP%UHp*7PBCc$sdFSgSG}KUa?SOQB`9e zGEfnNqyaH`d^en3CG|vvTI8{Ce{gvAojbXq*<2op**^TU`F&oUuJ*Lf%X3}pP? z>oa?Eu03R|?UJrSqlaIr*i~8`zO-bIXFI$4jfM$AR8x$yc8gB`_kS7@DBS|~?siL(y}djsC?k>%DXc;!O&TUTU;d@|Pg z@Hs*+wVOzsKIki8jRdIyPvL_!K332I#~C#06p!rBLUl(yO599MHQ4wo_{iV*{@ue{ zH&W)b>jyp0$_kTa?%8)h*4~bG4e`-hgWbE@I}rztaB2L-VShN|~-m zGD_H}#~*5l%%cMw#nr}oB!qt**vmSAJ)3fTz~+bsViOeZrl zZau7c5z!y?D(m9b((GHUz@=g~j6v+e{AKrD!9H&@^2S%Zv^%AE=)w&uA0F)H#=CJw zd6-#F)I`-rf&-86k03go1)+&fr`HFG)UdZ*a{I91u)M?UcrK=e8}0k*PPahAyloUy zJdM<=$7u|!A<60s{+XRJl-RJ_2^B1%xK&!i<#84hL0^84;%>^wo%RQv+y$rF0=k0Y zX0HheR#t=3m zE7G}wvX;q(V^|{eSS%H6xpd*BcTo1NrI@u^voIXXg5rxqxjvRqjcSC25>~~}ipLAN zz=v9}pwBT`;3mh1hYu!;M$hiTWtjcZ9mDus?lOCXQ(P%&j)A{;Q`ZpsQhBAq!zx%K zw_g8-wCY_ZxB|@K-6A$au-a$(WGQkX-S|4OVW}?P;aQ+mLoh9u!JU0LVm)~*E#ljm zEjCI-16MMERXMUT>Q()|u&xE!U;b?E3!w;{BR1@2szoZa!|q+66tp4hbHtU@vi|b3 zcRufol^in`;+vI-L5XTfJD5AjKpkzk6AfC8AE8g#{xy97c)KFs?w|}~cVE``2Usw{ zU-CW+*aSwS+{ClI`+j)v`qep2LFe3xK2x1m&D;$-a<0XXrVTG7{7zgY`)o#XWzDDW zaq=QhIAf_%#T+fbSlc+(u&KbZ02yL?Yg3MtnhsL*(Od)7yRR!rg~ zy&psf3r~LDd0ZNdWl=?C zVGKey_nXhAW4=iBeuyP>{1NwYk}K0?h6(!z{^fdT4hiMEb6Rq;;Uo=(={gsh6)I0& z<$jW1=jeXo3@k8qb9T{~;vW@k&P3~9>Id4Cj8P4Y&Q2?eY7((5hA;|mhm@oxoi1r z=mDm~)r*s_fHXH**FI#mJKOX^Mv{z}A|DtmQ+L`Cg!r_QqugxF;mkHt1tR4mtp|6p zY@D+)M&5nvAV9r0erNJK9{5hV4+2kE9s_5G$DMm$u`EvB9Av}pO?|g2Q{0Y&Uw|tuNSv78jIy z*apz@F+;(ZrB=75uQ50oV5X) zY_saXdB3;{)YPYne$-KiZu?Eo)C1a(L5vex}kx%+(#`M$C2;2zpkRL zrS1EvBpXNA4f*}4r;FMS|2h;f;GQ~ZU;C@Ra%yn?&H3vzcn#kQbQT{X9d;sY6EA-Q zpTP0LJH}(J1&SYEG-0`a%`~0*b*$y~C&>7t^!LHu9nfNB4GW z%fM*S?TO9{XQ&VyTt4IAh{*LA>#jJ850P(*b@O&(xjK`?1lBcZp~u3EQmT+OwZF8< zv$4|ha;8f?aWNzeFXm>E=U25uFT;gik?K(^6JcK0~8c4~M5LTrDl~l?s%akGW5SBzs zh@#M7$UK$I^N=MAg~(Jg4@*eNun-dRT|dwB?7iRpzI(s>`|~@F@2}_RIclx-yMOn6 zUFUV4=XKpTTc=|R><d z;d^|0cK&NSFtEk6_`Y35#qcLvCA4d)+*&E9DOf^TmxMlObVpCV6@i{bOSfUSo!%t| z#K#wL^MM~jCTSc@Zj4v4lr##67o0`Euzl#*EX;<&1-y?}`go!ixt9~w{xWMbowz;I zmX4OXZi+^e4nJc^9*2v23kxk9l`R8Fx*Ed1C2~$bYd0H@{~om%kG=B?Xgv2}Z|tsV znq|6@qKf@YOypsWLp*J*^{`_wy6fC1UX@?YGtuv*uZsk zqfDYWO;g=Z^W7YHS%a7$?f_QbjwkO<97$XxM|{SrANHo!o~d_vBQu7RaR9^E_8!km ziUYE@3~sK>&ZtZFHDah~j(IeW7G?_tf!a-z_0*I5SWb5v%B0&05Y%f_x%;mjj;51T zx*N~bRr|YUb7PJ^Cj)$?`R)11I+5}lx=y@ms@9)Zm;XG9CQWBZXYTo_)lN%QZ|8T^53AXo zzvzUAfuOg>>$_jNzBsgGb`eb>UL_uyJ~7SiU`Xax;8$fKE<>wIFWH~QL}kmb&~{mt;328(VDiU`dYmT;=EvTPn_S`qZ?*J7{GW%NmSco4w%pC4KvCOwLHcw*3%8-49+UN%OA{xq2dGkte z|F6eNQcMt_B)2zU=D7hZBoQFh9>pfEeB$ zF4#&we(BL-z0mloIdp&&-{an=9MSBsWP4%YRyX_c^^OGg9!x7OwfTm*V+7`M@}2D* zCR(2m$t&EHe_GhFOE%zznpg^FYnM*WYoR5z3AYQ)c zM~Y!J17*nLtH}fgsu4h7yi0};^k9DM7Fh6KPND;(`VfPbHi3p|@5=3Pd zLjBd(u*XoxaNA`FGQ9xCAJ!LqUPOJCNwtmp{>BFLYTmm^rHO`Xzz{{iUU{Mf?8Q*& zTgI`QAEhQAULY*F9!v^fq-$_xi|0;aIHdr8De{z9nq-PP!*Zao``KC`Yg7uTcQ<{Iy!&QN`t=h#$fkJLKT?INtVvFte9 zbmH;0P%%czgp}`zora94aP%pzYRdp{%GXGA(-qK%U~~`#qOD;A?!mPHq3?CEV?lVq zNP$;jeF3L*S`9Z>66Z%XV+@Xjnu!L4b7`Ls0Wimj8{q=F88g8uM z6wO-HB|~~segmsfm0;Fwy)=6-^ZIc4gfCa9_tq27mDDX;s?ap|#_iE~c*8@c1=o%~ zkj8Pl|vzhQERvLQCZ8Z1O zld!sB>5l|B((Y=XRpn{GE?K%DmzFw}L|&yOimPApx5S>?qaNn+9eWt-9wkw*ull+U)#`hP>w5YPZAwXL;;JL zU{d79&c;l6kw@+mN0J4Tm|2Cr$2nK1%im2mjUYXkdjF1lbt8*)$I+&4XMEDi*||^F z(gMS`3G-g^b4=N`bjWTpM|M;7Ve(jQ;z%pMoOQ(riav($b8=S_$0-zsoywA4z1CWV zAyjIba%k{k$n`=3$8J>=YoI)O1LOH*SmHQhSZM1#t6-Yq33J8Dg+2E!^9QCA zxHf>H%F&+ZQj+v+Yx#7V3;+1C6N%0Zpu8!el&3Lvf_-{H0Nx~SJ|`_tRY#h ztPn*d#t>hjDqMxa>J|)TqdI~;y)S*fh2^9Z;d(X?Y>~2fh<}iFuy&Kgo+5wkMw{Kj zyUiXh?+<7QJ7yVy*(du!Mpa4rnDmTU%m*>N5Bt#{5pwDCQ}x7gWpf9Kz%tBhlA*$W zVswU?>~=nKi`MB`P28J>=2LR)FLT`4uU*2lTayARFS;Ioevtjcpj<-q;vKK)+5KJv zPcp8!srYwjZK48Ek(3H&|e?L>B*m%{Nq$x z(42BW!g(hDp$5v>Qjh5z{f>J^sB5FS^n^OU)Ec+;WK%iYb4Pn`>`Nr14qSXgzs#EG zIle5qeJ8PzB6^#Jgj<0RsDyYj&jv-s5}!LI3(s`ULVMfj>yN$Rs=g<06?#=N$k;wC z7fFm*AiGDO-#qL8gegK`Im1wT;%Fr|E6cQO2isQ{)ymcJD~YP|1G}JxxWi0Y?HS}3 zz794NP|hBA5ag4T2+2REMc!W_sy4Xp7Jj7-DcCeF;Xd5H->pUU)}4v}eaI zUR9N_gXEp+oOjvjIU2S%u*!r*o)0^!+7C*=_|!@Ju+w4ELJ9$#`Y54>19RIlD7&`r ziE2M>I|D6!-`j-QPYXSv^3obuT(Vr}YzQ>J&mhE3BFvb_n^$0#Y?-xWZy6p*Y*&}! zZlkMb@3>cZ*P3`G?DP$k@*%EPB@UyL8(4n~;X?Dd#Uwa`nxe$$xG#i_1@hUiVk?@n zbAvtsm;C(Vs0O3E1f8Q5L+Q^^4XkAX{C9X$tnyWAc)qfd%qYzHJ8yg`zc;eyMg#NC zd=EcnL)4By7L0m@(44fd=^w>L$az%0U;Qry@A+#!X1{2l(nwQ&;BY2smsGUfxBEvJ za9;}=v6pBm<^rQi^f+sgN8+@<4tio~?hi^AxDu^TGn>tIDx$Bgn}0Ox!E97IU!os) zh&G#6fvdL6S81r6=C#5K0@?xxiOOzo&3!C_&AmqFgNNR)&{i>4aS1$eAH%;Nz8&hJ+cRgG!q(l=S(B6pzvLu7F5j*{KfkS*M!wM3YZuf)-7u*%#S;E@T%*>70i|ldpHPlr15ZWA@kAle>` zm$ zq6^j|o-grc;AJYmY*;`rUSx0bb{hVW;L(;6tA6OiDROTu`j5)DL4=F_7mm27 z!eC2L|)^))h z9tFzs-=7$>PNH2tMuyKK7O)?N*M9Ao{G zwzHXVD)S_{5-rln7bjJ~(;2XMf~MJZ=hEflGs)6coe$f!1iQkI!Qu4;3O*j_IX{&< zs6^B1>p;ubd}$Lo!$8fkMLD)TQz~=VC?d7xoyet^qJ8O)p-(*}%wJg)lxAmO5KmBGD6zN2#)~PSr)AqRerH>;c=xSZSDP?s-h5XoU%O zEAa*d(pd&NVsj4?{LRv-iSDEE#PHhrlGaIbOsZ3u$P_sSQ&iY?XP)68Gm8RCF%+p+ zX#yU+`Z@jNRH#{kdxq38xrV)K9buN&L+LDe(%7qI>W(<8iYEEb#H*Nb(QuyP)RLuN zb-=zZxi{KuGkWAly+Imoruf29c;wkTn-_s`73Ctgn7GfLWodTVL{nUAMuh@bn1>q& zd$t9PB?d7llEXQq(WF87_LZjcFVzs-@HU>Q2$S9*<%R5<=YvlC`(}H4*=Y~y#ESefDWhuZsgOXS*h4S5&eDU9t0vs9esf% z^(0E#D`5ES(_Jn%!WWGoP4dUTz>yD=pOl?F#0V$mq*Yu zkh8=ux-ZY7NzLx^$frxdryp+IBar{4fI%qWEJ0^1QX_MpP*b`(`8$@VVTbxa7vVWt ztu}X^D|9fu{ku4tL>iX}3AlpB?dBwYCH%6F#ANU!kGS0a^7{M{$&J%W)N|SCE1$H{ zXk3lH34Hr-i5P;iN>Jf4?IJsiF&DJ7^e}WX{fvuN<4-$wEJfE<+rq==KS1G~o^eNA z%sg0mq_RB1{c%voFV>{%1W>s`IRhgOp#K zj~N+{#At?UD+fl6c#~Ofr(-*J)18x06&}U63i}4G6Nos;%gL`K03wz%Fj7`O#dX7r zvd2r2_f$%z<&N*%eeGoE_A?nMcGYpNy}}|X@+7C-qQH)^zpIAaIeZEXYsecrUds-Jk*H5JSfWtm} z^uudz@)btM%z$(ZA!WDGQ_N*mkcC4)$5PvV;JF2cAiFJAO&&uq4B^q&PHq~1_jo3^ ze7FcSj_m<46;t|0FN-8#1OMF%j$U@0U>XfETXb`bG_1$!w@oP&#vg9E~2En zj?+?4TG;9`ok8!9cmaRSn<-{u6n+j9?@BRI%^M zP!CMWb%|4SPHai4b9fA+U}|>ue(x)OVQiDi^5{XiL*EBw&*`tYA4N;QLKoL1u9#p5 zeI}AZ*WNL_a_Bo0O-k4qj)iQk>3daCsa;Rey^-@J zwU+cYJp`o;`<_h44V9Jh6b2Sa8)u%r62Hp6XKL0Xlam`8ZABNSXARr?pXkah@;)Ca z+n652Pa3UQ4=v_pi%4*gxyj5Ppj$*>|5*F+mms1fVywL@Q23}jZ!cX%Nz-!{=R#OA zeyekA8=VtrUj2J#TCgBY&tdbUa>#6x`9hGUR`6~(G7{XvHtybIpJyu4?(d1(;oYK;Br>jG;=ZlfwN#v70 zTUV=q-6p<~8#M=A?e0PZu!LnIjgy`*9K!zjc!;kCN;>t@ao({Lp7nQGmp|#7DiIaE zX19!{gE=j#INZXSqf~IM^?@(VOuA;hs5R;M2>92Il!xTkh zGg`9t-3Jw8RpS&K2T?TMcCnvUzvtI@;_JW|7+mT!^y>Y+ZIq{xrv>1tBQQ!3;i8Lg zC<%%aqaQnX3+^RGN8hRxJjhNOyzxNbKt)tzfRNty08EOZ-5{qs5Z+!qSUX8QZrLWlJk9d*8e=6it0R4J%Kub;!n zOdx=s#d8$BUbDGYQwBqns(d(U*w$ZT7TQ$iaCiZZ%!{n^ag)!@aRn2?*ZqUA*bpkJ;h#C z8;eVK){DpI$B!(u58vM0kaASJm7pbm77aR0Ry0H(zx?vQ<1(g{TvY3)@r!`>Rk6Dt zV^HcSH(W%4AhzyKwBP`ZE#`-19kX?6u8GRsz52%!EfM%j^<4!vlf>m%F}xlKbgzS*Qi5{Wwi>8W#+;wJL=~5w`51XhqG(d zE#Ah3A-HDo=SrbTw#sjf?#InT8W23qf9P6>Q&VA9c5}Mws5or*p;EFLU9;=$y*S?X zSKJqIo!(`Xgt~wQxEW%~I?i$Ab@gHc&7rYsOvEu@igS%uoS0Ht)ji?)y6(ToLE)cb z2+#)p*r&s~V-zg-MKBzw5u{$}$xpvwOhPa08|!SnTsLnO=gJA*?_Dn+f7iej$VuSV zDuRc=YOZkWrwf`ycx{!X#i(|efZv_-zmS^c@;>V$-V)_UJ${e1jvzIE#}Cz;3V zv8NkL>^?QKrYhQYBvX!E*_9Z9N*u}FO`OzsHsU5P=2zmV2UOkV_tPCX37dRchk-IL zqJpRiMh&*^i(guEEgK)QO=%0On zI;i{)7~HJ(`@35G_l|F_wlBwxtT!AD9IO2eP4JcIyQkQXcE#i-ypzhS`F(SXP>z=2 zHoXeNyPWQ>xLi!VK);*%r3xC`X{mLeZB3YTcF~~-s($q1^KV3>r8zbc`AUIp_4&IA z<7_=izULSFBVYzsXVAAKU}ejo-Bf3n{G&B-4B_2%V|*)yF!A!yv;vxRZAr4+)0^^C zB2?j=|A7}ogfw_Hu_g(but|y-{lIAc2@g5D3&q~6VAmzC$nCf?J2W(Y0fnO%Vw|&4 zMI;XQ%Gt!5Q~RS+@7zmklGQxGR5w-^iozSP=4$$9WutZ#p615^HfyuzBQoR|lL&>( z)%!*cqr%td$if$^^I!jJ)(Sag2)oMn6iWKp1=pusIQ)Z<~#KUy&d&9g_k&K^UhNDCMQ`UH)MG07c0F$ui=<6 z6EW*|aj)Oo!6iDzD#Vp%|8DJO7)ZOft8D5#ONx|%AG&~ zhT5$P?jc4e`6&rXB0*nS{36|Kf?NL4U*FQo+&(tr><Ep1%5ujH| zZ=E0DYc>7fld%K?p3XK;to5!IKd%6}s-!&wL*P_?3eJ#7aSUmoqS;}&B7`Abs%&EO zb@{z)>pn^>dxq3@SMVvFPPu=-`bvJ@&``-et^EsPO=uoFx#q0g`6WQ_asWyfb7Oofol8Far8tR|;r z(S*Aal}mitnFJUkMmEIr)%Kf8mCxw z%k#W-?a2^_)@Sl2Ar1uNp;sP)lr$D@BaWMzhRI3)BKSgD)Kk(mJPj4mZYCZ}S=c}p zP~mJp;1A!sE#z6sIW!JUy2QsxX1`jkC%-omlJq>x|2x;MEqz+vOd zQlBDzDx|u9+Nyqy-OKT5zwYV?Td6H*MLwNB1ez0E<2zKpJeZ71s|pW@5@D4GPFdV-`^(uYG;M z$d%BVpwbhg6=LYG>@hX|d<|danG0K}-EjocYL4STnbYlWVzktA5~(Z4;4QS^C)7fD z`=Fh~I>~oAHo@=Lw=!rx=|wP;WkXmY2wJ@eNETGS;UX6Rc}8CW@*2ZB6p(g}f?n~h zS5-c?3Hg?x@M3@#5AA<4=!YhamuTr0Z#|ha` zdaAT5AD)W%BfJ4wq5mgY5v}eXmc=8X*V`1i$8oU2wY%KeR0Xob44~+;7l}8U{gt1u z9D~*xhY{bE(rw&g6~VsGH}gzuUc(ro>)l}AI3x+~N&wh61Z-3{p-3=du9J8t}f3(@qY?K@M0W*}iMzkLSxD5OH7 z<5SDH*33u|O|ySxw6(i7;3WLHyI}il(jnWY@=w&a){X*@jn@*GBliR% zZBYt0e|)vL7>^ha*v4mGea}+>;J$(i=VQ~n#J9^+K*7PKky1;uJySbP{aJB`)P9a| z2w#=oj}WM4Dlif;*K2OLEt~g}*YDNI0jp>eYX}JL_)3&dvPl3vYZ$c;F;#u>Rcvqb z(-I=^KLtoAu5AHFh3m0`)msb!>i0H?l3b^Y$-TJ8m(o#p_eP>G7)h27wvq&?*Fw)CY}jlQ<7(nB(d>c}Yn6 zsK{_#5VOhbL2%dcsj69ML*Ve@mqX)lOBa;C?M0^6?R--O)r><&@(H~fO<$FjC3l{a zac6I$N#~hF%RxR+a=h~CAaNg&Z%P0u_)8AZWyQc6{Fcsy^=^(F^8uR&W@J<(j45huXHo_x84wp$gZuD)tC&XF$XSh|>}LD#z{1HB{DS z1Zu$IvbP%ce$b-9 zP9p&!4?^F zp!>b)ZQ;I~8!JOOPZlTJ%l&B>cC2Yi14F~p=_8Dk2l#tiE$g?FP;^xbW)qBG5ezDIzMU$~g9yV}PK z>Vqql+&YYu3{I0OE`W&Ave&QZ;XJ1KFnRy+q?hG-u?}bo()xF`ZkKL>ORPn;y^q6%S@^J-*?(y zWV0PG)Y-=h>|Xa}1zB&Fi)PoK<4b&HSH}8lJipAZ<@txIBEK5_xM~N&=BT>(?We|& z`eq+NKuofStqBe!bl?(AE0sysE>56)ds?;67k5tyM9yPP&)zB?UMUHpO zKW9V)$R@+tfz|knQ@x7H#~juNGS+a(-*wY|xh=tZ)8z3d^9<&TxmO10dHmCt)E*Cw zR{@s0atWyx-e%BJ?`KfUc5f}Z!+)~qD7>C8Yy^qQ1*sg4ev;^W1c1k9w5+Obfu_kn zLDQwTh%@W<2?$7bQh%=pQCm=%j#@ND1Hba!Q!jHM?f?I0{Qvd%lP~yxW`B^*F;j_- zn$c3FooN~aMOH1-T~{ua>g5{s-DMS+2GfSyn&+^$%>Z;N-aO^(}!?ag&Af$W3 zSSe#b5G^+Z(4<3;Yy-Au9)c{eTJz*zo@O9Va>kph3Q+iAt}j>pQTVnG6StDLDs*N( zsS@~-9)v`IhW76{_C0ya3oX(EARgFO`>d29UTAutGZU0OitO_$8-6X%*h}c;xRi~o zP9`~*+I>3E=>fR6DMgVdnY=P`dyXPUj?M5{neEQh-m?Zn2M}=S>09PL*#_t@|IkJ` z=0f>p`TYQn+@KQv8#tOYg+R{KaaD_nV)cmuDx*gdb9Ep#XvVmMA52LzdJ(J@Pj^tD z3V8Q=W7)V0hCNP)-(F8c=0hyMIKJ`jq4JbC19i4$)6f1x`aQ(a;R&OId$kmYuC+u< zd*#EEWH0un&7riuv?t9i=Nc9_z+zkP1avKDq1u znC|;1ruA$*(GakuIl%P-u*7tFM_CW3dq_#cLrftd`!9s|pFIRGg{}wK-i>@#utUP! zU^n}Z&mWi&MuQY?e7#h@MYI8XWbXUxhwCr_MS|L>4?H`TJ*Kzew%xQuq;jPik>VL^ zeK(eRW=1xDm!t3^MfhrQ)R{z_cTqKrl9zfPw?c>nPPLnDdi3Z`P$&9_edcCJ=j^*O zM<+;G=k_4NWcoj1zQ5(ge+bbBiP7bOO8S5B=)&Se{R|Yo@uA?%=TPY~1Re$n9fM)c zWv~etsEM~0&Q<|^J+#IwRY9hb>mT2vyM)+U{ZAL9#Z}x_hR?KknzWwa-Wv5h@Aod4~Xxt zJp8o!EXxLPuWgJ&wIRFT_~Vp~%Nzr~xCuTfJI+5CM~HyS+VfFx3ae-%JQWO14SSXcn$8U8aI!}7x70~0o(1%W$*2Mh~J0C+C=Dv_E`U)U< zXX*XybQJ#TIrVJNU4WX40_NpPYlsKu&D!u?Qd6hyvdC7ZX>8Bi!ACP3d4|vs#-CRM z*CPu_>DT~XZ=+cJ01BpuNBr3C%QW%gQR3rW0OGS# zJ(8GYD4K~4P}=WE<~J@jt??^^_U=l5b1_HrM^P}>v%6X$($(DhAL4WVv=-M%&5DD2(qIV1^Cf7Lf={;@@7eX<91wr`Xnk;1Fyl$r zGsjp;_ORU1zM_74-}(s15t)J~P|{hK=bvzsLp2(&+JPLUR=WO>lbl_GlCEJk{oM&9 z^Y_%;-`mYj@+`<=;$rD4y%)c*^+Sc$)twI}#6bV92Rr01cU_8#hW;?kS_^R~!~;SM zb#@V>gA8POc*vn=n;f#V866euUOE_{Q9^h^;PhdT;qZQ|Rs+Cn?CjXHDxbDa_$upm zb39U`pX9xGHTD#n(i^d!*n>nqUb!JSLCPMVN1)oQc@FyDU&;{R-s-dex-0^}%>shZ zC43%Zf363i@xZJO<(MN5T@)PiS@%of+B)jzz|QZo1{^@xplIMxlIfW5zgZQec7&F+ zErs{rAbN--bvAsGM%O^QQ(PE;O3$eb{NcS7u498a#Yph~_VI-=h#3!D2H^j{M+EGJ zFHJ%UC}}%r#RUNjHv#-vr);zlE!9n1vc{5ji-rX=*1xI+;i19xoa}Ao z{Ahjed!}VLR#knr10~lj83b=c$hu<(-uY?8N*!Q`8z(64?7_-&{(&FhqrZ zP9Rba@LW^^?d8V%3|=(ps!q!s1WceI*|`uJ^3xlhm~!jHYU%{@@~5o)1*dHNP`syI z_?6rXt;!JobP)>dw$(o!H3UxK`{LeYJ3+UV(_NMBa-{I%#uhxBLA^#~`0@H%dSs#E z(ZZxZ!Yc@XV4K40WPIKaFFpKWW!LV%NO^>EII&eBJgdOSMYR{yT=d;_W95tc znu}NoA4;xYQ8eaG&iIjf_TV2(99F>!s8pSlEyeZUuMbLmV=jHOl%4N(jF)k+qfg9G z$)*+=+ha6=rNIK$&Gg8XE(`!rk;P* zhI^+TnwuW83~%#(tLIm)Jhf|oi%I|FWq}S4qI}kXxK-@A>Qg}IPJlX>?NuR;6My(AwJE-1<=6x!%(}3{J32$sETrjd3rQhxD4UPkXRUwTL=L@1o{t5n6b9NOC~IS-J{4ach?X4 zQxvW>abyFT7g&5L((WZW+VcG|pLZ+~(UXU!Yest0jH7zb4Lq{ZvW9HBH{R$)8LQnu ze|IBQa3s4GuRz*#@iM*FeN*&+C{f>)PZNxO%Ue7M9`-75c=r(hK*bx;9bhWm6&$UI zA>gk`Ih{u38!$Ob08O9HMgz*PO1t{Qo!{!L{G9<67idaXceNcEV*x`#<1~zp_E?>R zh8U$an6J1qIc=MmkjS{_!uUI>7sC)O9RS6{=lsa;RvX@o?i=um$xTcuXQ_X7xJ)DN zaM4pWxsPA|UTmw~Q)}hDC|6~(iKs#JiHQdJd8|^H7pqeLpizA>)*Tf>U@5qtfc?Ii zfWn_)$rZ~3N6TlfXj|xW4BUwyW~6vWR#t!*VzHl;3-nCnjtVj zWxzjUPBww{=K09H{p5FjFCaB;1{))3f7Hfgi}v`qMSIA*0_njOU2z%;dC_s|f;y@_ z^!?ihj1(TtQAbEf54y0rfmN?Jf3GdWO+pp#B>YxeOqJQIVA@X;B1kig#dLj!8EED3 z_La``K(hoQ-_OA8o-!EBUtg+fa**_hW)9J?=f@gbXxd*c1W4I(f8G9mSb+2bJCJ2k zQHUB2Kl)}d6hLroc>6OvNu4{-tc}givP%=BE@_4L&~C z%JrP-$N!?8tW~-SAkMSL#LEzP;_8xY^(@35<(FUeCsqAVuS;J$ z>)DjQiZD>1y0b#U7$QM~v~Z1?_n?cFA~0!)G*H}O!4ic(d{6DV+Wp<9L#@};+A}nx zmDBh)V%|mf-S*GKmJk#?p?i9|R@WP3-?vvyV8NkLef5*d;4?%9KK>ov1sk8>d+#|xywMqUli(vu=tsR# z0B?nuWApuNR&u@SXZEqMc+d6GR^1vuUxr{ipn)%^S9n1n(RuRyq^+z_7)slXMgg?m zn@dv@*6Su9oU{m60J!S|y4CtvzV17@T%s5JB$1oiKs5PzNWWref|VbC3$DIDUVXj@ zO6cYJ=jUTBOkUk6gwIhTHlhwLkAe{_2RQz2=nH;R6@={PZ2v!9YdPRrDJ-6BM5P8}4 z@auy+B zpIK*Vqog07oSy|cR=-`mR9Aqg;lzz2#YAA*Rag{oVANnB55N=|62S|sP1Dfvsa^}i zO?mk4zjvrk^858t5JC+Ay5nn5v^xTe+?}e%9IanqG*Qb4?}E;1CM*pp|H&?7huJiuqgXr~hHo($P?{s<+?`T&+0xO{stzKHLj# zV7p@iVj2Ed04k9$_ES?hLVV0HAypgV0{)9X&7vbSUweB7?l6l*G`iYXF#W8)dIV~) z2ggm}8x1qiIp;BT=N+&esA|}pt;c>+XpEug6xcQbE|y%7^MlIj^Hm2 z^G_Ws*9eVMH`;e094;a#tW6^pPRfCvZ3d0+ zi=|)xOm&2cUP_o%ZsXCVS0@7KMcMP3=YUrJGKHX}El=_&n0f2$ej38*Wh62j*&}^O z8%uh`;EX7GeZ@8co{UNaU#N4B>F5dg!&da?@`G@<{&lAlMBN^Uu-Cm0k2V1kD~yDctYBHE{qux+^yk2IwCm6})u( z&2OSUpok2rnuGK@B(lrf6DX+}x6#$F9RInXNf9_bJzQM{ zl|8;2>gqRZFzn=i4?!4$ob~$xP{))kb0O+DHOS&>sK2HLE8UeA_7+1C#BY*)Ef?aU zyCJ-UDTL~tbAcpeF24?gRd{I0V-_^+j44(`zJah4!;p-twzU;7$Hpqa*^<5egchiIE9;!Ir1ImRhCb*L|0|9kf;SbkGKA%!BQ{;3Fhp` zbtv{0c0@a!K;@V0e)pP>tf@D$V3)4|;n0fJEmte)*ig_p5vIqxWlmuBx*K>6`m=Q2IV5oCdQwftxU?lO`S|#%*)|`sw6vR%MkWYKqB7^B@|GFj4*J7 zX|GNkD{%c!0So~Ccx>ax6#;O?MgN8)r)Dj`kcOn>s=xJe{HYg~!qWUKz@80k$ulUd zYqe|6c_HlkBXaRIv-Oc&{!@2u_n#}*f!}6P&*bwhJ9;`OAAe@wm2}_W8!#O6oA<*{ zS+O|6gBvGZ`JntV?zCM3I_2a3yvyLy+aK!-O%;9iQFELa0+1I7FUF?~lGwi0cOp;u{z>nfQ2l$4q3<}=8r8eO_+sxCiqP5oRQ^wsfP-W>_*B> z$cdlBN^Wm_H*Isb2STkBq`-!Qj#E=vDrDm#zk3g2Wb1@teyiF=?w&Ol>&jlK?6>R5 z5u6W!2}>~GLJBk>Qy8AKJ|*Y`U_Iw6mtutm0&W0&$-y0O&-v-1q`e<}s=El;ZzDY@95SEqfoO+QsQh%Hvo+jgO{3AGC4C9v z3NZ8nR5r_d9gHYEV4Z{mrS@wrDPo}3=`Ux;SU@IZGUplwlDB}Ikp&4uG^DW{LlAoV zyAbA4!Oj5FLy^$Yn+CxBJN)?CiCjQp0QOG@fV@iz1rktZNEjp-gW<+!3E5UitIuD0 zA^@3ks}^3QC%eV#!;6c;KHoB;5xu=2fK9onY6>Z|B2}O~x%SdRQk)20GJXm0=KdEu z1s$4-vO(N3NZj)uLMUE9oD*V-1FnsNtj3*<;1oS&Rmuro6h86MNnKQXc-trxnRRx1 zYrsB9YRE9oR6sc|e))bn~;#RDRMeR=D!V zPXqhZzx!!4)H7*C(5DcmRBEySe?ZbOVT#LVkpQ2XyJgL^3&ER>NsSW6}sMd&i9Pu}h7 z3htU|2RN}srQ>{=^v5?n* zPnt;$l_~;S+$fY+axjorm4w_Tc(s@J)%~~GC;TQ^#w2mv_JDsOP3e#}ZJ9>`!mookGp+lf0D+uRhpc z-m-xOvCs?bhgHe_lxy4r^BLzJ@mbog0w)GUzzUEDO_$#qfsm}HB&P>9HSN?l$j2hZ z#vk<;-=Ts~$tDr0(PF~*(ceGEx{Itd1&+9wHEUvrPXsn1E)GSRTWv~{y}&4ZqYt?a zC1O9NHed!4l~>u9q|J4kEi8sxuWMxf4}W)HB!Gp(3=7mZdv17w8c{r4?V}-d{lQor zQc$bSYQVP5xL*q5S&o}^4ww$oPTxZan)CfncpzcJN+)fu3PT8Lxk`}eVK(Me;S}+L zJ9sikfun*%G9ZCIxR-Nm`kVs^xgAofAGtTx15Y@8tAj1k8=|O3u=`FSV&G67zPcar z#Ow1WXR0yn&48$LxjD)dCWfuP+FJ<`;H*bWwDt1@TZTGNoiTEV3di#ryN1NQmetD*gJ`RQ6)0 zY_=s5Gk1Go|F;b@>bL?uDSe=x+9jxXmBD!%0HEav97UVJ!>a^sUe!UEsy|KoiL|6B z7E4}Yt6In@ZgWfYPPeZ;H3w1F#Zg;&hP$X`o^inNwcbZWvXbAqCi3M<222L;tjtB0!g&uqGrXH*Hnit{mntUPI7U>l7 z0{^Ys7WQ--#p;Q8A$;JyvvEZhTXB|U7EyziVJv+=VuwuMlEGSnd(t$*pG3=$Xp9$t zJS>1YP9JwDID91z^4r9l1E5F>{biOM!_n`z@vD+_bG#p>-_hjPh44qx}2W|!W{POe&CXrZIl)!829L9w5twxFH+DWe@;T&l{N8(68%~QC-XUcoX%@T7iq{B7J)?{~;gV9-XrJ^uwJ>A7ZfS?%KP* zB486Rsx#IEd=ZH)s~^gpN0KWL5-HLfYNKNHvrix_pZwJ2!8`PL!av$5@idswMGrVv zKHVy;EHABqR$2_)MEcJ|bj#*R$9$1XLL`P0$L~W|rblN$gmv8`{u_BP z_S}6;h(JNF!8PgEk=g(60+ytZ>VjYNekr`995?kW$On(k7xF+&pw-;O1Bs9iYlU^F ztt+Er=BglrX6~Q!b+=K_C(jP!ps8aGE`K$LS0lT?#Px-oso>&FA^Rg>k@q{S z?LoCKK=HrdA*v1Chud_o^gxe;QCvFmAh|AmkbM!Fk_L0H*%(U!uwm_8Wiis=EgrbM z&#*Iq&X|8~0hHQuWEfmubT1XIfe32h`3X0(4ug|%Es3qmXfM2p-S4&=??RJ$)5ESt zZe@u5fX70^40SI%qJ)nB3Or4%7e46UdDw5A5M6MjH}QwPE}HOFgK-9NH-jV5Ym@~x zuV*EFG#FW-Ma*@8Uuot;hGt<7hVb_CH9TmOf1B#;k$@}M&w#h{eKZJt8tJbldLV^y zpxCRZK&88DV^JkYL4gcho1D=79i(Y(Zx(9D6`J%z?J-iMK!iN@{Zn={GRoU4Qf_=7 z?N>oqQzx?{F`X+f_LN4Qg|gy2)7vU=y5f(fbwS~wmhrQhk@EJ1RoAmD9v z1IfjMo)_);TR&?Mj z{n}dzX9@CkzA*|ZT+4)aJv(iQn}5T-PcUt^|Z9apY zS+L2H7_&{XKmsNVo{r|rpYH5`bOB!Djj1;^(0=TvCHDy$IM1dKR+uSovu&RABVa*a zAn#|epl3`R=Tq(Kt#%-Hsr#2s@RQnNOrCNMAT@?)|J&j@fFquj%EUmxH8)0I?LBeq zON`}U3f8t(7&SK*eq7Pj+CVH-SY6$aWGcy-2K)zGbLv3bXUhHY17%% zt5^Pq<+r7__7iWBSz# zY`Bgsoe8?^;SbhVeWARH*Cs$CLDhG%D^@s=mhw`gf_?ym;Ts>jtPtW&f6l%1Q=XxU zM@lnE1NeZv#Q&KOKtiG*XoR94ZqX))>{Q}#(EO|mOm3Dfm({$-gVcCUtNnX`ygP@G zcU!Xq{Jis5JU|3oT8KG<2k#GAVrm#+_KL_#>TYTpgNt7t~~(izezQ=~9J^6&Cvsy+V#3 zn)DiEiib0NrQ}PTb;`BBS!Xr)0P)+S4b0W>KX9Kg;em~Wlr-MntCc^kjG@CU`(D{z zB%8YRN9lup3rV#KlB%JvW)INzMv>zq`T|4-RRJZpbR@?;C^@8gV0I;J82!1)pUVnO zdnj(^`0sgk1ll`5`*Yh*q_GQfTr*=8g`FM8HH(LF$$3UBhui#XL;{zgVtR>#;!~{H z%JiAzZ|%LG{e9o({XYA7Kkx6q=fAx^TWj6-bzk>69LI4U7epV^*F6A6{cU8l7?9lK zT_~^jcasWDYgw z#3z3f3hG-WrGID7YQLl`q706o*N0T}WU>8_O9k*Yy#PrY$X}B;R*PtgKSt+sr@!an zD*V41+d8&x#{%rcvlQd2#`ABAt^@e1Je+jb^A}}#Cv8%AZM%Py-?phv`MvO~;~ngy zykkAhpT_{f$IQNpd*feqE<q*g}L4L}R~E5{!%iX3Em=>>_V9Al~zu__$)|e#THfN)U?tfn1I0k_jQ6{GfKgCnoSp&+ zAE@8H$+ndz?U7_J5t6X!V11N#=QY^Pp)9g*F^s^3Qcdgw1DH$v>!gCDFW@OtsGZLM znuBM^G`+}~3Y8dMPhiLEHV!5h1X5_-r&|P5h(`rSd*bpB#WMU{|#ApojSIod$w$jYONxLsQhWEUFkmZ|S6dTp(vzcA*|hH0$L3Wyqao(W^>K(YPrpTFp+-z%*L-_KKB5_q`&8T=a0NGGG!4G>-rS5V*a{hqhL~*dLbmtN_+x3{Fk7YTkI|;t2W! z&u-!BQPF@6HLt@5^x*J%s>3}61}M=_kucL$(XAhR>HXbbKaZ*&QO5?111kiz`WC#_ z{{|ETp1DRcaPi_&62ETtD2STJFHk%s1Qo8A?!EKv5_ysQRozKZ>q(f9Du2S74`|} zWjqkN?hSg6i*o}-h@6M}Ex?zZ^N)&Qq~Cp>yIgBma#2o7+%JDI6Sd}gy!@U>SQuM2 zJ0gfPY!cC~a&_6aeXP$vsd=U9ZrT(my>d;f_W8{~=~tzzb#|EM?DtD7FW$mDQnlu~ zP@OCw^CDV8bR~eb6$6!DX2RQ;Wr8J;aSnV%c>_yo{*1)}WS);l%-6#>VzuV=ktKAU z`q%IyqaeB)Ls`SF2$W{!;#++@oNl~(lN5So-12YZH|=F{Rl8IDfVk745O)xzt8`UI zRq9BrUY^lrMUJCNXp-BaOxGtbO6WBh3PGicW(8aYy{}heEsL9rtq$2zOpxdF-3seA z$~kI-dQYcn>YDvCd!7P%I)0LGViYk4`7)Rju?U9=I`a-8+LtU2s+9>ek#Jzf`TD!6 zqU(qST;n&Xd=qqoUr^{bqg_zy7jTdN4wOP+gD__|KRsQNi@X#E!tHl*4Pd4`hG_l) z2J`_4c8&%u<4tILn|5Ltyug0f=d;Cg3vbKKmn3)(I=m@NI_%nU)VYcXbE9=XS5Y|t zrM}a?c-RhBSD=>`9^#{qs6kNYyz3TiUbtdwKM;gOQ}1!WsC$1IOp_}b30swbt4lF- zr{mvy=W-p;#N!XwpAN=V6Z}~oG{jfSE`b=~zYa`5Bw*@^fOv>q@%fM*i2;%Q&V_T5eIY8 zr}0$9v};FE5kEVSGl|Msl86t226|9)Ht*RGJl^lS#2BRe-%1gvO2pkY0WV7Y6)VTw zkg4)fYujfs0>GEREI4vx{M9%siU*#FMBl73OyA`TZ}coIzI!EE^bmV~-rCF2!j`Ma z()p1P&$g=njjLMENn{>8gI|54JqwB9z~FcJ$mX9bYIo*iDqfH5&W_cV0Ja`4@a8u7 zO>`wagJSpr1CIf;y2!ww5YF%tDk5nO8P-7Ri0=F2(A&0bJ$!D?0pczBtN!21pTK13 zc?wy`y0u@x(dU%3mgoShdaBIZkFOrt%fcz25BfFtn0)L>vcXHIe(nt91Ivgv8(Vu%J~}NT2TM=8snkYHBn9i^FKC($Y_a)HP)TcJZp&m1Y(+RpW+d(7PRsD zB8Svdgl6cQ`)2P^xcdxfxc)u6h?bGRhdurnDr}^`cD_*!Es)4K*Q~^Yo59EzJec6O z(fJ7i$+I9RGC3e`zQ$g8XIk}SZHhza8KXEpY zCoCdmCw^k{tGT4h;?HPlSFD$DTKB^G;fK{q9tUS{xCe4@;_Qv@_U}oEVZ-*EUhCkB z7=@tmMakrAyUn})^zAFGyCw;nw(U>~!TN*4o+!Pst^we>I?1gd@`BWn1gV3Pl7J|x zc6-yi4!=NE=1x&>SY%?ym*(f1q?XXS2Z=D;oBH`@00icb>497{?!mJ&0eVjt^(*L1 z>FE%zaa1?Z^)^^dulb?QI% z%co+;qgtfFsU$<6inifA<%3LQ!X6?#lVcfTB!F1lH2rKl#0Y>BsN%=l0NGMwx(;oK z(QJ&mMF#1M~w|pQrUGTzE6ZR7$#= zM2(rl)ZN_KsVV%G;L7bDFM4p!u&N}%_GF%L`>1~CtWQN)x(h^TkIw^bW7RGk8qW?u zPmQolO8*thM3QhUjr#8fS5f2db8 z%A&zXDNiHDNI^R<_W?5RYAujWH#QEMz39XqjF*`_FJf=CcPU%r(Z>f>i6UC<4oFxy z;Teta-A{t1457>R`Co*EP>iEIgL*ZgU^wIci6A3YT>J5$3kkS#LyzMuBU=F|`lB&F zt*_hD5grYViKmZ_vV+f%5mufIURTxtcwOl$nUO(cb$D8?B-nBHPVb9X6=HZb9D(1c zMiK@b2JZxP&~N%VMSER37`#z{e#&6bRrh`e{7!Udb^I9~It-m=iNy4QOV z)a4fySbosH0nZoLLFu%oZ~^&H>1*dEf-lA_4Ure5T*w#<@!=yux@CB;%HfCv;p~E# z+Jng3(9<~clHuP!)cts_6L&gawMTHcRFAucA=YW$BWySDXnxVlkD+yefr16CB(PuU z&b+j-b;@zn3gK3H_%%jTO@1%>=t+L-i|5In-+Mzk0IJQ@ z#iOdv(4o|Aw=4DQb!i)T*F3tdymBDPVYQ)h2o7e0ayqa|F93_8VU6z{XK zGAppa>Y|EZzO`^-amA!~`Av)rMy6a=VVn>4MnqK%DRwNgQbHYPFBm3zmlWI7!8SyW zwz!YMDi@T2li}Msj{?NaJvuF|#NzuoFmjNq79K#iC#+LA2HMW2+Xb$iKl@YZuo7W4 zlG^)FPg!e2T3i&*udyF7ym60O{T8P_V=xWtOOhyd%=s&QsYO>@(9;sm<3_?yZV>Qj z)>F=pg)AD3pc~KZB1!YW$OQY$2{9zC2XG4Vo^CkeR~coeYiD)$M0ASlY049RuRht6 zg9l%$qEN>5Ob0qb8!Zvl7n!>bjD@}wW#n=u4D{(V`gjD-*Ow^aF5>1!!^nwjYO~iA zF)~`S4;&uAWRJp_03G34jAY?uOalXG+7_t``Y{3VrW<#J^CZCS;hH=#b(9=Ai1p2vBvK~vvUzKO@x;&aG!eZjiJV4TxQFXe!#AVZCkv5tM^=DuQUiB@{Waa4o!WGL z=|77L=n3=$DNjy}%u5|_lPFZ5uGG_mT=+17s47b2@lsz;M3uCH5=dO()jrk}3T5Tx z!?CA(hQRbGI5~xWjXhZtzlZ%3LNiq+JNfw3nSl^dfO4-?{ACW6MJ;fsuZ8AWSZax! zNKRB=%Jfe_iJrUvRf_s2L5gPZnwWnTjHH%96`geYX2!gD>G=$AF1&aF{pB=(d{4u4 z9c2;k<(5Ra9}WUU4fWYG&rL@lx#%O5*#`6ynP6^md1$RE5fy>zdu9kX6W=u$N{UVG zVjVt!w$Qs@p??HF$m?X(LW<>~Bpj7Go8AAA{jAAxCK_!C`^Ptxv>7%_e|#-r1c_B_ zpX`p=tA!fiHnZgKQX+FJtEB*6dMNtv!_sd0arCKh2z4RLOzCWD#j~$ByTqRGrzM!~H6~+jU0Ms81Y1TIc!dM{_8Ht`r!6nJq7Mc8$c3e0rM*`v;p}q)@#!HV$n~#lByqB zq*zky!^Tj!VZBrXZu~dB-rRQB+p3vG7<8)SwU&dtctLFxt+;rk37NP!Y7lVWY*({5 zQOk1+!Y9-4iA(U!hT5lLW>>p5UbOMM4%s|14&etziB$rV4r?sryl<-~a86waBV&K&?i+a;ne+8^m>pZ=@=m&?T z4W2i^5T(%7O(91*!d(H@kBKPJ2geVFQ4mjtUbjgG3IP|Kx9`qgS_2n@n3x#-bI~X4 zx;l_P_Gv{bY)IIWsfxHcBLOWz27F8FZ=$zB@0 z+QR7Q=pl!S1T)wB?w<#E{IdgCy2KR(QE!RFv3IPg#qd#2k3l zv1DD(=n0449GwrKfVH3TCd0_|@q}NEM4n)9FAK#zF{75~De}N}feIM4jRAyIB;%t+ zDmR`B@3x=?PcUnxf)kyZS8-GSmS_pVr3Sy`;q z0~IlUmVg%USJCr*EPqJlbH^Mxk@E}VX*C?nr59;aq*xjcUcB3}*x+GuqRjrmK>po` zu}4+;OlTHf%q|`Fy(=tQ!Sgth&u8=GcOzree-#-~ua67UNG;HL1h?@*>J)`hiYOSP zr?4`Gs_ez2>SG*r;rXqAogV=LlrDhZxwcTB%4O@m5KI%d@oi49o-f5c+6=lXhbc$8 z5u3^}C^-t;MXp`D_Hi)lR+<#?XJZ>FcKgdjfM!5ag>&?md(sl0xV_T+s)tlcK8zXM z7e5awIN_uCZ-#Ej?M#B8%x<0d-XRntpCWcuo?T|&_|GvUCb4ZDM5o&1$rLH$=JFnE zhtm@jgWkKnZ_=q^?R%1<}OX#*$Vew@>QS=pAxAe zdzYZR(*`Z1hOW7hlpjAjRh;6Y!7))1adB>GiMF?7^bV{4nx!p*q)XoKM_c%zXkU9N z>->aFqa4A0RxjhV?DqPFIAp;5;8JX(3i9sw_LcJU;LGXTg5g!-P>__6!`U;m%$Z|k z(#TFFbKwQ)QM818uU$jxE({&IVn@glc^Cm&^0*IOv~x#bhkk46!#L3q$YMiQeBV)W z1x_eOz54c4?}49>+~w5ovZZpohsT&|Gwzhb{!uuQPe+g%RDso`oYj`t-9NYq2qT#& z_C1cEw7#^lJO3|%D=@1qyXeK=zzUbxK%As^y@wK2pi85nAo?+EmRunx{v3Wc4iChb zI$({0sJah-NWvku>#iRtR|}pU&G@RLt2+x*Xz|cKZcWo7VX*1+m}NNpx}NXizNq{D ziU#U8uz~)SWJ&=8&DQtS1p-dL7-@9UlfEf z0^>!0y3(v zdelSX!?RhQqKbuHdTEd_cGlpC)B`v;bT6V4M9(?d1BlWIv>KL`$wJJP>m$wEvtce) z;~k1ZJ{|WOsEfaU6sYK(fxwj=1B<=C>!|}NmYst24lAbNK0%}1yRdaN^BVQJ|l^i`QDTU*;)0_lvK43_vr zFZYsMSAr-<#R@#u3zZaq>|L3Npa;+Qp)K}VdtZtJU>_GOdb;pjSZgZ+&oqDuSeeVz zxk}Ch2K(w^n3QdTRbc_}C*xVwJfmkBwpXrD0wG2SqF}UzS^oi;`7s9R^lhKOVJETk zEwqv-nV}2CaTnRWxj%&szvj?pi(7MMNm@ zAQm}6y)5so2b+lNsLk;UFl27Tt_Rf%Euj(geFI4C`L-+zd^U6ubxmd0C>WjaA=B_r zrSF_Far@Y#U8~b8yW=M}84DG72h5T2BKTc}^MpyS%s2H7(1h5+utq-nK9ZmB)tCv+ zFpVV8Ov$+cQKFN*N+&_dd?rs)OGPz$E9*szq|ReLsA`eD9pJ$SVcW&+Iw|&w#Lu_C zU$IltL~Ew4K)qw5FRvtEW{IpWJ8H=7MqGUELKkuzoGG7wcVV*xvyQSG@8!uBBEP^C z4g7ozlpuw<6qp81e$Hh&g5sN(Um#*x2p*hZN6O=dLOC1@2Oi0^$;{dy9awE&y~|>8 z$)ymyRgRT3Z`VCFCB^*!E-`PW9gIJ;EfWDE3qvv!BH`%<-D07ENXwxKsPa11L=X3- zkm9OATy+Gu*Ax=GEv@N&Q1qXSXCyfxZhQ4g-NNyevvm_bl?y!m=gtaG6y-&GpGBuO z|G10lvuu1-L{CT>CfcX%R@?#3*ck&PLvi|s{4yp*TRo3U61UGk61cG7$fS-d1K2H} zfKlJ8rLGkR zTD;+ed?LQVh-OEpb{=M1{x~y}J4Z7!)!Wsj6G)NBx*w|v_~V1Fk#OY(kHNbo>5 zg-*5bzfi=8MENNQ2LUE~O^}PGTEG1jLI|pFhUTFcxZ@rJU-57u<1&TPt8KueWrAef z6$U=?V$XKm5B}7UuV0Y=Rt^oF2AU`1CoevXGxB8)1yP21C{PL`vzOBw1OyOK_HKwN zQ&vm8^1vKI;UGIk4i^{}u=&93jm~jesU=Kjw6*HYt`lhujd_1Wne) zSyHUeDs`nMjwL@N;3d>R*A8F350@xPplXMdgprAnK0T>OL^EHOYn_;G3$p`1$pMgu zKF9dNE~i7CBV{vlf34_nvk?b9?B6mH#B-{7JR3TdBub`G9(gOi%`;n--WAoDsUTTz znb$;lhV))ICLP^n{n=A!?%_CN`XP9t7P(qMXL<2dg5hr%NwFrgl?V9n<;v5hXmqNJ z4vT~WyrhOR`$(~G>nWiBqZ?U*)UD5L0V52yFlG%E|9*bUrUvAcJOPeWDdj*&&#h2W z++E3qGT@?oyA@Ojd!EVUs^SbAP<@NSzpLj9|E_vQwwcs{z>7G0*S(AN@MS!^_3cHZ zzGXPE2ll2|$gG?`vRJg^$0QkjtHXLWFWHWlKQp)2mF9yAL5X<_swQ7^_XGyQXM@wD z=g=0V?6=tI2^)E9@2hrOS&NHsXaDdA;Pv}Fs}BLUB=*58ctIU~YKtHQ8$4ex0VKt; z9$SLkmG}nD?>D4knSme`3z;tc5<3(s-;_HJ^Z;COBz~_x1;hiH8&5~o;d6F^A!Mxv zofDGMyFXARiB2^o`_P;Uoz>!WWAyI6h5tp0<@G9^I;(fAETE(#P-g3!SggqCGd+ux zt>T_>Nzr=+3k4RJmNB2-Rh7i5h}c-Zzbe9m|Mp`E?zZ5R3H&~k(6CeiXM3+gvheEE zze9>`dnyfKtBw)=q`FoDK%s-%3ygpyB*}#TeqGk;$!a;TXNMGSzTNiQmqoR<9BJs) zmyoo#s_jOcKdim4c~2IVj&4>!+rA{drpL9Xi+^cSpXr?7n-caI_|od(f4?aNJVxJL zz|EC>L~df+=kaUU63fvAdnh1Lc>O1d0@l#`;O{kD@Nzp9rtWJvlQ6Q__wo|ulkX0* zp(n}6*i(geB@u-9=OT9&eKjz7#WvUcL%uFJe|omaQ_BBA5#) z!K<#_hMl#aBb1!zNB4G!6rHNFFAe^&AM^Othr6rh@#n*W!hJ;19y$U|kD@9HVqx+p zk)!y*q8p`1>DOQtL3|Huvp32q`N+Si@R8Lx{IA!jip$Y8FMy3$z+0yVc!MCDV;T}W zM!Eb>Lm4S&`-qpE=&0~DefR4%H0TIPtwiqxRGy z4-bHWG{4JT}6NxBD>A}8VU>WV)Wz3Y)BdII8mohbjb(0)z;3g z6ZC{UQc}jx8({uJ4MIN;KYyMw8Y#?PKw)-wL}>(?Li4V%cmK=gCnpwKq}Rd=RgS;h z0~wlL$qh2}mB0(g2lD9;N71Pf);q)dcX#?1RT6wl^HEt@%jfmu<7OK`B-wVw^1}dq zP6vQsGXR1nfHYL&N>AcZ1f~ydr&5L+xy1Pb5Loi*t7jB(IVz*auTT(QkUs_Ubgic~ zlG(^x>^Ls(LXRv;i!mRvT$8X|jhpvO;Ddf^Gpx@}1x*E#>{y6oAG}~4*5{5;x@@g2 zo&b5aF^sUs3Y!`z62N=Q5Gk6CVQ^Xf@oDYJTU$tmqBEsZd4d5yam`Bw0#qepx-@c7`TkDonXkKpsMnQZaffd%lp-PlPiI6GYSHAYR zhtf(YTl4%K)BcJV5HSp|A~8tl#({67*vmn$&D3!@#soU}aP;;%7QmCIqX80%NKRa- z8$c21{DT#RB6nyumOyYT0<-wF-{*(bX6s3MS}MliS36-U>In3Ug8#q#IwcjA9cV#s zU#%Y+(t{IQgJb!1mBy-^1k3IF;=w(!3#588<8FO%%)%zh*4EblPnW+zhk}TjURhbO z1zsB|;jY1KccDe2NnlY>5b3qi()1yiI-GXNLKBW+Xb0`*`qD}`zvk=ZWo-Wf5L(YJ zQ9g|6+M$zIkMzH2Kso~QaxEN-(t&V5ybiWiifQ2Nxk5ieee(O!L$JL`@hg}VyD3q1 z9m*##v$I1!NLIt^YS1!;bX2QK&`f2~5YR6UpT2Q^HehQxU`G&I!|-N^^Grl>6h?b_-cEXjp9Io+bi} z!WnsQ5H3t1+zP2UL~r|kl7g4$;>cA4Kt=g8=OAaYo3|MAAaBh!yfwcL23MWO8P22F zIs4;9%mrkB67QJYue5Fpbp@5Q8eoB0E>lgw8Hpk%6^)gc*H^8Oo87sVljt1Ec5}Jm z+e0|5Is_&9{o6G0v>=z-X5})UhQO(t*fwpXncaB=m@!f@Y|n>7*8VyC7)h;`5#mTC zW9~Y91a0y4lP3Jh67nZb&hSqP@5ye#@&GvyO@k_i)1q8^#9rq90=hvNqM7Zd{`A`+ z4ZpU2bR$NI$hgpB_W_lzzT=JVV-?pZe_lq10BqDreBTm5gZnXeG>T+Pf$~TFc8~2` zaSS^nM>O@8WNjH-m3kAI(!&>qf`+k4cHO7zkH{BM&6^+bf43HQqI1bf@oF|!6REk7 z%F5>{6jcELFTq8Re0Tsi;+J41p^P(>*{!*D|KyP=#N2}SeNdd{kzj%JP5_9)KzIx{ zYyf`gMSJpLkTw>Eehec?XoyQ7jfjKNTTu1_Df@X2*O#-vR>1(uQE)onehmY?%Lbav zZ%@8lwddW!JAZz5?Xlyr9yhx%XyE)vvAymL|2bL!mxm9x7KbMKk%h%DFaDcq=jg(2 zZTfDE6zk0NV8;j0z{;>6s+b0yy+k0uL>PJ1=@f>4fk;Duav8({I>ch2fCOYG!GoU* zkvQrVnMicFJ?v5xI?l`X<8enJ?d@&huWYt_d})b%rD>!xfNRvV;VwM z*rZf&UJ6vQYw)-)N!0+*W_8w`11T#$uZe}h{tx4B+D}K=;4IIgM?RQP0nXla=rs^z z3f>iux$%M?DACd*WT;7)kkr?k03kmw08#i?zCyIhGO2WMa998VH8(%M3Y@FJXV+!qc6YZl^6&6WL-iRj z7v24j{wkvphtjHVm7}JEyCjne;($DSE7#HK4pNA5@E{-jGGQUOOx7EL$*i81#~J#d zEo6FkQ`E2vLKgjvpX_0DSe>TZ#+wz#XJ9%ZgljF$i~ec>MajwK`mAHRs*zE7`y}} z_ylEeHAK?iTpO^FO}pvJb??8eWd(1+xix8`mj0`8NIjFSl_rAdLH&MAcnvgoyGgAM z=?+U3p7aE9)7~kW4ype{q8Zt3C|D-w(Wg)<& zX;^^&5t3|&ip@>IA;y(;&V=Gm5Vntm{=-QKF*!FxM0N|!TH1fm)|6n;lYooaK@z?_<-4{J z1(DklUa;(yNjp0`6)=FB0nMCVn34P9nEph=`I*Vy6lpu~wkflEj#+I{6uRv%{RVc; zTqw{6BcXso6v^kaL;9riZ@Bf#M@3ByBUng6gwHEjtGAffG=bpfqY}oX3^lOPy#22bhS4p{u)A1~##{k1; zdE4Y)4@v||3zP91E!2vlXX)#Z4&UM;O{2arD>+cyTG} zf3lwIsMHNj(({v0v}IW|`Uue9^z^LSzkk1STPV}8o(aC8AL;!?hxlHwLuDSvJ4SIn z0~9D-poP~T1~*7mh$wb|ikAUIr=q2m0A;UAoPa(z#MV(Z1Yi+kV`65`XE}uJ3~ZvW zm5ZX<{~BAv2NO@hUG$q!v6uqd=R0}=wgsiHSsYS;9EJh}l3*>xKe>KeR0E-Y`seN& z`m7pQjCg?xP80+4qvz9NM#uLYS%wLNxVuy=svv!_gKLdWP8Jxk@GSyRKmT#8qU5gV z!ecM((~fT&uyc;#D3yN@?$G8Bp@DsSYVvdM5hkkb{fz{GDD|%_&odIdbFy^Rc3XPO zqq~=R=a&dNwNd1Z8U@ik|9T}9a>EK(UT|rjUvW#?gNE(Mhy8Sa2BzVLl%@0(7D_}*T20@d8 zXia}g-hAEX)~zNX^E$oR7mp;a$Gb02ngd{()rGp0{tND<(A_W_(AH!58~c&=!UrHJ{++1+ib0;JrCdvFaubef?s-yni=f@r0du|MYh?ObtB%GKh1ya6Qs8eK5 zWF7B6h_*O?zvhtd%#5A;=+5ZxB13wK@b{F}-2mQo#C@)0ha^vgs zbaV2L9ak7fJMUwcF#`SbQv+dztn*OAJ5=!(+BMA#Ba|z!1Tt5@lCSflflO6FZyzB*^%N4_dfeN1awsQ)d zx|6)SIU0ULR{wN!FnfbVtiTTz&M4GzMCP~Z3$6kV9z%k~H7^cQfc&9)DUny&x|KX` zhej#iphzGm#WT49q(cr8LxqJxWSZ&H1{uIknL7MnVIb6w9Jin%#eNlF+4;S2%m0lR zj1*g}sAS9FT6T2#÷Xe@pd?eRs6i9s@OkYfLf9e_8 zUhYX06EL9|JOjS>Ma(A3nwsYvAEWweJ0vC0;cymFMmC)Y2%r?f(Pl&w!syeLyLMNu zsKP|cEZFy4+ISwSejHPxOlj$L`tBRU+HhNM&PQ`$4e#D~)Ze{#_nO_nOs|bYO;ucU zBIn`qGhJ$n7OqYOtp|?)8T@AVrTOPiv^SzO5wlWy8DDed)j?wzm>P!@K@kzxt{@8z ze-w~CU=Yy&dLjKK0lSpnH-XS6aI?{10ivp-6K%En3d&q6D|Ulfg_A~*O84VizK>;u z0~P_dfDlXmPl%B{mep$8)!Z+&f0f2P*v_4wv>R~h4ZmD)`m7voPA{){o}SQlEO9S{ zIjHBHk>VC0`F~qc?P8O27md^rkODNM}(-z1fK1{wCgZsN}_AXF~o#N5jMZq z=xKDSTa=T-p6tHqQyB-v;u!mBbRD8bIZPcK@(0ddJxT@mri4G>n{M2$+kx^MSJr&I zH*(q@&IcDk#%T|@(rq8GMrqpi8Gz6n)nEFMB9)%Y1ef|JyGa<*_KO zoH~Pdq*wzD_XN5>s0ZL!N-ws0l7>|KtU7D}f;H^^3P$`5go}?ukjgWyE>3=v6cu$8 zIY+@HT^I1FFb&%h2S$RFf$Myz(}~NSS2_hzFK$*>PW?=OoFcB*#Wwb(tG{7rf9*tK zSM3O{(z)6|!n70rkmQcrYOTcl%BKGo)$F&ak*>ihksH@OYh6o@6n;9GdgI!~Ah>u6 z$SM1K?#%D*`BS^xUhO~H^LT0Xh+o@V%fsJCL2_<6FgQ3nmlhBYtY#P3#N`qu*S~(4 zh5Iw4*pBmn!i&?_7zo5?$t^i{-2r?>k&E72hb8C$ls9 zgpmdLT@FZm7E3_|`;wF6%=a9}u{crFA8e@yzpTqY3XNmAb!>QUEn%o3T+-e~@L*)p z(F@{(cEM41`WD|5*ol4~$PLTP$*Px_eY?u>tIA~ZK(l(4H;TBVvT5SO8xj3I$j>l< z1D&c11o>luS?{yEYBGn&-SoiyPnm%mw_;;ro@Ok&y15O1Hx^3lhivw?IiMMW%LR?J zw)qMu7~;X(Hg8x5S!1}7(?-WQG>wHVdrg+tlW#EXMD3)Q#q4D1?YLcsR4Fac;(m&(j4ZitdeAXkwo zREQDUV|pk1W;AgVFo4M|Z@Y?o?Y^nl(@KiBhc(BUE1fGqk@|Z78P`!Tc(#qLgQt*L1iZ|O@he)CfPm~mOhX>8~1Ddr2; zG<7okqN7AE2wZ7Lm^7hGqTZt%ZXW)&CV8<~1_INaPf9WHjJ^QI)%-EF8j#uAOl@ee z;azwf^iK=AwToM89I+~DRUMBQ$~EzS5{-TVnBOQ;rzUmRp!&ty z+TY;rG)G`HxhwL3jX*;4KMo?>z?oUFOu!}9F;xN1@E3N!X$$liU%JMt-(MB@&!53= z*$rguU{Pr!wBvvAZG4a+&QUly-244i|1>s~%5oySq8IRr&UY7~jdQ=V!ws~O2G&VDn`x*>k#KIW!40$k6n1FL>(whL0k{_~d zgFw8GOppA5P5z;nL$nm~nIJZe8hY*Z5T%t?6$Nj+qRz!WU;)q8Z92e)GOX3N*~MX@ z4eSE#O+HAeCZKW51O=?lGiycGGw!)Fb8~{FhQML1F7N3PcR_CbNZczTf*QFOy0DsVK*nsJ^<hdE&rbrDFaxF|F<@z=hHeD=9aFGXDnU*D`Y|O6XKq9TwPj&`dlSm)7(nt>LF9!A z05NrZMMVX!GkVK+*)!^9-~FCV{OVL@PRMgL-GPb9s9z@LvZrjG_oKjiA3b`C&a~P z^9zcj5qRf`+?`O=B!ZID$`En8D7h#NyUa0h``%bEsO?7J81Sn+0jGarWVW<4h6V~B z2z?%DN;{c1&h0Sqv>4RuJJk?PEMGxqG<*p3P#XnONC;t!0O?^y^bZemV#CPLeVEAL z6Yw?-M)jGV4Qm5cz)l&+!O!5T0hozfE_R*m+7-Tm3y3ZBXDi^F(FGMp$qJ!yd-DsT za?As+V?|1VFbka!z&`qAnR9In=R0%`Jv-{H)wnx7-ZvhXI^RifkMjce{o`tHP zBt3svu5s+;>wpCVwX4!k`5JPcgruQDz`cgle$SolUjsEaZSsuEvkQ&Om9nVL8-w0w zh*Q?CE0OaQ=<82x0;M$<(Hm$0pb9M^-A2Bcq3|?}f;e|}bVNe8oO>$&Q*30nEgWiv zbX^h}oS(W0C5J+Mz_2X3Gls7lOeiw;_y+oZnFZ04=&-vpFcq~17H>e?s-t0F1trvH zy^QOye*%2Ti2${`eK}8x4dB29`-a;GLDwwsIa1 zX^S=4f~9R{6S)YS6+$)(XIJopWM12n0jcVJTEI+&g2*r%^T3@|M@(EVIWeOCb_jvfLXWhP zc)&{9f+W*HJ?yU3ap42>;&O)!)2X9dKFbj-v>r)n2wUA|3Vv4838c|1K&OURo=ByN z7B{YqaoqrxA+I&w2kYD-&<8?z{p6NrIMK}vfgVFU?ySKu>?2gR1*ps}-WGt=U~+~Z z7x8rGyuu0~=T8hKlq~v@1{-x}?n-PLcKW{Raz-rh-z~p zUn;pDmJho4;K0!c+3gwx=Fo(xBC_wO6`)KxLF=Gf%{cOYuO&Ab8QE^dzB*#b8p!hM z?bFB532{b#{*LR}$C{iZBE7HKqkh%lt=bcRACo#$-BfefgGsRusgaqX{2649W@qwu zCro2=LCE%Xy_btOoG9yO-ijFvpr^n0iCdw^Y9G~&AmuDLgZWr)5X%twcbxpy)((Qj zQg)h%8zoWV^txxz;lzDk0U4xRB1siK-eUiA{p-98-a^|QYRe0ETMX~KAztf=#N@UB z9d-`E$9mFaEfw!03T8+NJP$ta%YIG6an9_seJr^H)fr;-7yx^%J$l5(#HgxBKXvnrNqeKXq8sgcV+IUgl|)6tz-vvKH{ z#K-P?bVf7GWZ031M4BcRSnQmHt0fn4Ip)^CfPwlva=K!d`HIon&xgP8E|KZBpH)bx z+{qp|lP}$hp}jEC36Vm79h@|9dWBJl*(4N_b}baaLl!@w6SAK?sk z^r_HnHwHpmZfRv5D*>(AvP`cYcvX4kTFc2k=oHv$Bk8dhD?)5U_Y=}fC}y6MbV!2( z?d)T>UTyCSao(7Q(|VrX0I8~>Z5kevY?^35eQLWHZm8Ekcga!)yX)2%{?y01A=Efo8uP zB$XTr>8aUtx9%zt1S#gaI4bt|BhuHG&-#47B(6lqboy%l5eb0#MW4i;YQBJG2S<%i zzKj*H-72IV(G!^9^Wa0}v51%*W8~A+lVe8o??NUO%0Mv{bGD+?p5gapJP);q zhSdW%QmB}}B2e}Y++fh;RLw{S*&d_d zys{3)dwy?G*j>`@GdRHrnJ_U$c{_kgV5|MSDXA$k6-H)oq){8j;7D;4gYC~Yg>Gq{ zUDaEaF|1ezYThilAP2_Z&VY$-kfo%gH$Vu8wgTU0W13_~3lQu}TB%FxOxaVxvdOOT0Xj7r) z!y|^5xp|K(8LRIWOxFOu_DoM6vb`WO9`divEfcbpL+VVyCRvr@j+x^`y_cc37)}kb zIxu;Fco4-viN>BZ(8*tsE&a)l#fR%IO?DS2=e|CY#0;}yGpSXbW~K0hNaf1JbZ;et zbO%;cktUBKt^GC=@g~jz#dV%1)+U)Rn?C`>K}DNk#~L!Ob%UTr^^9~-#NO{pc&dzp z;1HOHKiozCo;(F;wBi+iW#n=pv=4-l!Ge+$00f}rv9vNX zHlv4R;r*CCB}o4gTJ(5IDe!m>bw|(Z!h|O>Lm*(u)LsS8ReVBjuD<^eet$9-Cj}9c zJZ+3FB{|eZM;aLwm7Xf52Nikr^N}4O-vxki`42s5!!!xj0@0EHCUy(hKRte6=I2bj z^#Sk^93se<}NEpOjRGx_lVHI)=Y|uY;LGOU8F_VTUH*ysm z5LNu>q%uM2PbYVh1GsGFQ7H3M!;c+=DPAZCY78x&%dtw>-;r4xmO-XsI%Chenh%hA z%ghdjtUi9+qXK(9gK-{sA$=IEtkszuhJ>KmZCwtR*Zlpjcinzd+!`cmVH%>9gy=xM5^Y&9`|8!z4^yn=jLD)e3s9-KuqsLn!u^tZ>|qTNaJhdl8NLk zgRU$ol1%T(A*{xJ^HYCK9R~juJKvol#ZJx}3%vpLM#5iYNESjV>mHf<`OHG;)x_1h56_kj zgvLd7Mz@O0DqR#7Sc1*8lhKoF4H(v+-2$IwZz^^)pIK{(VTkV}QL$9r%B+26=~$Oh zLDEqB#k1>+O?*dRcUAKC@Uvw}O(U)dUVni(-e2@vZAani?vgVsL8yLw!Uw=a0T7J zF-A7-8we&;mL|M0GRqb2)`;4_?|vF=RPD|!Wbv-gx55}HtZNahYf);OHE5aw-tI!; z7IQ3k=}TDU?2<9wzHpPP4riMtzF(*vIJ?7D`yH?KmG!u}BFqJH^*{x4;Ie!uJ*ehc zI72^PK-6VlPHO$B7e@4dMxYojOkUT9cp2RFMiTk(Tb5i(3;4_#wcNUfpas0g82PRHFsEGK8ovkG`0AR-Y!*-x@60hWrp7{$tp8;GjVuA8*sQ25S4aM0jwRg= z5frVv!MJEdZ+a0jc$l#RLe%E=F~C#?MV@7$cLRm)AHb9+^ydc5Y7l4@85YT4WS{5F z2nvy@F9M_hK-q@kRaWFax7h$;c9 z59kg$^|4&M{KwEEvyjn|`}gl70&sHowry~tw1cywkSSHZ$2V?QI1HVRt}W`)5kpEEg6X>+0zmIHLb({rBn2v!%zNB6~I2_AJN>~$I6M|gyOxYFCcgv zBDpmKSKKoHEzcxLr{oG?l;!H6u+bZ~x@ z6Uh7C%;kdPz}$}0E@AlRV9Ng_Z?mg883ivJX5^YRWT;+XJ_AU^3F=h)kAPW?uuz-HeQ1sqGJ8v*s?JDg0641ub{ zc_eIYjzIMB!m0BKFrrZVDI+X)67*X!pnVcg3vl;!nl>9?0un3BvBB6Lt6NGV(A-YsGwjoruBUp6jxWa)R zUC^Bt;|4e9Z)ww80wi4$O0FiBUCxQ2)xMnAPEkYJp;zl_N45fIP#ha<+AMJ| zI$@~^BLM@Wex)}*DDApm^nq1CVYbi(!}EITcDF!^LpIxgXfVMN4t4W}Hc($Gdr$<% z8a2!jBkq|;&r7;5tR{4H4)1W|x721xZBXBHjET_=h}>sxNY+J_jq7a8MU}-?Crwcb zQ^64FO1jfQgcSC zBL|~8^at~!qNIU-Y8_d}#n1=Id_2?)l!moqSZI9;xzb9<>y=0HCWTMuP z<31cb3c;qfd+aO8Y!h?+Y;5wyrb`!+-Q11F(Cqk6APE@~uuY^y9|=9ded69WoCax_ zOwwLxR;$xE(Jo*IlhP3j@2a(CFFeP2gK*%)<<@#vtveE~28*&h!cyCAd<>Ra&=z2oFHaGkrTy%pU1A>Iz)wX-MjnAb>BBJ{@0|p1MHU{YJw=zg17>fX4-ZRcs>TU8MBZcSES(+&@gCFF7ef?5 zKWzHn)VkTPGBe-Db{FmF7Z*K~*DRTJcJPPxCNUX9@CIjzf(CKyR=9rKlT&*Jj6PHU zAh}Zk<5%kFCUB*fL(ZD^2#6%D1y$(^Ow@OXa>+(+MVN{%K4jtt_4ezjh_#sYhUrCV zgVOGfj!Wlol zFDV>1>8SwFFo=;NCc-Jl;$nMKb+Z(pySzTDuYpVgyF2fFZH7DFas9UP8TE*^*!xiT zO~63){g>yz6o#2WFUh2vbkGp83V2Q>=+%6+*VJO0 zK&30nw;lZz&V6Ef`i?{i?tKyPolUAYmeGN(qex-*7z{C&vN|4`8L7*y>@ zZZuw}4odET6Zf2mS>{>_iCX|PQ+$dG`fHX0*ze8ZhVqxW4ZDCDE^imIh&k`%{2ICF zD3sPw4FL9O4qJDxSH1h1JgDlYfVxU~viAc(RUZ6Z`~eWqKzfLxSM!AR#Peom`|n1+ z?HIepTyo)A;v*x5djNT?xE27*T81Kl)B*16mqUVoK3Rwuj&Cr64`0I5NEejn;%xd` zkg*813;pWE2)sm}^m~lPC#24`Fy<2wt^g4ZHr9gQca1CMhOs>^)-H_l-N&gxgEtGC z{8u@o*_gS~u0QLxt~>Dj-d!uHpqzap<2FjiOR$=^U*b&dT4H+Od z`cKVKn^-K}eHjA-?7rg~ke+7@DUq@#2LYt!%N~AOsMt!TyZ%k)-HdBc4DTlZhfV^B zNQFmJ*s2QRZrPU5SD^huyz9fZ$ck|u9zFW-bJ`U!1&JoM*?P!kaZ8%y-8JInp3-c! zfJj<@iwZ8oF}?*l#727d6+rXxK{)L};s*UM<2nIn8TV$hB*92XMKZQjW9bFDOOh~w z42h4rHYnyBjpKcA%-_bHpasiTIUWH`tFkyFy*&<;uDpb#! z%sLPCc=&d0{Lib!n?60E;l?kGitfZ$`)aJ&{qepK^?7Ewo`rSd-(hm%zbs*Ni?7?M zp5uh>@7w#+Da>Y#KOi4MZx{6XFFXsK*eo5l>u$TlnmW|3Ut1tmA2`y4y?e54|7`Tj zgYwM>>RqWBuY-~dBh}5uOj6%+v<>ogdDyhD*G|-X_NSCTYIqpC+tFJ~`RUGBGG8dE zoOIi&>7>JBLdK=I{Ou(mMC$aT$i6d;kB^h&=SX3l=_FytmduHhc=Ogt(p&v|{*%>W zw%F=Hly6Qg5}$0H7QT2k_|($&m4iopI#A}9)T~piOn7{fyt~}|#bd)c{myRNdycco z07JHph&O+CABFFJx~lkFzeD8l({-fF^HIg%)(lVrt~Qw^U3iMs$m1OiT*~C$Y66%6 zrAX9c@IP2D#2z6F<%f7~2~Ck<2y~=T`>n|)a-27BE3=5eM?mPQ6%u~8a#Gr`{p<03 zGz|~rxdiynVPIs0pROzE*WT-PnkD3Ro308~BxU&nbR6}mr)wM|_R{1`H6q(D(0LrH zE-{VwR82^aPu_o+Zpk**D#u8FZERZ=bhx!lG~|^&C9M%6MVj-IgiANFviy?(j$QO!D} zK`CM5>i4KRG5N)d)o&JFTb_lYCYrA=bjL6|Zdl(pME$RvChNMp+*9(*=SPQIhb2kB z9r@;XhbI&13YF#M-^5+{1@fDUPv_X-pQN3eH-rNl#&lz|ay~&n_Hu`fND4ZsY4r=9 z_Lx|m^_aDtcr(0ZB{*V7&5@Vs)gOUE!U+IrCX5kyT>th`&7z-rI&p=*%v}(Ro#pNKI}2rU~qRf@^_?%bjus z#!?kE2`n^_m&sqBYN!p_0xhcKE8y<5*3{+<37XRz?TkP9>7jO7+rC$8nX-%~3NRk{ zF7-UVlcgfE6mp7?a$? z+gSGJrA)i3^?75J$D4+P%im-esc&3eq4(CHiDl|fGPnQ?JXZu8xsLkCC}@d%dn>w% z%}{6y^pjxd!9NH>L6RbAAX;(_qo4F$FAHOQyG;mT$qNfQ+Z6Qy{$L;{SSQ`lm;ylRurM05yCLet4;0X~ouB;rl`e`Ub=TVJFhcdr zteqoCqGbgDkq$6`gw{`<@}O28FtjZhK@9(LO=uTCbr~P4(G^d>CE1>&2L?0v@Q{WE zX3v5iN2nI_w5mHoG~f)@Ur%P{X3R20$t_Xbg zn}}C2$@$pSN1Nj#k4H>=(yyCen?x$dVWmx+DTn;BVgU5F&=)lEU;N>BojvM%^k#Sa zcQ3Y}wK3^fE!AyQH_HUH*OT(j^M`|T+%bucfu^;N& z0jlr!((2ufTH+@UFAMKKGUl8ql>YlWj{Z#uynsPZd0WK8yCK)&uW`lu#?dV$z_Hsp z5SFDrnITrj$!d_Xjmq`8ih6Tzq-$)>OR{3i$89CjyT>&p-rfG=Q}brqs4j1EU&`R4 z&q8Rw+=pCa8~1!XbQO~PgT5bK=WS(gtw{F48}JZVBMz-Ke(jJoxq{u_0`fWNcEubY zqWbsIk9lozi@bj%)i*&d7|t0P-7+b48&dD*L661qT62UC;&@WHxpPHB;xP9hvir_j zPh+3syj=)q{XHYM{kz-;2u69#gXUvmVV3X1G70>br+`6RT;Fuv24y}6$asTY>&7zq zX`IG4aBKS~jD}3T(@jPNWgemrh=N#?qD3AW3l!eG8^619c`|Y$IOZPv`HUo@{i8-S zY2BPx+e@3mVX-ne7oF1YesR#DX^pLSy7F-|Yc+Y1RViVuYg8D^uc2;G5>G>a+AyVA z9{6dfzsMqw^6%ZLOuoH8n@Q)P5tEmDi^pLG9sMuW)CjVl@4Ij&UNEaEbKfYj@<|+} z>8_s`d0~+!;eK<~yqZg^z;4^6_#|7m8=>Z+_WKKu7sq;Rv<@dXug)z7@vccH)jU6k z)2iR6>3^IarA)5Z^1~iqHFULLBHxpjLE@FBWt5%S%I(3&i4#4nP7qr83JN6NJHC$hOQ!#)m^XZBM-gV1jeuV zT0nXBRSjMGuS>kFLM(kGkv8x#rQQM$bt58jmI04@8#bK z?+j7?oO^yDAAi47_O>|tWBEZB$NI~NPOI4*sOGmFPgv=3Dz(xPla1#$`B40BkQ`X$^zrukx- zv3|aX?}P16OrpZ^s!!D^{bdq-A&Be6cTqo2vt*Wg_dB)HNoNe+2Ql3tj3j*dV8R8a z_fbpKq3#KcI0*056CRe*W6*^mKOe*lDDWd-mklTAOYE0bq1OnQYvt3+y89k_O>qk3 zr|BTPv@oY&6TKnR92D`FJ@=^O3$qQ;97vP}X6P^O8CrCFsks(ByOS^IyZ`)JkV*=u zRrgd}SC6lmzfr42px|;XZLABB3%_qgrUq$gYR1X+1AgPf!GcFiz~w3#V4B_tT7~?+ z@j3AB*Hw-WvOR!$`aX|E3)Hp6K|OQma|e)H7z(*<>@f#wmMOqgL#=nl1_|D^gx^EiQVcLYc|ao+nlhKTw*wNiWB|nNzAj}0WKmK^ z1|PsV>Jme91XgIxq5Is|)lnjpe3LdjU`~cj4sWovDjlnX4FDlu1CXX#I1vp_J-#cM z-hocsa8n{bD_{G{#?7mzrsa8nU5zJ zaRqrgm9L+D;HikwVj6?;?Y0e#%Eyz90JgyCO>BDH8ePek_%LR~s89->r*Ab)3wiOUv4tZGz zNEV{09yp^=Pol*koU4&5lIR#3KEooKLZLw-44Ev1gI)I%oUwwJ-sCrVclT`pq(3x z=ZvY=rPz)+djP0hl1^c6ez-oO^{riME{uvgtlX#~Yu8;)>UbUSeaby>Dm$XSsET75 zUz35<0pohO8gY09&{)Ij%=_yg2bQ=^a~H_u4G)|kpN6L9wYBUgfKt}&6`9)xu{-e{ zbd%>Nd&3#B+GokhNw3rHb_Dh47%*ewHAc3wL2qVT7+y}wXoRu?i~A6@d}y>X3R zJH;lMYoLJO2viDYg5y5tsyilx_$xMM?#3z+8b2qZOK|@V@&hnco*k9?!7O(b-1Xv^ z8_8p-gKrbQe@|J)W}ilk92KQ1)fm>2&L|U8B;A-(4@%<(LGgC1`)b!QKVb2KQqg{g zJMk%Ha zXZY8_?nk^1O5<p#%=URM0DaZ}zpYypupYH$9lEVT;Z$N!nHuK z=3{}|{0;b&?1B5d^12mS{0*+82NSw^cA{DM;jxZZOmaX8ecJe$m z*NRUuygdW-vp0m3u2I2d#UD$rlHM2M1PK0So&8RHN{nWM&*t{a;jr{!bI9v8lE}gT zf-3EhJ9&Z2{C8;HCPnd4Ff2DB5Z{tfudg1L4}v@b#hzYJbT7{v5~yFKyiQp3ss94p zesW)W9{HVLvNlA*r*``c5X1oEOzx^hR?wx-13tV9Dl9;MTVeJAEh+u0(3QORchd9W zfKNYha3t9}OCmcuE2ok{#a!mipW`#TUAP71+P^GlULO`6k~knUXwSA;+qy51*d3XBB0`$ije}lL6ys=zoI9b=>%gbb7vcTqbf$zhha!jDpGNON;s_ZA z%aTBj>@s$h3vvrCHJ{5U@H_WT4xSudUJJ#uP_s}}N1GiaQ3OG;LR$*iZNsKu&m$AS zp@8AJ;Lnm*hCP>rNNI3}9b#U<=jT{jj()A=I0yehdZEs7RdIQs6pOY7v&~l*6(NpD zc=|jct1^zj`3MOHqPydRR=e~=SqRW}-E_%pq`jYl&8`{hM{bJiTa4dcb0>6Dfe$FsxuXxMkWilEjgM#jD` zvxzNDKC;_UCbaNC5J%Vg9FxqdlQhL$VmqU;=T_-wt=LEd=GKd%YmMsY#5#~zEA?Z1 zNPH*%6Gi9!zXL=8BwyNuday4t4|mRLZ|8AT~JJ+iWTYvR%e8MQb;VE(X3z?&~{LAiS$<7i(hH)wOT?+@H~$B zEnQSE;-B|RejSqOkrEv8@|(H4u^GEU(}RpM~BN zN}4kFgS!Xp|5EuFr1X^J)M39Kg@ju~%P`o;$`Qw$<=p8AK%^@-lWszdcBG@uLM9C% zI;fBAvz0I7wpIP(LG*qZVTa5&;mqF7y3!#J5Gx4Jb5{5&n170v z4!v@h1m&4$$HIB8I{^f8M3C6V-^X$A8Hn`GJ>33udU9)xh*g!w`^c*%paQ-2hUG%a zyjALK<8u_DlhLTR~vq51 zTEMq)lInyc%#Y!}U3iNQKuco}bnHp{vjk9p-R#c*0YA=oi|kw>y`+sI!zq=&g|!q) zhVe3bdkA9MGD;C36A0OWm?nT)3=Gziao<~yHE&hd?DWr>W z5MyScb^8FV^HJbln5a!oOzpVpeiRvvRUZi9Ybvb=fM|GG6G41U+=Ir4(^A7%T>bP$OmAf^r2!p5q36f8{;^ z>pTwb7{wZ&r$2BdVO;(i)1;@XbWVOw^;VwmIm1UQ^okC*i-k5_XYh8PoOn6;WHu`H zl2l7UWuGSal5N0S(Rmd`I-V=#!slK9-g4m&7J6e0ONILyFrABGsjM~3m}B1w>cfdi*m3mIH=o_Gz&G#Unw<&h<< z<9B;P@47JWhtFWk&AEp(;kjT3P~rbD{HYTn5A|o_i^`orXBf$`uSNu8Ls*bJY|Id< z)~g*_ip2W}hS1uYT6#2r0F@vL8{J`Q3dg~|=lc0Ja=Q=s9u})U-)}<>AvfQD(4@? zENwSxpe*v&eAbGQJn&flJ`2k@rqUk(tcoJyw2={wmQYA|{`Z4eCQARDxJ_H!{kG5t z9M%a7PzX5;VT;61z`+E+fOTC!kYT*x5I>Z$Lb@!Ps-TPAI>r=%ixDJ6L=@W(__l@- z-u{gXLm&`%b;wj>W&+=ZO?(b!=rKeyk%?AE1kr$-K2l*JOCt?L&4!r*2$8tEVvv_| z!izCzb!gbKjpqjlTJb8BKyY6&IKfSWul>FjoDUji>Ejia%=v{iF=^kkWSOU>mmV&F zMVYH5*Nn9ZRTjGQodkpV1c4<%yKbash5H4k;X-`VS%~T)o?Bww^!xb}LVknOje39# zNjA6*-GjXQl?_SAn+g5{aYuSX-ghzL)kWB27?2e$#Y@Wna|q>d zALapCoXivroZ2diO+k9cCDtE-sO#_$k@8reAHe3rnuEf~$9Xyx*$XD-Tf!jyV?T+) z2046TFb{|tDn{N-2kvPl!OrqY=mG8C3UvK4`1B>#P!f(e`7NPFFB>`gbF#>HkF;NwrixbD@Z~TrD{ZN2-`oqY-hq&PpYkC7-oaGwghL@MA>OEt6JZ zXZ@^KT9au>cZJ$B-q-eOTfs>~+@^7n_ke$8nG0#Hn(Q9Ga1I<(V%4BQDv<$)+{WwV zWUBqFc`Q}&VxymzM*;?%VJJBq4vJs`I2)EEc`!~R^AMfFkXWnOCUssrD4KWDxmer| z^X`f4#N~Ya-b}hFQ+@UJeUx5-67^+uABVC_SJ$`Y=m+Hr ztw8I8ELZcL86wlFsje4xD=n+h+V$#=cTD50mT}>yK(6ca!TJcfT^3VlJ4Z=RIdIiQ zsTw}xFr#?O|4gfkt?_IBMYDD@fR|Qj{RrwcBb|GLQ9izjqWmbWzNo!$%d?rPD7*_$ z;l~y(tIN7^LO!lE4VaWzz%<0$q*fkLiYYHS%KKqT%=)Jar>t=>8 zE%3ucVg2xsmBz`!8ZI2laabJINANKgRnPs5i zr8rb0pgZewz-l#~r-SN>H}}J4V{&A3;10g`4y%>t&Q<3|eN5ekORMu&yMs_)sbMQw z(&s~Gl#c`rR=x$kJ#9N&E8M(tC0;(#bEs;+Ax!eksNifyf(}xe zwIJO6uzu2QntuGF>eJ0^$@For9bxl#c8!|{;c0gIa~aLHaROe0!tDWnFEdZk{gKQ~ z%_H1vh4-drPSKZ4#d=r8C1BaWQ0P8+wD5R-4C*GY<6^!@qD_dm9dqHaAJ?7uX1-fV z8qco^zl+I#q$UfX3NSnG7|*+{QD+Di-O0@aq-gfVw1LX?r~Gpr!bW)sQRW;bYMboV zauo-1c)SiEa9De^p}H_H3==zCSraK<5PmHk zr;ZvVi}QOe#kB8+Lq+@HcV+fwrk1NeOzx1%Kq+kK)@J3qfOioTD^iEvfvpwO%asRs?$*4F=wdq)J-I({q=mEmN)Bs!(!-PP#(zxhKX21=g`4d`2+fSr z3)M6E(uKy)1UZ?_6STi8DLU$Q^EJ|GH93*6uKZHA0oY7_*9l*LH+zH59>BPYJ{~qW69CF!Mr%aK8eke{>?9CvY%l z1|Dt8DEqLxTlbwrq6Z$|j2cj)0M*vv4S?8Tb&_d)GoY_2)hA9|?t0)l?%DlLc{?1t>=Q(VP#ngDpEQ25_)0b9n zzLazHdRnq=G)K3c8a72*t;3rFLG7n4Rwag-W8L>l;zFD~CdJ5{VMTnSvh5YJ&=I)A z%)4;9t-9iw)WuMAUD&nMJ;dPa2-d*k(dnJAZAy}O zVDMN4NvUEH8aQ^SLe>IsCdo)P#SEy~$Mh2pl9uIZ zp1;=G2C^TF99+=&J?9Ap0#~eQw{K?+ z*LW{&gMgyUdE!KS)lCuFTh^&Yh=`ig%%Z~S2UK@l#emXs1hOt)qNYRnpPQ5{3K?nq zy3p`74MziYRJXOJe4J#Q+?ITwgtawzaunJcJbLtteHkp+=nI1M?~AHbx|V_+OJ2W} z=wgSPQPyVeOb+1ghBuPb@4k@+U-21ibk1KaMq7Z$vbX{Z0RjMY{)+tpbuYn5R|Qw` zl^^*|fT7?8w$)ghH`Pm7RBx>&fOJT#%P4Ty4HBFFk^gcK$k~Y*5()G5(5O6JNHFD} zh5bcE5#?g8_Xqpq@X6cc()$B0xRj@;S6ZO(FGe?#-M`WHPQ|4W)EPTm3DUwe*uS(& znoR&NGBKI;?0;^{g4;MTtlZv{x~_!`vM)bp!zDMfn7sFHxqX3T2bwt z(YR6}vt_l@60Od6IM|;acCrRB)&ITQu<>ck!05??_WL1u{WWwL0(S+sYAu3NcdFyM zWpLO|dV=o!J84h|Z8PA}KU&Uk&Uyz(W$$Cdl*{Ex62@%7`4>rBKal$W^AupzO2Cvq z4WnUl>tg?$!e>PGOLLTujtPI7kULY5wjhBj&bSt=!FYpa0{>kUfziR6VYJxl;t;1l zVSRN`_^@ODV*;z{xAffEjoxczr=DPUipzBG2Y`hit#sS~Pc4>a@O^QRlh$oiEs39w zRW$o%d%I$}#E9jCd+g1$A1h@$bB2GQLg3Q5m1|`+9ah`|EVWsk|%elEs>vcPw---aGMV{C+TKH`p|I;#i{j*xzCM z8*tXaq2es z<=7`v*JDSkKhtSSm-M<#ybv&&0lj5eg$VD%nvte|c>>t+0{Hujoft_)@LAY|8f10d z#WmD8QjS^X+IK$GY@^FJ_5bBNms79fR<8g$mVX_O|G2P{ngHBUxcSG}fBCzAgc|PZ zY*1&Xh9=_wD_s1Kdov~ikxQzA!0`VSfc@vIP|pP-``RZn;r|lZ|JRFx3tklU+_aZA z{|s&Z@tgmcI`kx^Dmr7Le7lwNAA!&R{H+{LDi%0bKMbf>3;iFl-+x_X=>$gWr{jqC z-2eV;>cL=WbXPx&)BmqSBUpPKmDQ{_VE%O){__hy$jx3sAdsu86t#`^|5AY8n8I&- zqBM+)fxQ+-a+!@W;ku;gyXC_WhZ{Q_ZHuPaon?2E*B4s{gMa zR*`5A-dWOKFSYmo{=z6ZBE%T%-(O=D11@U2X7T*HuY|MVHf~@?%5fwR+{F6df1m}G z!9`Y+VlPbo%~jwNu*?tt?c>2M={XayrUjic@89`<55o1`LIuwL-5tqs2*!Ymoase1 z--;X%oCC6MR&qCp3yiLuX8v6I{U+4IsQ9q}D;KvNon;c?AA-2z-*&Zqde=XsnSYeD zIQVFk0yvJBDfndbv^AA#Qw1HVmp%B(!Q>fvW%!gyZw$3?PkNH&!C?K978Y!J~+ zFb`(UA8VeC$2(re?OB@|tsHsR>zIE&FYk%7@jIL9{=-xFjimXYb2Zx4%(=H`;UAOp ziVS*$IrdnQ^s~2GTrypYm+~|9DqMJbs1bYTB88lhQy}Q`3qjrFZWZ$C><-=!nl)kX zo@`cdF9%)3$<0r1C)mEa2m=#&f%DW>SPFLSk!x^ydDLulPY@gnMxDVubsQ z@y7v4(OSA=y9w|k6ti$@EUr#RmOuyhu5;jCMlXQcPo?p5zvG;VVDT;TUy+E5x;*Ozvg~H5#@3PdK-79L76#l`Yp*xwXT={TWSMjS}Ur3KHv~r^QpO-xw^18+462S zO?~1uwj1Z&_&!T=C^Yot^UJjBcLGhIz;+(*eDvf(BNQz@6|Y>kK9~NOI+sZ~eMm7}bC=O`n+H&` zvnXt*FQeAckGhL50yjWg(a`6hVdi3Mx6ixDUH9^|-{NvN%?&6zrGo6cb~k7wSY~@+ zd)6#2cJtmqTEN?pM`nCA-!HfNh!vTrI!~OIx3_KU6Kl5*13VSin=9*m%+H#{X1t!w zKhvpkv})eeHM`pN$h=5wZq7J+M>Ubh)H%LYYZH41u$Hw`&D^^0f%AW8)79aQ;FUVg zD}rEZ%qxOpey>@W%bv0}NY|ThlQFyh5K74hrejr;pd+*-z3rn#6PM+?R_r$TNYsG+ zWjKvtk*LQX@tMw_5>BImoia03@#*fpcWSBQ(kzc6WY2TY_X}n3IQuU!^6{b?->%%> zi}dGgw)JUG1Iw|QviqmVL@9qI8Y^Y?OZMQ8{+YzLK4Z(?dy`oX_RCbAt=r%>1X#~T zJ9|K)!V;lz7(43vD$`QzXE)jNi@L*cd%og}<4##~8B8$w(A4H(hj>lIrd32hP0QK3 z#g(r-OYN9#hS;;;-hNq$TA4Fh7H2z$6Mv+(LC{ysLHf&~de|^S-*9nBa@RRa^e_4k z>rbhL#!=dp_Cs6q%eYPW2axuZMX7+?(%fe-WBRH0l$QQ!hiH>dZCUjnlj<3I)$53C ziCgwWZ469k4}f3VXNwvq#*0)r)=qhqTs&r^gf?GnI;UlW7h|cqBEWI%`TN)&Abx4m z-vG)ky$?pr^uRmx<~Cd4vT$~Bp6%zKq#;cwKb`;hlQ7Xf@%Z(r`&NcNM^tha$d{HP zxwCkjeR@rMR%DLqB`CCmE&#gkE+2q6dfe${dpYGfa>u^3O8ErrFcttQlKim7fjg7e z)U;BFvSrhB=G2hOD(GU}f^T_uV5PinKFaqA?`sM_pct3TdaarlWI7zj7%)37Giy z2j@!2xc;+6Zdvi~Smt|y0s>60D;CdWj$6nLXC6w?7g3+*mYfe$RkXkJ0>ugr^c3&T z)p&5(Pl@kUAx1#cz{&@}CrY-nD(Ik7Wxt>=(qxN+>kg2FKX=zK1KausCYU%al9r$ZcFTPj)#oBlLz`}OIZ%d(bdVAzA2P~j0wKet9=?jw30(Dep`l}iF{)Va z>7ID#d=DArtNRMriiR4PJ8A*TV7>Zg+F#pTl-tlq8E0jm4i&ykVDthdCztP_{#8>l z&!q8WukKwW1ZdDd`@U~)ab_aD!pVA~`UdQ5x8`78OWquU)CBoRQR*VQH=!YDS+HqY zqc@k;_ucjaxBX^JBd6ckWjzjmuo%BEzdxzNAw4JOZgje4oAPiWVQE~5OhDpu?T z-ZZz&PUKx&DV@i{_=(Zwwj=CRvgdQ;e2MhJwiDd<8~JzaGfKV_noLmBWVOk8KMW ztdq|0cr#z@AS)IPfvP%g1h8v)zo+_fW?#?w zsk8KfOvv^ET}Mrb`gOv2V0E4>F9MBsL5mXPV`%)F{iH9E#A6*ty|1;rWe-K0dsc{& zzs~H1f*jgz$*IqsBo3X8N9PiEehfP~g9p~^OdadrJR~IB>6~zCaP&Ewm@)5PT3_5| zI_WktM;I-xOaz+!mhtrBH>X-TH8pq8FW+qwp;tXDZk8A;xa%*yHTVu0v2ebd)?CUI zVqvf40OY4yNJU0-@aC|vYd$}I+%s~^!HzRyY|bY8v?{!R8T5%Q`+jbUFdUtNfDpEP{-iKD42fza309#W zSeJ6;Jj$f1G!W_Y24R@d+<-HaMj)by!O>Pa@Swz^%$QNKYS3Ba>-0gmDvn%>wVCT& zI9+)oy@j`Q@#j5^Zm_`-Nl3=mcc7Ja5SSb5J^rFFijM5#%qLzPsgYvK;MvM5QrCex zMHe~NV4=H(Q}?*A6k|awsyK!9_!7y6R(tzPadeiS;zsw+ch5cAVJiBamL~UumGmRT zY%rQJLJJY@3pd4|_w{Vt=u`ZHsC0Y_eh;dml2E%@s--rv@rc)m)oA*2Z7iA&_Xnd)MF z2xA!1VsVkbE2eb-A4sr3_+Rd2SHxw~W8=q7d*!Ruqwrf>)HwQoG9X;y@8{CYaa6kG zWV+_FVF&j1|G3+Z|7VwW(~FEd%CdFL>+?kdR0O6z7maAUG3MG>b2IO`7Mv(oo?qNA zWURO_%Ou?78INfOC-t}i<14ZJ0E#3)rYKHw(lpagX6_5c4Z*Vs(a}RIW)AU=<3p__ z**C%4o_&-4rRWM}S~dR8!;>Dln@egCu9_FD$E)h9q>X(qBXtShNKi~$76{NstVshI8Q&p5pJAG(4Amil1l`8mPH=Zx0AQj3CcvRTnuVKK-m(3)XIZm+<`W z^P5jKEwoeZVUOXhX4pN7ll%+vYhTJlsAE&Vd{3v9HwZecq6$umkUEI0OSR<2iXKpM zn`(jXRO_2&FvvZZJsI0Rg=-G$WK_Omdrr#|;w&gXt4I`?hMKK2ltX(KR6KHstd}Y0 zHf9^e3$wlYWO3!OY4?vzc9j8oWZTr4GezQb8Ew%T|AUee;o9%z?AhGB_Oaxn#oolX zY3WQ^4rL5+>eH`udwY1NSQ=Zuns(^V*FZ)4(QK|;GhHR~o$iPT6zV_H5`d_Q;=Q ze`%Rzf1lgE59!bKzx{aF_My5K`BtKhRiN|a3tgN>j18SRePlX{Z_cr@qrnQfy=pI( zlf!RX3*z>-0-9?~h&lSx2wyanR{Fqv`*CR_hUDP)pTlnQv^J~F!FbNUD;>;;w@qbQ z=lPp&m@8?XvpS2U3QxNPxx}JLjR5XWL%5nzSgyGg|GdJ;oumWeA5eK9*qvMcU$CiXNA_YM7^n?Dnm3Wuy0 zo5;rezS0l4kho2x9tQ!t+-?EHtH7*W#eK@Gc>0q4JpY&!v4GH5&!7QF}s8lsZjf*We;W z{+3P@wtvirigTjozCwvOwocX^Q-7>Lh4Z=I6KAOiRA=EsA#5!<6cTNoNhS+3qk^|8 zx)RFs*33u1;;oI<(*DUp5z?Q`U| z1}$8a7DT3w$oDYWW5ng_eh0kw3TQpnlX)rxZcrgvU7FWUtSKR8=VIgL^PaREJm(OO zrUL&!^LH*xlek8PvQ0)Z$7FMaEPdQk;?wnE7LKi*3*ItB+OLH(mN`hq1vfUpyV?gPo5QW?*Hx@B3{c-_P1R*wJb7${w_5yGY2JdPc}@ z@y|VM8QZyiVVZ4lx9Mq+i@eW5!)}M^Qdtna{VeJj_A7&ePcSNDJ#-`V^E&>@P(NpJ zh0VL@wuW0S!s#X1hoWs{&YYOGTXmv3zpII4)v~>U22{RCX2p+uRavESa1Wrum^IgB z3Pp7@#r;esAbEfVN|K($PfxHzkc?8?{lSzLa7PMu7{g+e@FLS>g18EnIovFvSRimG z`k><{FGr};C;`lkyVV;sd<;4ANN#d7iNiu(>zY##EV*uQh-KDpGU4c=*f{Xu_u{Dk zF4l=RsAfJPKx`Nk5DbEzh_2(T7)&8we!9<_DYwNZlls$p4IDk6OaYZ%p%MWsbPH@I z;f66!g!_KR1uDB)e37C8MFrmM7i=#nrNq?{>=r%p=1k=As$EF`L;){O7@SWtm1<>Z zm{n)#`Ph%hUK=xeJpF_|6g>w~4?BeN>4(oI!ow6MouBZ76*=bQjOFtaJOaXhV&>}- zpTex;m|mz*&*hYjtst5cQOhJ!7rY$nU63lG9iukU z6zV@geQF{oi7Xp4N8vVD1{ZbXH$8&(d*vH7z}!g565t%S^2!1oVDd`m`%7NX_#$QW zpv)R6s{&nKC)rbSb5y!KO=$)ei$cMZ>i1B>Jg%hwkE%0|hq8^^e#T(zGbm)8v6W=a zzVBqKgk&d6*+!v}u@6QVV^6kFA=%d=#!_}sWJ|^tS;yKCdN22L-|zFjf6ZU!GiI*y zysqDQ9LM)~leETupqP(LK*0oPy^da5KRr-14IAZy`h%_VhlMNT1BONR83pay5?`7g zH3v$OqzwcZKHZ{=Y6e^$ZJWQ*${3s3sHpe+)k8v?t6`7^47lGY1g~`Ged}b|XAjSp zpZ$75=ujhw!`y$)fqz4qJ}=ze>?Fgs85{`eBr#`Rsc`;Jm8@E1bCq~Lehs2w7YxM0 zN3f#Tnx~c09$-+?6m(AA;dcM&k3HW+;S*i9eR9m}6zOOHo(nrX|2d%^MEQ3vnj2kHeJ#X!U7TOBN`p-+8np-p7~<>tk+!xPCJC1LHSYH^WfE_Mk1XK?djcj z$6RWkd}I=kN_SXbv}t5eUZF~%JM!|ue5K^v?4iapq9Itnm5Dl8ix8#g#MDSoKKV%> z2N1rTala-8-0yz=Pmh#=(^~f8c;3Dli7QRb!kxk6@ zEIBZ%uD?|GH_WXfFRFD_80XMg%grb<%Fa{#`93?h$%pN` z0U`!-(8ziPR<&r)`KM&YOOIa{}$``S6ihdZPBBtKMjn;k)5;<^H@ zJ?{o4^Owucw}zv#wb+m*ZhO&ERwvU_g|wsj;wd|>W?m-1d!Wda z^l2@vm=8KQmd*GnszPFV+v_*roSZ7#IVQ$BnV;Rdqt9;L(dZ0iM+irifYfUz0hrG^ zHYrF#Z*B5D~=Ndp;>l(pd{iRlX1+@n#_0)E1*aEz8*)16^36zd<0VsrA5v6 zC|Y-Ybr?gM7;N-T<2t*Mw1 zRH#bcScN~qLQ3QxVn#VQp23&VI-PGtJ2<34tS>v&bm&999yGAGKMfZnA@umPK3rz7g>$b+@~3nghSEFZtpZp5Js2KvulkMl`zEjH3j{KX~5B z4Ow)e#Zg>H&cwgTZ~C;v8FH~#9u<-MgcJI#^o4_f|0drNx8DfiQ;7Bq0|U&-+w(rl zA3I7ZE`jyFt$zg}D+Dogu#x0|;ff6ZqYy?gV=0dejt*y+A!!k6FSZl8_~^ox8-rC& zwW=1vAoq86l))UwxY_wD(^M+X^h{bk1a>l0D^J^k_Tr!wbeK#XH)BGPGYHgaq3 zY7;0qu}$b!RYcNqR8G9iPBq96&EwD++RD^zF41tA%L=a zV#YglQA8#%qCg6AitBD?5bgQpko!O+@Z?aB8ISQ$l`PC_dBZt! zJBU|Aq;Ng!JfsRD6&VgeCU9S65q;Aehj8=~&}*PaLFglmOKvdVd%F8`^f}1BR>~=1 zg5R^}PBDgGU#eu7_&tSNhAE7=xaXuEYea$ ztNy9y;`&tQmV95CHVf{LkYlM~gFQ}mpFo7-^eZ3_ux~XDASmRhLwYG6Ytq~cm9})v zmB3trVKZXWAo)_g5-=JwRwwgRX?@${8>g0j+XnrENJ;!2)3nNg;Iw6stb=L2`jGBK z`^F{R%p*Vr(M>G~jnFQvt6gLNHRcevNKLa^I9MNhiW{g_W~q6t3YC7QG$gyOFy7kE zC6#@WF@^ZP`r3pcZ(TbE_E?b19{k{>Gw?)f_RC`@SE!3+^hV*trlwos3iE%D*zfhj zx8~i+{xT>e%kMi{?0m$$fl6321hv_{xq!-C9^2+Qz7j!Tu~DPZNB$M*@n<0+5uxp> zJe7W8e+_TU`69(zzn$3eHnGfVPt3&!Um?t^w;6t`g0WIS36|6 z4tn)cpW_ej7xK_4Pdc&T6XHZgoPzKRY(MQiLtfg>#{t)?q@xBD;W@HN^b|>d1(4eT zUSsxXdA`JamN*idGe570Mz{QK5=zSSZ3UShrbkHPh?9VYIf^+7qhkK-RFXM)=SJXY z^>t$Jcy_OTmFr`3!lTwF3*1Mr&`-skrme|H%>c*D(DPa~E!!Tp)V{Z@PRvgBXjPcP z+AH@ITd4Fq>PM;mMp~03(DY#)(IgRDuB6Ti^GwmfaIP7|bOj$vquCzVe%9i>AI z`}!DT*r-GTpW{yzg#3E?pk9NLB^Rd=M?XZZL!)K}1!}YmlUwmccKPm^e@{|)DkIvt zReQJ0v95lBblY%}A0nXbN)&I^O zWChA=C0-n-A%z`0aOsBj(58j1Wmn-dx8t{qIIE{Dl(}kMuY+SQo~dN-scMcx^}`|U zhJMeguMr0GCgy<{n6UIcw+w521Dc!RlmdG2o78KeT;xY}4Xh8K@<26I3^(Fx4z&0w zcibmW$<(XQiXG&ya_uvVg^C1PKqb(*R=#0zzm=X z;f^6QYM zCR^!P%#=A7aEd0yc>haxvXmfOTtsa}Xbrr;6QKIb3X<0l8HZl{;xyg zH-r>?3=MT84oDD@XxT75gZbYOEi_j?f#!Yw?&Zy8a>}HZj+$C#(jy7sJfb)CGKRX9 z-GLvqZN9m6e1)&bSdDLXK_7zHJkjAkj^JbSmnQ$_wulPN!R0R~XUmYtVTCUVFgX76 zqPZX6AuNx~q{h)LlZtof)F*2o*z%*csOtDGCY|w3trdE=Y zVuoZi^FoXrEozlD^z9W2vq`UT8PkFv7}M|gOH=}f3n;Kq!<-AqAN718E~KgKVE9Lq z5E?7m1j_I0o}Y1>r0<$iZ}=EIYBr1KtBA~=6F8!>ouhfAV$B8nGjaZych*UQm(MU* zS?^GR+O$)+<(xiocV8&rTjqoln#hgTRRrs}&eJMDS`%xP+077=?Tl7Y*I|VoeBeVxFQ9XxiyJt$0t~Bd95BI>x}S zWryUKaIfv*`Z5H;+_q%ch0F`-W2%nW4anOSYTMY&4jGDL+jmMuFQ^<=55B`~`L}xG zQXS!8`6XuSNe|OufKt_Y!Ie66D9$BqZOgRxaA>ZoSxN65F8-LOSPe-(K;&4`wI4E< z_){p}K*c+>@0XXMfUQ#IRNy@TCd|IKPS~IyS5@7(bZu zvSWPGarOo@z%jdR@>PXo&7k5=dfVwwbpDEGt3WUN_G2{`cc=ifoY79gnnnmWO{2k4 z!p_U8PQTSfn_CV+m--J%%x4Z~|2hu*D2ZK*<=F^ux2kJ<-!C&2Q+}WbbJfaMw;$Cy;%Up=u`cuKMpi-{4bt1;J)u2<%?GQrH!Q53ZN0$lC z5YycHsP`SQAycNgd$iN(`d}A&?)CVWLy_@rFP=kRsBn}v`qG}RZrKLCaX=~tuw8O1 zTdW zhZD=_jpYf09mk@YNqUFw_U^;bGM5SGY&n-Cmx-_IovZN43iazd^>*sX`MXF(^2h7X zUYqQ{JeGiQfWWs1T%h6F9$>n7)&P|xL3#89bvBao+OHe%e2IChRdA8Mc$^Pe@%@dcEBIBU>duQ1DJa4DD4!fnxrTjmOW#VydLx#~jq(&N7f4V?z` z^cRN?=ZEi3KgvH^Qw9#E_Ia9BWd8Az*~9szeGW0cM!}BAZq-G|QV}c(aFbVpcWHn}muPQ({yaNkHaiyoPz7wFY zPrv2=@5lH`2RfG6j4-?QuG^qQeL7@fCUs+V#yritK1QVqgOTl?q_gR!j&9bdd&7gMvAfD^%lLMT4F6vVZjl^$uS_?Ib5ox3Ox9EB^?DLTr zu+;Q)J|#=T!P0bR*;Sx@i*bMT^Wxms6dKy#O#{Kt@NK9gGnW^O zLO|RJW-@-z+y+JAhFI47rw5OBAODdWY8>p2^864Ix3aYHaz6Wo^uD%Ig-|EDLseX^V={k$kYKewNWJ#@;A1vtNa3KvrskyP>;NXhJPwmM>4BIu} zM$&Rgr0)Um_^JGkVb7l49p;PQ#D9e9k+rVZ70A!Qx?0KyZAK&CUfaDDm{$3{syM`E zm-|rEJP1zxSV?{;?nozZ#{2gRhwfn3x9-Ou6@Ta$|*quW(Djs(ypd>+d9@_n2pos&@E&zK)tU#ZktWWjdLP1FFA z+gM}iynG6~CCdCQu~|Nc?%9Dp3wBTX8k#CLe&IGrQ#EBMR|6LK8y z*d{+Rf8*4rl?{R0>TlF2W3QM)+IL9-#NVUuiXpu2dl_~TJ(45mGH3|9@0_x0@gTG^a|{p0CMNUVv5ll^M!Ni=b17b z3z$N?r=Kkv5vK2@nykwC9^X>yE4_TcNnfq>9HWUPQRpRiWrmk-)1{3rEs^#iqn~{PbkD&A zz=o%N>FiztEC$38A8gvLj@IB}?T5=6y3<|q19%wNfYU(IcpuiF#NHmus#OD!>MXUs zq=$FEzQiZK9zj1nG5SLftb+MQj4=4sv{JcBr#$h7#z|V13-|7l5XDjzz^FtZ?3Agj3qLOyfgqWW-U3k*LR;~S#k;AdnPB(Va zo4?)u&RHp7z#ly>@7soI$s8jh^Lap*#V< zuQGR&ZFU-^C%HFPIr4u9cB}#kMC&Mg-hHtfYOiG%n1ACNEw~*{R;{jNP67@hPn*Ym zRy;)&>o?5~$mHn;H|6N?*fQA7?F-%Q7Ix3Qc}_HWa~qUDOGU{4sbdcnh8U4<=N*yN z=CJ~AJT6eUF3WG5v(v`(?cQvd-3Te-ojAyEHO1CW8H9C*Y`ff9`Qt@UrR{0@VQJ%S z@$Nf3miyMBpw!4NZCnGW$#QV-^>;a$9e0GJ^Vz+$>zc(yr8Ec<0Y0yVnN-10-vpk@ z1evmkj!@B&0^sVXU;*5RsKRx?AZWp{a}kU5o+9cmSA<4NCt(VY`^WecZ0^!Kbj6RkF9qMF8xwr3rK2OKPda+jw zn41OXnVh1$b3|_G+y=a4ty2%m#?XbwbM`(V3|nsi9g9n=$CpXKr{R5bzHM|v4U zKmQ)Y1k! z+`5L|U(H4ZDtKylpavl$vR!f^QsAyYhP1L-fu~=9$j^=G8UEPtapR*Izy@zW9h+F; zt|c0Ahx3T$yCo1zd+xsb{rJDtenm^iNnDF6M`y?g%nCFeHi zt=q@lWHP;9+wZd}t#2f4SVGAUNw_8kL*V-$0Vi_F|PvKa1@S`h5iF~Mk_Ysg(*{Ixw;Qegp(O5=08z6dxQ zCKc%UZ5B_~X~nY}`NqY136hb=J-pj-;jvBFiw`?6moW70E;K2w@0GeYZ3BeRvc(gz zQjj!PDvZ)1on{SKXJ*k;*?GKgdfFNKRgv6y(kdin~)Lpm{zSp@qO4Jy%IC=a)~mLdVM?QjvNI$@1RVgpNCW z-ol^^wWn-0qJ0@u7w){ELrGn^!S`holD)84l(eQyEq}yI@hvoDG(4OXbiCa?j;)); zHlG&S_kDGLJmD03y#)|txnanAH!$oJF{Mh-7Hu}6s@1ZJM7LWPxLruY>EX*N<7Woz zI~jK$l!}@n%0fLKom}`e>z3^;y6w4@G$_6MpYoC%cdpI2YA#iUKJ$*|Ix!e>zKiWU z*qyiqp#d3Ugj((c0oBrZ#fR}g!6zi z+&bDwayj8dz-5Sabgz#L$Gr6`x43|zi)0Q<$gaswTKdDY=G(zV!6eerHcC^xnzp{& z=4YQnx4aMxAfDHrH`a`Dnzc&|26h`wv$m~PUyMYxmy^;Vb@SQe#P=+_jz1M59Y~C} zbdBwl$a5;pe37q0c2Z~xZgb6x!;Iy38HKApHrKt99y4sdFB90$OygGE8h&MBe5iUd z&bP+=1H=SLh$qP^Kri}PPIxpze2v##WOJs#c&6BJ#EKRRfTW2LU*-Ma(j*T9Cg>tL& z9a8n!(&5`DP~AcMO+J5bsHmMU%1S1y9b3tAxMzI;qtYgLK9o{1Q9BZ}0q5D#A4RTZ z4i{GvgoUqN>+fr0jTkx0*c6q@){0ZcoU7K;Ng1vMBI>51vD&&m3b%3ehjRvEFx?-} zL?B>altOvaybGMgujf|+p}E;JaCH&igHh;-eI(?D#K4KLgwFURPE=EUNp3`C2uY;fi-j z<1(h^_`-~E1+2HpLK9)gxdF0=_`B7!szOAL*MVBd+qw(^0xB%O zNt}gDbB1c`lFfNNUoOK|2Ese_Nj%PR{KYSX?e$umB>tL;%|%U9M(!R^P9&nC-*I)x;R~RnVE(0XTfzOX1E#12 z9y_0PLyaOfBuDXkq(5>+hcmE)q5-TD$s*R8l+I!cwhFUSoP{cRzn(@E+%_4e=&`8N{=Iyf0-^nDlNcO)j`5IjL8YOdO zSOmxytYRD0won8y)sZMXk8_LR`rE_2Q{2ScOpFc9F4EI; zELiH)-Wdd#9V9V0&b4D5W+0%|KJM3ac{A2#;r^+mFmGf(xty_1B+tM&nu_U@+R`;D zl~Z0CQh6>dY6{a7dxd4XF0)~ERCr1|tSXYyVcI$V>Vrd|6R#2k_;Lbkk0-HZox`gT z1=%1Q$4YgU@X5HSg%?%bCey&uAa!R$98mFcA0q}#zkZX<#JWg+%OhX0kLsVh7fCv5 zY>lIwTSQ%Ly?2WOek;3^*B;36IWEQScn%T1vO6X5Jz(8u(nRKm9F#VA(8~edglcQ$ zj^fmC_wV3*NLnnaGl!1KB5zDN;)bx65|`-S%49u0EPcj!Yk zALvLYW8CIjbmf$3QGJrlPg6zQ;Lmg5G((d$nN1xLh^q#XPxwW_EI2;cUgJwKx$5z4 z0~%=oI10ia!KkRqF#ypyhs78?dxRG(WI%9!oyiuwp@!TLB*NvuSBA0(an}^BviW@> z(1)E_0(lT@Pib3v{4)X2P!xrYE}SQJm|wGMvn)JzKmfO4_y@y5Un?AY|Df7B zUpm}_4&}C#l^%n?-z{v3x%x7iKOuZo{YbNTG(Ei7>)8W|5O4!Y3Ie2e?$ahhE~Y3K zoKM!b-FFIs-9!-KVFn5?b2jeFG6I^aw#s1Fn+KQAlLl74TG^TbTT=+eE$&4hbx15@ z#pMHmZphn%=ywM7t zJhr$uJLFh8^kXLM@xZ0d6SAyYvij|?<(lz#$0|~iw;Tz}3l*zT0x(0!Wl+np!VgVJ zfed1pi7BEISS~7%%n10ljE>OqLPu3i&Zu8xwqzF%?e%q8ms?fY=iEPCRG!qov(vt@ z&VFgjkkDB+O5X;dU~cQM0;97pXAFy)PwJ>0b7zo*g*$ZB+zI@+@)2r{bLZXZH_M$U|VWp-}DtL7;t+aUSOf#%^`D%{Tpd) zeBcR2VxXRjd*31iqk(OlCsY`9D~wA4UWAde(0Nh4Ypwms({uXRJ-DgCb88Royh$xL zLD({ugS*bRwhSs%yNjraQGq344Y;qiw@Wp^hVhKGZQQW~P*2Eh2-O!%4mKem6R$T9 zeaNOIP!N47`Wze%X2Rv)(1Zx4`1L{f5k+WRiutlFt=ww2<-PE#X#T3WJF*qBx_p`m zIt zVM$Hc+clG7lJlt>J?FI?+n(^_s}Q1a-YMLK0YLyb=vvy~UbIu23g2eKW&Fni(0FA( z<#0G8a#Tg3HzTxYs;*_3?g|YLCqNGXV+5YjYunM~p!<;@e$!w_ z`}J7o3MPrTbcZ7=fY#+i_f*GbVy*3n`5_gJchMlsxL*W+DTtu0uGJ#k;KtF>wSKEZ zvIX|wr2Pv6%uV5v`6RG7SZIY;YLX{B6w6vi%$~k(hKET zeiBvx{@oBmgYCEpI#^Wsx5)oU9tqsw?^}L6zr%7MvvV?F6!}&Q@IYTl6c660w_@C+ zb=!98f~mx+{}iF3IrLG({}owHvPjJNdM&3SxwPmDG$LKDVh}@*{{ls2Z zbTVap&MJF*355`wxH4gIu1hsNwz-7wrcv%5jRK^rzs)?`gP9LKAfSkiXW10tv;FZd zMJ3L1XAcf^nW(sW#2Yl>UlklhPd;6Jm0vG%l7TV+ch-((h70kR;z$3%tQ zxC>+^znqE1FB=`F)IA_bX9Ic~Y=U1EH%IZ)0YmtYiP&M-CP|loyxdv6+7V&uK^w&Fg||BBpMHnLi`Yzz8k24^HC{>mK}eXqy3W2pLmtWBNJAM~iuzXl7!if;H8oCML@F#;*O;aOwA8DyH!NWSwu zdnTijMzevf2y`5O$*z}xG(>0n)y~{F_vd98C)vN7r2_pVm4uHgfALI*z<+CwzAuHW z&K*yJp(FX^*81NyunHW?1qkAxX<%5KL}cG!ms>w8)fXh!=WUrHTD>mbecR2SO9Ah&*7fS`X3);1b6O- z?lA1TbZT75gax7ll|%2;^&mXZDXv6I2TrGp$b(v=rPD=KvgY(h5!Cczftd)X1PS|f{d4FTh5fbc&B3w`cl<@6LDQR@1%M}EfR-qus_YqwR# zL=lux@vL6t!9~NgiIh?7=%<5Y)BlIU@`8urm{EUMfjPaNNB$7X3E|#Rt*={pzb6NI;Lk8ttKnc)j7)KP3VxqNEtVmfTw$ z6q0@F`#Nh2_|tP=#@V(O@y|>ZaY1@#0csAxM&`8G4&13|ZIJI)rHV{Gpn-Ed0V5g9 zE3mCEOyDWFfgpS-=$9I~=He9a=`|NEq~Q1VatHtX4{ zl|g)w31N4>s0gR@=55PP-~M1^i1ww=|H$LK?pR;yjuEqn)>I(KjIvdVcC<< z!9UC`tzFY8XnSxT6wDfGSk!Bs7s_CU7-z|EGk)(R{_jI6KlwM%6PyOrE>?)WUn#T@ zJUd-o&90;k3_I+1c-nX^*s`}Pj`d10|J2npY>I4U&|tVq+e_AtuPNcN&AJaxd8UEo z=YfDCo}lG_FU|h@X^QwLY}@G{p%Z?6nD|Jx$A1$GdxB>_nB4UJJQINPt{spzQVozr9Tsn!fn29(uK*c^F6x>6PSVNFa{|%&Hc|o8GfMy0n-raziss_MPj)uQordjYm z3yE7hMo{&+tLW=QAukJanP_@exG#2UY&bJ@7T#=g%Msx4=47e*wP2m`6cl;(Lj07trnMfv;~X z6_;~rhd63|fy+p|5@1;D&ixHpX9Xid9RbaxJZY}#t>!#m=w1*;76F7_IU=@9Jo2>$@||Qj7%14ShnSQNINIWS+Tm^#C;_Gyin2 zcnRPJIGx`K1$Z8r%z$*WE&%e8IM6DnHzHRC$Y3vMi@!b(cMqaxk_Wo#nWx0MVZ$b0yQo1@ zc~!?VPq{Rca-zr2UHWT$wi!8I)FvkD^OkCkM=>dth$o)(=&Rl0OF}?7WJut4AtTQu zg?VgE7mE>Fx1umri?|X)?4L6r%y)pNDI58rotle{G6~>^l&Js&Q40fDfNlc&U5|o? zKz*SVh3RN8fGm0=`XKn*3@z%LEbFO#=(~NDmcMin8_Q&PEWQHJ7+3MG!oW7n&ktQv^mGd;2NuviMdLB%&)Yv~@&Q0k zQiBh>>!N%;qT|I zubHmP`LzRTy9t#F`OvP@Gc1+fD&slhuY31}8!lp1k4c^Np>B}COm4nX#T$}%g{3s} zzO#H_bg_bPEA06hoQf_QG7q3irk+0XRVt2%cL)6PH7Mz z*>n4o^6cTz-M>{u&AybwfNWY*p&Hh2^W)1*0cC>(RSN|WL)klQM9W3{gE#KDpgY6s?9&~(x08^o-Z!_O0GZC<6QxkmT0zbYmjx| zr0W`ss~YKy>64r8K3PLR_{@{dLU&S^HUKKri>F3kYr@B$(q*{k{V}UTK2TbID`V>5 zwnms;G=v~H=p5$_QK{$Ll-)ae#a?~lc~W~%M!`0)OwM0wJ*y<8~;xkVzm8 zV6qW3xElcZ_h^6<=1U8I58T^C4Taa?EZJT)hSmVWwJ-a}-s>Y1emZ1CxUqnFmtfx; z&S0z+o&)s5xOTv@;>3C_NOpOa{p0Un!4;DZcH%z*52b%HR_3I;$z{DVI|CC-tw{5e zG<0r;>^rO5(E37BFJ}W=W^fkZ67gi4{c>qvb6AYC8m+@C$&Wuo$)((T{j9~mM*_eH z>~nwB9K6Tig?;I8^d<8_vEY*$SAYQ(v#WDCy;#iX8Q@(RRCDDI8@CVk{)eq2W%YUk zpcd=fiByQE#D;AFyxjmo6e;dmu)<61JEiQy67B<*lhHqkzGxR72uy11v$kLyehbyD z{2(vtgirYq^$*9Qyrna%;bTZ0J&Jl4V41lIapaS`L{>vaKd4c)Y~EO(Q43u;-u)~A zl&u6aYLuS=*S0^hSGEH8jtgbL_}yWGWbl0{v|B1XH_~%4Nh&&8zhgNbwPz#2I8Eb<9SP}RM zsNoFn^QI@(cRn1~t{Ae=q`-rVXglPxs{h<@1{892_G~D3m-rM!4NcL6P~cRLE{ZQ5 zsvc!8HlcWn9!*C7R$_j<2g(3i`6?e?(Y)un;Ev)BEmg0b0|W#KV4->};V^0x(*VM% z?3;CK-%W`iqa7>ufZgpDcmz2&aO!@Kf-N1Y{{BweKYPvV`(QFiEzWXo$z_jSi)XWI zd4v=B6X2d#QxCV~c%2!3+I=akTcts^l(d3jDsR9l=e5{csBqAF0KaAN#i{h_4#q z%C$$fQRhhxRf2uKO>SWX_YNlS#1KCGkJENqX2UWQCe`lnl-KGwr4ul_Y zb-|VxrFmS!J#4tSVC7C0rA}T9pn3Z~^`_7C)LE1BKQ?sf+=`kj-|H*m+THTtFxCzz+^gOK{ScEGx*=z`$>X|Je__+3CV2de z{-DU3><%r9e$(!u7T%_r}gMd_MW5Gv(Dh9r2e^-qmmYe+t|?4Kn8m+ z%gd)Ljjp=XvKgVQm*V_z=lw6}e4G^ju=ho&c{C*3q^aq>C{1IR<{2`$mp918t?FUS z)6Q4nJ4loN*jcjcB|N6YA%2;j ze7lVKP`));4-{;hm2_4(>-plKTm95;-(^Z~egWy}Ej^rL%9Uh5ku;~KL;@gucJ4^H z@09)UL3fwN7@$UnM zUEi_JU+oip5RXTo^?8HtN%d>Zj2hl{sLMCdyCdT=lzkH|*mbKl?_<>WEWai|h_h;0 zoGX~Bil6jzBf?k&QoIhURW1)9-nN=>?o0aX45#qdvePDFZim%cPeUgmk@lU&2?9jx zoi_~TW|LRnp#})rx67~S?}-B=#RHj$5o7hkteIl*DbqXv6a5#TtR8a7ojCTh|1V2~ z6~CjSD9TyV(3s_hhV&+2%Gx1NGLHr)FB>(vPySTf_~9ej*g_YT1ouLvU^j}oX1jSi52 z0egXj_3ojachqnu!mk^If1UeqWJRUw&`HER@K$!}Nf~(-!EUlTv6&Yk|IElEE`iM+ z#E#HC)Lon~_PAya3ZkcAzPbBbZraP7rr8a~vTREb_rRot$MCmPZKpsUU?gsOQo4a= zd4t!b?g!?aWWeL0T<31>F0IY2AW9**_W~mtYi{Hv>&{>PVn|^9r9XvNJA{2-)J@p+ z(z<7n_T&udhDqHx=v_M_x$$_2vcm)~TwIJ_N1!v9`#+26B8WG1ea#7FGyK4|q=evi z*6n(f%QAaLbz@Jy(oUBogWQsw2nb_klkBP601)hDl&UgT&X&k-(8Gm$kZ1d@at(jV zBT9pXoe1GmAHHru9A}W7i~;Aw`;6F-W(Q5+Zr6(;XNN~spgjFplpuDkSHKX-{yoM+ zn7n#~zz0>+&T&^(`;QjseE&mIsJ(=g>;FEOAatB%9m@sVCh_(Lgq~ro3I^9@3%T>6 z!#F4AqA?mRL;JqMIh-JcoL75p({rgiUA^C3{7j8v!9G-0$R>bgRXn7`&-NsU+sTYD zrkd@stX5=vMRo$`!b(Q^AAb#JrHQ4&{rur_;Sv*YgIq#KwZ)$8B=Sd_0oFx2j@S4# zI+2x+`Q#6Z10XHKV#Y|R)}Nl=`>A3{p0>{~ARvMYsc4^Tdzjue$wDfXmz{MXMzBi* zh$jY5f1Ma$v$#YJnzRrD9oiS255^a7q&-ZfK3`>cLKdy2iT)dv7_XMb|Q9*hH z^tqoQ_caY!`WuIBU$V*3*GJmstIB1qc2~zoYXb zNgI9U`nBa_oRyulesaIVlI}`fs<*vkH$ZG&DU5$eyK#F)b7%4FwlD}m=}_cr5M%_B ze2J9viiaRU;o&=)nxxKxLr!jGcjA79d&}NVSVqUe=1N7<^NPAtli&Gf+9`_@*Fpxd1uBVLRq%vy`f;)fSU@|{KI zh+S^wT24Dsibt6WVF$uOg$2?`3xX0N z-Cc`D8U!hUMJpgIy5Y?AeDAy8z4!U{Z=c`!$9jCc=RN1VM_l6?*C=D{YxIcd-34Kz zcVUVZj-P72-KS@5>$WS;EI)bP&*7>Ps}s)cb(X+QKQdu!#|`Mug38d&FsMI{mtISN zu~$WZE6VmlY#0Sv_zwFK-wsmyy~gpO6t*P6fsUGwm`^bg-m6?25T+khIw;XNC1pHC z>%cjE3%DKbj}T4lWnA?#!A|GXgSQ_!fAqGwI2#?)7pG7cFwTT%*{2(1hn=QwnsFC1 z2I~1+Z~_x|82uwPCN(5**8AAk*V!#h_f%Xx6)Aul-ZhPp2_|=5a60t5yFEZk_@h>yy+3-=Do3>%s+3u%bOOkWgAqX(HjBEi)b3_^T6=vs5|MBg9b+W4g6HQk} zgH^jZ9zEQe+C)Nc6i4c;4|@=wuYK$=Oqz~=qt*&bimCWoWlazn058*SUbU1Hf?0|~ z3tSq=;VF88DkEIoG<$oOc@Dna7*j@r~;zB26Yy47q+H&$oPItd2Ppx}^B^#bPB zBksfZn-!JQ*vp>xGF(&$L{wwLEh6u*#Dw%YONGvBktb$qFQ@w6`8;dy9}z^N;9nKi z>Por~*9;w0p{~5**{S3iaBu4`yAV?`a<+3b?_le@7j;voeJ=AA>vEx{nMO9^w$RXA zn{GH`o#|D!5N<&!{RFxEhACvp#{`Wi!JXFxQUE8pA0pH6WC6K!n;BL7@{}|xwC!lmbA( z>N9DKYhG$Dz>7R2GN{8X!s+e4+akSpB`uETWY*^6zcS1I^`lDM?bun>K?_%^q2?_k zkIjMZ#b)Y>HgvE38Cq^LwcjUIt6QN*r;QDM>~$$-&lI+1b!_wAr2V{3Z}hD|6+0^8 zM?968(Xh&~JZ;VIekS>QCrPSn5;CswTrlXpw=I%ph+0Ylo4*o67FTtY=eDo?wo>Y3 z$Evj6S!U_5Lz-@XMEtq>wQY8w9Ih%6TXe(%2SU?daQV4yXfP1hEJI7#>>%;}#Xhtv zux&~_RDw3$M#VuK;Vk&b1t9yOZ{f_NSr{!}@Jgty8>jtE=`BXoZ7}Es2^dk$e=;r! zDX2*5mKecxt_XGMAfbV&sN<&Le8ZVb3*-6CBxgq{xF9z+i2ty!hM_qb`u=u#cEJp+ zAa>*>kLA><`{tTI!EZiWLKUc+9M`AxQ+!m5AAWTD_XkRFU8C3T>Q#AQI7!3E-Tfe4 zBGEV_23r}iUi0bJ8E(RytaOI-0A<8GvqhcQoBi?i!giS{)Eie$e+W$WrZg$=6EXf; z28yxePR>npubt9eY>8LW8yAbV&U;YVd9cV=Y_|+~gq3X;m5^AkMh%!Z(jeoRE8aXT zrz{79vCGCm?;UaUi3m;(UD0zwM|W@q|I92m`ccFF7TMidI023QgqPgn0CH_ohQT)T z-Ox-(x>xjug6cjHWV6l9+ND&(MVG>5152Kk_-r@5`9%hzBP!LaO-am_@B95`xx_#l zK-nVKHkmYxR7CHUBT?v^b?<3LJGa7X8H^xIwV7at4Rs~#(f*S?euZ|0e&7Wvr`=jvkLy_Ehx4M#4jdRZsreTdH)ah}hF>Pn4aWNhHA%47 zNlpvUsvmOCZjajdb09DWKz|`vNb2o1D2Uz>@hs+hdbU?QYh@gw9Sb%fah3mWyDmJ# zGpQ?XRk*uR-@qt88sM~TGw+D&0SDON-e`#=Ul7ugf4-%^cMr@GYHI-)N=AvfKjE|; zqB7H_OPliZD+hbMh~rBsZ;&pwP5zS#yoZT6-Qh_RzQ&n zs+$T`-Eb4|B2Unec_VN?Dq?-uHY9NSt~@@+LCIad!QYYg1Ghj%n9tC}LuuBdkDk4C z_Ktr_oEuT*DORm7Q0#9FSj6W?qM9d08}=VM;VvH=JoM>7-;qut>md4Iztp^VTy|mG z6rd}5d(S>)&X4TZ%ujO*jqQ%C>tyo}ZeRZL31Q1HS~hHwq(gJEFMjJ?l>NPk6y@GF ztE0Am*C0iVUKCcoKbC|%GRrP=dwFpg6`JjcPWjR6i@@m4;ZtUTsq~nMJHWhLz0-uc z4e+yfBU8!PG#-GGrD^axqU;@iX%y@dyj{4tic9R;u>9_Ld129dt49RTY3+lXP2FET zc#*X`wb1&u?ig)W_vS<+bK-Q5+IVzr2fk$8?~A+_6s^d#sv?z&Q{LGepms${*f%Yv zf){p&y>oq%QKkC_2HWxOh?UP;_ zwR(n+7m#8i6(k~C9k(Y*IZJ+QS41XH2J(%yOuIO)h37NWW>*69hs2k-MufaZ z1kJU;;HV27I}Aru+2|j+V3Y!16`0~PU0?5ic*oNI-KYB1n`NTV2ROV6Z)U)(9>L_b zT_E?Koauj7-BJdyNVTf?2XAZ=HAM>FoV_8_`d0omrJUN`*saxjnCQvTPt{HJZY9CX z8LugQ!zshNEas%0pgBk7=K;>^H*yKyYoz@3LrRv6fh5xRsi+^Zxq+lpjSZDSPOr{* zqM)GgnZBV%XkTzzHdnsMwh#y1Pwi*cUT1ovS|XJc{%|g~ViDZ9brhOTFEsh;V!0jI zA85DyP-pFm67EH1>V0ld(A4Mt*?x!am`AuErC_caGjPYABFwJt+>8C&U@$b%^{)KUXYDW}jjlDK*2{cRo;~dGnC+jWlS>W5ZNE??F`{cTW%E zF{w-{N$ZJQPJr-#bS#6OS9TQxQeo0swxSC*f_w&weXV|9x%yZ-M6 ztg@v(72FA;WZUVh%v_C$THGj(y-*2dwINg4*OwAd9KIQlJ^aTVe6Gj&wZAPn@I_Xl z1`8e-sAbsaaOIR!5z4Q27|=5iq?0akLhX*5QGLl zE*{`T`&#tggKLy^+pZfMtnCzVd-$p9_d*@D9goeQQZ=LNLHalfUAsFr<Byk- zJsG=!mLHfDIEmmad!(4S2WFGK5U07P7Sb(dvlx83AZ+Fnu!tB}VU&JsQeCz3EjPBv zZQq-4IeI1>`{4b^I$7(!P6hrg`6O#mrDlkj2{_EPgtk z5vzAC?~@qP&jOum-?}q4?ymrK#ozb`bzMf`J3<6$z*jW@I6$018qaSr$ETOt?&GV}+911?+eZeCkD^$LSaIxLFXVT9^G8*FMdnh83lWfC@bKiqGXSUy9 z2+mUG(ssU){ES%xHHd(Rh=awGNsW1pfwOhNXI`InjrxhD{{04{Hs%kJGw(~P-ZDL? z+VyMit4kU91tu>h!-H38m%fb~GJoD&i0BTWrC;eITv-A}&p34@Qi#P5RlHyg#E~Rz z8ds*BQsv39XQNPvfJeS$d4+skctJw1^yz&D2cp(@$YHyFyHmic8V^-m@dV20Kigd4 z)|5p~-?^pGnb7Ln>~ugqb?@>qo?mNUk!-96v|Lsvvq}mWaDf)At>4S{5%fXGj^rqW zCR%zB&Xc(8jz6e5hZ2G@NY)C@iud@-d_hF*Rw>kFDoJDw-5D6N@5B7k5%*o|0600B zOwrvA2ov?hhZy!Ag#f;7_q?Au8YL~IpoB1ppC0l=`^Y-TFBuE7Q_g~MbrA4*(J}OD znRub9OU(33g)(hc@3Id{7Kxv(n+=nbUy-`kI--d_wd0aOipc-YaBV958AI^uM{!w` z;|qZpSyFk-sfbs8pFD>;ySCjM$dOoEjsrntVTR0OnH}3%8v!EfKWk%Enz5Txdwfo* z<~3<57sPkXK*S9K$1HxD2o{v%0y>RseYq-7A|ruE!=6T>GNO;>aeTnT9M0#a+YIX8 zE6%ZfbfC;&C2{=_L3&{G{hd8$!eBkMPxXj|(+fgX;u?&wh_^V*PZdShp5z(U{3sE& zVR2U^c%t*_-pYN_zyXeer^^p*5$W>2s4{;Cw=I;}SW!k@_C8t;ZG#u_4=^mgb>yAk z>lm1dkUa4e)wtfG0zS{v9{75yS(^m@alJF;@}rwc+(yt6>E$$;(xp>&oz=4pLMP1}2$zZutXjUNcX!kx#%+~Kz#6@KX^V5ZWTDQqz?8lh$Y0M379oLNA_#ufrn|5vGry+0(Y9m{SmmHZ`_axFMu17$46B;~sRJN@=}?zoR1gz4nLYw_}EN z-8q=kVeq=vc9vv}8%oL4+g=h8&-aC7sq5Z;B}(NL@54>(tPg`Ee;V<)y!r6ra(Zg?;v5NQ#(5UWH zRwjq5R$2Ac_Vant__HJ-hgoZC{lCuM-g(7MFRD5bdZ%QFd`$L8^SXb@mY*1g>xX4> z@yY2P%lp=C32?fE1ve6$m@PL7Rj23R^#e<=4*Hjw5zCqek_di9KB&4N0>=+E?qi}ie4wBpm z9HR!wNm~gKs{ms!UX%;Q`ZXr8yK$|D_x)FBdbq7-1jb3+WVar|>T{YFPd+SSl4Hm# z=|G|KJ?|cfypq0~TiJJ3k10%pFCx_YiXc~+;Nwh>t0f;3v$_0}BB2h21m4f&`CEO} z*I0YM0te`Jvxe8fv`2swff0IvBa&UM+fsq0wcA3x9&8W8%mRjn&|2&v;+dio%!EMy z8M{@L6+YfU-8tJyZABpvQn>Pf#0Wbc&fi^WX-kGw{>UL%CH5@O6RA$o7yaCG=F7hN zM;GS9v-RjlYHHYUzM?@i^HnKZ#rGGvP+}IbyZHsAMtwv4khqT_GaJtvc;#@c5H%lc zJCF77-zFK+a72xl2zU?J4{boMzmcIUE=#C3+I*$7OWW=xiOxA4k=W+;Wzf1ma{uEO z{&=>kKN3j?j8kx~d{~$^`@(}rOUTB)4UCUO(f+_D%r3(&Is#|$@b=xRRJI}+A_>V| zxNaZ^;j5Qvi^D^l8wj;aLtCzB-u_Owg9x_s8QpbCwYauFeFU@Ec8rrS7nJI+jRszs38jaH=` z)(zukJik06o8NvZ?B$ho*YrJ_*mkqC&`C_TfH;{mca5qQ|I+ zh&O66pMHz$d*hA4^9MWczT|5sEPK)?{Y#c+1Pf{I$}Cr=y-yR-bU){zUF|Y8r7rPd zgh6)CgM0Sw+-RdAXZlX%KGTyCUNJCRras23y~4vmtWQ6PVLtj%8*e|D9k6lg z8VGre9TPf)Rwq)7#BZ?@uC^p#N5e+54=0n4qQf_*sHPHodL?kb2U38`9&460WFXM2 zwr_?&t8QxnO?#OU!b})-4q{<+eNQ?(EH+Z!X&m4xQEx!szJ6)691ZES^)|9u1>+Ap-u<>~$W z)Cgg=G^GBSB*xIn9%-mA%mEPMS#n}V2bTD+SlI+wZew;Zm~D_O``3ieE^RUl1=wM4 z6;oExNtL>|2*yk=zql-yUxWJ2Eitw*9Jo{gy2lr}y($*1`+LwS!xUiv=Ci+Wq53)TSDx(L-Ar zOxLjyrQRWvd2(!+7)XJ_oZyTmG&DbpYzVZaB0%sX#MEzCFy?MYQzpdiuVVH`^8P#+ z#FS0HMjP*{2-0qK9*UNUUL(j+0SGY7!ABpZ1MAdZ+X-Yo#$@&?EFF1+B~Pb_IDfO- zZfbNM$PTNw*e!&Zn)rezGt6;@gX@{xl~EM;)Db*fjmJVB3j>MwVNl%;D#a&|G8X|c zEoA+bM}UKHnW#5olnFx%8oj5Ms`|{)htxqOkDuC9kvy?OVUl|+mSzJYt%jk)_P$?o z!r^t#?jyDaP7~K!pq6kJT3P8=sJ-4&igI}FLa7b0^H}*QK;HPe1P^W zWWup9Q00P~<2J9Wc-;^fqjqJ7BVu?|F(-^_`jPQEFzhW=p?Xy1&zyfyYFEDn>B_AJAy? z!QyE*>TnS~-c<0Y?aC*v&~FbgG!uTaX@0zaafh2(9>@DBxA^$u{NLm+Sd=2tsnv>= zqP&r67Pgdv#Bg1U=9X@j-1xBmSSq(q%*6w>+gZ1wW zlf&_gMRRcxZ>ObxM4 zUs_xa=cLBrhx6{Q#xh2(A62@>HF(dDX8U4jHw`patE&e{io-UXV3K&nRm>AWWa4+z zq0wEk&1n6X4o8<@K3rz+%%&je%EGrNxrzZ$fxyPkj{eDtX!$q z60M(u^*IeHw~_@^%)IEY5WdJqU47mr4+*nh2bbi1mofP9!t8HlIU6Q8t_LyT-sC@VyB?*0 zM>!d~LQHFErglF4*?U$_6wo7U5oUZ`JXIeaav1NTXO}^0w*rTJz5zpGB~dYqQ4N+D_MEHs?ee1hzIG4c-bY7p@14JUIzcoczdQ|C`Yb>Z~yMmM2-^vrUn!{ z%tq`bn7Hokj*sZQ?2VzFudwLak-a^S^Zql@6Ej+*5Jk!33EKhKK)q45Z6iO+nSOy5 zy>W$k7;QL_QWC>>0O0o~UV~wy+11edxwn8D_X(68(BO;; zNaD!~225`-rDw!F1KQi_3PBXj^+;k_{>DtWcr99bRo*6wxw1aV3J}&ifPUclU zli!}W-jT5?HYm+Bi;#9!x%iB2HId^nmgZ3rCOcUB!wfR|Jufs*P6HZZ#{0ajKg8sE z@QvIKI4q5j$X-T{3jzSS8eyIoZaqq4Uxuk~P+ zg!B?(#$w814)8_*#15JU?JHrAx$nUAhkq)_XHwDSv&2bFbjasy-hhF)k`Hn2cirpy zf*MNd*cDDB%0wB<496c6wI5^hWf=E_@n;sP^R-SAN+0Q9cKOxqKHaX3_!1J9M(*J^^#f9Nd)s zlb85Q2Ev%l2$?C=W8dtC3UX9k05GXBr-U)|suI0Wkv2!%>d;6f`tI@(!t`@GBrEY{ z?wJaN@5+S~*xJVb*_L2TAY|9k{Gtqb>mFsvb{kwBsau<{NuHnV_Z;RKu+N(L-U0(* zf-Z9>gup#w>TcL$@}R!*&S^_hif?ajU&mocF;_zyFg|x;Tdk1wzmrwjz$B|U1EMV) z!A5^c%))OnRl35v47+y09<0gDJyD@wAL%-!eP|BbjvDQG!xKW1;4$x99dqb}%tofq zs94h^e%Dw-;jp_B`^R@_&wA|g`GNzEelG{zFM~41RGwc&Yi^+5kWIgr9Tes@X&BS| zn4^!ekw(|Up8H$+M!(#&_1CkbKFT?Ik8TnzNeG9P#z&#==UHbtXYQ!VOCZ)*^l0yz zNPr;Wbc|_2N_5v|>q;aS2C~7!QI{zSZK?chk%8f~BK1I!N5svdU?(WDb}`30WxR$K z#0(-sUQEEzzCby6ux(%l9yoSS5<4)Xbh$~;BLwmU2Ijvu4Tx(3@VfXD5L_f&1Ey(M z{IM7>gx6LMH3xe0{>>`mZAJUlI^PI12j4h( zLtF@`O~iY_-Ld9t99A>)aS;56@AcGP(#GEZOo)tl-3Zv>lACc(ZD3;Yf;U16N|~pzA zS(REz!b6k>Dh^)JLR*mast~@C4dAVgZH2&ZKibh$azmu9h_Buc@-`2hOKYS|z|`dm z0TX`~%JSJ@sV%rP`0`|Ax|0!A+@e=S|N6pfSlCs#51@&|n(;E%M?VOMmJ{PSV0+`$ zcG{c8%j(Nc3CWPVU7@RB2FCt`f~%IK4%ug`R6-oxq0 z?5;JoF(lYp@5GEVeZ~1Fq?o^P{JKN~06_Ig`!P8K0gP(a);UO_ETe?S5Wtgg}ip(Jli51bqwGXjigsKr{k+R`Ll72|@&GKbq z0g|Ab^Udb%ZV_1%>{W|64{guzA>kb8##pg^=5gOc1I_I#$)!OJet&XyT5L_CKEDz+T&E)?-%c>Y@pVBW?+5keG7VBRaP z!GxBCTt)tkaJ1@7AU6oAKy~U6Y9(OG*+L!?>qLzD0U?|w{{3u#(_1tH+0Gr%DCe12 zWG#vqr1P1vApAg58cwA8ly-CF@}W%I9V05aE?cK+{AWRTWDmoz87cti-E;1%v9$=w zDui#KqX5Z&k;-*lvG-k(EBjN(nSlwJ zc?_;)ED4W_8G`krJ8u)@8WMiyIRn)Y7}wkqNUZzaai1@qOqR@yV#)RP`qURf{4G+J z2lnqbx;&b1F18o!`u?y4 z5UC_5S?(+=DnsIMB%N281Q=Eq0jNGgf`PXfdV9^=llog4Vwu#baR2@^56p09nYSwg z&2QN)7I?2=Ol0TQus!&#@Ticb5yTHB!FG)d?}x$Jmz$8=5Mzzc0`GBmEEN*_L)ggE z{jAZ*?qGjn?m|+ec1Qt1Uc~e`*4@}>Wh6Fz$NaMz zt6pe7VfN!Qa%ymz5T4+4)gCh&(i70F){35J{Rnb}B*-lxWQ$HOu^~-yQsAo+hGg6K z)-r-l7gx_f1U!tF#b#@6ZtU#4MvboIM-OVU7yA~_0PFV%7Id8Nq$KdcBU!b}JUAan zj^2inVjw%6MnJygu`oygFE*`iipi9mV4%KLy&|J*&~H|AX|7q#|5>5j1wx>mWe9ZC z{WruE^pdMu)KA3m7VC>5uZDvL=9 z2Sne#{6~lf0#=bM>asGlF(Gn7bZQW`Y-u(14$84RGdQr-01wA>eawsf#>e$c3USXL z2*FoDJMyPPwIjM1r!_Bu=4k1cYjn2x9>4EmS45or9uA>1J%azSKA~%d>uz0s;S$`N zf!681#&Y=}k{P%wOehe5Y@F#k_9%rLA9d_o-_ecmFiEep{Yn=zdB6p7T{D&?X#hiT zeVosNn7Rf}h;hPF-xa-`*T6$IwT_ie3L~|gy>nhP%$Kujsd555u+xSxi@%0tyoa2qj%4yPLDK{Of{fLL1@g~bef*wTWwJY64lG=cE(YJyZ7MjOdAdTC+$&yDh^j{I5U zqh}?YrNQH==rQCTGc&*$UBdX3F>ljgT} z9{mVacG$aB#a0R#qL`3q{D=3aQk;~DUPhQxS{-CN_Xtbr#WKaj^o@=`H>JlqTU2if=S{GD5;A`3Oa*#nK4D#YCo2IBbd z+Dvpecdi}IsH0z(hmXox(_5`_EyvM=j$~KMRF5=O>nD6cnH^4FrvEs7VV1%5jt;Ys zlV+oDWVn(Pkjjzhok7D{=~Du^K)TUpS!>7jV~j!o4WtoEV6$It6tUWC(HUvUCfRs}-V zR;#o%@l>ILz;|nhpg1lQ)tN4u+42X(wFG=^y^1e6K-K~YygUGrLo&Lij6 zUCTq}eOs3YYwMdEMVtvn)d6l9gf>q-fDIatAFqCcn9>y4)^os8Uu4w8;vn+#oxyBs zh?M3_im!jZP8vhp^qS@`O5Q$Tf*D3U54iw1{CK4f7chf81DJ5Dp8}D9#wz9VIBWvi zqAKfHt26y}PJ9SWuN<>NrJFG!amYaM^+<_8f^Xi(_@5e0cgFo38!3QqE!Cz`!9-5z z2seXZWM?kVTM@#?V}oDK1v5-D-J(Y!Z2;2ClS84Ad5R5&&=&XSRP>J_<=<#*X{F)l zImBApbFV*=KPZEk(ZQS*5eWw!UR5%J*Gf=^@vOYqcm;J%g2L*&pE`{3H2#W+Cw=jA zVv~6!aIO$THCKE@^5p%$(rA7TTy*t(s1UF_1!9Vmd>EI2k4W%*+_ZUd^_6L6>z*+SVF=|*f0O@{mX-Y<7#G=vY)p&Jud8%K*yujK6j1_8>|F9j zr2v9C0DL`-VLTbu{->H!WY?ILJI2_MW;Yr4chPE3gw z9y>$g9{5n2o6BQqzGT(+`mr6JJ-R}9W?(HuE`Zzor!Zf;g@XfRZZ}}7-%e+epkM>H z8VFH%+&e2(zg7S<=R=5Hf-_~97A6bz7VsQq>@dX)ELPRA5ovF^KZ?5T8X#G9P=gS~E6gXW7k;)V>$0B*G_^|Sg)Sswo+hNbk28=oo*ZYcY zO8|fg73Npo#f*X?!<+oNHo$+d11)!r3qZ-kwYScX(M27~QQ^UfddW5brcMDPUa}a? z2BR`P)L^ki^Lrv2yPwO3>B#}^au}|+}ypt0CDgRO-q08GFz1oQruESc`Ink zSgBb7yfiASAQ%Jkmmlu}$Gp&jXGY|gm3G8<7>V}Q>?Vh)>bLHD9fVKuNU^hXo^X4lUZOvUrun#-WXonvAruBt?2h<(OSnF5X>K%$1 zbyDAcJsj2;a(&U5a~n?YleTQVSMx)wEM*2P!OqYuxOO;8dOj;~R~&HX=1q#2Q)z~W zlp%YOTxB{KrdJG(T$q?YuWEV?%8VSGLQM4>w<$!6?WZ{Pf9!Y=vSK1VBN8X>L0gjE z3|ksN_@W*f`H*>xcFLa-fLePC%7pW(G0H9j9!?c_*3`|-X?rtH=x2stfZ=#h&4%ta zZZYye>V-kDp`s+OH;>eEvFu=El4jddZ(yL<~Dsh5elp2pGCEh<58pSd<&~-X)dX zgXgNJ^151Nc(SQ;GJ-6O!pMt;0zSy0?T8INlZmfr$eep={^Ee6_+ripz;-4_^zGjY zS$cB1y50D=5=-iV;nwKl6!@Bw1m}v=Dh( zvvV22we7cI6N&1{4c~9guf6BnGW5%I8Yk-pih(&p`huN8C4q1MXeMEeZ2}gVr`cB8 zSr;SRjEOYvdm9Q!o2z%*GeBc+%6=df`D@h|iTMOl{&4sYfphK=E?I?Q&rZW?Ebps# z*T3WVlxN9;By#MK>l;Jqt60pQ?=H%FG9acNN;N!~1LbH}p#{o9!e7##-BY7+C;Dt< zY1{D%(^WRXAJPJVo4%=p4PSBZ{B!!wJ-2^VHv4^vpi)bbY}M?*MswVSucit;@h@S+ zjM)BB5Vst^8bNgTA>6g~*ub#=E^NBlch4IVcOxe|VeiiQcrbXYumA(Oa{9zLivsds z;pWd@5zE~OUE|%R7lzvljrCETT(y;+j=TOF-wO?0z2# znMR2*L~UH45W}qvfl8I9vSYZi%Ru$P!wlnAd@K8>8|crkwC%YdxVasbs9s3-alP~^ zb?h;t9OX=|q+=Hp|BkT<)DGrJZ4^B-H;#$T+Bm{qeN`0yaL=RvUbt8Vb_2`6K1k?m z+JZ0V>gHXY;-pW~)a}Yl8mqWs6h2ul{-~g6MYFmEN-Gc0Mmcgo{CI9?zS!cw>x=Vn z7nSa0K$ryf0Pm%~(QEVh=E$cP5`S;O{r2X%<#o#w_CTq)jY88_ z8~9f}P6$y!O!{-%=b%Xz4b?C9>yE_y&qb_F`3)_%OxwPjy974MNFkOBd0@7*m{ z_re)V1Ns6wNQ59r8c*>NJ$t!zQXt;nG#ZoIz(5}0Bz0N}rYW zVgrb;R0@uWJ>sarTqN^OMWf9{>wz+4hv6&&ahJxI&u>ggyZN1G9!mcH{5pDx_B9rP zXQ&{N>=iO*p9Q*?bB$*dhCm49CmI5*@Nv+dybZyUi;K8VsV@ew=@PKSGDb43t6@uM0k{N9$~@GpFhkfHxzH_; zTOhcFv_oKqfM4%U&c^WxWgcazXC!9-g9PA9ARAFCv&eIOw z^IZ5~;SprCKIi)ms`jZyfFKc8`HM~`7Re;)w)l(Ua~AN!>g7!NB1Oq)Ljc?u&GcHV zv{E^xX&*9U-p4>j7U%q}AxBj}XNP<+kpkNvHcY_{oW#5X-QR6JHN{?gru6A~$@S%F zJ<#PDC{0t_Vk?xLJX==BPry{9g$TD5ApxrZ`r&Zh_4nA9vIg;lOT|g(c9DMZMv2tX z*ds#jgkEaRGibzvap=SK#b43~uFw`Vp#mjS zcWs{X%WAZ|1n4!Un?{Dna2o8MfG^l^4`HNgp^fagUlOqx5{~{t(fj;q-(CSJ{IJ-L z)CtY+@zaAP;6}cQkWb9K>UHd}49xBI)zH{A?N+3P6M5tCRfCL4f zz0dTU(E38RU!Ub)r+LRht!wdIG$>tE!#u&8FQxBTLKWKTp&Jnz<+}NigKaBfHpF$a zwzQ|089knDHy2K)&2QWD31~%3P6OLa@YePp;UfIW1;21WTiz)6?}Ctv>gj)_3ot!Y zh5I##$qd|-$>7Ga!z{3R6iO(kQbRAi8zCkOc;!23RnJIl%S58hB=JV`g}g!dnz~Pc)~7qqRM5ZO0KpnTO1fOzDWI2QnX4S*nUGMz z^!;W<6H?y5JePu|D~Fe43ViB>)3a}w&=v)XLr_`cE6W7|Ig!g_XaUG$X0YY|6(bs^ zZR8P=Dk(lC1vk-~mD6D)Q=HJRLP6qfec|#2=s}V6s|V^VM&X3?CePz3D<5dMF}7kN z@eE03u<}>}-q}miki<+D<&Z3^x^Pi?+~fR!qc^c67FYG)h-VR;9aoME2u||k{s5!T z)%x!3bMVtse%WgfUr68~dXC3Z$lXyfV=yrY*8V~E-a2RzNI6zjijgHKEB;JSQx9g; zg`tA6dS*_Ctdb$yMPwtY%!f zky?e_7`P$x+@V^{Am5Ur_}RU-&lbDYBCWX~P~HRevwvCE=Ks@XmQ;y*uBjTvXA2fC z|0vSus(*$_qRwqC|CIE2rZ<|}ihE5Fy`T8)=^x*0JcPK?!Ihv}&TQcAvy9^N;H=nl zqNUn^tm8}nFQpY`l(zk$${WB-rh7kbGKBCI>Ta=PinwC3@uXSjZp2h&hujoCWwgE= zuCISVvtwCXG|35wIU!Y`jx5Oyf`2b%@_&^T9vSPxQ900wJn_F77gIrFNAlo{y`O#Z zW!9@8ohKtZ&v!(|cWVCE#Z;jSmm2jj8!T0CqXk5Q2WokQx}QxRAiLkN@02C@QYZN3 zJE(}t&IP>uZK3HX$`O<9847p`XU6~M$AVlSr$grU%8ORiHIqwVbf6&UvF$j9L7a8| z2+*eaPLW=qMa$Llq&Sk1(#-zH*O*q3W8pw#HMYy#io;uitS`Rht${|T^i2g>mT4Bh zV{B6)rk%#DumbYAH*t|okD(kob-?uE2g?5Y9{oG7L4oBU`OE|s-hvSW*yE$G4D&OU zR)2xVehJw_MJ>jdp1>i9`0kpZQUp|jG38)fW&K|Blf)E7)@CNEo^X<}bfpXqVa+M( z#F#_xA+_UJv~THXMCeR3tKf&&%v!V2Sn1Ns*o9J`O70N*j^Rv#yIPiQhl_D%v@nqI&=<6M!OAm_= zi$<4sp3BT%a8tVp{=AB>)t&&XlJ7~ncrF5;CU*15=V~ZYf;n_V{@FXm|J&Xj|F}Hg z%ob(diTGw!`{Vp@v-vIMyF1h?%!jd{tt(YoeQO6aXxi60PE`Tn$bq^UOZ+LgI_J3^ zUkO2W^SJZ)^yAU~88@Bdt)F&LO(MCT>(PDqvjFnZ;DNSSOXM%8L-<_Y+L+-b5ul)i zMeP4jA^yRb{`b$$?fQt95cw?zyPH-fs6jnf8i=KpzG(L0J=m?CHmwG=v4#98(oY3S z+@y{mqttnzy^y3(O#WOu?!%BOl!Mp+n8}I&01W&qfB!F39oVJ-dv!c9)DmQHP&rAf z;}j=~8zc=jwPaC~z3 zA(JJP4N&-q2lW6V_eKE^!Ad@$CyLtX|NG%4otN4Srp&4^23uS8Mpr6DQO<&qZVrm8 z@oFk-1RO8r+E6SE5sOZY3`yTI^@0q*H3TN0T~PP$ze@`LODv-S;rpVK)r5-x(Lg|w zwKAS+SOvL#_@2tE6AP7;zY6(0VSNV$yj!VYl8p znHcIgL8S7{)-c-I?Ij^q|79|bry|=Yr`7%DddC5eY$uo$4uGr6ZU-T7P*T7zY(DOP zV~c?##apk;fMlug<3D5tfeiQ*%lecndpHwwRpROZLk{4oj#EXA6ztdlP=GM>Y~7+r zFctOI-$_xEMohXFK|hhLW`@YZ1DVx#ZK~k#eL)cPWije&dQ8ZK^UrVpwMgxy+1Ii> z6%KWrfdy|=K&FN-Z0{o7DPufYOU*EF_h@J4N7LTSf}>&Kr%*&pDS#p&i@hg9TLJm7 z|2Yrk-9(Qj=+KmWNmihtRsS>-M6#{U1K>3vRB#Dm0#$3tvS< zFvv(;YiGd=&VYG^HWZvs-J=Y3(BlmJT^8~|8GdeSvT(KTSlbo+i=Nx7m!JxupoICK zyZHb4&Hv{Z=YMP-8T{~pEt($665=wDc>_!MdFP)%DC<9%?SF6cae=Y?|MT|EwHF0b zy?k5$1ZUIFK zF#t`i_4zvoO#QI)pNV`|n>7x-uMJB2&#Y}6hVy^=YM9X;499n3jf`D>+fXrzBa|+2 z`1_SQ)^)8J_^Ks2a3CM?ffafiydv}w+>|Wf)(`JnD+q%3e|s>a2^q~z!vuFJDhhnz z)44;g!XKmil*_LM50QW)d#7IwmBmXlfS7q%w&e@5X~m5Oh$(6E)d>kI zJ(L0IA^5{A68cnd2pTB&a8mc*9YWG@507ozb9e@)k|Tl!I+>(@|y0v2n_P zmcOhKq`X41CE(Z*dkf;X6m%H#$#a0i07Wva82BjcgrfDKBmzcI7)>rwFcf#%Si>qz(@I|w6W-d-_M!4JzFc%x7YYs?~|M>RVP{j;#2|NX*1j8ajr3;{t$ z$lu!O=*gKxZJAl6X~0&}cKxS{zfuNhO+U-cWK#_S$K+Q#xttJu-N}mh+mH@Ya8QtLE3$Vh#+y5{k{tL%P#k&JLAjh6D1}SGq02QWR z-jG(7)a+v5o-|d=bfcGD7fHy)pMp=Xg8(i$2Y5wJhnu7IgbNZg)q+_7|NTGi59Wgk z&viA!&ehyF8BaU27-N_Zpk14ilfUL5W8-z2`9lAQBeG^sP{U(U{2@%ZrtVvHuTiZvsvAzW0ysO;nU3 zQyD5Evs7jhMVS(@&B{DenYIj3NeGEZM8?QGPuozTGG@$7gfebqPW|5>opYb(oO_@1 zJnOgq>#TL|>f9B3f4`sM{eDgFl{E%ItR6zW1*g3EOivzp?$}m~n} z_i`&?W#3ATG7P&-SnYc^iO zyhI*0d1%U#KR0Ofo_&}mHB}01$s=7gL-IG zeSy*h=6Wbuu{VUv+$S^zd-I`%4HwUUxK144FsT$qHE=Tz=LGi%t1us3pJ z%7?kRJ=@r&&3p6B1(b3e2Fq90-QsQO^j>Pj-ToH>OMAZASOepDmy`gQa9@yMl@$WO z;D3AMFNf=06u-Btp2ew22>EME(`?(R(|Hx{xrDMx8>HV$bQ6;RL@yn!}qCJrs&5=M$FfP2$G5fN+Vu290vWKq{p=C}x?{nYylUAM35o4bFlT)0L8n+UtVx?X%33 zH0qzeL*#Z|_0Q!T2O%FOM?xFKGCJ6HriE3!^{M#kze{RH-AI-iU-pyq>r*u{{7;?t zB$&OA$fXYWzWZtX_I_C=c4`4kqQ!Fu<3t_f00q}~xU&F9MhV=BYlPUFj_6bJ|EJQN zs{m|8#*V>&IsoKCV+9?|*<-DSJBU)0&)&sihRDhAZG%T?zJd6>INPwIJbu@`rl&%0 z=c<2sn+o|r<=&4}BOVYAVbC`VMYYB4A7h8@Artwg1?G4sBRfZ!m7Ps8N>wGnhrPe0 zYKVPtiaI5^m1G7CqD}gg^$PntUTR9tTGyZ-Sgvi9;UvrexlIF8K zg-d7n{hnOCB>h+SggErMpB)2!YsMOwYb3usI%h!J0d@`p+5_M65f#U~b{llTg zz@gG()wM|D{=UO=Z`#wNfKJ0wBinkjpiJo}@x-=iSW7x9>(UJ={g z%+Yq*!okc-r=~knLW?qo^It596t%#$KEFm#v`E^FZf1@q-fJ4H@28lq=rv)=?_F$S zq3Gd-);crFrmJwLXpU*eE6q!;av?^QD|6<9ZeJa=cyY~bw(QbiI#{XSJz{^^$3Mp} zRH3t7RdnzRrydrc+;wLCOSpqjAXui=TCTo5jzoO3t+#&`cqsTFa9N+vFOo}YCQMHG z)+~ytfgf}`LLr>wHG&g3i%yF`sURAXA~vM^dl)gE@gt=z@7WXdc>=}EV* z*h)d{CUVeN)aryS65sA~%M~ndvLbf_{;t^hn}!EW2!LCF^~w&TW0!q%0K(V-^>Au< z^IEOlb;kn1og7j}ok|vLlN>j@vrNy~L{<$w#28rw2hD@fH2Y$tJzD8Tgk*AAe3$H{HXgC*C&>Bryv5QY zMntjjC|LvYM;SZ9ZSbcrXZsR^70<92Tg(3ZPN8-n{aCF~kg^1s%W<8-hgyxu=&8-< z4jJuK8_P4j9a&cQSu7kBE4ENiL(+R061;rv zM}6+gA%MG5dN}>PNO+B54PhxLKme0HRNxNg1qg&|?$j7y^xuD+2+U3a4BxAHrJXaE zP5UnNViokuWuK4^mIXXv!|#*z1L5&QYjpBCt#8!j<5vff(Jy)cG@3WFJipV{^uNHg zzT@X}IJXT-U()w|VvFk=Fn6;58M>q_t+}k!m31Sl9#>*BsL{_ePmbipS7x)IbI;BdYQxh;q{y7Af^hhhFdO|s z<14~MP12mNr@|&Y;Lw=jA^D>{cc;n6Kq*Z2U_2oMy2?2JcNaND?8&OO7W6%~8b8f8 zr#nXKJ(!)cv%GD!zO&yxV^GOHgTREIp#R^WAjl;UZU7kkD9*_St^<8tS71U#TxWEU zQbgI;oK9~lHJ66gFN9Y+d&cH-o*X~5yL=30=(MBCXIigr!I;QNi#@ARTYt9Wu&?bv!=qd}gn!fH6?Rqyy~Fj&BD*mXKTiZ@&zSUpV^14#6Mm?vEeq zQlp(|Gs4mNK^p*>t4f?D>^@NjyoaU9h_FheF68Ga0#93F?0^cv!M{J)Kk8Q;>; zX{NGv{edn4O!43L0(7OU?9n}iu)h0GtOCK2IS1SF4+Fxs`e@tw`L812&z=yve&~p> zfOi0PbWR%qM*Bj;RP6fT+_rzZUEjf9{L4Z_sAK5K3TT;DpflhADcV1;CuA1cVgRfS z{^DQGh`$!P|Jvqoq3_o9IS5o&ua+ag1t`LjRO3|e<)w?0TlWNjHV_>e@y|%*gPp`I)EOQh>Ra(R$W8;3{@?fpo=ydAVtzMh#lA zVgQFrLONtVY{Pz_Z$X6h|Hn&uJp63DF{&Rj_AS6p zl!#!&7U@-k{#(;o_TRs@t$+ka&yRu*@$lW19&V_#orR2*VmPRD0W!`4c)=B_!M-o^ z$NOjFhaA`^LN1=U9vNp?GHp{Sxms-VQv z|KkGq;JEiAVANFw`_8P@+~3P&@$$Lrv!GJaAuwnSLW6Dr&0#KFvwigXcAw7sd$nBM&DMVPpv1=HR@yrEPa+p6C$%ZMnioQhjilS0otpG=1gW=H2Ck^s zzQ1o<+QhdmhY<|;vQ24!x`1hok5uH}{Z}yG?#=dz5raCy|MA4oUdzIcgt2JJkd1(zECYJ-(p*CsKFO2C z4mET*w1yw!jmh0U947wL;Sk3F@y)e~nB-$5qHF281*u(G`k!=|eaelG9{TiJ*uZ;z z+_hDDm9b4DMgxi{?~)`-SLhoANcQ=>dqbQ2@v%t7Scs&A&pRhz$w?8ha@Q4*!?L~L zM}&MkP5r9%n^@po;jB>qaP=@&>b4$d(J`MWO>UX39k4ThNGay$@BV$lSN*3GUL4DM z#0-i+f#vH!yUnTi3^qLB9o$S+jghzzPDauW=UB(g4A!zg)4V{5!=*r?Ia4j_On1I_ zqed@xedO`)XEhPTA0t||HyOIkmKni=v- zRej9GlWp;6z4}65bfnI;Jy8)p_zvK95+7HeA;JGFRI!2InXciz0ar8@zoxQgvoz6s z>WA1#MjxxS>udHZgpZ9e8!^ z*c_zPldV9yrfkBZQq;}&HqUo5Uq9Q)-=&=`i_q#W|I7#Amtr&datrz(WtLy>U~$?$ zYXxaCz<;a&|B(}Qo?6#Fpw6D-6%LAiUw<(VQ7%>0#XT_TZU7NjTDDX<9kNO5gAvpHjW z_4Za#MvC0tC;L%>Dmd>{>>Sto;6nFL*V&j7+bXItUaj1ITkRrhE#C3zXRy3rH`@K^ z)AIS<`*pt`7FP>RdUn`cTH@FdGfB{Z@k?u%v_SF5Y^ z-`{CJzO94;$ZpR~M>67__nESR$Y*f=p%46#B(Tzz>u)uo2b+)o(keJLb+Yt|2KESK zv}k;d?`nFLxi!j?h$`W6ua?Nn;K%yu zC(}`pzE;~pF4}iVTLi)K_}R2Jf4b{{tx!oqnoAb&aBW8`m3%d!6<3z;CMX~s$CskJ zm^f|m0|(*UKNh}v32JRcU~xEMf$+NY}fXyh26z)@#`;{hw!s~L~o3qk2z%-q6#FO&ihe>rU?__ zkh1`Bq`(cwg(7-EqZ#_@HK9LyLpavOAUDC$pK)kaQ{TZiRf8|>$aVuptJ?;C#wRDz{%0*<`J;Ibe9A#KCU?a0%OFJZz0U%Gf6ih zv_PT5P7%sSflEsHv76`sI-^B=?T^V${3Z{gZf(7t~_rsQ!s(_cYT`sJ60Gtm@cE0}C zJxOaJ5e5gPZxuF<;DODVfiy~38)>Dk8=7Y^tuLQ_fR)biso{beR_ZVhy#>6s6P2Do z3I_2;GmeBw$nK?^`iL;2Zm5yKS^IO+LVt|2qZjeCjhkk$|DU|~*ol>ze>`!T3xrg0 znp6}cDQvd5Bv^XbA)x*Y^!Qw+F^+s9uRSzOg9Ojt&ygn`LDFC>GUB%KpUp#vxI7nl zV!b>7Ii0;jDUb_CdRLZ}a`NE=`zF%vbtS z31^3!qSGN-o~8tQm6Gv7Zv^xL z{SQ8pe)n1*kIJTErFQ4Dkd>CkMUahznKZJ|XF4~Egn6H~!!E3;Q`qlLFJebFA`Mzm ze`_p(c{KStfQV%3OHbj~vR_sL9A@;*1Q}lJTBlmV5auNRu;gFV1tk{Ll&uP10c};hI`3T9UgbYmTwPHECGn~F)~D*9<7`AOl^+06R?J%67mhIXy>f124!XHHo9q!j2POMH z6Zn3sfJOY{)?vGNzPWGA?(?0#-%<1+K?*0n6_Y8*{% z2ksATn|uGbZH%z2kek=i257D0@z{g}OTIs|0IuH&iZorMv!aEY_lf2~F$qxaOarEy zaqyz%SbK_cmdEPc3d%`2ZSFG!R~iWPHL3_6XjMljfyV6jk`T-|oz4RLMYS*XKCH6E zVNB~y+NPQ)~-2o`AuB1sZqwDo)dK+rUgldHDnf2O2J3%DyEBE!y4xZH(V$+4bIhK5%phQp+@><)zR3Tr>56i)YZL8Wl%dkRo#3W%ok#CiJk0 zP-QFGM43T;5cG?y&@cWbwaYpcC!b=BB+4fWAR$bfOR(nc!XYTY^`5X(su^1Qk?Irw^sCb>geh~cgMN1F`ZM{Z043m#~3%awAWilyvg@U~0P?#%1 z7HqT2>LFKMPjpimYs!28$LH}i!QXCm(9KQV^ZU21jHtW!TBVjAE`RHI2l}S*Y+C&$ zwhNefNORHHOIVZUv&ztYZqfZx`LM07%*-B7S%$Kz=VZig?XqPVH4p<7>BpafKVB$m ze-5Cj#1@6LKBkqoW)m)jISos!Yp1?xLICu(eXx}s{h4oGOmMTT+yHhcg1`NqqJ!r& z41pU`2xYZL@HMYCX^p8+7=+-wQ+?|HO4M*sNwfvrp-dCo&c46nfFNU0w~Epo092@g zq@uvG)ph?o_;wogav3M@b?QuwtIOm77}P?3A~bpgs>j!Q0)t3I$rV zYsQ^dOl2uJ<*;&mCttH&Q%Ob2OL2#L)#s=)+j$DBZ4M2FE{wgaxG2dU)uVn&x#*%a2bU!+JZonJIog_nPUI+%FXmd}TO49wt(Ym0$J7Mt6PvD6~ z+LmUtcffc0`Rnt7)~lY`<>|f$Q`Mc;Dz3N*qyIw1lpJ}Nb}?Pg0Wf3XPq-SvVoMZ6 zVuH_kfER9h)4JT(WljL?X-hoi4d7dpUNw0c%MXJ!=OaCz&Pd`6s!JHWRzX-<;i1sU z63v4gB;}bAk0T3@hr{=e)>OP5k1^Uf#K0npN=D-$0C(ET*0`WZm#4u>)#$gzN!vqH z=wdnc5NQW5x^d`bc0;Ty z$TbT|U7@Uw3aV#+7E6eQXdhJq+I(cA{o1$q+AzV6+Zm zs=}d|`>hZC3$P8f<%7jrh#+@=%pM~4Mr7F2X|g2m4fi_)MeY;jtKN-rUeo=rLP_#t zD!LOO3XPN>-vJO$GChg|4ne)~>QyLpBi*sAA)mm`IiC0xFwTi1%th0?wk-1kK;kCh zaATH!5;Jx&jBEw`Rc(hv?7nI^yi|>u2>O9Ok@)xDc&7#vCP zdVa-gr_@dKsqq}g6uta0MpHEJ$RY0W12qJS2tH_}ct0Q;*-t{_wpR|wg{Ihu0J~baOm*HUy0>%hp)VT3 ztx%dDcH_(0`=dnl;KYAg#t&c;mwJ{5YF8;LpySCs=+Cl72e>}&OB^PrRmi7)d_c^w|nJRxvdRm!kf;8(bNx!I5 z`8}&em(TLN1KiD&|yTg_vK%JofU@_tR9#Yxjldd|&P&#D+o{bIDyp9QvrhOkKz= zHGjfc1oDn_#hA8Sge@}oRwPOkg!;WljdO{deVeIurDc<rNUU4Gp>VIlR^{rIP&z{5r@ zL2&$EIvxVRlCA(t3R|B#M2|M1LhzySlbmc>3n_5aUqd}Al18`9X+fiOVD{xEnBO;=v;t^nA zK{{W&z)bZIJy;IL5#k#~Zj9Et+#FL5lU1+w_L2nc#-~}erC?O4J!{qG?Ehac83JTI zc660*TOHtfa)rj2JP0Z{ZG^+!qR+4T+p{-P(A5|O`^GB*2fS%_GCbmT;Aw!2(SCx{ z7->XTtDvYtGQp@vGQlipU06$Lm)Sx1tj6(5Qxg0)>zEHn`cAB1O}UDcZjZ{&&M z4&u-6FJ*d#-Z78S=U)?9?YyGqLX8JdtEf9m{~dp7p%yEKHQpFlEe8=#FiYaVI56E; zlp$=_^HCv$*&1ODSN0jURY1Kt-Py@TKb`#kC-`?NM66lhX!?so^K=)#HPmcOHSBjs zq1QXVP&;P z5Sm7VC4gn)Bd1#23{D7IqbG><62A-gZw_+)qn`T7PM-_>vd#qfZ?iB=r3lbBvFmPa z@(jH-zNCi6-%v(ZZV6xj`k;&IPUrS1xfD3#xZ}UFXqxT74OSpvJqOZ(D}#@rxrZ_v z4whE<@DgwR%Ry8%8>#Q0JE@OZmOG95kNvH>Z9=fU{PvysQFFlitwWG!}s_^gKy z_^O5Df^T7Wk*jMAari{#*VHvp{zeNiMwqamJx38jwY8jcy9nj{VF^(1)?JcXI6tX> z+Pb}V1(K53-n?AJp|w`|lDEJdOK2)ju7vMu?>DxBc;S0JuzwP6et@qs`V87LaG>l> zvQn}fP}$HQKS~nXdIHii*b{FMM_voGhiLYqgoFve0UIOvPfhLFjuo7m1)yj89$k(8 zarqVKR(p#Sry=}Vufd`iN?qNk)2jf}q&q0WUV)Ma1iscNLRR+xK|z=ZI3H*nU3k-P zQ$X(s$2C6l=Cf@y1mz|C{qQb01WFm@@LeNYV3&91jt5a|(DO>bbaX(uV!GMh$F&a= zy5WY*73cuON?aRyyH5roI?+g&>Tur^G6_~d%&!kk40BHqh-!i&-Y)-hfgzaMa=iM8?sLK*(<^gr}InO%@oSsvr*}wF&&P{b>D8_cmC-gQPf9`I!fV z>}ZxG+he72FrkMkBEDIDd?Bkc4Wg8K&+;DGRwi^d#2rp2O#Cl>`IjAdF~dv~S3nQU zRT9R<_|LNt7BR%NJ;S$uRW8pD4|s@|gkTd9A4oKs=e! zgP_^{SMU|Tu>mIM-m&y~X#an2Ya^@?XizhOIR`9!_G!}$94@wYSM`|I&VVxbE^>Sc z6uf}*$a=39=WMnC9O4`GAREX1hyo}0o2kDGEm|pwfUA$iW>X53)_`2-b4&ffAn%^n zDh_?Y3nDA>d`d}S7$cHpP`o_&QWk`Xvy@$W3WWjtWeR}2H1-SoDcwGN$U6+|eA!Kes-|;WMeBb9qWtwkAlEsx#JA>HccKC1ZK$rT|d+0=5_*}F|Y*J%FFdPpl< z$Gx&;So0DMD2CkN9RD}P&}u9uG+WGlJD?evb%(t-vocah3T;1S8$Q}_Yjd%4<}S2# zDG<)+4lL!&-neC_s(+>2#NTg62O(N&??~P&>17w_&>Rsjb=T!=1vK`O z@gbRkARPZvSsiw`6oe9)P0SQ55<><$FhQzMI>6x?Uv@Pj+ zHf2!)N#hA~j02cp-_1AOU(wwpBbI4UirU+! zHuk#k^k+7tD7(X5=?W3KkH4AyXCV#&fExpNPWDJ_!eaF{)TB))an8rFv%VjIq@|(L zEpNNwF8$m>hB+Bf(RyaHr#9e#JzUrhZTusH@OS2wGH8Lj03UB+>$wM@h&!q&Abx+l zM1w94*v@Ess3E&KW-H-(^+UJ7+Tez9LT^oVOt5ERr|ZkC=L=^l)bAEya-6LX0F2<^ zhg6A3Iri_p$Pk9L%@tU3r$_c}iSor@X4pBxb-SJ5?xao6@yZ^e_>3pv%+G`9oZ6Hz zjbBf9cg0D2^ns6bmj1mJ$okOn0t3}(EGyEa(Z%|I`OHpG;|ef~8l5V#;uP%=8s03u zA1puuLXK6_NBwa&Bjer2gOi|v`di{0y5(CbYl3Qne~5XCAM$3B;k)+Q_Hlg~M2kmD zLmGsD$vglaLW#4NB)%Q1SQ`zRg?W&Lz!UaB(`f@&NQ}$;&|tLr3@BU5zUyjO%~#By zaia>m9iyRN2BpyVE#O46)&$28CHrs462k~W(-a^{`UopoSkj$$1WR(!2zE{%sL0on zVvK>axy!{p56T?f5t9kHM|RU(gOkSRz5IP(rznVlTopr(>^`fY<&}(g1R1Xa?JyGf zvdwL*%uf3Z0aaiNdeW5b+tKvK5D2r8cQ}`h7uN%I&|qL`!jJmppEYr6VD{v*B5{KC zcU~zW2xCN%IB^1M&VSv4tZ)n7@P$qS7w-l6zV@Ij*{+|VKp8(W1VtPQJ&HqFA^%rK zLlBdL|6+KfGN8&-DDYS?84t2)A|cEc%*4w>ndp!JX%G5?mxi~#rBIF9gR0*Z;(B_3 z54M8}!N~%PhkmFST(0GQr*dTjq0*=6I=rmQ4`qXMn0x2h-EF8USv+G2><7oWBDB&zzluP866Gg*nzeu1Obj)k z2ncmHA>1=IsX22UYJ+4gtDwM=l}Zh(pg&jgL0HL+N0b7=TL*Mh5YSP4Z;th-YMwc> zqsQmI54U+FUB?C1hpY~W6nxDeR=4PyByV;M*Ib@OXk@gvwM3E4g} zk^^4;Z|VQk2mE74=ZFa*JG*$m!3KW#5hR!!~Sg&3EF@hiS8Pn7Tzg1;^ ze3!j-PfJ9k@?kdO9I*cY<55dpI1Hf2{+I8M{!M$4RXvn^W3goa_1a-!oSEZrRVBZE zaZ~E0=Qe;S7DH>L4-!U`BZ{ybg(p>hU`gJjMI#$l`7-T;slzAD9t3^g`IQA~egO%n z(kr?VX-#{|qA7Rx0Ztv15N5^h>R)@hcYKKx>g8j9EZi(g;Ycz!9jNyh(;|9O7 zX{sGiFc4OUYhC(c0hvRyP(QnkzX8jTjlO-+lbGwH zMO+tYYBsgrF1x?0W#jETwcrNasy;9{v&gQNQ~k3~^)$`wZJUqp|dBX&?I_N)!HZ}3ed^~Q?C$+t4Ae{liZsxBe)(1Kof8M`XW+)AQ?<`W??7+8GMC6{vB~TnbQuW8m=z zkK_N1^gh2ceX^BA8SLRDv(>+TQjMqw*+XO<5~pGb=Xp5i&trNh1qmZ0HC*o$#9>1F z&QuK>Q#h_&g>NZZ<8~Oi8g65IwMGPd9x~a4h)*ykiJhzqN=QEXY+F1q3FiBMU_hye z7Oio!Y{aL%XO=s!7(SPXweT$att+Z^!a0LXA<^cZ2WU7i*(yNfPUVmpeg5U+)W$XJ z4Jt@11tU$OM3TsKXsF#XaccaA`Qp_mgoDB!j(K}9@7`#`%YSCMPwUGNbxNp)v?okL zS?CK=GVUb8;oxK@;c)W#w&ahX_P;F?by7Hjhnx!f(TJFg*>iiJLBajA0kG8zdmyux7X?EvJ^Dl|c`p4Eix!r1lA0 zTKH$&i;B4IYpG_G3DjD_vk2Qn7(z)9 zhZXT$yN?L92;lf+qaM&4mJY#5BtEJpwQw({&b&MF`UR`+FnN@r z?uCZULpJPa^awfV1^P~QKNS!p3PdNulESn&EcgIBaE zfm}MIu67ss+8EV48G|_wCH?4pa9jDIj!D#0F7l*$THYf(xS2o_Hj1Rrek7(`x zR`=_Slas^PTwH;NMO5XYv}dHW$Ha4Sk7`TTmRL)WNvb{pRabNet$-|k_nS+(rSkAK zvcS;SU3$Y*v?$V(1n+x`dPp==q(5Kij&bP+ZpWY97CCW_yW=~rkI~X^RjigcWR>g6 zcHN=HEj<3ZGyr5V)t+B~(D7$WE-3NGNS>@=l5kGbH;JzIGryhweTO>%Aiu`Qr=tgg zt_=XFNxsLR@>Y?P8d8W1+8l$%+2us&HX7+S zj%XcGq5_1(oOAJOTaqEONS~QS$t-x9km*qJ_tli>GT;}TL1AmLz1=1Bm_dh%S~ z?iJO{bSAs-q1yN@`GukzYUc8%ae0+!$^fDy!OdT`KDqbMX|5JkR*@mneFP-C(s_Pt zzNCn^X{fBr?#)I=+60&$p{#+X~WTp=}#Q^rkIW$6x zu7ZhVAJz@wGWukzkm0qR#Riz~1EYpw!!1`=wGxj^)3K10ke?S_L7;Q{DEtdesc~4I zu!CL|iUXw6c_NML8cQI8HvS~;mIp{#=|}}H2!rzFf?SkDFvf^m=w>N^chNm!g`R7l z{S~Z8Q*BAUQ%Moevj4zd5kF{mRD_9A|5``Bxw0FZ`-pj}TdV=V&}rbYIxAJJKdk?S zyHLN_?#7i9j7P6Y)KgzzHU=KI=t!Drry%(w1cpzGo%EtFxu)RHZ3)*GDEC(nvbs8- z=zZ#{d?CEcu{R!qTWov2_%diM@_?KY?fdNwEyEc#}<$>=)+(G0;!Y&K6V0`i&a>Q5z$T`gqo&xSD3YlHKut-N5_C)`%q_L`a~1MIuB3K zI<0A`q3N^5w03HkwSGq)Q>5N-*A=^!Na23YRQE5LA(py1TeoyOBvd<^bQPP(h{ZBB z>=P_3XBBRKdx*rJt~J=5rI=9l4C-~C+vo1@xv?J|(oQT5*QhB88$P#Cw%*{b9pjJv z+x%+|%tHg}>;r}2t10tYNUNR-n+b?uu`fC{ONnkdI1ItZW(Wdp?OAOR`EA3AB5Br zCym#}soE?QJEB zajMh($$8D$>R$b|r$gb&WKg+c)o4T4OFEBQPq=q`kT--djx=|suLGd>kK`AsJ1%hp z-=FuW4xWdtN}VADgCW5r$dKl7J;uE=NUV`{QNNsPkz9zV(+f($LW#CjGGd}NT(hdj z;iGUx96trQL~^O>;O{$R;(h{DjZ{RUx9JAm5VKvA=Mm4{87vbf2apI#UYgennkqQ> z9N^${%DNHp`C#-4jo>%%7acqintLS~J1f@W@#I$>77*QYa8;hJT{Pp_N0^=V^hM8qmiZ4ftn7U z!h~hzAf0T<>_(;xPoXOZ&sI6dzxMIwR+-9A(Z_Ezsk0g;hYwJ7C=$4WcMg0tyB=c%T|oh0ja&(^AyU@4zcnf|kPna7iD(6%AE@K2!q;wU5?Fh_N zq}*P=!1I|4R(^l{+)sti z5)Qc`8TY1{pic*Qg&lSasXQnA@NagV+^1~!wC*Z3Ox-N%mVHH^w=3cSQNX9f3p<3e zF`>0REn&zf<}1o7(1BvTZ9lRbyy@XYh(f{>kd#~d zy>v12a7e*t^_IMLAukn7t95w6FZAZ5=yTRUoM|p~{mQ#&)=z2b?BFVkM&kzS-ZoAw zkFTaM;sbr7nk8^mcBq&+|LX^UDt$R;QRNDEabw_ ztWP>(;aL2&{uxQ2slP7rdIr-t)q@On4EoFBimIf>LMhr$Xl8)I;T9@p_W^6YMPj$6H+f zj+*q7-H=ToIucm2$0hFP07sO;=&uc}%ZaE;ChKC$@CWuQD$4b)ufSYXjYMic>E>=x z*ui^Dsn3y@LX9&PGM2{*H-azC(Rsk+Nz9MxVgxA@@6!Su?S*A+^ebSa?guTjH9VDq z58=o4K?E23PVW8q(qldlO)%&|i}U-NM1>_6T0Xovrls<-G&lESurs7pwmcnGkVxt; zERwQ9TBkaV1mXrxaSfQmLtBXI{3Yi$4F-0k-6+4mrUpRVOYZ%G5B4=yK4PdI zTYM*GR2~XPvN{~d59HiWZ-<1{CT<^yV>E46JDK*nthdVQi42RsO#Ns>Vc#GD6`Dp| z<}oN=j`6bhyuH8I$@j&#{cf{;T^OZ@_*2c{#dENZTuC|W;;zL4x6WSq!5O5ErRfDb zMlCQDp^~Gkg>ajFYtDo0eX)H~xFOS`OC=vjb1wQDY;AuXX+S6HX%~QGtYE~~o?wL% z$M!Zza3f<4v^E?AeP1%-#nbJ6NkX8BK$n1c6vilko_iHGLPPiy7~hcj{#pLz?cJkc z6wAy(G4g>_u<#YEb97b0=TX$9TLh85i;=6!xY-lOvozRAtTJ>_aR`=in2Ny|DbP5E zTZ4uKWlU~orR>orxiRMJUv*aV1vl|K5}BcifpvSG{JnYWQES>t9!*FF$5Ywg+r57VE z%g5^@%vwVjdlD72?J;9m&ipP8Kw?H%g%_*}J<1Vz+J+>UpLs!uou1>5&&bec0Ep$E zPxLDOGubAWf?px%6!by{_XjfBo=PpHsJkSLa5!rk6`5Kz5fR_Q+X-p^!jBScbNk#}3f zWlD29Jc=hBDR%Wo67h$!Oh!`J3l8f;l&ORQzdZUqHiqfPTz}u`8mM<(E7WejKkQI! z;)>cx7M+~pWk;P>k;u8@IJ;+yKszeA2Ofwz?p#%%$j*ezaahPxaa*y@4>k1q*yVa{ ziS!gdTlErn0Df-l0^bSx$FEol?rIv2Q)mv?4Dd(SS(Q}S&1A`Z20td88~n~opYv^? z4273iI~?^Hy$P63%006&a#)Fkb@zmwOVo)jdu#V%Jqf_HNyMCfBjm6wO;7U4%qk`7al*3W02n(=+?7HaNOR5y8O*}hQ2O?G>CZ3nq;Qua z(DILW9q`6O4vYZ34$zQ>kCp&(hxwkrsW0e3dL~-q=W*0OKqNCuZ^R`46E1WDon5V3 zpO%*a^~(@LF&JH?n8_n`^i$5&d7LZIffiFAG1oR}HT>)RCOf43@HJ#V=#GC5URSQ*$r%jDz32cjpOC0529+ zs*jhP<-yV2xx6Y;$!;?}ZSMQW6q_)Y;m~{DeE27E=VYB>If&7r1Kd_RhB0}r7En(0z;RSFsuSCCbSJK76yZiy`BH=J4x`?5iXt-G$`xN_OwV0j$x7ES`~cdZFQRu-`%HW$oy^K!$Vjw z+*js6I|%P(ZZ8ab$bWrzXFt^GlTv;QrKWWgXyUAI8RC2aSes65i-U2}bBQ-)q9Mnc zlG0~#2uKY_n63j!BxOADE$=P`OZ1L}lj|?rEJ(X|h|jO&4;EU72OpLhGP=Fx@`>Rn zmEchP_j4%f-c;UggB$OC(&WT1WxatOYs%{xnPO!MUqAkkzG&}0$ZnEa!-H6)Allpw-Y^ABr^D) zTP!OK8w*=jhmYE|JvLI>A?;l%>lLHV{`2lgu>X+NOINWj8(hZ0*)O`qfK2Dt^zs>D z1O>6ILoRqTfMf;*CI$gKn7uXtH=RA2p1aiK4Hdz2H`Yg?d*>xbC@7cq(~qFJJJ20y zUw2)KKFl+~ zohCqg^!pEJ8_bGf1fROTeb2jTM@B{_K*bn;uxqj|Z?sEXh^FS8-@{^LZ*jTj{(f{h zJpM;5v4%;=P}P?w>~E4A@hebDjT;_mOVVmD_6(!%{aqC=GD#P4E2eHELUa=F0Q1* zcG5_{@)(|XqWmQ|jL~CM<2=LjF$Bv@NPPMR59mWhS7G$%d#}5G0#7v>zxTB>tuIW> zft4&MiN{A<@A>x?x6}Kirl_`PruN+}=X-YNIM;G$G>J<3)@YC*Gvi!al7EIj7oahx zr@*QUP|r!IJC2zhssjPe(a7UhptF{87$w0=lGm8_HA9fggu--ybvrNvrVTq}0|VR_ zLFn2G5}7QRd|UTA-|z~%&t!U?Y7!$H7ACt#C2}}KWEBxdt8Qd@&HE$0 z$wg9u;KIdAD#a*$>>JI)_xWr*MtC%7DUVhhEC4~riqO48I0z9He|%(?njrn%h%!8x zbq+s>T;d?t)n%3-+t-2|)vs__QzJr`=TY3(G3rLeMFuj##vkQ!2!0@?r8z<~BsC!)u0l-O;(=1;3hq_|Ns6;!WZDdeV~<<4$O zCoD|eKap^MdA73L#pKDkiSYh-yC~PHPCdHfK*Pu!5@d5en8l1GSV@ehJV70l5w5AJ zI>H2C3>H@Wy~Uc%@YjO?hvw1K~&(7`)6v>I(vw7leqU|Un!HMQ(15v zwID0gOY1U=K$gm{)YHVl(nkfUmq6p6rS|9*BeZPk!yp-)4*!&n_NTj{Y{##g0J_`K zi-L{h5Csqm|L{p}xKwRS@iHu#fdJ0oaHu5i;&X_t)EA=j(YDaigjL8t7bwg$^Lf>> zU*Z<2#9RF2~&;3{p-V9ZawVg&LKSgPiAO+Saus0t7HmDjG~=G$(^i{)7TS&4?KaAte$CC_|v~d%JQrS$dp~`zjJjqX_S;hZHrY zw`P&hhDE}e%i0`@_QH#K&_Kw=@^M&G*hk(|)qrV5G{!t{jS-I3a9OQmE#%K7EVvga zhTo_o#GG|-feh;Rr}*O}54CL$bgQj4J?!(2gdU1^Cw>ZR;Juym<|!yBl=L8@i-unV z#sj77Kd!6@)C`S?Vjje@+)LjB7{LxA&pBz;F-Zz&9&DHH*G?Tc&|HP*FuDK5B1j~s z2s_-GD{E`XAsYC!Jx~3@^k7Pfi|ru4Nr;jK^p~5I@GlRb0dD3%vc?~e;iQA{4kDW( zT(*jwISL?29^bJk;V#wS0Vo-fqKHKn1}VmG%6f7YK76Y(HNOSg^A0Qg}sl)!uioB(Ijv2bm65C`A9 zPmhpC^`D1O?L6){+w?UC4!#H|`w}n5HJ|qDE_gv52w(chV@nVRwBAwRy0m>X&A+20 zirDp;I7Q{;tq*4heLQqbG(FEDz38WnPf^Uu{1*l4D>`(`bM<@eJ!jWC1Vd~+Vwa^8 zbGmpQT*cMq#J9tf{X0zd^Z53MxXrIR%>=JC0#tA4L!b!OLkVJ7!Y(#<5g;b{tnQ8e z{yyFWjN=El>DhpNlRXDaCl##rK8`uP{#jr0(@<{EE7~lhn1Pd0@InmO`6RU08?yQE zL*-gAwEX92c)xpd33ZLb8T44^yo~3~&2Yn+2%KM5qGdyV3dSgsOh6kVcEn|QIxcLl zYt5_dp4#XW7$Yo$tJMP3(`3PyV=+cM!lYuLIj27@cLN;&>!6LMf`X<1q;)z_fe%om zD@XF>K_17Uj06$e?xv|uZn7iY*(W?EaRs|WFQkp9t4?q>%y%WG^Q&aOKV)gEUHyEp zW0rNgr_?mogfL+ki^2b55pHBMlo_J~;<@sa7=bq&ffvd3LD^ ziWN`pGL{BppXjIR5{P*TD)uy(6>~m83f}4;pOO0fqTRN~Xjk3t5iM_@oj(A93Maj0 zun6%-I>1#AZUQjEvLge9SzW}BKEiqT4`D)=90y9<5D?BJ>6-Z*v~Y7C0Nr&0QDTlz zDR3q9LHvWkVi&kE|8+(jgEQhcDkpgUNGrf@ab(Tr+s~;i(@|UufR*!MWJETK@m@fE z|HDelStOMrbXqh$HAbT&07N~MJBJP@67&DD_tsHatzF;fO{X9#AdMm-AdP^u z1uCg@iXbH=T@s3Pi3q5)NF#{AO^2j3NOvin(*4b)&)!d*cklQ6&iVd1W4yyL92*%M z)>`+vu4~TU{Dmv{E+E29G=R{Z*UAu41ft!*+k%0D1!C-VI6><137*`UKEf)URZXx@ zLA207T6;!#?t%417{N9maT$OMm>tjohn%NE zmz?^wUl_rDA=mU3s+3!iO@WE1I7+HQg>%UJIFvelOPt;9es#Jmoc`4OCVGf&74d&V zOiHRV7omqRtWJj``!yK6{Pl8kr$t%ye|iWcm+$t=TYq=~5@WYKuX19iOk5^&(``CEy{qvsMj#dgz7l6A=7*09;4S zxyn|bM{^D(Nt5Djf&RtZs`3!M&R917oJd<#bQ=$RSsyegivY8-doBK#-h!^_D(SWH4?_T= zTD*f$~=JavWz4p zO;VJsm!X}_w-0v^4J#5SfarQH3FkpqqN9EFtF$ix>g$<78q1P*lCnu-7#jvQ8M*_ev@YHU>Hu3xl{irnE_{GWugS5V+}SgA2>%J4T}Se z37*IC?k;TUH2+S&#J8f`(A|vb&aiG)Q zp1q$6PL#}2AqZ*Vf?awS{;yrjlgWWL_dCkjX`GBN#G3;u2HK*je)u_7Kwo^2p7|hO zW(MEW5dh3VC}rZ6NXx!G@F`+pc)I$mz_hFW*)7!%h6c=<6hAu7QzZ8Upn;Vp$MOJ% zJr%EmbirBI?&DK+#OmByBmxm87RF}i_M1}48g=Y~+RGXXjgUKDhV-{=U?4MEqreT& z>VOnTSvw}BgW}I&%UzJ;M1F2;2Ju+79Qr=!%9720+Tg6h+hs6#c^u4ZO_#6d&0_If z_p|}3mHo*aATVW5X@bX>1>21ZRGCGEdIr|dDw$+?Yv5l&p+5jIDgG^n@-U#63d!<- zZK|YpKm@Q&pz%Cb%X&4}z^Pl-zi9~kG9%(~Ud9nH)ZYgO^pS2&3ezFjkFtOz13yp= zxL9`N>jnUbOE=iv1t3Y9Ljnz!k9M-D;A+-Hx{7*YXLt6?qfFiM5-$f@;}VROFS(4X zR&&DetM$q?`C*PdGHJdvUzq1wGGahS;~1nA)K~V;y6NBNeLqF+AzP_}*`-58MtZja zWDDoel~RvDq~Oj!FqYI~f~lxeV{GoO$&xvb-zrPMbRzTqX%0(WqXRjr{tXXN+r`j* z(H!&bCh{{4aqe7Le#E|X@Yrh5u;&2@-wC<+cYap3j#EAY6%HR2hq%3_2I-C6LTAki z2q&Y+9kMUYK~S^uPc!KE)Y?CN1GUYWrYRV}VU&qx$9+*V;C5+z#Y{3yCTuua>(9wa zpSz~Mh~&rEdUf{`jpJl^13u%8|2p{*u#^d+->ZYjck&S{oWe5IDVKg}>eNtH3%v~H zjoS-LUB&E5-Cs@a^ay6IDF9VhokEe20Y8?!8|D@r}**%VWEOZUYS76m#=t3k0!+@b{!UqJ8j1cp-8 zSE|9<^PRl-M<`7)sC0o+StR9b_Dkto7{Qmv1>43wz=xq$6@VN4UD=)Y`#x~0+Mo`pLGrHft* z`cXaU_gHA2e$&g;=5nN(dm?(>iXO zJ*w*Jv!0{CGq6N-#+i>dh6K#}^#Hhri|!qO(cB|&K+6_g)jv3S#arMVGHI4$F$Rh5 zU%)D`fn{C&yB99X5z?(Gl^(@IENAAuB%dXSPZxOf^;hiHbqy;J^ZJ1b|`69DyJofb9uC@Wo*DL%;A>-8pOsSAfd?kdQnX zUahXiR6t@BuGBPxB)IRA6$ik1r^Y!n^M82&n1N5zdJ1B&vIru8LKwRC!!a|R(Zx|X ztdC5zT&+FbX?T9ZCC>a5g_>;312QUpRt2t2!4`87UQ7Pin7;(@kWKm1G{kTH3g0Yv4=o=rg1O+5=sF^!hRH6c#WMUV5OV;WzKK6R7?SdBX0`g{zOgWq2vcnEwbyAFv5#t5rM`2ZRueVxzF!25`v zx70$TM$zw--+{q{^NCg7bZO+&2%86LC}58?&}K&Q=R-D@qJJ69FONa^0&v)0AEVEI zzlmGTyb*iyav(A|w}Py=p*!0!s-WY{;A@B+1cI&^RK^$M+h1WLO25QQ!yRgrwu8jYpTp&ABb8PPy;EMeoi2t6EQ}#~&x7#rY z{VTPl#(a$dk+ znSyGnZPFq_9`e)l7^oo1 zp)V}%5fK7#IN|2ODYus$4CSWiho@@%ZdXxRYF zjvm5>^7;95%uI;Rwix(a2euFkTOggb034N_^#KaT;{4l)tQrxpCb~s&>I}nS=92xI ziqEJ&?OxKNCl<-4@~y#Uo$$#?5R@i=9SHgJb@lo`e_gl0Pw+Hx0uXQ5@Mhijy5a}H zRb^$%xb>YO79L4HD46>H5m?Fb$JHS=Y+xMdm;(oUzr6VFqXi%rcqJod`*2}q4$H+e zgTP)h1DZ7}s}6EU$e{m^cNgp={`z$K*M)AFwGFyH^Pk@vw*bf%3jKrx)rN@vyghSx z1@c4o;mkV0#XWv%^v>cR^eE8OjrAimHoQ>?c`*P`DL-6ZxY3RNs?7()%ovSGtKS;O znU*eombtU`YrI+v1oANrc={G~0pwdb9*50Hz>r#wFXRrD(f<-{L;m}6)g3_CWT5t8 z2M0*k-)kD~WC^y<|GiaBwQ!;eiAY7bIB3g}#}A&ad3zy0G_Nj@a%jFMb0jx{U2Txd zeT@h-rz3aU0OWdq>z%7?_>bK)ZVaxAJTn%)6%1}0VOXxV(=E9$#uox|epE^kYm1=8 z;3I@&nz#b^+2d}rlpe%!WDA*+{Sw=jIzp{}=KS>eJO8MrxU!JMufH#qL8m;$o@>-f zjraPEraCZm%s>bkC|v+11ko_;U@wH@ip1$m11>=9>Q)JKCquVEJ~4!dDJF9#c5$@d z!NfFr{*G`%M}^brD)z|p;ju{bIO?{oqc#SB1Xu>Iqi+1^TkNuG2g5J1SLT&yQdSu6MzsdaL`jTW&Ik z&f{PW;R?hS%R3=d|V@p<~M!i zl~K9ZlS6CH76Ro$RCagWRzABd5$VRodPXdqFUr#ho&K8GBnbjB%N5$cg(ZEZ<^Oa^ z*Q-5`xVnbMYQf+^IU=!H<0J|^jCh&1U@69t@WVjqazXP~zEcUWQODrdWvfJh70RW$ zaKQrUlI2IA;nC0P-Cv_Zwv*eA#sC)zm~=2wD!nWTLYLW!y_vphH>+4!gqh$1Nnjir z{s1`Yqz5j&%@74T2woD5eim_YSBsFMI}c+uaT6Jxi_+^1T?e{Dt`9QeMDVNnAkXj? zwFd8K$#_NE5L=}QqxR4&P~GvE|KLB?p*t{3n1xN zEUw6J({YKb*i4=hx{jBXzwm45=kpl8-A3@(Ybq2P{Q!F2ZM*b)#X5~9vbX2RzWp%} zX{;`)FyJ(e^WD0ve?J5pLsa>zNs`=`^|0W&%fQ+Du0N97E~QnYteJi7NeeSRy!keJ zQ0M=^a~c*^r}M85@(BqmMx+yD8<0}hj&7iLEn=8JH2c$8Q8~b813vM>&0U~d{UA@@ z0ro*%$)k0kkO}mRpn+L!aV4E!8X39#O#TcWpe3hd&EOW<6L@2T-Q7{Ikj#$-u&;pt z2W#eM)6n%8lv_VP3l?gw-a{KeYuT&ArKJn?aDr}{L36|*~5w?!3QLsQiSn0TJEEr8va zJz1s=Y!%<%$_DBLfQ3Z!gp465^x@uS9+H;5oZ=2$xA3+;imusw8~7VS@~Bvj=PEW` z4}h|a^y`RgCzN{~{4kwmvFFV14w;#Tv5OS}uc>&XMWHuk+H@ zE*mk1e9Vz_w@SqP>@8{#+k?yqAdH%^d}sneuzBiI^)8Z zKkFEao2q#9#{d`4!nwNZ=RY(GiRg#8`}_k9+2XU0MLz95dzf-nCcflJL|~Y>t!bFC zY0PPc(jyKLvyraerwgxy>cG{H`X|BoA-Ju<)L;TX*>g zT(?VEKVwuSO?jVKahgR)-|wzR*IT8Qr6VmO+tFoDwOr00p3-X0mACth= z>%IUUz$b7#q`xCOIDniz2NbEv1{7AMJ9?;#Mm`8T7$FBn#QZMf7J*##xoJVrB3H#s zaL}i=+SR&nkWaV3hheK=K6AR3qY3w-yV@a*;HThYEk(vIOxOIC38VY@m(AZIn{whd zkRkHbSU3W7whU<)3i!}vmOYu~{wlqpbmMOPuEir;^D_d5UmSDo&ab~!kyc-M9+hR^ zlbqWG1}Znie}UU?s#js~MAa;Ddd6}FB=Aa<_@4sQ=#s21+qO032YKQ)U&HIUfC6Vw zfw*QEp(-ZbuLM>}FAD;pzP^13G#O2Xv2++0kheZ`MmA8549jE;dN)PUFG4PZPe%l1n>U_vZyMk_ zjb#t$IhL<^i>){IHGT+sdf@Z?nuBgZVHWI1cUymP(2t=Fj$jy{xP^EOP_ z39r7d*;W>~TPYNh8gp`8eKRpipd#_vj41B2P9|sDrx&lFxdztZAn4|jKW!o zM10~x70AqR`%?n7N0gTAxgB5;Jv+rZ7Ufp?1vfO>EHo%C{2S-x+&0pM#S1isf`c^= zQxX9MNIXSO^A!tNldoYm%KD@6E?9P}K5Eqg5%cdqe~%cHm0>$&O$s}$j_c3jw1AJ% zkTz{G0GsWIoM*Ce2}X;(dBzHq7n#!xHpb8_RYVoigD|5AyP#F#;w82q2v1WavU#PDaq1l$C~d;T{H?DF5Dz)~$jlazToH88w2 zL3B+PM+du>=RPC(sA%&6-i5Ks%JqDhU0Pd{pRVGgQ7Vnp>3y$}d-3hE-Q(^?+<5+e zAVL-KT!L!oR>q_4H$XvF8w-H`EAw{5B0aPm*%caGe}KbWS&4tw&O}9_xYD>T zov*+#!)Clp^!o`RNI9rD{u`tmB-4;gEe4Ch z0rJH)!8>gUF5!vw+Am>OV;uM)9I8J7#iP*}xA3o#I zEf;qnxBBG%ELLzrHq$a%KcavqOpD%W;uJ`7ilm-_&DVe6NO$-CIk_gUDtfsvy38Ik!?NjA_8m2euDqXj^L6(f-t4HfZblA z>TJsTXGBDY*=9TLe;GDGcqGvV?DFj2eK{qj7{Kb2{%hGBm$NQ=LN@Q?*58*lg} z08m>^@CTyJG8-)`CO>ngAND{q&_7ytWCO;n3am%op1>omY}G_a-DPr<7Y9u9<}@!0 z49l6M|NW~UFdF%zt+n}Yo&jYlI0dod>b~AfV$Hv&%=Q_0FaK{JDPZdNuKsxTi^SZ7 z#dTvs{}KGtIb9zO(D0NDzDQKWF8 zzo}-`gXb~IxV;0uK{@?j&P{m`2$y(`w*>}BiBEVw6O7lLKCIA75*M6S+ zm4ge%t**Kj6qkPo2uu9{>ehceD8n}}A^!6jia+{IXhpV(z`xrj;6l;><^0bF^)Iw? z_?9nR=s%7N3$VugJ1FXG1oXN8`Jn#o8ScJBa#eqy@f-k!{|!DwnFM(Je?F*xe})Xu za{h4+-}<|A7(Ov&dE|}y_XqVqFWdjTY`-Z7|Ce62uZ490vs!>34Wiu3gy9~dNZ%Us z@FH`ThOOf zkfEaQ)_N37-Jh`MjXt;LHfc_|?3$=^5FYr^sNkJ-06{CFebfR;5^#7U?t*DelyR1!BMgw7{W- zoJvdM7;B%8AUuaSHe+z+{NQD0Rkr4X<;oIk_euw3@0^fGzs_I=8Qfy4RagS^b<&II zwlVdYX_YL9^4c~Wsp9~cEHj4nDYGK7K;QQY6H7uMI`N}yu+#CqLn-x8uV(^5rfDh((7a+51b_ZnM|78!z z6ixtJj9SiOEgvpBR^}OmT&=gqMkXsrVjEp85~Mr)`ff@O z)RJA0jo?taUlO?VYC0ZHz@VX-^}Tuy2&q7E-KB#n@%{ZU7`Xt+PXkDPe>$%i5(K$U z`m<=q6AMZ25M7U>wd=ShpkIr>56OzePSV3u;Ql$B4A%0}TOieH<*eAz7o5Ue-OhY) z*i>S9_%*xVBd=%sq_{dNXOTSn)vWrV;+CSjJ94{3nQ#B%@cNos2bFHD^axCxhLtsh ztq|)UrN{oy5R>m>dAg(`S{7XD*iu-Xqvd(CIL;o(crBt*Y{2D8EK=A--6K~aE(>}T z7;#6*#Gowhp99K%!Y`@-2WXtK3d$<^;C zlWy!lGtm!mHNkU~K(F|WMJn2X#9hN+E^Ekm{xXfy72M4Q18ELt?aU6)&ne%6it`nR zDvZF@e;U39t^?D6z9GB^%nzhsRs!Fek|nO-<$Hca6Zn#c*#Y=J4#4Er0yFy&{y~YO zwNTFQ1c$|DIpw`2c5{<^mJA*TxjZyj2@R4QZe^QH$Cgn`dEjzQbXZ)5ata#d+iN3q zfF02B5oKZPw`ftQBJcM31^@?x?Y`lV0r0XPzghwY#eiy&`7YvN^|g*=c?z=n@IrQ2 zoQ5#I_Zt)9c5xJi;x(+5Fp1L($Tn1kEDhbU(j4jI%$;9|qGV{^*x$D5uEJ%W)U)s# za4vZ=Mzk~;uWf!M)AXTN6F|!2MDAj<(~+>tJtH&Gs>+!-rM* z6)>cJ1I#wbc8$<#-p&77Hwv@TMIaS9hVVO(}=V+QrgG?GaFE?kAAz>@Sh5&i3ct9YF|DFySkJ$wqH$ zsyJxGn(Tm&0H}6GRdOf@n^l>1D^R*N7M~-)BaOq59r7=LZNgU9S=%!P%pAQ4z4Q4~x=#-AgQ%sF;S*IR0LEsCJl}Qk?vgG&ssbG4qeqg;rW#$BWC?!C zK1wQumLu9v%aAx>r^glzA#_MoId4k)sf8l&39jeYHq!hwUe_%RA)5QSn4+BgT~Y!T zgWplVQlnt>oCy|qUok{YxanP#FQkm4wvZ@Nc8NNH@BqzwadKMdL1|mn{`dzm_wU(g zHV|3_>!`1m#A;iZYwoh~>Z&WY9+fE>XvHuNTB9R3H8tbC_WQF>I0Pf~Ub~|BL#GNc zk)s9m9|E6O7W1EXI>;9|f*55w=afcS(&PGgUVEqJnx%QH95l6xS?!UoIH7uh zWw_XW<5L<@OWnoEtt{TYHg>0Gl5AlyRgDKdX_gA)y?LRJ`vO!*-g*p9r#i@|+GaVA zOy@TLRH@pqQXNY2KO4%xG8ieORmv8)`2KN@M&IME$umA?I7n0l4q?mc_(IM zwDVVPIhYR_e6-vw$*oY*HPYQlA6W3<)uLP}tRCCC=>>K%IoTEmV;jS=$D@4PLS1e1 zcB;J#$dqq3C6?QsnT84d_UO=hy%9@+-s#@_OVQQ$Y08g~`hfjG`cU970{=e%=@0^}F`qLjAU{f`^%`2{U= ztxgfK*N$Wis+!$5Ea*}2EPF&l?*}^hTyXtr+ZxVO;^j-4_Xv0D7&Tv0wMo}4D7li^ zb$NmG{GH>Mgr@{p66Tt)l`${Z4P*>Zn7!;Mxfih#PEoU&{7`>0237;ty-+rRk(WYc z?=`Pzt$bN3kiSM}rkR?q@^Ny^8$D?izc(BCQR2eJs0n*teOGi|cYbiM(7lF*vjHk) z6Po2For3;CYG8=IACUF6oHub>lhmazQ@5SE*W9EhGa@)iqtn^e-uy6y1FhdGr4{8A zn%8qXA3OMTLWyO>cbBfn$)CFF>Jpu*%dhf3YR9G>WS)9wb$5&D_Eexi`bceu*n@c? zEFKxvoWjj$bjq5Vvbkn^@72~!we+1q3%P^1v(Dyy*&nYw?!J^hYIa(fOJTNrz-u(v z>$^`Qz-Jt|Eev`&?^4Y=w0Q)hDXhHNWM zx4B;MC;W4swvB z(Ar(@VrM?inw#Q&*LvI0a3^}FLN@2zMR^e9jC8+63SD4`P7dx%K3vTA^9Dn)`5T`d z!Q#Bh+!f--wB$a!0qt{$&`PV~7vnI4iNMpi%jca1#6Y=?zz*%zF{!eRZ!@|<(7+~m9F3W%MHH^pQ7_6c59vnZmiHv*9FGEUl51)=s5c>mGZB@1JtR)Ek@ z&?PU@ideK|^}Kq2)`?;0wRfg|e3E?x^Ok(8*{QhcwXz<;ixrJ;YWY7jYw`yl#tL7V zf8?b=9iv7+U0daPP&2SK?)NtLs6rS$>Y_?#;n?fMuX^>7?(4HLbT9EWxG(Vpi!b+r zRf)3tu*vDv zsi_uRDB0VuS&(IQeSN@R0_;4e zd6t6$_DjZ$r3-|c2s7$Q8D7iU?ujCaHwz1}Y6RkH;M1K7#m?YXdnbQY())+M8M8ZM zL?Tx7;^Zx{kZoMlX|5Z>1tQgBv?|f&{cBYiiu#00FoA_y_(WK9g#S!kwKvhoMh1c) zC!b??Y@*v^TAG7o-U(4XP%JW+=V{Y)(BvcmV`l&?T|NlF1X)RRyOmUYr;0>)mXR|4 z)^;ncpY9jdw0#)#=fT8V#Is=rH2Xsc@f#XA2e;CkioK@R&nP=mXfoY@hR2^}A2Kx4 z%6pi1f>xxjLW5j_`0foYizxE4M>zgZE?J{n{ab7wpXtvdC?az-(@iaoE!=ii{do7N zg?>joakogLdFe1GvTko3pDg!5gvJ&TsdZ9ire(@ORm1{&^*0RD+`E`Q)>dZYzOE^1d1eob%42H|b^` zQpF}0N%;?j3hA=9LOl*@ z>RO5suV8Rh%|p14ZWpI7>$+&?0eigYi^gEGxkZexdXj%(zg9$TjeRWJPPi>Kr#$M@cl(!bWZvX#=u2L_H#Xbg&P7@I;mIV6JKeSvLz${q#NTAhmKxb)rNvp@>0Xz$ ziE34PnKsjNI_!I<>K)y)+nGH6RGP#j=-b5nD&2*sTh?lR3k3CrNpgu6jj1ke+dt73 zcZok7zN8-IVX~CbSKJ!Ex%gnut`S|}kInF|FH5!Sgr|y~MR=@8_*Ba$V%*Dl*4azT z&!YSF#sC+O`RjBz$G&%DD5 zQ-`*eF<(K6b9$WR8~0(@q2VC^;eH@_f;i%uvudeVBEWv`adQPGm~`3)67VYFCw$;2 zB~)w&ma7IxFyf%oXe~G14q~Ij0dps-EZpPu_+!KpyO0ng?)A@1lnY1=-7&9w@c0me@2w%4Ai28C_w$8YuBWGMxQ(}I=OM68IfHiFk7BdQUc>5IG7L|j6l zB*I?CvG$Q#DFX|Xdtnsyuli>Fr6obK7u2(QcFlN@Bv6&#a=0Jif2K+_AXcYk^su>k zh)lmR&>nvh`NgQmfI?>nagLeY4OIrvfI=w=v3GZ@G3!Wp7O2&jM&2dp3t<%Wi)Y=0 z)@o1d#IT7PoxOKqdg`Gsoe=`dE%;T!losu>`IwT=-y3y5c)9-Qo5XWW6NNaBBWHwI z8A#c5;`u_Aaf)}+Il&;912#BUbYNWAVltizc94x3P~wi<%_O?UaKv|MIq^wm`n&-K7PB6dJBGk-*7t5J4jrc+J=_;na zy&pI;2Toy@m+{H%uz`+w$MNIp$)wnC(+U{bbnlW5ksu?zCV)VeC~POTe;SPK?5Br! z=Z+n16)tZd85~C!ZjVWe++_$&AF2%ojH8u&_h&Pgj>q$ThC?%p0ToY|Hd`hlY9#q& z`6^GTvGUxdPkLW4DStjIG6dVPRzjz+Z-(RvE1rT9?W|5Qj{{RYp=O^u(@ZF|hEPK| zXC=>h(me^mJ42z2dVF&6rLU!wEmfyc06~NESMlS6b%R(B=||aOIYGP$rwfO8T2g#x_utr!TOpW!rTPt-b^1AKZk^e94p0ctv4)aBWieZgh>*@kK4Bwt;;aUiP<@vHO(UBDlr`UhiQkCtP)V zy!T|d(g9mBM(0*iL|fIlZ86VNOSwo*>hZ3Bwc83ePPsiV(@{})o5wCSr;0IQul^bD zfa?Ra`CC`_5A9hl^g5thnbkBjZ@hs=deMn${B3879F3sO?V76FlfcZINQvQo^%YC_ zgy1KF)fatdM_mT1$t4AasYFGm$ew_+tI!)f{BlrN(MeX*;3|-4C(YqcdgBq#4dkV| z3N=LX&eB5R&?DyQ8}HnZKc0A^_1O)CHr@5hMbksLH%Jdd(vQ^|0GQG_O1rW-Y|yh0 zJKwMoLop$k(0N`=+uP?!o}&ZEt4hh*vFNEg$xIq}q-r~lfOl;}Pmq7L*6@ZSBb##v zc@Jg;&`K%%Hn3U^%#uCi#k@12g$!5As zGs!3v{lz;suPM*YKWgCZh;}rMqYGwdyV4QMZ`T*8gVPaxbj8hgJYs||!|u!_o%Uo` z%X#e07&({JR7g^EaWQ;!;ls1SYAkIO#}Q!U!d#^Jc$z00uoSox*huZ2W_em8O)QnS}{xmJ5%Y(WgYUF2trKg`y zRMc?h2FhR9FIk0i5brY9W+y?~vuw{3s=2uOs1PuJ`(F02gX-&-ZiSShwd&~CW<8u_ zp47n@ec11ey)Q`h6j)@UEV#SW&pIWR>#-5hJLOY3jfaukysm_cwR(v;O-$D@Xq5K} zDyv3`lvHext_q!EW0`pcwSwR_S||)IPW_IPQ-oaKkXW?2mNRboA7G)0FY#I6qDafv zJm0$&14(C?-*%9t%vIyjgq6pBw3)}*OG!y-a)dYUd5Tjlv21fCaBp99KR)o8Yv=K- zGu9RLdkUZH^0oqS%lF)J3= zCfZ<$ZP#Ym8zRnsqODa&m_R=Gp2?oJU3f#E5&J}EjM??$ApUF|=OU@^NetIZo5i<9 zo5nR)U$My5y^9jRJ4Dv4G)OQn7GE5icy+UE!(LKtv4});;fdRAQvUYoNy+3&N5kQn ztF+}#6;cldDfTdD>9ny1Bdfx2wX*Tt$g*)Z-wMl*NP7%!0T9PKANFxYmZr)rZ0jw? zk|fh0e$3S(l37_XDT9qiL?O8fuC-SZ#g%X?u?8KMbG{H0?|s+yrE`Xp{Y%e`4`o?e zmF(KXcjTncfuMH+FCq@_#^ej05eib8D@~nT_f0(=H+$4XR2hQNey{yfc z@NF-1rN&j|o#N{t9hbgwsJ5L?$Ixlfg2 z>s3#M`C1Com=JZN9*v*yi8_({bG3d-&@(kS<-CPfXi+By@a>54$H{Pul5SwCR}YbR zN=wr$rFZ4CXfw@=TaZ`~X<{v{9weW}xl@PEW=S8jFyT%&Upb6JOjZFf&x_9HO0y*J zd10DJi?ojueF~Y|1+R)vK&0}p=_6bbNx8zO^A_#9fU&qRQd&sH7wStV>^1m7@(hlX zNrL66)Hl8a^t2|9bJ4)UI=1n0&rfZ6oe*5VANhq`V6X>5g%sWVE?s9acUqk)yelh1 z|8iGuT`M_9>Nv>vnrQu6Tk>HYQB=hh;UtgJT1mn~L0nv_#q;L1u*~=UjK|hmI!JV= zKImiBPZ-MgV@}=-mpV+%PP*Dv>+Exm7US^bpoV#4yB@_$uK`Lh2nZnZ{XE4=eaL0> z#NQp$YnZqKaZiz##wB!~PLhxo@j8*HF#Dh*lth2wXY?Bc6=UXVXFo8?SII(ApAvaj zfpy5ghq=LFV%2e$#S017m?1MS=G11gOI_|luO8fve|}Iqm`rSR+LZmCr?GI{jkI{d z;U`|UA&IdMPevM3#1;ErRT#OLf42SNu;5jBS5p4EM1Qp3K8BlkjM&+rx8ZMkt{0uT zo;mC^6)bX~4B3;fT%roZ=)XJlkJCLTt*XlNZ0A2^cayVZ_B@_sS_|-Oqt-rE=L=Dr zkNcsY;Fw@LRi#Ww8VFl@UJzfON+p}{4K1XL6-lB0CUjDZp#7|{FY2Q37`Ac)7U@@P z_N{VOWplSOU zzo1=gvGoWOQf#Hgeq8g}yTiiinV#PxshBN;QhVms-|550@fz(fFR;c~Y}D;xaf`5n zQz_ngcG?}E5l6CXV4D8UkY^lm3MzZ9c1^Jzi^&VOdkhie+LxKm?CJ~yBW*d6eueD$ z{V~XUMM!HUzB2PAsJO=ulyQoDaY#w3PLxHjX-{Kdks9wg4Cl<&bV{vj87F1)-mb2} zt8})QAY66*A=sX9y5y&1!%CFHGoiYw;xGbI5BeC>@Zzzxmkm6F%7Y=mp1OLny8P6* z9n8I6#5>^?BrM%x(e1X~Gg{;m@iVKEB&X)Bd2UR$*S2xaBy-NHW6MSX%afb?)^X@Q zvzN*SM?SNfr#LUyp6PG{>Q)u=z@v!1V(kk=EHachdI@}ZZu(DAm)rkp$O9JhD8xhaGiFS23j=M`1QOEHV zB*K(VOR(?D8_e2Q@ZJSvb^im#ab}&npHSm_{Mz@v2C1XRxL1=qQxR0{%%u# zmV1gGCH+t`cp;a`3WmBxd&(x%kLW)X&rMk0^lHXB_aPonLX@P)^nPNo#y82XPm+hG zp9;L$B^yX%pC#By&8n|f$>D`cVaQ?Fqpqe<@-q9Rd&yGioeAr-7of%yo_;!=lIB|; zP>_1`4dY9VcjxIz&qS_xQzz?Cl5C7%E}CGjL7Njr*LdkdXjm#Ss2!vVO!|{|(eG@N zhy>_Kisurkl15E#i_K6coZDo0gFT!?kENDAlg}kOt|v32;DQ&RdLS>2&BawlVp3|# zVv?CYR_4_Iu$xBm?hI9H<7sWYY$`}gF;8F5zXmh20x2e3rH+ujJ7R{Cp1}tBEW=#R ziixc$M>YySo{_zLE=^}4&V@peZi%kKGsZXk$$eM)l|z8sQegPC<6DZJ=+}5cxS1%s z#VN6>8v06)@l)GeDEiHSy&$^HH4u+*)89l@&AE!g4gH|yBO}vc3b5B^QBs`@-Q^ZqtyiC?#3ht9KrLqSPX-%JWjK6(x%?cMin=>Jc zwVu#=%^;49>gA8viVe_uEzp|ols}38A?PJG>IOXILFPTP_B_nnEy2M}cc+@XaG22i z=V^_JLh#UE)DoCSc%l-H zn=VPZNv$LhT+7tMe#vk5osWy(h$#OD-`3NVpl|5eY>ooVZUz1u?D>~-9xBe{5@Zvh zpH2_Hln|0X8sGaMz7P@>5Ry-R&Glsd$ZT+kxd~;}`|)kbSm_4>dP=J_6MS^_jKS?9 zdlcP0k_sCMVkGld=AJEgk4YNv6Wh(~J&F1FUcfg$LDu-g@dwG9IB6c;r3vj-s544W zj}B%eO2P7o{w0;CG>wdyk)*fjO=%W-Wt3d^9g9wK)c6T1-dYS%6fS9t=t(HVUcSg~ ziDZ<;L%r~|GRj+(#2+{4h!nuYFEXpm%lQG`Rrcs@tiY!$^o@`#+ zPMLfh6b;z$S52L?`S`+wMqk&gV0lUpZ_@3KvXP1&4*m<6!-(@i@+uC>38l|S zM@l4$%}W=}dfjOi>*Z~xUOH6h27;MdG1H%Pu9_->L#=}v6|hZ4I_xwuL4_*`Ws(L( z6(QqktaDDI6Fn`;9Fya%cVsQFE?SG>lYiNqObqk2u^3|W9IX;5Gw{4##Uwi8WPuuc(Cg5~H_U#;Z@CQk15uVivuC;ou~JdEXg^Fg+qWH!q#mqk9+%dqY0@`e7Lo`BQ5=jkUzTvNGx`7~F4$Qc5AhT=ctESHn;GO*VSw8d(vBd7TJB+BIj{ zMDp%5byxc1w3_3}a#z!^59&2SqhntpN~o*_{R%Nw(h|-G2oB@B^F##%5M9Gw!55ak zX;_w+fIk|whoc?8i&M*EVJ13g^mubaB4Kept>tT)$B)-m5^H5vq5xGT#o%Jd?_$~) zM!3HWSv;F;QWTCadn9%;v@AO^qTQ>C4JWMmEzM8xJH;0w=A)4to{(XQPNv$O`cfXWq|Kz4!R z3A>nR)*a5=#zcds9w#%_zN2hcGE3`YQ#XQE9L|UHPY>o`zfSm^em06#;HHthsA<0T zhbHPGwcNsvx_hal#@S*aqQ^Nu+V~=GVFqHW=Kipx!Z`_xI8ZHGrg1XVQ*m|qt%;#xaW0~!T6c4a0%_5f@1~L zg92Ng+E_j5lkeWs76wum?;R#er%nB=rrIwiO0_-+lfSU$>tG>>5oCSgu3li0L;#ON z`&3|g$p?BRd}h(zq@hZjc${8w!Lmb_c64|j`<~YQO-YV1hsmOwyg zT`@=!29+Lc~Ghs#zjUA1`f_y7g_D@ z{oFmMF&OAuAHhB%HWMw-IWQI;vWXce+z((Wb!;@fqfa^Y!|myt!-=t|2F`}1T`Pji zy9Vg*MGJgSwidS!R@a*Q%5?W5s$Oj_inSbEKMY@ZRcNr;-Zx#$aya8HH1+v=Vc_AT zV$n8dnKs)4!*~OEeVHl||9F>`tC+60t0?nSxNz!pz=&fk_m@^hc94yo(MGG3#gP#ed) zaMHR39I*v_6cV`BT7(!HT=KC{yS5pE|HLtF%%Vy3Og?k1DXlf(o9LnYReUU^8c0^K z0X<>V1L$G!fGAr3#&?hT#dRF*;5%6tu4ioPu?UJ(%rSKg&@~G=G@lM-oJuKQ-1|eGRLiF|0X&0cHdh8|5x%|0HjK62n!%%yAMQ~Bw z7K?;t(9hUVA%DlEb40j25&ysp`_$^ul?@#sOfvOTFWuQ%l^G2wnY6r5&*9s-SAzPa ze&zAhz*`x`n;qYT`oNu_3F|x~Mrf53@rCzy?lF&()%%%gKS2u~1B=hcPbv$cKKfP} zUp9DBMRtVmr~n+vH>a=NOK=1jlh~Ng|#2n{(rDhhUn|J?uy867vjXvUMjhi;vA8K0wy|@9l5@rex5g&Y#eD z-~l=FQ2j|6P%fpOUW6E-i?%mtjn1i*Z8|$PF@E29Gal z(~sB^`Xk_EA%HDPB3vD>tWiA#QYN2S+;OmZ602t#C!51}sQm(RcjJE`&R)D6_lm>KIFvqvCeH^}X^*5}}fGk}zxx^v^{5-rb+Z0OkU%{@HthRUOAD_m~ai zr5?-K*S#5>agz5@&ftf_PzECarn6luLhNbJW#xt5CsI3Qv)rEAn~G%mPQZ)YA6fp@ zsLXcJWVzdOn>!!^u6KRAxr!Y%&Ag=c@~gE;5((xgh~TcOW)pZ~>S4S$OZR2*R4xsz z$@%BDyXEhx?(Lb{;=kjcCY5!c@Rm8V(fQ{Zip;D26Ml~;uV4-^g^Kdl3eotcdz&Xl zUNTuzxy&5N^y;cEgJbk9t?SV{ZId4taev&!cZFZId#T`hS{~0E^?NPM^}G`4A7wBU z+zi%IwjVA}awp4gu!fy?|K5Fig>0=h`m;c7Rz10|&B#?B&s68odR}AwX(0QG()#aj z6kmzLzd7zicDqVMH(o6-xayXSBw>bRQ;qGjaL)#X^COG#U!o;t6r-qd))W+`n_iX=8Sk-=EC}(V!BdAOu(uxvO5U1g5?? zfeC)vZ$oRX4_+?J2%EL`PSKsni(9i!N4op00j5Z(uO7QKEO5P{RB%!%fXj9M(EvV6 zNx@l@R9j3AO>Yf9XYXeVr1YJ-BfEeb`^z|* zTDAeBIdzKC!;0d&z*)E=of)fKmv@SHR=R9m+ThOfWWnP*pLcCsviH=X4HdwuufOm4 zo!WJYai`||i1j&s)BS+49;1KHaz3MjeqFi~gXUOS_MH4OAJo#Tp9rk^<5{jf1bU}V za0cV!lGSgozbQN4HrsE4b!-FE;g>sZ*ShURL{L4az>_O2kJkua_gFqP@<6Wo)v40` z57hFMs{~(M5fS@euy3X38&l2grzU-H&kYG} znq3m98oK22t76&DLGCj=p2ws{@h!iwD)CIByUY`w64|YvI4Yi5u-f$5JPqhsd2Z9J zYT!!bj{>|){|K>mcD;HRnJ|w@ZlXS8e*OFVX=06!ofH_~@2CknYYYo@a5J{!fSiy* zj^JsF$2mHV0?M-l-+mE3t`k?ZU}d=a)-cW~k&lj~#RGSS{C)8_<#E*G7Uf%oPc6f^ z^b&!!se@if{(5Fu{HOAbO!>ce zH)3ng8Mj^E61)HW>oXglZDg^RUMBc?8N&*(`BwFxB_5w`|NK?@Jh;90U#ywf-KQVA=8dO84Y-r?!^MNImRZ9nibmTW@E|k~`;iRl4j3x5UmdK II;Vst0DFzYiU0rr literal 0 HcmV?d00001 diff --git a/benchmarks/benchmark_headdim.py b/benchmarks/benchmark_headdim.py new file mode 100644 index 000000000..5e5ceb2f3 --- /dev/null +++ b/benchmarks/benchmark_headdim.py @@ -0,0 +1,196 @@ +# Install the newest triton version with +# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python" +import csv +import pickle +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat + +from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward +from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined + +from flash_attn import flash_attn_qkvpacked_func, flash_attn_func + +try: + from triton.ops.flash_attention import attention as attention_triton +except ImportError: + attention_triton = None + +try: + import xformers.ops as xops +except ImportError: + xops = None + + +def flops(batch, seqlen, headdim, v_headdim, nheads, causal, mode="fwd"): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 2 * batch * seqlen**2 * nheads * (headdim+v_headdim) // (2 if causal else 1) + b = 2 * batch * seqlen**2 * nheads * (3*headdim+2*v_headdim) // (2 if causal else 1) + return f if mode == "fwd" else (b if mode == "bwd" else f+b) + +def efficiency(flop, time): + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + + +def attention_pytorch(q, k, v, dropout_p=0.0, causal=True): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + dropout_p: float + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, nheads, d = q.shape + v_d = v.shape[-1] + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=q.dtype) + + +def flash_attention_pad(q,k,v, dropout_p=0.0, causal=True): + batch_size, seqlen, nheads, d = q.shape + v_d = v.shape[-1] + if d == v_d: + return flash_attn_func(q, k, v, dropout_p, causal) + if d < v_d: + q = F.pad(q, (0, v_d-d)) + k = F.pad(k, (0, v_d-d)) + return flash_attn_func(q, k, v, dropout_p, causal) + elif d > v_d: + v = F.pad(v, (0, d-v_d)) + o = flash_attn_func(q, k, v, dropout_p, causal) + return o[:,:,:,:v_d] + + + +def time_fwd_bwd(func, *args, **kwargs): + time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) + return time_f[1].mean, time_b[1].mean + +save_csv = True + +repeats = 30 +device = 'cuda' +dtype = torch.float16 +# torch.cuda.set_device(5) + +bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)] +causal_vals = [False, True] +headdim_vals = [(32,64),(64,128),(96,192), (128,256)] +dim = 2048 # qk dim +dropout_p = 0.0 + +methods = (["CustomFlash2", "Pytorch", "Flash2_Pad"]) + +if save_csv: + csvfile = open('flash2_attn_time.csv', 'w', newline='') + writer = csv.writer(csvfile) + writer.writerow([ + "causal", "qk_headdim", "v_headdim", "batch_size", "seqlen", + "time_fwd_CustomFlash2", "time_bwd_CustomFlash2", "time_fwd_bwd_CustomFlash2", + "time_fwd_Pytorch", "time_bwd_Pytorch", "time_fwd_bwd_Pytorch", + "time_fwd_Flash2_Pad", "time_bwd_Flash2_Pad", "time_fwd_bwd_Flash2_Pad", + "flops_fwd_CustomFlash2", "flops_bwd_CustomFlash2", "flops_fwd_bwd_CustomFlash2", + "flops_fwd_Pytorch", "flops_bwd_Pytorch", "flops_fwd_bwd_Pytorch", + "flops_fwd_Flash2_Pad", "flops_bwd_Flash2_Pad", "flops_fwd_bwd_Flash2_Pad", + ]) + +time_f = {} +time_b = {} +time_f_b = {} +speed_f = {} +speed_b = {} +speed_f_b = {} +for causal in causal_vals: + for headdim,v_headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + config = (causal, headdim, batch_size, seqlen) + nheads = dim // headdim + q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, + requires_grad=True) + f, b = time_fwd_bwd( + flash_attn_func, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "CustomFlash2"] = f + time_b[config, "CustomFlash2"] = b + + try: + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + attention_pytorch, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + except: # Skip if OOM + f, b = float('nan'), float('nan') + time_f[config, "Pytorch"] = f + time_b[config, "Pytorch"] = b + + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + flash_attention_pad, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "Flash2_Pad"] = f + time_b[config, "Flash2_Pad"] = b + + print(f"### causal={causal}, qk_headdim={headdim}, v_headdim={v_headdim}, batch_size={batch_size}, seqlen={seqlen} ###") + for method in methods: + time_f_b[config, method] = time_f[config, method] + time_b[config, method] + speed_f[config, method] = efficiency( + flops(batch_size, seqlen, headdim, v_headdim, nheads, causal, mode="fwd"), + time_f[config, method] + ) + speed_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, v_headdim, nheads, causal, mode="bwd"), + time_b[config, method] + ) + speed_f_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, v_headdim, nheads, causal, mode="fwd_bwd"), + time_f_b[config, method] + ) + print( + f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, " + f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, " + f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s" + ) + if save_csv: + writer.writerow([ + causal, headdim, v_headdim, batch_size, seqlen, + time_f[config, "CustomFlash2"], time_b[config, "CustomFlash2"], time_f_b[config, "CustomFlash2"], + time_f[config, "Pytorch"], time_b[config, "Pytorch"], time_f_b[config, "Pytorch"], + time_f[config, "Flash2_Pad"], time_b[config, "Flash2_Pad"], time_f_b[config, "Flash2_Pad"], + speed_f[config, "CustomFlash2"], speed_b[config, "CustomFlash2"], speed_f_b[config, "CustomFlash2"], + speed_f[config, "Pytorch"], speed_b[config, "Pytorch"], speed_f_b[config, "Pytorch"], + speed_f[config, "Flash2_Pad"], speed_b[config, "Flash2_Pad"], speed_f_b[config, "Flash2_Pad"], + ]) + +if save_csv: + csvfile.close() + + + +# with open('flash2_attn_time.plk', 'wb') as fp: +# pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index a1a50f2f1..75ab69644 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -12,6 +12,7 @@ #include "flash.h" #include "static_switch.h" +#include "static_switch_headdim.h" #define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA") #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") diff --git a/csrc/flash_attn/src/flash_bwd_launch_template.h b/csrc/flash_attn/src/flash_bwd_launch_template.h index 404643788..a1568285f 100644 --- a/csrc/flash_attn/src/flash_bwd_launch_template.h +++ b/csrc/flash_attn/src/flash_bwd_launch_template.h @@ -321,159 +321,3 @@ void run_mha_bwd_hdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { }); } -template -void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 32; - constexpr static int VHeaddim = 64; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 128; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB - if (max_smem_per_block >= 104 * 1024) { // 104 KB - if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - } else { // 96 KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - }); -} - -template -void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 64; - constexpr static int VHeaddim = 128; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - // printf("max_smem_per_block = %d\n", max_smem_per_block); - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - // Changing AtomLayoutMdQ from 2 to 4 takes the same time - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // This is slightly faster. We want to split M more so we need fewer registers to store LSE. - constexpr static int Br = 64; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // printf("smem_size = %d\n", smem_size); - // printf("max_smem_per_block = %d\n", max_smem_per_block); - - if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // A100 shared memory spill - // run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // This has a lot of register spilling - // run_flash_bwd, Is_dropout>(params, stream); - } else { - // if (params.h == params.h_k) { - // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // } else { - // } - } - }); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - // run_flash_bwd>(params, stream); - - // run_flash_bwd>(params, stream); -} - -template -void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 96; - constexpr static int VHeaddim = 192; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - // printf("max_smem_per_block = %d\n", max_smem_per_block); - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 64; - constexpr static int Bc = 128; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - if (max_smem_per_block >= 116 * 1024) { - if constexpr(!Is_dropout) { // 92KB - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } else { // 116 KB - // This is faster for dropout since we don't have many registers to spare - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - } else { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - }); -} - -template -void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 128; - constexpr static int VHeaddim = 256; - int device; - cudaGetDevice(&device); - int max_smem_per_block; - cudaError status_ = cudaDeviceGetAttribute( - &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); - if (status_ != cudaSuccess) { - C10_CUDA_CHECK(status_); - } - // printf("max_smem_per_block = %d\n", max_smem_per_block); - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - constexpr static int Br = 64; - constexpr static int Bc = 64; - constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + - Br * Bc * 2 /*dS, P*/); - // run_flash_bwd>(params, stream); - // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). - // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. - // run_flash_bwd>(params, stream); - if (max_smem_per_block >= 144 * 1024) { - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // A100 shared memory spill - // run_flash_bwd, Is_dropout, Is_causal>(params, stream); - // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); - // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - // run_flash_bwd, Is_dropout>(params, stream); - } else { - // run_flash_bwd, Is_dropout>(params, stream); - run_flash_bwd, Is_dropout, Is_causal>(params, stream); - } - // run_flash_bwd>(params, stream); - - // run_flash_bwd>(params, stream); - }); -} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu index 010fbd630..9834bfbe4 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu index 53e334b12..8bfa8623c 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu index 1bbccb862..35ce26dbe 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu index ba1916590..17521c9d2 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim128_vdim256_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h new file mode 100644 index 000000000..8fd4acf28 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_sm80.h @@ -0,0 +1,42 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim128_vdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 64; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // run_flash_bwd>(params, stream); + // This is faster, in the case of sequence-parallel bwd (where we need fewer registers). + // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why. + // run_flash_bwd>(params, stream); + if (max_smem_per_block >= 144 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // A100 shared memory spill + // run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd_seqk_parallel, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + } else { + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_bwd>(params, stream); + + // run_flash_bwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..222fa2cb0 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..61e47b53c --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..0b5f0c766 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu new file mode 100644 index 000000000..3fe80fd91 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h new file mode 100644 index 000000000..71e550db4 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_sm80.h @@ -0,0 +1,22 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim192_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 192; + constexpr static int VHeaddim = 128; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if (max_smem_per_block >= 136 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} \ No newline at end of file diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu index 621e9f679..7023d4741 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu index a87d7b453..0f6371b41 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu index 0f8b1fec7..285bca814 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu index 6d2f207fc..8be40bb82 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim32_vdim64_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h new file mode 100644 index 000000000..9ce14f6a5 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_sm80.h @@ -0,0 +1,32 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim32_vdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 128; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB + if (max_smem_per_block >= 104 * 1024) { // 104 KB + if constexpr(!Is_dropout) { // We can afford more registers to keep V in registers + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + } else { // 96 KB + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu index 740f0baa8..9d18044d4 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu index 34df4e575..0ceb99220 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu index 5e9428a4f..543f16045 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu index b0912ed91..771708192 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim64_vdim128_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h new file mode 100644 index 000000000..b09d032a5 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_sm80.h @@ -0,0 +1,57 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim64_vdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + // Changing AtomLayoutMdQ from 2 to 4 takes the same time + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // This is slightly faster. We want to split M more so we need fewer registers to store LSE. + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + // printf("smem_size = %d\n", smem_size); + // printf("max_smem_per_block = %d\n", max_smem_per_block); + + if (max_smem_per_block >= 144 * 1024) { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // A100 shared memory spill + // run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // This has a lot of register spilling + // run_flash_bwd, Is_dropout>(params, stream); + } else { + // if (params.h == params.h_k) { + // run_flash_bwd, Is_dropout>(params, stream); + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // run_flash_bwd, Is_dropout>(params, stream); + // } else { + // } + } + }); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + // run_flash_bwd>(params, stream); + + // run_flash_bwd>(params, stream); +} + diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu index 17f479dc5..4bd2c82dc 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu index 229078332..7536e95ab 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu index a502004d5..487006b5a 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu index ebd73992f..9544f59ab 100644 --- a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_bwd_launch_template.h" +#include "flash_bwd_qkdim96_vdim192_sm80.h" template<> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h new file mode 100644 index 000000000..79ca59f86 --- /dev/null +++ b/csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_sm80.h @@ -0,0 +1,33 @@ +#include "flash_bwd_launch_template.h" + +template +void run_mha_bwd_qkdim96_vdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; + int device; + cudaGetDevice(&device); + int max_smem_per_block; + cudaError status_ = cudaDeviceGetAttribute( + &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device); + if (status_ != cudaSuccess) { + C10_CUDA_CHECK(status_); + } + // printf("max_smem_per_block = %d\n", max_smem_per_block); + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + constexpr static int Br = 64; + constexpr static int Bc = 128; + constexpr static int smem_size = 2 *(Br * QKHeaddim * 2 /*Q with double buffer*/ + Br * VHeaddim /* dO*/ + Bc * QKHeaddim /*K, dK*/ + Bc * VHeaddim /*V, dV*/ + + Br * Bc * 2 /*dS, P*/); + if (max_smem_per_block >= 116 * 1024) { + if constexpr(!Is_dropout) { // 92KB + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } else { // 116 KB + // This is faster for dropout since we don't have many registers to spare + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_bwd, Is_dropout, Is_causal>(params, stream); + } + }); +} + diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h index 6f51d423e..ba4c29d8b 100644 --- a/csrc/flash_attn/src/flash_fwd_launch_template.h +++ b/csrc/flash_attn/src/flash_fwd_launch_template.h @@ -327,95 +327,3 @@ void run_mha_fwd_hdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { // run_flash_fwd, Is_dropout, Is_causal>(params, stream); }); } -template -void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 32; - constexpr static int VHeaddim = 64; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - }); -} - -template -void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 64; - constexpr static int VHeaddim = 128; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - if constexpr(!Is_dropout) { - // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower - // Using block size (64 x 256) is 27% slower for seqlen=2k - // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - }); -} - -template -void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 96; - constexpr static int VHeaddim = 192; - auto dprops = at::cuda::getCurrentDeviceProperties(); - bool is_sm8x = dprops->major == 8 && dprops->minor > 0; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), - if (is_sm8x) { - if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // These two are always slower - // run_flash_fwd>(params, stream); - // run_flash_fwd>(params, stream); - }); -} - -template -void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { - constexpr static int QKHeaddim = 128; - constexpr static int VHeaddim = 256; - auto dprops = at::cuda::getCurrentDeviceProperties(); - bool is_sm8x = dprops->major == 8 && dprops->minor > 0; - DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { - if constexpr(!Is_dropout) { - // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), - // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. - if (is_sm8x) { - if constexpr(!Is_causal) { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // 1st ones are good for H100, A100 - // 2nd one is good for A6000 bc we get slightly better occupancy - } else { - run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // A100 RuntimeError: CUDA error: an illegal memory access was encountered - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - // run_flash_fwd, Is_dropout, Is_causal>(params, stream); - } - }); -} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu index 795ec67f1..b20271f2d 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu index e1048791c..464e0b283 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu index 582a95236..5af5648fa 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu index bfc09dc6b..62cb67ead 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim128_vdim256_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h new file mode 100644 index 000000000..3900d1fd7 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_sm80.h @@ -0,0 +1,41 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim128_vdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 128; + constexpr static int VHeaddim = 256; + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm8x = dprops->major == 8 && dprops->minor > 0; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), + // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM. + if (is_sm8x) { + if constexpr(!Is_causal) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // slow on A100 + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // 1st ones are good for H100, A100 + // 2nd one is good for A6000 bc we get slightly better occupancy + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // A100 RuntimeError: CUDA error: an illegal memory access was encountered + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..52dcca482 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..cfe937021 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..82db5ae67 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu new file mode 100644 index 000000000..2c5d5c7e9 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_qkdim192_vdim128_sm80.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim192_vdim128(params, stream); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h new file mode 100644 index 000000000..8d259d6ca --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_sm80.h @@ -0,0 +1,19 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim192_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 192; + constexpr static int VHeaddim = 128; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu index 3f80a1fe7..f28444255 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu index e3dba404d..0aa49a111 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu index 5677fcef4..b88785f29 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu index 36b511f06..28c42a9b8 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim32_vdim64_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h new file mode 100644 index 000000000..4c4941471 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_sm80.h @@ -0,0 +1,11 @@ + +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim32_vdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 32; + constexpr static int VHeaddim = 64; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + }); +} \ No newline at end of file diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu index 2869ccfc2..762253d09 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu index d9d444fd1..86a7616fe 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu index 2504c540b..0074f41ca 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu index a5270a3ed..7578c123f 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim64_vdim128_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h new file mode 100644 index 000000000..3ec8ee12d --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_sm80.h @@ -0,0 +1,22 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim64_vdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 64; + constexpr static int VHeaddim = 128; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + if constexpr(!Is_dropout) { + // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower + // Using block size (64 x 256) is 27% slower for seqlen=2k + // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu index a307cb29e..a140b8d33 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu index 9f00dd249..ee39b3da2 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu index 27bb08d03..8943b8922 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu index 7843c337c..ce4b051a3 100644 --- a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu @@ -2,7 +2,7 @@ // Splitting the different head dimensions to different files to speed up compilation. // This file is auto-generated. See "generate_kernels.py" -#include "flash_fwd_launch_template.h" +#include "flash_fwd_qkdim96_vdim192_sm80.h" template<> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { diff --git a/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h new file mode 100644 index 000000000..bd106822d --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_sm80.h @@ -0,0 +1,26 @@ +#include "flash_fwd_launch_template.h" + +template +void run_mha_fwd_qkdim96_vdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = 96; + constexpr static int VHeaddim = 192; + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm8x = dprops->major == 8 && dprops->minor > 0; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square), + if (is_sm8x) { + if constexpr(!Is_causal) { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + } else { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + } + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // run_flash_fwd, Is_dropout, Is_causal>(params, stream); + // These two are always slower + // run_flash_fwd>(params, stream); + // run_flash_fwd>(params, stream); + }); +} diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu new file mode 100644 index 000000000..db38107b7 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu new file mode 100644 index 000000000..62cdffd8a --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu new file mode 100644 index 000000000..566dbf250 --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu new file mode 100644 index 000000000..9f3023f8f --- /dev/null +++ b/csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/csrc/flash_attn/src/static_switch.h b/csrc/flash_attn/src/static_switch.h index 8e663d8c3..3cb31f9c6 100644 --- a/csrc/flash_attn/src/static_switch.h +++ b/csrc/flash_attn/src/static_switch.h @@ -113,52 +113,3 @@ } \ }() -#define QKHEADDIM_VHEADDIM_SWITCH(QKHEADDIM, VHEADDIM, ...) \ - [&] { \ - if (QKHEADDIM <= 32 && VHEADDIM <= 32) { \ - constexpr static int kQKHeadDim = 32; \ - constexpr static int kVHeadDim = 32; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 32 && VHEADDIM <= 64) { \ - constexpr static int kQKHeadDim = 32; \ - constexpr static int kVHeadDim = 64; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 64 && VHEADDIM <= 64) { \ - constexpr static int kQKHeadDim = 64; \ - constexpr static int kVHeadDim = 64; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 64 && VHEADDIM <= 128) { \ - constexpr static int kQKHeadDim = 64; \ - constexpr static int kVHeadDim = 128; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 96 && VHEADDIM <= 96) { \ - constexpr static int kQKHeadDim = 96; \ - constexpr static int kVHeadDim = 96; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 96 && VHEADDIM <= 192) { \ - constexpr static int kQKHeadDim = 96; \ - constexpr static int kVHeadDim = 192; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 128 && VHEADDIM <= 128) { \ - constexpr static int kQKHeadDim = 128; \ - constexpr static int kVHeadDim = 128; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 128 && VHEADDIM <= 256) { \ - constexpr static int kQKHeadDim = 128; \ - constexpr static int kVHeadDim = 256; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 160 && VHEADDIM <= 160) { \ - constexpr static int kQKHeadDim = 160; \ - constexpr static int kVHeadDim = 160; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 192 && VHEADDIM <= 192) { \ - constexpr static int kQKHeadDim = 192; \ - constexpr static int kVHeadDim = 192; \ - return __VA_ARGS__(); \ - } else if (QKHEADDIM <= 256 && VHEADDIM <= 256) { \ - constexpr static int kQKHeadDim = 256; \ - constexpr static int kVHeadDim = 256; \ - return __VA_ARGS__(); \ - } \ - }() - diff --git a/csrc/flash_attn/src/static_switch_headdim.h b/csrc/flash_attn/src/static_switch_headdim.h new file mode 100644 index 000000000..8d2e97b3e --- /dev/null +++ b/csrc/flash_attn/src/static_switch_headdim.h @@ -0,0 +1,69 @@ +// Inspired by +// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h +// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h + +#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` + +#define QKHEADDIM_VHEADDIM_SWITCH(QKHEADDIM, VHEADDIM, ...) \ + [&] { \ + if (QKHEADDIM <= 32 && VHEADDIM <= 32) { \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 32; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 32 && VHEADDIM <= 64) { \ + constexpr static int kQKHeadDim = 32; \ + constexpr static int kVHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 64 && VHEADDIM <= 64) { \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 64 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 64; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 96 && VHEADDIM <= 96) { \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 96; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 96 && VHEADDIM <= 192) { \ + constexpr static int kQKHeadDim = 96; \ + constexpr static int kVHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 128 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 128 && VHEADDIM <= 256) { \ + constexpr static int kQKHeadDim = 128; \ + constexpr static int kVHeadDim = 256; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 160 && VHEADDIM <= 160) { \ + constexpr static int kQKHeadDim = 160; \ + constexpr static int kVHeadDim = 160; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 192 && VHEADDIM <= 128) { \ + constexpr static int kQKHeadDim = 192; \ + constexpr static int kVHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 192 && VHEADDIM <= 192) { \ + constexpr static int kQKHeadDim = 192; \ + constexpr static int kVHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (QKHEADDIM <= 256 && VHEADDIM <= 256) { \ + constexpr static int kQKHeadDim = 256; \ + constexpr static int kVHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() diff --git a/flash_attn/flash_attn_interface.py b/flash_attn/flash_attn_interface.py index ecb3515c0..d9c05a026 100644 --- a/flash_attn/flash_attn_interface.py +++ b/flash_attn/flash_attn_interface.py @@ -588,8 +588,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension - dk = dk[..., : dout.shape[-1]] + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None @@ -675,8 +675,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension - dk = dk[..., : dout.shape[-1]] + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None diff --git a/setup.py b/setup.py index f80af1c32..7f6f30d4f 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,32 @@ # For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE" +list_headdim = [] +compile_list_headdim = [] +if not SKIP_CUDA_BUILD and not IS_ROCM: + list_headdim = [ + (32, 64), + (64, 128), + (96, 192), + (128, 256), + (192, 128) + ] + # "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu" + for ii in ["fwd", "bwd"]: + for jj in list_headdim: + for kk in ["fp16", "bf16"]: + for ll in ["", "_causal"]: + compile_list_headdim.append( + f"csrc/flash_attn/src/flash_{ii}_qkdim{jj[0]}_vdim{jj[1]}_{kk}{ll}_sm80.cu" + ) + + # "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu" + for jj in list_headdim: + for kk in ["fp16", "bf16"]: + for ll in ["", "_causal"]: + compile_list_headdim.append( + f"csrc/flash_attn/src/flash_fwd_split_qkdim{jj[0]}_vdim{jj[1]}_{kk}{ll}_sm80.cu" + ) def get_platform(): """ @@ -265,55 +291,7 @@ def validate_and_update_archs(archs): "csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu", "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim64_vdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim96_vdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_qkdim128_vdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim32_vdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim64_vdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim96_vdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_bwd_qkdim128_vdim256_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim32_vdim64_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim64_vdim128_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim96_vdim192_bf16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_fp16_causal_sm80.cu", - "csrc/flash_attn/src/flash_fwd_split_qkdim128_vdim256_bf16_causal_sm80.cu", - ], + ] + compile_list_headdim, extra_compile_args={ "cxx": ["-O3", "-std=c++17"] + generator_flag, "nvcc": append_nvcc_threads( From 255cd5a18a981bf428c9dd9fcbc5948eeee0312c Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 28 Aug 2024 20:18:11 +0800 Subject: [PATCH 24/46] add optional dim compile --- .../flash_attn/src/generate_switch_headdim.py | 66 +++++++++++++++++++ headdim.json | 1 + setup.py | 13 ++-- 3 files changed, 73 insertions(+), 7 deletions(-) create mode 100644 csrc/flash_attn/src/generate_switch_headdim.py create mode 100644 headdim.json diff --git a/csrc/flash_attn/src/generate_switch_headdim.py b/csrc/flash_attn/src/generate_switch_headdim.py new file mode 100644 index 000000000..f7def19d4 --- /dev/null +++ b/csrc/flash_attn/src/generate_switch_headdim.py @@ -0,0 +1,66 @@ +import json +from pathlib import Path + +def write_file(): + TEMPLATE_PRELUDE = """#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` +""" + + with open('headdim.json', 'r') as file: + read_list = json.load(file) + + read_list += [ + [32,32], + [64,64], + [96,96], + [128,128], + [160,160], + [192,192], + [256,256], + ] + + read_list = sorted(read_list, key=lambda x: (x[0], x[1])) + + TEMPLATE_BEGIN = """ +#define QKHEADDIM_VHEADDIM_SWITCH(QKHEADDIM, VHEADDIM, ...) \\ + [&] { \\ +""" + + TEMPLATE_BODY = "" + + for qkhead_dim, vhead_dim in read_list[:-1]: + TEMPLATE_BODY += f"""if (QKHEADDIM <= {qkhead_dim} && VHEADDIM <= {vhead_dim}) {{ \\ + constexpr static int kQKHeadDim = {qkhead_dim}; \\ + constexpr static int kVHeadDim = {vhead_dim}; \\ + return __VA_ARGS__(); \\ + }} else """ + + qkhead_dim, vhead_dim = read_list[-1] + TEMPLATE_BODY += f"""if (QKHEADDIM <= {qkhead_dim} && VHEADDIM <= {vhead_dim}) {{ \\ + constexpr static int kQKHeadDim = {qkhead_dim}; \\ + constexpr static int kVHeadDim = {vhead_dim}; \\ + return __VA_ARGS__(); \\ + }} \\ +""" + + TEMPLATE_END = """}() +""" + + TEMPLATE = TEMPLATE_PRELUDE + TEMPLATE_BEGIN + TEMPLATE_BODY + TEMPLATE_END + + # print(TEMPLATE) + with open(Path(__file__).parent.joinpath('static_switch_headdim.h'), 'w') as file: + file.write(TEMPLATE) + +if __name__ == '__main__': + write_file() diff --git a/headdim.json b/headdim.json new file mode 100644 index 000000000..94aa8daf7 --- /dev/null +++ b/headdim.json @@ -0,0 +1 @@ +[[32, 64], [64, 128], [96, 192], [128, 256], [192, 128]] \ No newline at end of file diff --git a/setup.py b/setup.py index 7f6f30d4f..8c899d84b 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ from pathlib import Path from packaging.version import parse, Version import platform +import json from setuptools import setup, find_packages import subprocess @@ -65,13 +66,8 @@ list_headdim = [] compile_list_headdim = [] if not SKIP_CUDA_BUILD and not IS_ROCM: - list_headdim = [ - (32, 64), - (64, 128), - (96, 192), - (128, 256), - (192, 128) - ] + with open('headdim.json', 'r') as file: + list_headdim = json.load(file) # "csrc/flash_attn/src/flash_fwd_qkdim32_vdim64_fp16_sm80.cu" for ii in ["fwd", "bwd"]: for jj in list_headdim: @@ -89,6 +85,9 @@ f"csrc/flash_attn/src/flash_fwd_split_qkdim{jj[0]}_vdim{jj[1]}_{kk}{ll}_sm80.cu" ) + from csrc.flash_attn.src.generate_switch_headdim import write_file + write_file() + def get_platform(): """ Returns the platform name as used in wheel filenames. From 00979f57acf503b0b080ff08ada0007080074649 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Wed, 4 Sep 2024 12:06:46 +0800 Subject: [PATCH 25/46] support different head kv --- csrc/flash_attn/flash_api.cpp | 20 +++++++++++++------- csrc/flash_attn/src/flash_bwd_kernel.h | 2 +- csrc/flash_attn/src/flash_fwd_kernel.h | 10 +++++----- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index 75ab69644..03bedcb7e 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -28,6 +28,7 @@ void set_params_fprop(Flash_fwd_params ¶ms, const size_t seqlen_k_rounded, const size_t h, const size_t h_k, + const size_t h_v, const size_t d, const size_t d_rounded, const size_t vd, @@ -95,7 +96,9 @@ void set_params_fprop(Flash_fwd_params ¶ms, params.b = b; params.h = h; params.h_k = h_k; + params.h_v = h_v; params.h_h_k_ratio = h / h_k; + params.h_h_v_ratio = h / h_v; params.seqlen_q = seqlen_q; params.seqlen_k = seqlen_k; params.seqlen_q_rounded = seqlen_q_rounded; @@ -167,6 +170,7 @@ void set_params_dgrad(Flash_bwd_params ¶ms, const size_t seqlen_k_rounded, const size_t h, const size_t h_k, + const size_t h_v, const size_t d, const size_t d_rounded, const size_t vd, @@ -195,7 +199,7 @@ void set_params_dgrad(Flash_bwd_params ¶ms, const bool unpadded_lse) { set_params_fprop(params, - b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,vd, vd, + b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, h_v, d, d_rounded,vd, vd, q, k, v, out, cu_seqlens_q_d, cu_seqlens_k_d, @@ -396,10 +400,12 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const int head_size_og = sizes[3]; const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); + const int num_heads_v = v.size(2); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } @@ -423,7 +429,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og); - CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_v, v_head_size_og); at::Tensor q_padded, k_padded, v_padded; if (head_size_og % 8 != 0) { @@ -489,7 +495,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size batch_size, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size, v_head_size_rounded, q_padded, k_padded, v_padded, out, @@ -744,7 +750,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s batch_size, max_seqlen_q, max_seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size, v_head_size_rounded, q_padded, k_padded, v_padded, out, @@ -1013,7 +1019,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si batch_size, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size_og, q, k, v, out, @@ -1273,7 +1279,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size batch_size, max_seqlen_q, max_seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size_og, q, k, v, out, @@ -1491,7 +1497,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he batch_size, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size, v_head_size_rounded, q_padded, kcache_padded, vcache_padded, out, diff --git a/csrc/flash_attn/src/flash_bwd_kernel.h b/csrc/flash_attn/src/flash_bwd_kernel.h index 46634ceb5..1db7a6dc1 100644 --- a/csrc/flash_attn/src/flash_bwd_kernel.h +++ b/csrc/flash_attn/src/flash_bwd_kernel.h @@ -110,7 +110,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params ¶ms, const in const index_t row_offset_k = binfo.k_offset(params.k_batch_stride, params.k_row_stride, bidb) + n_block * kBlockN * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride; const index_t row_offset_v = binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb) - + n_block * kBlockN * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride; + + n_block * kBlockN * params.v_row_stride + (bidh / params.h_h_v_ratio) * params.v_head_stride; const index_t row_offset_do = binfo.q_offset(params.do_batch_stride, params.do_row_stride, bidb) + (m_block_max - 1) * kBlockM * params.do_row_stride + bidh * params.do_head_stride; const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb) diff --git a/csrc/flash_attn/src/flash_fwd_kernel.h b/csrc/flash_attn/src/flash_fwd_kernel.h index 74c2833a5..2153ba07b 100644 --- a/csrc/flash_attn/src/flash_fwd_kernel.h +++ b/csrc/flash_attn/src/flash_fwd_kernel.h @@ -147,9 +147,9 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi make_coord(_, 0)); // (kBlockN, kHeadDim, nblocksN) Tensor mV = make_tensor(make_gmem_ptr(reinterpret_cast(params.v_ptr) + binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb)), - make_shape(binfo.actual_seqlen_k, params.h_k, params.vd), + make_shape(binfo.actual_seqlen_k, params.h_v, params.vd), make_stride(params.v_row_stride, params.v_head_stride, _1{})); - Tensor gV = local_tile(mV(_, bidh / params.h_h_k_ratio, _), Shape, Int>{}, + Tensor gV = local_tile(mV(_, bidh / params.h_h_v_ratio, _), Shape, Int>{}, make_coord(_, 0)); // (kBlockN, kHeadDim, nblocksN) Tensor gP = make_tensor(make_gmem_ptr(reinterpret_cast(params.p_ptr) + row_offset_p), Shape, Int>{}, @@ -599,8 +599,8 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons : block_table[block_table_idx] * params.k_batch_stride + block_table_offset * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride; const index_t row_offset_v = block_table == nullptr ? binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb_cache) - + (n_block_max - 1) * kBlockN * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride - : block_table[block_table_idx] * params.v_batch_stride + block_table_offset * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride; + + (n_block_max - 1) * kBlockN * params.v_row_stride + (bidh / params.h_h_v_ratio) * params.v_head_stride + : block_table[block_table_idx] * params.v_batch_stride + block_table_offset * params.v_row_stride + (bidh / params.h_h_v_ratio) * params.v_head_stride; Tensor mQ = make_tensor(make_gmem_ptr(reinterpret_cast(params.q_ptr) + binfo.q_offset(params.q_batch_stride, params.q_row_stride, bidb)), make_shape(binfo.actual_seqlen_q, params.h, params.d), @@ -724,7 +724,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params ¶ms, cons + ((n_block_max - 1) * kBlockN) * params.knew_row_stride + (bidh / params.h_h_k_ratio) * params.knew_head_stride; // const index_t row_offset_vnew = binfo.k_offset(params.vnew_batch_stride, params.vnew_row_stride, bidb) const index_t row_offset_vnew = bidb * params.vnew_batch_stride - + ((n_block_max - 1) * kBlockN) * params.vnew_row_stride + (bidh / params.h_h_k_ratio) * params.vnew_head_stride; + + ((n_block_max - 1) * kBlockN) * params.vnew_row_stride + (bidh / params.h_h_v_ratio) * params.vnew_head_stride; // Subtract seqlen_k_cache * row stride so that conceptually gK and gKnew "line up". When we access them, // e.g. if gK has 128 rows and gKnew has 64 rows, we access gK[:128] and gKNew[128:128 + 64]. // This maps to accessing the first 64 rows of knew_ptr. From feeab17f773c2aeac0454c79647f1f95e74cb220 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Wed, 4 Sep 2024 12:26:52 +0800 Subject: [PATCH 26/46] add test_head --- tests/test_flash_attn_head.py | 1262 +++++++++++++++++++++++++++++++++ 1 file changed, 1262 insertions(+) create mode 100644 tests/test_flash_attn_head.py diff --git a/tests/test_flash_attn_head.py b/tests/test_flash_attn_head.py new file mode 100644 index 000000000..b2b55cc49 --- /dev/null +++ b/tests/test_flash_attn_head.py @@ -0,0 +1,1262 @@ +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from flash_attn import ( + flash_attn_func, + flash_attn_kvpacked_func, + flash_attn_qkvpacked_func, + flash_attn_varlen_func, + flash_attn_varlen_kvpacked_func, + flash_attn_varlen_qkvpacked_func, + flash_attn_with_kvcache, +) +from flash_attn.bert_padding import pad_input, unpad_input +from flash_attn.flash_attn_interface import _get_block_size_n +from flash_attn.layers.rotary import apply_rotary_emb + +MAX_HEADDIM_SM8x = 192 + + +is_sm75 = torch.cuda.get_device_capability("cuda") == (7, 5) +is_sm8x = torch.cuda.get_device_capability("cuda")[0] == 8 +is_sm80 = torch.cuda.get_device_capability("cuda") == (8, 0) +is_sm90 = torch.cuda.get_device_capability("cuda") == (9, 0) + + +def attn_bias_from_alibi_slopes( + slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False, key_leftpad=None +): + batch, nheads = slopes.shape + device = slopes.device + slopes = rearrange(slopes, "b h -> b h 1 1") + if causal: + return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes + else: + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + relative_pos = torch.abs(row_idx + sk - sq - col_idx) + return -slopes * relative_pos.to(dtype=slopes.dtype) + + +def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): + assert mode in ["full", "random", "third"] + if mode == "full": + lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32) + elif mode == "random": + lengths = torch.randint( + max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device + ) + elif mode == "third": + lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device) + padding_mask = ( + repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths + ) + return padding_mask + + +def generate_qkv( + q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, d) + k: (batch_size, seqlen_k, nheads_k, d) + v: (batch_size, seqlen_k, nheads_k, d) + query_padding_mask: (batch_size, seqlen), bool + key_padding_mask: (batch_size, seqlen), bool + """ + assert not (kvpacked and qkvpacked) + batch_size, seqlen_q, nheads, d = q.shape + _, seqlen_k, nheads_k, _ = k.shape + assert k.shape == (batch_size, seqlen_k, nheads_k, d) + assert v.shape == (batch_size, seqlen_k, nheads_k, d) + + if query_padding_mask is not None: + q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask) + output_pad_fn = lambda output_unpad: pad_input( + output_unpad, indices_q, batch_size, seqlen_q + ) + else: + q_unpad = rearrange(q, "b s h d -> (b s) h d") + cu_seqlens_q = torch.arange( + 0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device + ) + max_seqlen_q = seqlen_q + output_pad_fn = lambda output_unpad: rearrange( + output_unpad, "(b s) h d -> b s h d", b=batch_size + ) + + if key_padding_mask is not None: + k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask) + v_unpad, _, _, _ = unpad_input(v, key_padding_mask) + else: + k_unpad = rearrange(k, "b s h d -> (b s) h d") + v_unpad = rearrange(v, "b s h d -> (b s) h d") + cu_seqlens_k = torch.arange( + 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device + ) + max_seqlen_k = seqlen_k + + if qkvpacked: + assert (query_padding_mask == key_padding_mask).all() + assert nheads == nheads_k + qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1) + qkv = torch.stack([q, k, v], dim=2) + if query_padding_mask is not None: + dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q) + else: + dqkv_pad_fn = lambda dqkv_unpad: rearrange( + dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + qkv_unpad.detach().requires_grad_(), + cu_seqlens_q, + max_seqlen_q, + qkv.detach().requires_grad_(), + output_pad_fn, + dqkv_pad_fn, + ) + elif kvpacked: + kv_unpad = torch.stack([k_unpad, v_unpad], dim=1) + kv = torch.stack([k, v], dim=2) + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k) + else: + dkv_pad_fn = lambda dkv_unpad: rearrange( + dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + q_unpad.detach().requires_grad_(), + kv_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + kv.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) + else: + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k) + else: + dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size) + return ( + q_unpad.detach().requires_grad_(), + k_unpad.detach().requires_grad_(), + v_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + k.detach().requires_grad_(), + v.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) + + +def construct_local_mask( + seqlen_q, + seqlen_k, + window_size=(-1, -1), # -1 means infinite window size + query_padding_mask=None, + key_padding_mask=None, + device=None, + key_leftpad=None, +): + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + if window_size[0] < 0: + return col_idx > row_idx + sk - sq + window_size[1] + else: + sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk + return torch.logical_or( + col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk), + col_idx < row_idx + sk - sq - window_size[0], + ) + + +def attention_ref( + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k: (batch_size, seqlen_k, nheads_k, head_dim) + v: (batch_size, seqlen_k, nheads_k, head_dim) + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k) + causal: whether to apply causal masking + window_size: (int, int), left and right window size + upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast + output back to fp16/bf16. + reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.) + without changing the math. This is to estimate the numerical error from operation + reordering. + Output: + output: (batch_size, seqlen_q, nheads, head_dim) + attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout + """ + if causal: + window_size = (window_size[0], 0) + dtype_og = q.dtype + if upcast: + q, k, v = q.float(), k.float(), v.float() + seqlen_q, seqlen_k = q.shape[1], k.shape[1] + k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2]) + v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2]) + d = q.shape[-1] + if not reorder_ops: + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + else: + scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d)) + if softcap > 0: + scores = scores / softcap + scores = scores.tanh() + scores = scores * softcap + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + key_leftpad=key_leftpad, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias + attention = torch.softmax(scores, dim=-1).to(v.dtype) + # Some rows might be completely masked out so we fill them with zero instead of NaN + if window_size[0] >= 0 or window_size[1] >= 0: + attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0) + # We want to mask here so that the attention matrix doesn't have any NaNs + # Otherwise we'll get NaN in dV + if query_padding_mask is not None: + attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + dropout_scaling = 1.0 / (1 - dropout_p) + # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling + # output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + if dropout_mask is not None: + attention_drop = attention.masked_fill(~dropout_mask, 0.0) + else: + attention_drop = attention + output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling) + if query_padding_mask is not None: + output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0) + return output.to(dtype=dtype_og), attention.to(dtype=dtype_og) + + +def attention_kvpacked_ref( + q, + kv, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + return attention_ref( + q, + kv[:, :, 0], + kv[:, :, 1], + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + upcast=upcast, + causal=causal, + window_size=window_size, + softcap=softcap, + reorder_ops=reorder_ops, + key_leftpad=key_leftpad, + ) + + +def attention_qkvpacked_ref( + qkv, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, +): + return attention_ref( + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + key_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + upcast=upcast, + causal=causal, + window_size=window_size, + softcap=softcap, + reorder_ops=reorder_ops, + ) + + +def generate_sparsity_mask(seqlen, sparsity=0.3): + repeats = seqlen // 16 // 2 + # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda'), + # torch.tensor([0, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda'), + # torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + nrow, ncol = seqlen // 16, seqlen // 256 + mask = torch.rand(nrow, ncol, device="cuda") < sparsity + return mask + + +def attention_blocksparse_ref(qkv, blockmask, attn_mask, dropout_p, dropout_mask): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + blockmask: (seqlen / 16, seqlen / 256) + attn_mask: (batch_size, seqlen) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen, seqlen) + Output: + output: (batch_size, seqlen, nheads, head_dim) + attention: softmax after dropout + """ + q, k, v = qkv.float().unbind(dim=2) + d = qkv.shape[-1] + seqlen = qkv.shape[1] + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf")) + blockmask = repeat(blockmask, "s_16 s_256 -> (s_16 16) (s_256 256)") + blockmask = blockmask[:seqlen, :seqlen] + scores.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), float("-inf")) + attention = torch.softmax(scores, dim=-1) + attention = attention.masked_fill(rearrange(~attn_mask, "b s -> b 1 s 1"), 0.0) + attention = attention.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), 0.0) + attention_drop = attention.masked_fill(~dropout_mask, 0.0) / (1 - dropout_p) + output = torch.einsum("bhts,bshd->bthd", attention_drop, v) + output.masked_fill_(rearrange(~attn_mask, "b s -> b s 1 1"), 0) + return output.to(dtype=qkv.dtype), attention.to(dtype=qkv.dtype) + + +def convert_flash_attn_S_to_softmax( + S, + seqlen_q, + seqlen_k, + query_padding_mask, + key_padding_mask, + head_dim, + is_dropout, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """FlashAttention stores the S matrix in a different way. + Arguments: + S: (batch_size, nheads, seqlen_q_rounded, seqlen_k_rounded) + query_padding_mask: (batch_size, seqlen_q_rounded) + key_padding_mask: (batch_size, seqlen_k_rounded) + """ + if causal: + window_size = (window_size[0], 0) + seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:] + S_converted = S + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + S.device, + ) + local_mask = F.pad( + local_mask, + (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q), + value=True, + ) + S_converted = S_converted.masked_fill(local_mask, 0.0) + + # Need to zero out things not in attention_mask in case S was initialized with random values + # and some of those values aren't overwritten. + seqlen_q_og = ( + query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q_rounded + ) + if query_padding_mask is not None: + query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og)) + S_converted = S_converted.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + seqlen_k_og = key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k + if key_padding_mask is not None: + key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og)) + S_converted = S_converted.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0) + S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded)) + S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded)) + return S_converted[:, :, :seqlen_q, :seqlen_k] + + +def normalize_flash_attn_S( + attn_unnorm, + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + is_dropout=False, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k, v: (batch_size, seqlen_k, nheads, head_dim) + key_padding_mask: (batch_size, seqlen_q) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + Output: + softmax_lse: (batch_size, nheads, seqlen_q) + softmax_max: (batch_size, nheads, seqlen_q) + """ + if causal: + window_size = (window_size[0], 0) + q, k, v = q.float(), k.float(), v.float() + _, seqlen_q, _, head_dim = q.shape + seqlen_k = k.shape[1] + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(head_dim), k) + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias.to(dtype=scores.dtype) + block_size_n = _get_block_size_n(scores.device, head_dim, is_dropout, causal) + scores_block = scores.split(block_size_n, dim=-1) + lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1) + lse = torch.logsumexp(lse_block, dim=-1) + # lse could be -inf (i.e. all values in scores are -inf), and we want to set those to inf + # so that when we do torch.exp(m - lse), we get 0.0 instead of NaN. + lse[lse == float("-inf")] = float("inf") + scores_max_block = torch.stack([torch.amax(s, dim=-1) for s in scores_block], dim=-1) + cummax_block = torch.cummax(scores_max_block.flip(-1), dim=-1).values.flip(-1).unbind(dim=-1) + attn_unnorm_block = attn_unnorm.split(block_size_n, dim=-1) + attn_norm = torch.cat( + [ + a * rearrange(torch.exp(m - lse), "b h s -> b h s 1") + for a, m in zip(attn_unnorm_block, cummax_block) + ], + dim=-1, + ) + if query_padding_mask is not None: + attn_norm.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + return attn_norm.to(dtype=attn_unnorm.dtype) + + +def get_dropout_fraction( + dropout_mask, + query_padding_mask=None, + key_padding_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """ + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k), bool. True means keep, False means drop. + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + """ + if causal: + window_size = (window_size[0], 0) + batch_size, nheads, seqlen_q, seqlen_k = dropout_mask.shape + dropped = ~dropout_mask + valid = torch.ones_like(dropout_mask) + if query_padding_mask is not None: + dropped.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False) + valid.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False) + if key_padding_mask is not None: + dropped.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False) + valid.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + dropout_mask.device, + ) + dropped.masked_fill_(local_mask, False) + valid.masked_fill_(local_mask, False) + dropped_total = dropped.sum() + return dropped.sum() / valid.sum() + + + +@pytest.mark.parametrize("kvpacked", [False]) +# @pytest.mark.parametrize("kvpacked", [False]) +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +# @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +@pytest.mark.parametrize("h,h_k,h_v",[ + (32,4,16), +]) +# @pytest.mark.parametrize("mha_type", ["mha"]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [False]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (512, 256), + (1024, 1024), + (1023, 1024), + (1024, 1023), + (2048, 2048), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +# @pytest.mark.parametrize("dropout_p", [0.0]) +@pytest.mark.parametrize("softcap", [0.0, 50.0]) +def test_flash_attn_output( + seqlen_q, seqlen_k, h, h_k, h_v, d, dropout_p, causal, local, alibi, deterministic, dtype, kvpacked, softcap +): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if softcap > 0.0 and dropout_p > 0.0: + pytest.skip("Softcap and dropout not supported together") + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 1 # 4 + nheads = h # 6 if softcap == 0.0 else 4 # softcap reference impl takes more memory + nheads_k = h_k # nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2) + nheads_v = h_v + assert nheads % nheads_k == 0 + assert nheads % nheads_v == 0 + assert (not kvpacked) or (nheads_k == nheads_v) + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + if softcap > 0: + # Ensure the values of qk are at least within softcap range. + q = q * softcap + if kvpacked: + kv = torch.randn( + batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + else: + k = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + + if kvpacked: + out, lse, S_dmask = flash_attn_kvpacked_func( + q, + kv, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + else: + out, lse, S_dmask = flash_attn_func( + q, + k, + v, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + if dropout_p > 0.0: + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen_q, + seqlen_k, + None, + None, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + attn_unnorm = S_dmask_converted.abs() + if kvpacked: + kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k) + k_rep, v_rep = kv_rep.unbind(dim=2) + else: + k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_v) + attn = normalize_flash_attn_S( + attn_unnorm, + q, + k_rep, + v_rep, + None, + None, + attn_bias, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_fraction = get_dropout_fraction( + dropout_mask, None, None, causal=causal, window_size=window_size + ).item() + print(f"Actual dropout fraction: {dropout_fraction}") + else: + dropout_mask = None + + if kvpacked: + out_ref, attn_ref = attention_kvpacked_ref( + q, + kv, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_kvpacked_ref( + q, + kv, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + else: + out_ref, attn_ref = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + if dropout_p > 0.0: + print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}") + print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + if kvpacked: + ( + dq, + dkv, + ) = torch.autograd.grad(out, (q, kv), g) + dk, dv = dkv.unbind(2) + ( + dq_ref, + dkv_ref, + ) = torch.autograd.grad(out_ref, (q, kv), g) + dk_ref, dv_ref = dkv_ref.unbind(2) + ( + dq_pt, + dkv_pt, + ) = torch.autograd.grad(out_pt, (q, kv), g) + dk_pt, dv_pt = dkv_pt.unbind(2) + else: + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + if dropout_p > 0.0: + assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item() + # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate + if not alibi: + assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025) + + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() + assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() + assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() + + + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [True]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("h,h_k,h_v",[ + (32,4,16), +]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [False]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (3, 1024), + (1, 339), + (64, 800), + (3, 799), + (64, 2048), + (16, 20000), + (16, 100000), + (128, 128), + (256, 256), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +def test_flash_attn_splitkv( + seqlen_q, seqlen_k,h,h_k,h_v, swap_sq_sk, d, causal, local, alibi, deterministic, dtype +): + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 1 + nheads = h # 12 + nheads_k = h_k + nheads_v = h_v + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads_v, d, device=device, dtype=dtype, requires_grad=True) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + out, lse, _ = flash_attn_func( + q, + k, + v, + 0.0, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out_ref, attn_ref = attention_ref( + q, k, v, None, None, attn_bias, 0.0, None, causal=causal, window_size=window_size + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + 0.0, + None, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5 + + mult = 2 if not alibi else 8 + assert (dq - dq_ref).abs().max().item() <= mult * (dq_pt - dq_ref).abs().max().item() + 2e-4 + assert (dk - dk_ref).abs().max().item() <= mult * (dk_pt - dk_ref).abs().max().item() + 2e-4 + assert (dv - dv_ref).abs().max().item() <= mult * (dv_pt - dv_ref).abs().max().item() + 2e-4 + + +# # @pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.float16]) +# @pytest.mark.parametrize("num_splits", [1, 0]) +# # @pytest.mark.parametrize("num_splits", [1]) +# @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +# # @pytest.mark.parametrize("mha_type", ["mha"]) +# @pytest.mark.parametrize("new_kv", [False, True]) +# # @pytest.mark.parametrize("new_kv", [False]) +# @pytest.mark.parametrize("alibi", [False, True]) +# # @pytest.mark.parametrize("alibi", [False]) +# @pytest.mark.parametrize("local", [False, True]) +# # @pytest.mark.parametrize("local", [False]) +# @pytest.mark.parametrize("causal", [False, True]) +# # @pytest.mark.parametrize("causal", [False]) +# @pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True, False]) +# # @pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True]) +# @pytest.mark.parametrize("rotary_interleaved", [False, True]) +# # @pytest.mark.parametrize("rotary_interleaved", [False]) +# @pytest.mark.parametrize("rotary_fraction", [0.0, 0.5, 1.0]) +# # @pytest.mark.parametrize("rotary_fraction", [0.0]) +# @pytest.mark.parametrize("paged_kv_block_size", [None, 256]) +# # @pytest.mark.parametrize("paged_kv_block_size", [256, 512]) +# # @pytest.mark.parametrize("paged_kv_block_size", [None]) +# @pytest.mark.parametrize("has_leftpad", [False, True]) +# # @pytest.mark.parametrize("has_leftpad", [True]) +# # @pytest.mark.parametrize("has_batch_idx", [False, True]) +# @pytest.mark.parametrize("has_batch_idx", [False]) +# @pytest.mark.parametrize("d", [32, 59, 64, 80, 128, 256]) +# # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# # @pytest.mark.parametrize('d', [56, 80]) +# # @pytest.mark.parametrize("d", [128]) +# @pytest.mark.parametrize( +# "seqlen_q,seqlen_k", +# [ +# (1, 128), +# (1, 339), +# (3, 1024), +# (64, 800), +# (64, 256), +# (3, 799), +# (64, 2048), +# (16, 20000), +# (1, 128 * 1024), +# (16, 128 * 1024), +# (128, 128), +# ], +# ) +# # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +# def test_flash_attn_kvcache( +# seqlen_q, +# seqlen_k, +# d, +# has_batch_idx, +# has_leftpad, +# paged_kv_block_size, +# rotary_fraction, +# rotary_interleaved, +# seqlen_new_eq_seqlen_q, +# causal, +# local, +# alibi, +# new_kv, +# mha_type, +# num_splits, +# dtype, +# ): +# if seqlen_q > seqlen_k and new_kv: +# pytest.skip() +# if not new_kv and rotary_fraction > 0.0: +# pytest.skip() +# if has_batch_idx and paged_kv_block_size is not None: +# pytest.skip() +# if has_leftpad and paged_kv_block_size is not None: +# pytest.skip() +# device = "cuda" +# # set seed +# torch.random.manual_seed(0) +# batch_size = 2 +# batch_size_cache = batch_size if not has_batch_idx else batch_size * 2 +# nheads = 6 +# # rotary_dim must be a multiple of 16, and must be <= d +# rotary_dim = math.floor(int(rotary_fraction * d) / 16) * 16 +# nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3) +# assert nheads % nheads_k == 0 +# window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) +# q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype) +# seqlen_new = seqlen_q if seqlen_new_eq_seqlen_q else torch.randint(1, seqlen_q + 1, (1,)).item() +# if new_kv: +# k = torch.randn(batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype) +# v = torch.randn(batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype) +# else: +# k, v = None, None +# if paged_kv_block_size is None: +# k_cache = torch.randn(batch_size_cache, seqlen_k, nheads_k, d, device=device, dtype=dtype) +# v_cache = torch.randn(batch_size_cache, seqlen_k, nheads_k, d, device=device, dtype=dtype) +# block_table = None +# else: +# ( +# k_cache, +# v_cache, +# block_table, +# k_cache_paged, +# v_cache_paged, +# num_blocks, +# ) = _generate_block_kvcache( +# seqlen_k, paged_kv_block_size, batch_size, nheads_k, d, device, dtype +# ) +# cache_seqlens = torch.randint( +# 0 if new_kv else 1, +# # If we don't use seqlen_q in the case of causal and rotary, cos/sin won't be long enough +# ( +# (seqlen_k - (seqlen_q if (causal or local) and rotary_dim > 1 else seqlen_new) + 1) +# if new_kv +# else (seqlen_k + 1) +# ), +# (batch_size,), +# dtype=torch.int32, +# device=device, +# ) +# if has_leftpad: +# cache_leftpad = torch.cat([torch.randint(0, cache_seqlens[i].item(), (1,), dtype=torch.int32, device=device) +# if cache_seqlens[i].item() > 0 else torch.zeros(1, dtype=torch.int32, device=device) +# for i in range(batch_size)]) +# else: +# cache_leftpad = None +# arange = rearrange(torch.arange(seqlen_k, device=device), "s -> 1 s") +# cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1") +# key_padding_mask = arange < cache_seqlens_expanded + (seqlen_new if new_kv else 0) +# if has_leftpad: +# key_padding_mask = torch.logical_and( +# key_padding_mask, arange >= cache_leftpad.unsqueeze(-1).expand(-1, seqlen_k) +# ) +# if has_batch_idx: +# cache_batch_idx = torch.randperm(batch_size_cache, dtype=torch.int32, device=device)[ +# :batch_size +# ] +# else: +# cache_batch_idx = None +# if alibi: +# alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 +# attn_bias = attn_bias_from_alibi_slopes( +# alibi_slopes, seqlen_q, seqlen_k, None, key_padding_mask, causal=causal, key_leftpad=cache_leftpad +# ) +# else: +# alibi_slopes, attn_bias = None, None +# # cache_seqlens = torch.tensor([64], dtype=torch.int32, device=device) +# if rotary_dim > 0: +# angle = ( +# torch.rand( +# seqlen_k if paged_kv_block_size is None else num_blocks * paged_kv_block_size, +# rotary_dim // 2, +# device=device, +# ) +# * 2 +# * math.pi +# ) +# cos = torch.cos(angle).to(dtype=dtype) +# sin = torch.sin(angle).to(dtype=dtype) +# if causal or local: +# q_ro = apply_rotary_emb( +# q, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved +# ) +# else: +# q_ro = rearrange( +# apply_rotary_emb( +# rearrange(q, "b s h d -> b 1 (s h) d"), +# cos, +# sin, +# seqlen_offsets=cache_seqlens, +# interleaved=rotary_interleaved, +# ), +# "b 1 (s h) d -> b s h d", +# s=seqlen_q, +# ) +# # q_ro = q +# k_ro = apply_rotary_emb( +# k, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved +# ) +# else: +# cos, sin = None, None +# q_ro, k_ro = q, k +# # k_cache[:, 64:] = -1 +# k_cache_ref = ( +# k_cache if not has_batch_idx else k_cache[cache_batch_idx.to(dtype=torch.long)] +# ).clone() +# v_cache_ref = ( +# v_cache if not has_batch_idx else v_cache[cache_batch_idx.to(dtype=torch.long)] +# ).clone() +# if new_kv: +# update_mask = torch.logical_and( +# cache_seqlens_expanded <= arange, arange < cache_seqlens_expanded + seqlen_new +# ) +# k_cache_ref[update_mask] = rearrange(k_ro, "b s ... -> (b s) ...") +# v_cache_ref[update_mask] = rearrange(v, "b s ... -> (b s) ...") +# k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k) +# v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k) +# out = flash_attn_with_kvcache( +# q, +# k_cache if paged_kv_block_size is None else k_cache_paged, +# v_cache if paged_kv_block_size is None else v_cache_paged, +# k, +# v, +# rotary_cos=cos, +# rotary_sin=sin, +# cache_seqlens=cache_seqlens, +# cache_batch_idx=cache_batch_idx, +# cache_leftpad=cache_leftpad, +# block_table=block_table, +# causal=causal, +# window_size=window_size, +# rotary_interleaved=rotary_interleaved, +# alibi_slopes=alibi_slopes, +# num_splits=num_splits, +# ) +# # out = flash_attn_with_kvcache( +# # q, k_cache, v_cache, cache_seqlens=cache_seqlens, causal=causal, window_size=window_size +# # ) +# # out = flash_attn_with_kvcache(q, k_cache, v_cache, causal=causal, window_size=window_size) +# # qk = torch.einsum("bqhd,bkhd->bhqk", q, k_cache_ref) +# # m = qk.amax(-1, keepdim=True) +# # s_tmp = torch.exp((qk - m) / math.sqrt(d)) +# # o1 = torch.einsum('bhst,bthd->bshd', s_tmp, v_cache_ref) +# # lse_ref = torch.logsumexp(qk / math.sqrt(d), -1) +# # probs = torch.softmax(qk, dim=-1) +# out_ref, _ = attention_ref( +# q_ro, +# k_cache_rep, +# v_cache_rep, +# None, +# key_padding_mask, +# attn_bias, +# 0.0, +# None, +# causal=causal, +# window_size=window_size, +# key_leftpad=cache_leftpad, +# ) +# out_pt, _ = attention_ref( +# q_ro, +# k_cache_rep, +# v_cache_rep, +# None, +# key_padding_mask, +# attn_bias, +# 0.0, +# None, +# causal=causal, +# window_size=window_size, +# upcast=False, +# reorder_ops=True, +# key_leftpad=cache_leftpad, +# ) +# print(f"Output max diff: {(out - out_ref).abs().max().item()}") +# print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") +# print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") +# print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + +# # Check that FlashAttention's numerical error is at most twice the numerical error +# # of a Pytorch implementation. +# if new_kv: +# if paged_kv_block_size is None: +# k_cache_select = ( +# k_cache if not has_batch_idx else k_cache[cache_batch_idx.to(dtype=torch.long)] +# ) +# v_cache_select = ( +# v_cache if not has_batch_idx else v_cache[cache_batch_idx.to(dtype=torch.long)] +# ) +# else: +# k_cache_select = rearrange( +# k_cache_paged[block_table.to(dtype=torch.long).flatten()], +# "(b nblocks) block_size ... -> b (nblocks block_size) ...", +# b=batch_size, +# )[:, :seqlen_k] +# v_cache_select = rearrange( +# v_cache_paged[block_table.to(dtype=torch.long).flatten()], +# "(b nblocks) block_size ... -> b (nblocks block_size) ...", +# b=batch_size, +# )[:, :seqlen_k] +# assert torch.allclose(k_cache_select, k_cache_ref, rtol=1e-3, atol=1e-3) +# assert torch.equal(v_cache_select, v_cache_ref) +# mult = 3 if not alibi else 5 +# assert (out - out_ref).abs().max().item() <= mult * (out_pt - out_ref).abs().max().item() + 1e-5 + + From 18b309d50acf7adf4b53ed1a90f75442f140d9c0 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 4 Sep 2024 14:56:10 +0800 Subject: [PATCH 27/46] update flash api head --- csrc/flash_attn/flash_api.cpp | 54 ++++++++++++++++++++++++----------- csrc/flash_attn/src/flash.h | 3 +- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index 03bedcb7e..6e9f60807 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -356,7 +356,7 @@ void set_params_alibi(Flash_fwd_params ¶ms, c10::optional &alibi std::vector mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size - const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_v x head_size c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, @@ -563,7 +563,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size std::vector mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. - const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + const at::Tensor &v, // total_k x num_heads_v x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_v x head_size if there's a block_table. c10::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 @@ -627,6 +627,8 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s int num_heads = sizes[1]; const int head_size_og = sizes[2]; const int num_heads_k = paged_KV ? k.size(2) : k.size(1); + // TODO: check here + const int num_heads_v = paged_KV ? v.size(2) : v.size(1); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } @@ -640,6 +642,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s void *cu_seqlens_q_d = cu_seqlens_q.data_ptr(); + // TODO: check here // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); @@ -657,6 +660,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); if (window_size_left >= max_seqlen_k) { window_size_left = -1; } if (window_size_right >= max_seqlen_k) { window_size_right = -1; } @@ -665,10 +669,10 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s if (!paged_KV) { const int total_k = k.size(0); CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); - CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, total_k, num_heads_v, v_head_size_og); } else { CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size_og); - CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_v, v_head_size_og); CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); } @@ -856,12 +860,12 @@ std::vector mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size - const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_v x head_size const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &softmax_lse, // b x h x seqlen_q c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size - c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &dv_, // batch_size x seqlen_k x num_heads_v x head_size c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, // probability to drop const float softmax_scale, @@ -918,6 +922,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si const int head_size = sizes[3]; const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); + const int num_heads_v = v.size(2); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); @@ -927,6 +932,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); } TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; @@ -941,7 +947,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size); - CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_v, v_head_size_og); CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og); @@ -969,7 +975,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); CHECK_DEVICE(dv); TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); - CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_v, v_head_size_og); } else { dv = torch::empty_like(v); } @@ -1007,9 +1013,12 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si at::Tensor dk_expanded, dv_expanded; if (num_heads_k != num_heads) { // MQA / GQA dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); - dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, v_head_size_og}, opts); } else { dk_expanded = dk; + } + if (num_heads_v != num_heads) { + dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, v_head_size_og}, opts); + } else { dv_expanded = dv; } @@ -1075,7 +1084,9 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si // For MQA/GQA we need to sum dK and dV across the groups if (num_heads_k != num_heads) { at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); - at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {3}); + } + if (num_heads_v != num_heads) { + at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_v, num_heads / num_heads_v, v_head_size_og}), {3}); } if (head_size_og % 8 != 0) { dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); @@ -1162,6 +1173,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size const int head_size = sizes[2]; const int total_k = k.size(0); const int num_heads_k = k.size(1); + const int num_heads_v = v.size(1); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); @@ -1172,6 +1184,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); } TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; @@ -1186,7 +1199,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_SHAPE(q, total_q, num_heads, head_size); CHECK_SHAPE(k, total_k, num_heads_k, head_size); - CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, total_k, num_heads_v, v_head_size_og); CHECK_SHAPE(out, total_q, num_heads, v_head_size_og); CHECK_SHAPE(dout, total_q, num_heads, head_size_og); CHECK_SHAPE(cu_seqlens_q, batch_size + 1); @@ -1216,7 +1229,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); CHECK_DEVICE(dv); TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); - CHECK_SHAPE(dv, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(dv, total_k, num_heads_v, v_head_size_og); } else { dv = torch::empty_like(v); } @@ -1260,9 +1273,12 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size at::Tensor dk_expanded, dv_expanded; if (num_heads_k != num_heads) { // MQA / GQA dk_expanded = torch::empty({total_k, num_heads, head_size}, opts); - dv_expanded = torch::empty({total_k, num_heads, v_head_size_og}, opts); } else { dk_expanded = dk; + } + if (num_heads_v != num_heads) { + dv_expanded = torch::empty({total_k, num_heads, v_head_size_og}, opts); + } else { dv_expanded = dv; } @@ -1334,7 +1350,9 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size // For MQA/GQA we need to sum dK and dV across the groups if (num_heads_k != num_heads) { at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); - at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {2}); + } + if (num_heads_v != num_heads) { + at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_v, num_heads / num_heads_v, v_head_size_og}), {2}); } if (head_size_og % 8 != 0) { dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); @@ -1414,10 +1432,12 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256"); const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size; const int num_heads_k = kcache.size(2); + const int num_heads_v = vcache.size(2); const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size; TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); // causal=true is the same as causal=false in this case @@ -1440,10 +1460,10 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); if (!paged_KV) { CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og); - CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_v, v_head_size_og); } else { CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og); - CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, v_head_size_og); + CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_v, v_head_size_og); CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); } @@ -1527,7 +1547,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension"); int seqlen_knew = k.size(1); CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og); - CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_v, v_head_size_og); if (head_size_og % 8 != 0) { k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); diff --git a/csrc/flash_attn/src/flash.h b/csrc/flash_attn/src/flash.h index 75825b9ad..fd8dea467 100644 --- a/csrc/flash_attn/src/flash.h +++ b/csrc/flash_attn/src/flash.h @@ -40,10 +40,11 @@ struct Qkv_params { index_t v_head_stride; // The number of heads. - int h, h_k; + int h, h_k, h_v; // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be // different from nheads (query). int h_h_k_ratio; // precompute h / h_k, + int h_h_v_ratio; }; //////////////////////////////////////////////////////////////////////////////////////////////////// From 6909ab4dde0a14bd69e076e17261dbf671195941 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 4 Sep 2024 14:57:07 +0800 Subject: [PATCH 28/46] fix interface bug --- flash_attn/flash_attn_interface.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flash_attn/flash_attn_interface.py b/flash_attn/flash_attn_interface.py index d9c05a026..fa86b8c26 100644 --- a/flash_attn/flash_attn_interface.py +++ b/flash_attn/flash_attn_interface.py @@ -543,6 +543,7 @@ def forward( ): if softmax_scale is None: softmax_scale = q.shape[-1] ** (-0.5) + ctx.headdim_qk = q.shape[-1] # before padding out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward( q, k, @@ -588,8 +589,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : q.shape[-1]] # We could have padded the head dimension - dk = dk[..., : k.shape[-1]] + dq = dq[..., : ctx.headdim_qk] # We could have padded the head dimension + dk = dk[..., : ctx.headdim_qk] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None @@ -617,6 +618,7 @@ def forward( ): if softmax_scale is None: softmax_scale = q.shape[-1] ** (-0.5) + ctx.headdim_qk = q.shape[-1] # before padding out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward( q, k, @@ -675,8 +677,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : q.shape[-1]] # We could have padded the head dimension - dk = dk[..., : k.shape[-1]] + dq = dq[..., : ctx.headdim_qk] # We could have padded the head dimension + dk = dk[..., : ctx.headdim_qk] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None From 5f26eb0b150a81acb924dd05d7a54239fa6b4a44 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Wed, 4 Sep 2024 15:32:56 +0800 Subject: [PATCH 29/46] update README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index a6a863aa0..7f1804e9d 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,17 @@ we have supported: - FlashAttention-2 with QKHeadDim=64, VHeadDim=128 - FlashAttention-2 with QKHeadDim=96, VHeadDim=192 - FlashAttention-2 with QKHeadDim=128, VHeadDim=256 +- FlashAttention-2 with QKHeadDim=192, VHeadDim=128 Feel free to tell us what else you need. We might support it soon. :) Currently, we do not provide prebuilt library, you need to compile from source. +## Usage + +Users can modify `headdim.json` before compile from source, to select the (dim_qk, dim_v) they needed. +Or you can just leave `headdim.json` untouched, and compile all the supported config. + ## Performance of Customized FlashAttention We test the performance speedup compare to padding qk&v hidden_dim on A100. From 536a8ccf1e90e40f2c8177bc6c8eba22e59a1a7a Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 4 Sep 2024 21:06:26 +0800 Subject: [PATCH 30/46] benchmark head_headdim --- benchmarks/benchmark_head_headdim.py | 208 +++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 benchmarks/benchmark_head_headdim.py diff --git a/benchmarks/benchmark_head_headdim.py b/benchmarks/benchmark_head_headdim.py new file mode 100644 index 000000000..f8b1a82ad --- /dev/null +++ b/benchmarks/benchmark_head_headdim.py @@ -0,0 +1,208 @@ +# Install the newest triton version with +# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python" +import csv +import pickle +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat + +from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward +from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined + +from flash_attn import flash_attn_qkvpacked_func, flash_attn_func + +try: + from triton.ops.flash_attention import attention as attention_triton +except ImportError: + attention_triton = None + +try: + import xformers.ops as xops +except ImportError: + xops = None + + +def flops(batch, seqlen, headdim, v_headdim, nheads, causal, mode="fwd"): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 2 * batch * seqlen**2 * nheads * (headdim+v_headdim) // (2 if causal else 1) + b = 2 * batch * seqlen**2 * nheads * (3*headdim+2*v_headdim) // (2 if causal else 1) + return f if mode == "fwd" else (b if mode == "bwd" else f+b) + +def efficiency(flop, time): + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + + +def attention_pytorch(q, k, v, dropout_p=0.0, causal=True): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + dropout_p: float + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, nheads, d = q.shape + nheads_k = k.shape[2] + nheads_v = v.shape[2] + if nheads_k < nheads: + k = repeat(k, 'b s h d -> b s (h g) d', g=nheads//nheads_k) + elif nheads_v < nheads: + v = repeat(v, 'b s h d -> b s (h g) d', g=nheads//nheads_v) + v_d = v.shape[-1] + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=q.dtype) + + +def flash_attention_pad(q,k,v, dropout_p=0.0, causal=True): + batch_size, seqlen, nheads, d = q.shape + nheads_k = k.shape[2] + nheads_v = v.shape[2] + if nheads_k < nheads_v: + k = repeat(k, 'b s h d -> b s (h g) d', g=nheads_v//nheads_k) + elif nheads_k > nheads_v: + v = repeat(v, 'b s h d -> b s (h g) d', g=nheads_k//nheads_v) + v_d = v.shape[-1] + if d == v_d: + return flash_attn_func(q, k, v, dropout_p, causal) + if d < v_d: + q = F.pad(q, (0, v_d-d)) + k = F.pad(k, (0, v_d-d)) + return flash_attn_func(q, k, v, dropout_p, causal) + elif d > v_d: + v = F.pad(v, (0, d-v_d)) + o = flash_attn_func(q, k, v, dropout_p, causal) + return o[:,:,:,:v_d] + + + +def time_fwd_bwd(func, *args, **kwargs): + time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) + return time_f[1].mean, time_b[1].mean + +save_csv = True + +repeats = 30 +device = 'cuda' +dtype = torch.float16 +torch.cuda.set_device(2) + +bs_seqlen_vals = [(4, 512), (4, 1024), (4, 2048), (4, 4096), (2, 8192), (1, 16384)] +causal_vals = [False, True] +headdim_vals = [ (32,64),(64,128)] +nheads_qkv = (32, 4, 16) +dropout_p = 0.0 + +methods = (["CustomFlash2", "Pytorch", "Flash2_Pad"]) + +if save_csv: + csvfile = open('flash2_attn_time.csv', 'w', newline='') + writer = csv.writer(csvfile) + writer.writerow([ + "causal", "qk_headdim", "v_headdim","nheads_q", "nheads_k", "nheads_v", "batch_size", "seqlen", + "time_fwd_CustomFlash2", "time_bwd_CustomFlash2", "time_fwd_bwd_CustomFlash2", + "time_fwd_Pytorch", "time_bwd_Pytorch", "time_fwd_bwd_Pytorch", + "time_fwd_Flash2_Pad", "time_bwd_Flash2_Pad", "time_fwd_bwd_Flash2_Pad", + "flops_fwd_CustomFlash2", "flops_bwd_CustomFlash2", "flops_fwd_bwd_CustomFlash2", + "flops_fwd_Pytorch", "flops_bwd_Pytorch", "flops_fwd_bwd_Pytorch", + "flops_fwd_Flash2_Pad", "flops_bwd_Flash2_Pad", "flops_fwd_bwd_Flash2_Pad", + ]) + +time_f = {} +time_b = {} +time_f_b = {} +speed_f = {} +speed_b = {} +speed_f_b = {} +for causal in causal_vals: + for headdim,v_headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + config = (causal, headdim, batch_size, seqlen) + nheads_q, nheads_k, nheads_v = nheads_qkv + q = torch.randn(batch_size, seqlen, nheads_q, headdim, device=device, dtype=dtype, + requires_grad=True) + k = torch.randn(batch_size, seqlen, nheads_k, headdim, device=device, dtype=dtype, + requires_grad=True) + v = torch.randn(batch_size, seqlen, nheads_v, v_headdim, device=device, dtype=dtype, + requires_grad=True) + f, b = time_fwd_bwd( + flash_attn_func, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "CustomFlash2"] = f + time_b[config, "CustomFlash2"] = b + + try: + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + attention_pytorch, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + except: # Skip if OOM + f, b = float('nan'), float('nan') + time_f[config, "Pytorch"] = f + time_b[config, "Pytorch"] = b + + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + flash_attention_pad, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "Flash2_Pad"] = f + time_b[config, "Flash2_Pad"] = b + + print(f"### causal={causal}, qk_headdim={headdim}, v_headdim={v_headdim}, batch_size={batch_size}, seqlen={seqlen}, head_qkv={nheads_qkv} ###") + for method in methods: + time_f_b[config, method] = time_f[config, method] + time_b[config, method] + speed_f[config, method] = efficiency( + flops(batch_size, seqlen, headdim, v_headdim, nheads_q, causal, mode="fwd"), + time_f[config, method] + ) + speed_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, v_headdim, nheads_q, causal, mode="bwd"), + time_b[config, method] + ) + speed_f_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, v_headdim, nheads_q, causal, mode="fwd_bwd"), + time_f_b[config, method] + ) + print( + f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, " + f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, " + f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s" + ) + if save_csv: + writer.writerow([ + causal, headdim, v_headdim, *nheads_qkv, batch_size, seqlen, + time_f[config, "CustomFlash2"], time_b[config, "CustomFlash2"], time_f_b[config, "CustomFlash2"], + time_f[config, "Pytorch"], time_b[config, "Pytorch"], time_f_b[config, "Pytorch"], + time_f[config, "Flash2_Pad"], time_b[config, "Flash2_Pad"], time_f_b[config, "Flash2_Pad"], + speed_f[config, "CustomFlash2"], speed_b[config, "CustomFlash2"], speed_f_b[config, "CustomFlash2"], + speed_f[config, "Pytorch"], speed_b[config, "Pytorch"], speed_f_b[config, "Pytorch"], + speed_f[config, "Flash2_Pad"], speed_b[config, "Flash2_Pad"], speed_f_b[config, "Flash2_Pad"], + ]) + +if save_csv: + csvfile.close() + + + +# with open('flash2_attn_time.plk', 'wb') as fp: +# pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL) From ca6335deafa7380b8dc86c3a36db7347f5064926 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 4 Sep 2024 21:19:31 +0800 Subject: [PATCH 31/46] fix bench bug --- benchmarks/benchmark_head_headdim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_head_headdim.py b/benchmarks/benchmark_head_headdim.py index f8b1a82ad..b254aff26 100644 --- a/benchmarks/benchmark_head_headdim.py +++ b/benchmarks/benchmark_head_headdim.py @@ -48,7 +48,7 @@ def attention_pytorch(q, k, v, dropout_p=0.0, causal=True): nheads_v = v.shape[2] if nheads_k < nheads: k = repeat(k, 'b s h d -> b s (h g) d', g=nheads//nheads_k) - elif nheads_v < nheads: + if nheads_v < nheads: v = repeat(v, 'b s h d -> b s (h g) d', g=nheads//nheads_v) v_d = v.shape[-1] q = rearrange(q, 'b t h d -> (b h) t d') @@ -101,7 +101,7 @@ def time_fwd_bwd(func, *args, **kwargs): repeats = 30 device = 'cuda' dtype = torch.float16 -torch.cuda.set_device(2) +torch.cuda.set_device(0) bs_seqlen_vals = [(4, 512), (4, 1024), (4, 2048), (4, 4096), (2, 8192), (1, 16384)] causal_vals = [False, True] From def41c0cb3658b7d5111ff4a2f98982fccae0914 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Thu, 5 Sep 2024 14:15:16 +0800 Subject: [PATCH 32/46] fix bug for numhead --- csrc/flash_attn/flash_api.cpp | 53 ++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index 6e9f60807..8c639a4c0 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -418,12 +418,13 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); - const int ngroups = num_heads / num_heads_k; + const int num_heads_maxkv = num_heads_k > num_heads_v ? num_heads_k : num_heads_v; + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_maxkv && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && v_head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_maxkv; if (seqlenq_ngroups_swapped) { - q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + q = q.reshape({batch_size, num_heads_maxkv, ngroups, head_size_og}).transpose(1, 2); seqlen_q = ngroups; - num_heads = num_heads_k; + num_heads = num_heads_maxkv; } @@ -455,7 +456,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], v_head_size_og); if (seqlenq_ngroups_swapped) { - out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2); + out = out.reshape({batch_size, num_heads_maxkv, ngroups, v_head_size_og}).transpose(1, 2); } if (v_head_size_og % 8 != 0) { out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); @@ -552,10 +553,10 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size } if (seqlenq_ngroups_swapped) { - out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); - out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); - q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); - softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, v_head_size_og}); + out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, v_head_size_og}); + q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_maxkv * seqlen_q, 1}); } return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state}; } @@ -642,15 +643,15 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s void *cu_seqlens_q_d = cu_seqlens_q.data_ptr(); - // TODO: check here // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); - const int ngroups = num_heads / num_heads_k; + const int num_heads_maxkv = num_heads_k > num_heads_v ? num_heads_k : num_heads_v; + const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_maxkv && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && v_head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_maxkv; if (seqlenq_ngroups_swapped) { - q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + q = q.reshape({batch_size, num_heads_maxkv, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_maxkv, head_size_og}); max_seqlen_q = ngroups; - num_heads = num_heads_k; + num_heads = num_heads_maxkv; cu_seqlens_q_d = nullptr; } @@ -709,7 +710,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); CHECK_SHAPE(out, sizes[0], sizes[1], v_head_size_og); if (seqlenq_ngroups_swapped) { - out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + out = out.reshape({batch_size, num_heads_maxkv, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_maxkv, head_size_og}); } if (v_head_size_og % 8 != 0) { out = torch::empty({total_q, num_heads, v_head_size_og}, q.options()); @@ -834,10 +835,10 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s } if (seqlenq_ngroups_swapped) { - int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_k, head_size_og}; - int64_t size_after[] = {batch_size, num_heads_k * max_seqlen_q, head_size_og}; - int64_t o_size_before[] = {batch_size, max_seqlen_q, num_heads_k, v_head_size_og}; - int64_t o_size_after[] = {batch_size, num_heads_k * max_seqlen_q, v_head_size_og}; + int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_maxkv, head_size_og}; + int64_t size_after[] = {batch_size, num_heads_maxkv * max_seqlen_q, head_size_og}; + int64_t o_size_before[] = {batch_size, max_seqlen_q, num_heads_maxkv, v_head_size_og}; + int64_t o_size_after[] = {batch_size, num_heads_maxkv * max_seqlen_q, v_head_size_og}; out = out.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); out_padded = out_padded.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); q_padded = q_padded.reshape(size_before).transpose(1, 2).reshape(size_after); @@ -1446,12 +1447,13 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int num_heads_maxkv = num_heads_k > num_heads_v ? num_heads_k : num_heads_v; + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_maxkv && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && v_head_size_og % 8 == 0 && !alibi_slopes_.has_value(); if (seqlenq_ngroups_swapped) { - const int ngroups = num_heads / num_heads_k; - q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + const int ngroups = num_heads / num_heads_maxkv; + q = q.reshape({batch_size, num_heads_maxkv, ngroups, head_size_og}).transpose(1, 2); seqlen_q = ngroups; - num_heads = num_heads_k; + num_heads = num_heads_maxkv; } if (window_size_left >= seqlen_k) { window_size_left = -1; } @@ -1484,6 +1486,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); CHECK_DEVICE(out); TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + // TODO: check here for seqlenq_ngroups_swapped CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); if (v_head_size_og % 8 != 0) { out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); @@ -1662,8 +1665,8 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he } if (seqlenq_ngroups_swapped) { - out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); - softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, v_head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_maxkv * seqlen_q, 1}); } return {out, softmax_lse}; } From 6e8d5375b42cf42c86610dfceee142f82b72e5dd Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Fri, 6 Sep 2024 21:32:54 +0800 Subject: [PATCH 33/46] add autotuner --- autotuner/base_tunner.py | 235 +++ autotuner/code_emitter.py | 75 + autotuner/configs/base_config.py | 34 + autotuner/configs/fwd_config.py | 18 + autotuner/profile_attn.py | 28 + .../template/flash_attn_profile_interface.py | 1352 ++++++++++++++ autotuner/template/flash_fwd.cu | 6 + autotuner/template/flash_fwd.h | 18 + autotuner/template/flash_profile_api.cpp | 1663 +++++++++++++++++ 9 files changed, 3429 insertions(+) create mode 100644 autotuner/base_tunner.py create mode 100644 autotuner/code_emitter.py create mode 100644 autotuner/configs/base_config.py create mode 100644 autotuner/configs/fwd_config.py create mode 100644 autotuner/profile_attn.py create mode 100644 autotuner/template/flash_attn_profile_interface.py create mode 100644 autotuner/template/flash_fwd.cu create mode 100644 autotuner/template/flash_fwd.h create mode 100644 autotuner/template/flash_profile_api.cpp diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py new file mode 100644 index 000000000..598a75da6 --- /dev/null +++ b/autotuner/base_tunner.py @@ -0,0 +1,235 @@ +import ctypes +import os +from concurrent.futures import ThreadPoolExecutor +import tempfile +import subprocess +import importlib.util + +import ctypes +import torch +from configs.base_config import BaseConfig + +import pprint +import json + +import time + +from code_emitter import CodeEmitter, ShapeConfig +from profile_attn import profile_fwd + + + + + +class CompileResult: + def __init__(self, config: BaseConfig, lib_name: str) -> None: + self.config = config + self.lib_name = lib_name + +def _create_code_for_profiling(config): + profile_code_path = os.path.join(config.template_dir , config.operation, "profile_code.py") + + spec = importlib.util.spec_from_file_location("ProfileCode", profile_code_path) + foo = importlib.util.module_from_spec(spec) + spec.loader.exec_module(foo) + # from template.flash_kernels.retnet.regfuse.profile_code import profile_code + # return profile_code.format(Br=config.Br, Bc=config.Bc, Kd=config.Kd, D=config.D, unrollLastIter=int(config.unrollLastIter), BlockKSmem=config.BlockKSmem, num_stages_qk=config.num_stages_qk, num_stages_mask=config.num_stages_mask, BlockKSmem2=config.BlockKSmem2, num_stages_v=config.num_stages_v, Nthreads=config.Nthreads) + # from template.flash_kernels.retnet.smemfuse.profile_code import profile_code + # return profile_code.format(Br=config.Br, Bc=config.Bc, Kd=config.Kd, D=config.D, unrollLastIter=int(config.unrollLastIter), BlockKSmem=config.BlockKSmem, num_stages_qk=config.num_stages_qk, num_stages_mask=config.num_stages_mask, BlockKSmem2=config.BlockKSmem2, num_stages_v=config.num_stages_v, Nthreads=config.Nthreads, warps_mma1_n=config.warps_mma1_n, warps_mma_n=config.warps_mma_n) + return foo.profile_code.format_map(config.__dict__) + +# def _compile(config, arch, temp_dir:str, timeout: float = None): +# ## compile + +# profiling_code = _create_code_for_profiling(config) +# src = tempfile.NamedTemporaryFile(mode="w",suffix=".cu", delete=True, dir=temp_dir) +# lib_name = src.name.replace(".cu", ".so") +# compute_version = arch.compute_capability +# cutlass_dir = os.path.join(os.path.dirname(__file__), "../../third_party/cutlass/include") +# csrc_dir = os.path.join(os.path.dirname(__file__), "../../csrc") +# if config.fuse_type == "register": +# template_dir = os.path.join(config.template_dir , "regfuse/") +# elif config.fuse_type == "shared": +# template_dir = os.path.join(config.template_dir , "smemfuse/") +# else: # bwd +# template_dir = config.template_dir +# command = ["nvcc","-std=c++17","-O3","--use_fast_math","--expt-relaxed-constexpr","--disable-warnings", "--compiler-options", "'-fPIC'", "--shared", src.name, "-lcuda", +# f"-gencode=arch=compute_{compute_version},code=sm_{compute_version}", +# f"-I{cutlass_dir}",f"-I{template_dir}",f"-I{csrc_dir}", "-o", lib_name] +# src.write(profiling_code) +# src.flush() +# try: +# ret = subprocess.run(command, timeout=timeout) +# except subprocess.TimeoutExpired: +# return None +# if ret.returncode != 0: +# return None +# return CompileResult(config,lib_name) + +class BaseTunner: + def __init__(self, arch, torch_array: list, op_name, tempdir): + self.arch = arch + self.torch_array = torch_array + self.Br_list = [32, 64, 128, 256] + self.Bc_list = [32, 64, 128, 256] + + self.template_dir = "template" + self.op_name = op_name + self.cache_path = os.path.join(os.path.dirname(__file__), "../../cache/") + self.problem_key = { + "dim_qk": torch_array[0].shape[-1], + "dim_v": torch_array[2].shape[-1] + } + self.shape_config = ShapeConfig(torch_array[0].shape[-1],torch_array[2].shape[-1]) + self.tempdir = tempdir + + def compile(self, configs:list, timeout: float = None): + temp_dir = self.tempdir + code_emitter = CodeEmitter(self.template_dir, temp_dir) + code_emitter.generate_code(self.shape_config, configs) + + + def compile_parallel(self, configs:list, temp_dir:str, timeout: float = None): + # ## compile + # arch = self.arch + # with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + # libs = executor.map(_compile, configs,[arch for _ in configs],[temp_dir for _ in configs],[timeout for _ in configs]) + # return list(libs) + pass + + def profile(self, config:BaseConfig, device="cuda:0") -> float: + spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.temp_dir+"/flash_attn_profile_interface.py") + flash_attn_func = importlib.util.module_from_spec(spec) + spec.loader.exec_module(flash_attn_func) + latency = profile_fwd(flash_attn_func) + if latency < 0: + latency = 1e8 + # remove lib + # subprocess.run(["rm", libname], check=True) + return latency + + def get_tuned_configs(self): + dim_qk = self.problem_key["dim_qk"] + dim_v = self.problem_key["dim_v"] + configs = [] + for Br in self.Br_list: + for Bc in self.Bc_list: + cur_configs = self.generate_configs(Br,Bc,dim_qk,dim_v) + for cur_config in cur_configs: + if cur_config.fuse_type=="register" and self.validate_register_fuse(cur_config): + configs.append(cur_config) + elif cur_config.fuse_type=="shared" and self.validate_shared_fuse(cur_config): + configs.append(cur_config) + else: # BWD + if self.validate_kernel(cur_config): + configs.append(cur_config) + return configs + + def tune(self, log_path="../logs/"): + st = time.time() + + dim_qk = self.problem_key["dim_qk"] + dim_v = self.problem_key["dim_v"] + + best_config = self.check_cache() + if best_config is not None: + # print("Best config found in cache: ") + # pprint.pprint(best_config) + return best_config + + configs = self.get_tuned_configs() + + # print configs + print("Configs to be tuned: ") + for config in configs: + # print(config) + pprint.pprint(config) + + + # cresults = self.compile(configs,src_dir.name,timeout=1200) + # cresults = self.compile_parallel(configs,src_dir.name,timeout=120) + self.compile(configs,timeout=120) + profile_dict = {} + latency = 1e8 + best_config = None + for config in configs: + lib_latency = self.profile(config) + if lib_latency == 1e8: + # print(cresult.config) + pprint.pprint(config) + print("profile runtime error") + if lib_latency < latency: + latency = lib_latency + best_config = config + profile_dict[config] = lib_latency + + end = time.time() + + print("##########################################################") + print("Operation type: ", best_config.operation) + print("Best config: ")# , best_config) + pprint.pprint(best_config) + print("Latency: ", latency) + + file_name = "profile_result_{}_{}_{}.txt".format(best_config.operation,dim_qk, dim_v) + os.makedirs(log_path,exist_ok=True) + with open(os.path.join(log_path,file_name),"w") as f: + for config in profile_dict: + f.write(repr(config)+"\n") + f.write(str(profile_dict[config])+"\n") + f.write("\n") + f.write("best config: \n") + f.write(repr(best_config)+"\n") + f.write(str(latency)+"\n") + f.write("\nsearch time: "+str(end-st)+"s") + + cache_path = self.cache_path + os.makedirs(cache_path,exist_ok=True) + with open(os.path.join(cache_path,"best_config_{}_{}_{}.json".format(self.op_name,dim_qk, dim_v)),"w") as f: + json.dump(best_config.__dict__,f) + + return best_config + + def check_cache(self): + cache_path = self.cache_path + op_name = self.op_name + dim_qk = self.problem_key["dim_qk"] + dim_v = self.problem_key["dim_v"] + if os.path.exists(os.path.join(cache_path, "best_config_{}_{}_{}.json".format(op_name,dim_qk, dim_v))): + with open(os.path.join(cache_path,"best_config_{}_{}_{}.json".format(op_name,dim_qk, dim_v)),"r") as f: + best_config_dict = json.load(f) + best_config = supported_configs[best_config_dict["operation"]].from_dict(best_config_dict) + return best_config + + return None + + + def validate_shared_fuse(self, config): + return False + def validate_register_fuse(self, config): + return False + def validate_kernel(self, config): + return False + def generate_configs(self,Br:int,Bc:int,dim_qk:int,dim_v:int): + configs = [] + return configs + +if __name__=="__main__": + import torch + batch_size = 4 + seqlen = 2048 + nheads = 8 + headdim = 32 + v_headdim = 32 + device = 'cuda' + dtype = torch.bfloat16 + q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, + requires_grad=True) + base_tunner = BaseTunner(arch=None, torch_array=[q,k,v], op_name="flash_fwd", tempdir="autotuner/temp") + + config = BaseConfig(headdim,v_headdim,64,32) + base_tunner.compile([config]) \ No newline at end of file diff --git a/autotuner/code_emitter.py b/autotuner/code_emitter.py new file mode 100644 index 000000000..d9c42022c --- /dev/null +++ b/autotuner/code_emitter.py @@ -0,0 +1,75 @@ +from configs.base_config import BaseConfig +from pathlib import Path +import os +import tempfile + +class ShapeConfig: + def __init__(self, Kd, D, is_bf16: bool=False, is_causal: bool=False) -> None: + self.Kd = Kd + self.D = D + self.is_bf16 = is_bf16 + self.is_causal = is_causal + + +class CodeEmitter: + def __init__(self, template_dir, output_dir) -> None: + self.template_dir = template_dir + self.output_dir = output_dir + + self.profile_api_file_list = [ + "flash_fwd.cu", + "flash_profile_api.cpp", + ] + self.kernel_file_list = [ + "flash_fwd.h", + ] + + def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): + template_dir = self.template_dir + output_dir = self.output_dir + + if not Path(output_dir).exists(): + os.mkdir(output_dir) + + # generate api code + for file_name in self.profile_api_file_list: + with open(Path(template_dir) / Path(file_name)) as f: + code_template = f.read() + code_template = self.emit_code_profile_api(code_template, shape_config) + + with open(Path(output_dir) / Path(file_name), "w") as f: + f.write(code_template) + + # generate kernel code + # TODO: parallelize + for config in configs: + kernel_code_dir = Path(output_dir) / Path(str(config)) + if not kernel_code_dir.exists(): + os.mkdir(kernel_code_dir) + for file_name in self.kernel_file_list: + with open(Path(template_dir) / Path(file_name)) as f: + code_template = f.read() + code_template = self.emit_code_kernel(code_template, config) + + with open(kernel_code_dir / Path(file_name), "w") as f: + f.write(code_template) + + + def emit_code_kernel(self, code_template:str, config:BaseConfig): + kv = config.__dict__ + for k,v in kv.items(): + code_template = code_template.replace(f"/*{{{k}}}*/",str(v)) + return code_template + + def emit_code_profile_api(self, code_template:str, shape_config: ShapeConfig): + kv = shape_config.__dict__ + for k,v in kv.items(): + code_template = code_template.replace(f"/*{{{k}}}*/",str(v)) + return code_template + + +if __name__ == "__main__": + from configs.fwd_config import FlashFwdConfig + config = FlashFwdConfig(1,2,3,4) + ce = CodeEmitter("autotuner/template/", "autotuner/template/output/") + ce.generate_code(ShapeConfig(64,128), [config]) diff --git a/autotuner/configs/base_config.py b/autotuner/configs/base_config.py new file mode 100644 index 000000000..39eed0865 --- /dev/null +++ b/autotuner/configs/base_config.py @@ -0,0 +1,34 @@ +class BaseConfig: + def __init__(self, Kd, D, Br, Bc, Nwarps=8) -> None: + self.Br = Br + self.Bc = Bc + self.Kd = Kd + self.D = D + self.Nwarps = Nwarps + + self.operation = None + self.template_dir = None + self.output_dir = str(self) + + def __repr__(self) -> str: + return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={}".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps) + + def __str__(self) -> str: + return f"{self.Kd}_{self.D}_{self.Br}_{self.Bc}_{self.Nwarps}" + + @classmethod + def from_dict(cls, dd:dict): + cc = cls.__new__(cls) # cls: 子类 + cc.__dict__.update(dd) + return cc + +if __name__ == "__main__": + cc = BaseConfig(1,2,3,4) + print(cc) + print(repr(cc)) + print(cc.__dict__) + dd = cc.__dict__ + cc2 = BaseConfig.from_dict(dd) + print(cc2) + print(repr(cc2)) + print(cc2.__dict__) \ No newline at end of file diff --git a/autotuner/configs/fwd_config.py b/autotuner/configs/fwd_config.py new file mode 100644 index 000000000..70177d317 --- /dev/null +++ b/autotuner/configs/fwd_config.py @@ -0,0 +1,18 @@ +import os +from .base_config import BaseConfig + +class FlashFwdConfig(BaseConfig): + def __init__(self, Kd, D, Br, Bc, Nwarps=8, isQinRegs:bool = False, SharedQKSmem:bool = False) -> None: + super().__init__(Kd, D, Br, Bc, Nwarps) + + self.isQinRegs = isQinRegs or SharedQKSmem + self.SharedQKSmem = SharedQKSmem + + self.operation = "flash_fwd" + self.template_dir = os.path.join(os.path.dirname(__file__), "../../../csrc/kernels/attention") + + def __repr__(self) -> str: + return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={}, isQinRegs={}, SharedQKSmem={}".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps, self.isQinRegs, self.SharedQKSmem) + + def __str__(self) -> str: + return f"{self.Kd}_{self.D}_{self.Br}_{self.Bc}_{self.Nwarps}_{self.isQinRegs}_{self.SharedQKSmem}" \ No newline at end of file diff --git a/autotuner/profile_attn.py b/autotuner/profile_attn.py new file mode 100644 index 000000000..f7f80ec4c --- /dev/null +++ b/autotuner/profile_attn.py @@ -0,0 +1,28 @@ +import torch +from flash_attn.utils.benchmark import benchmark_forward + +# batch_size = 4 +# seqlen = 2048 +# nheads = 8 +# headdim = QKHeadDim +# v_headdim = VHeadDim +# device = 'cuda' +# dtype = torch.bfloat16 if is_bf16 else torch.float16 + +# dropout_p = 0.0 +# causal = is_causal +# repeats = 30 + + +def profile_fwd(fn,headdim, v_headdim, batch_size=4, seqlen=2048, nheads=8, device='cuda', is_bf16=False, causal=False, dropout_p=0.0, repeats=30): + dtype = torch.bfloat16 if is_bf16 else torch.float16 + q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, + requires_grad=True) + f = benchmark_forward(fn, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=False) + time_f = f[1].mean + # print(time_f) + return time_f \ No newline at end of file diff --git a/autotuner/template/flash_attn_profile_interface.py b/autotuner/template/flash_attn_profile_interface.py new file mode 100644 index 000000000..26273c293 --- /dev/null +++ b/autotuner/template/flash_attn_profile_interface.py @@ -0,0 +1,1352 @@ +from typing import Optional, Union + +import torch +import torch.nn as nn + +import torch.utils.cpp_extension + +import os +from pathlib import Path + +# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h +# See https://github.com/pytorch/pytorch/pull/70650 +generator_flag = [] +torch_dir = torch.__path__[0] +if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")): + generator_flag = ["-DOLD_GENERATOR_PATH"] + + +include_path = [ + "csrc/flash_attn", + "csrc/flash_attn/src", + "csrc/cutlass/include", + OUTPUT_DIR, +] + +cc_flag = [] +# cc_flag.append("-gencode") +# cc_flag.append("arch=compute_75,code=sm_75") +cc_flag.append("-gencode") +cc_flag.append("arch=compute_80,code=sm_80") +# cc_flag.append("-gencode") +# cc_flag.append("arch=compute_90,code=sm_90") + + + +flash_attn_cuda = torch.utils.cpp_extension.load( + name="flash_attn_cuda", + sources=[ + OUTPUT_DIR + "/flash_profile_api.cpp", # "csrc/flash_attn/flash_api.cpp", + OUTPUT_DIR + "/flash_fwd.cu", + # "csrc/flash_attn/src/flash_fwd_qkdim192_vdim128_fp16_sm80.cu", + # "csrc/flash_attn/src/flash_bwd_qkdim192_vdim128_fp16_sm80.cu", + # "csrc/flash_attn/src/flash_fwd_split_qkdim192_vdim128_fp16_sm80.cu", + ], + extra_cflags=[ + "-O3", "-std=c++17" + ] + generator_flag, + extra_cuda_cflags=[ + "-O3", + "-std=c++17", + "-U__CUDA_NO_HALF_OPERATORS__", + "-U__CUDA_NO_HALF_CONVERSIONS__", + "-U__CUDA_NO_HALF2_OPERATORS__", + "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", + "--expt-relaxed-constexpr", + "--expt-extended-lambda", + "--use_fast_math", + # "--ptxas-options=-v", + # "--ptxas-options=-O2", + # "-lineinfo", + # "-DFLASHATTENTION_DISABLE_BACKWARD", + # "-DFLASHATTENTION_DISABLE_DROPOUT", + # "-DFLASHATTENTION_DISABLE_ALIBI", + # "-DFLASHATTENTION_DISABLE_SOFTCAP", + # "-DFLASHATTENTION_DISABLE_UNEVEN_K", + # "-DFLASHATTENTION_DISABLE_LOCAL", + ] + + generator_flag + + cc_flag, + extra_include_paths=include_path, + build_directory=f"build_autotuner", +) + +# isort: off +# We need to import the CUDA kernels after importing torch +# import flash_attn_2_cuda as flash_attn_cuda + +# isort: on + +def maybe_contiguous(x): + return x.contiguous() if x is not None and x.stride(-1) != 1 else x + +def _get_block_size_n(device, head_dim, is_dropout, is_causal): + # This should match the block sizes in the CUDA kernel + assert head_dim <= 256 + major, minor = torch.cuda.get_device_capability(device) + is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100) + is_sm80 = major == 8 and minor == 0 + is_sm90 = major == 9 and minor == 0 + if head_dim <= 32: + return 128 + if head_dim <= 64: + return 128 if not is_dropout else 64 + elif head_dim <= 96: + return 64 + elif head_dim <= 128: + if is_sm8x: + return 64 if (not is_dropout and is_causal) else 32 + else: + return 64 if not is_dropout else 32 + elif head_dim <= 160: + if is_sm8x: + return 64 + else: + return 32 + elif head_dim <= 192: + return 64 + elif head_dim <= 224: + return 64 + elif head_dim <= 256: + return 64 + + +def _flash_attn_forward( + q, k, v, dropout_p, softmax_scale, causal, window_size, softcap, alibi_slopes, return_softmax +): + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.fwd( + q, + k, + v, + None, + alibi_slopes, + dropout_p, + softmax_scale, + causal, + window_size[0], + window_size[1], + softcap, + return_softmax, + None, + ) + return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state + + +def _flash_attn_varlen_forward( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size=(-1, -1), + softcap=0.0, + alibi_slopes=None, + return_softmax=False, + block_table=None, + leftpad_k=None, + seqused_k=None, +): + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd( + q, + k, + v, + None, + cu_seqlens_q, + cu_seqlens_k, + seqused_k, + leftpad_k, + block_table, + alibi_slopes, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + False, + causal, + window_size[0], + window_size[1], + softcap, + return_softmax, + None, + ) + # if out.isnan().any() or softmax_lse.isnan().any(): + # breakpoint() + return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state + + +def _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + rng_state=None, +): + # dq, dk, dv are allocated by us so they should already be contiguous + dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)] + ( + dq, + dk, + dv, + softmax_d, + ) = flash_attn_cuda.bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + alibi_slopes, + dropout_p, + softmax_scale, + causal, + window_size[0], + window_size[1], + softcap, + deterministic, + None, + rng_state, + ) + return dq, dk, dv, softmax_d + + +def _flash_attn_varlen_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + rng_state=None, +): + # dq, dk, dv are allocated by us so they should already be contiguous + dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)] + ( + dq, + dk, + dv, + softmax_d, + ) = flash_attn_cuda.varlen_bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + alibi_slopes, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + False, + causal, + window_size[0], + window_size[1], + softcap, + deterministic, + None, + rng_state, + ) + # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any(): + # breakpoint() + return dq, dk, dv, softmax_d + + +class FlashAttnQKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + qkv, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + ): + if softmax_scale is None: + softmax_scale = qkv.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward( + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + dropout_p, + softmax_scale, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=return_softmax and dropout_p > 0, + ) + ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state) + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors + qkv_shape = q.shape[:-2] + (3, *q.shape[-2:]) + dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device) + _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dqkv[:, :, 0], + dqkv[:, :, 1], + dqkv[:, :, 2], + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size, + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + ) + dqkv = dqkv[..., : dout.shape[-1]] # We could have padded the head dimension + return dqkv, None, None, None, None, None, None, None, None + + +class FlashAttnVarlenQKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + qkv, + cu_seqlens, + max_seqlen, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + ): + if softmax_scale is None: + softmax_scale = qkv.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward( + qkv[:, 0], + qkv[:, 1], + qkv[:, 2], + cu_seqlens, + cu_seqlens, + max_seqlen, + max_seqlen, + dropout_p, + softmax_scale, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=return_softmax and dropout_p > 0, + block_table=None, + ) + ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens, rng_state) + ctx.dropout_p = dropout_p + ctx.max_seqlen = max_seqlen + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors + qkv_shape = q.shape[:-2] + (3, *q.shape[-2:]) + dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device) + _flash_attn_varlen_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dqkv[:, 0], + dqkv[:, 1], + dqkv[:, 2], + cu_seqlens, + cu_seqlens, + ctx.max_seqlen, + ctx.max_seqlen, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size, + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + ) + dqkv = dqkv[..., : dout.shape[-1]] # We could have padded the head dimension + return dqkv, None, None, None, None, None, None, None, None, None, None + + +class FlashAttnKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + kv, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward( + q, + kv[:, :, 0], + kv[:, :, 1], + dropout_p, + softmax_scale, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=return_softmax and dropout_p > 0, + ) + ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state) + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors + dq = torch.empty_like(q) + kv_shape = k.shape[:-2] + (2, *k.shape[-2:]) + dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device) + _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dkv[:, :, 0], + dkv[:, :, 1], + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size, + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + ) + dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension + dkv = dkv[..., : dout.shape[-1]] + return dq, dkv, None, None, None, None, None, None, None, None + + +class FlashAttnVarlenKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward( + q, + kv[:, 0], + kv[:, 1], + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=return_softmax and dropout_p > 0, + block_table=None, + ) + ctx.save_for_backward( + q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state + ) + ctx.dropout_p = dropout_p + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors + dq = torch.empty_like(q) + kv_shape = k.shape[:-2] + (2, *k.shape[-2:]) + dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device) + _flash_attn_varlen_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dkv[:, 0], + dkv[:, 1], + cu_seqlens_q, + cu_seqlens_k, + ctx.max_seqlen_q, + ctx.max_seqlen_k, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size, + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + ) + dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension + dkv = dkv[..., : dout.shape[-1]] + return dq, dkv, None, None, None, None, None, None, None, None, None, None, None, None + + +class FlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward( + q, + k, + v, + dropout_p, + softmax_scale, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=return_softmax and dropout_p > 0, + ) + ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state) + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors + dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v) + _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size, + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + ) + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] + dv = dv[..., : dout.shape[-1]] + return dq, dk, dv, None, None, None, None, None, None, None, None + + +class FlashAttnVarlenFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + block_table, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=return_softmax and dropout_p > 0, + block_table=block_table, + ) + ctx.save_for_backward( + q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state + ) + ctx.dropout_p = dropout_p + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors + dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v) + _flash_attn_varlen_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + ctx.max_seqlen_q, + ctx.max_seqlen_k, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size, + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + ) + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] + dv = dv[..., : dout.shape[-1]] + return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None + + +def flash_attn_qkvpacked_func( + qkv, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # <=0.0 means deactivate + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + If Q, K, V are already stacked into 1 tensor, this function will be faster than + calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation + of the gradients of Q, K, V. + For multi-query and grouped-query attention (MQA/GQA), please see + flash_attn_kvpacked_func and flash_attn_func. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive. + + Arguments: + qkv: (batch_size, seqlen, 3, nheads, headdim) + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + softcap: float. Anything > 0 activates softcapping attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to + the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (batch_size, seqlen, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnQKVPackedFunc.apply( + qkv, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + ) + + +def flash_attn_kvpacked_func( + q, + kv, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # 0.0 means deactivated + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + If K, V are already stacked into 1 tensor, this function will be faster than + calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation + of the gradients of K, V. + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (batch_size, seqlen, nheads, headdim) + kv: (batch_size, seqlen, 2, nheads_k, headdim) + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + softcap: float. Anything > 0 activates softcapping attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (batch_size, seqlen, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnKVPackedFunc.apply( + q, + kv, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + ) + + +def flash_attn_func( + q, + k, + v, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # 0.0 means deactivated + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (batch_size, seqlen, nheads, headdim) + k: (batch_size, seqlen, nheads_k, headdim) + v: (batch_size, seqlen, nheads_k, headdim) + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (batch_size, seqlen, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnFunc.apply( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + ) + + +def flash_attn_varlen_qkvpacked_func( + qkv, + cu_seqlens, + max_seqlen, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # 0.0 means deactivated + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + If Q, K, V are already stacked into 1 tensor, this function will be faster than + calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation + of the gradients of Q, K, V. + For multi-query and grouped-query attention (MQA/GQA), please see + flash_attn_varlen_kvpacked_func and flash_attn_varlen_func. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive. + + Arguments: + qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch. + cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into qkv. + max_seqlen: int. Maximum sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + softcap: float. Anything > 0 activates softcapping attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnVarlenQKVPackedFunc.apply( + qkv, + cu_seqlens, + max_seqlen, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + ) + + +def flash_attn_varlen_kvpacked_func( + q, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # 0.0 means deactivated + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + If K, V are already stacked into 1 tensor, this function will be faster than + calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation + of the gradients of K, V. + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch. + kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + softcap: float. Anything > 0 activates softcapping attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnVarlenKVPackedFunc.apply( + q, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + ) + + +def flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # 0.0 means deactivated + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, + block_table=None, +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch. + k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch. + v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + softcap: float. Anything > 0 activates softcapping attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnVarlenFunc.apply( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + block_table, + ) + + +def flash_attn_with_kvcache( + q, + k_cache, + v_cache, + k=None, + v=None, + rotary_cos=None, + rotary_sin=None, + cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None, + cache_batch_idx: Optional[torch.Tensor] = None, + cache_leftpad: Optional[torch.Tensor] = None, + block_table: Optional[torch.Tensor] = None, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + softcap=0.0, # 0.0 means deactivated + rotary_interleaved=True, + alibi_slopes=None, + num_splits=0, + return_softmax_lse=False, +): + """ + If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from + k and v. This is useful for incremental decoding: you can pass in the cached keys/values from + the previous step, and update them with the new keys/values from the current step, and do + attention with the updated cache, all in 1 kernel. + + If you pass in k / v, you must make sure that the cache is large enough to hold the new values. + For example, the KV cache could be pre-allocated with the max sequence length, and you can use + cache_seqlens to keep track of the current sequence lengths of each sequence in the batch. + + Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be + rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc. + If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos + and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc. + If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at + indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens). + + See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function. + + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Note: Does not support backward pass. + + Arguments: + q: (batch_size, seqlen, nheads, headdim) + k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table, + or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache) + page_block_size must be a multiple of 256. + v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table, + or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache) + k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate + k with k_cache, starting at the indices specified by cache_seqlens. + v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k. + rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding + to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16. + rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos. + cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the + KV cache. + cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache. + If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1]. + If the indices are not distinct, and k and v are provided, the values updated in the cache + might come from any of the duplicate indices. + cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0. + block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + softcap: float. Anything > 0 activates softcapping attention. + rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in. + If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False, + rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1 + (i.e. GPT-NeoX style). + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + num_splits: int. If > 1, split the key/value into this many chunks along the sequence. + If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic + to automatically determine the number of splits. + Don't change this unless you know what you are doing. + return_softmax_lse: bool. Whether to return the logsumexp of the attention scores. + + Return: + out: (batch_size, seqlen, nheads, headdim). + softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + """ + assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension" + assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension" + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + if cache_seqlens is not None and isinstance(cache_seqlens, int): + cache_seqlens = torch.full( + (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device + ) + cache_seqlens = maybe_contiguous(cache_seqlens) + cache_batch_idx = maybe_contiguous(cache_batch_idx) + block_table = maybe_contiguous(block_table) + out, softmax_lse = flash_attn_cuda.fwd_kvcache( + q, + k_cache, + v_cache, + k, + v, + cache_seqlens, + rotary_cos, + rotary_sin, + cache_batch_idx, + cache_leftpad, + block_table, + alibi_slopes, + None, + softmax_scale, + causal, + window_size[0], + window_size[1], + softcap, + rotary_interleaved, + num_splits, + ) + return (out, softmax_lse) if return_softmax_lse else out diff --git a/autotuner/template/flash_fwd.cu b/autotuner/template/flash_fwd.cu new file mode 100644 index 000000000..a940d81fb --- /dev/null +++ b/autotuner/template/flash_fwd.cu @@ -0,0 +1,6 @@ +#include "flash_fwd.h" + +template<> +void run_mha_fwd_, /*{Kd}*/ , /*{D}*/, /*{is_causal}*/>(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_qkdim/*{Kd}*/_vdim/*{D}*/, /*{is_causal}*/>(params, stream); +} diff --git a/autotuner/template/flash_fwd.h b/autotuner/template/flash_fwd.h new file mode 100644 index 000000000..98ff86037 --- /dev/null +++ b/autotuner/template/flash_fwd.h @@ -0,0 +1,18 @@ +#include "flash_fwd_launch_template.h" + +#define False false +#define True true + +template +void run_mha_fwd_qkdim/*{Kd}*/_vdim/*{D}*/(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int QKHeaddim = /*{Kd}*/; + constexpr static int VHeaddim = /*{D}*/; + constexpr static int Br = /*{Br}*/; + constexpr static int Bc = /*{Bc}*/; + constexpr static int Nwarps = /*{Nwarps}*/; + constexpr static bool IsQinRegs = /*{isQinRegs}*/; + constexpr static bool SharedQKSmem = /*{SharedQKSmem}*/; + DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] { + run_flash_fwd, Is_dropout, Is_causal>(params, stream); + }); +} \ No newline at end of file diff --git a/autotuner/template/flash_profile_api.cpp b/autotuner/template/flash_profile_api.cpp new file mode 100644 index 000000000..0ee997cf8 --- /dev/null +++ b/autotuner/template/flash_profile_api.cpp @@ -0,0 +1,1663 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers. +#include +#include +#include +#include + +#include + +#include "flash.h" +// #include "static_switch.h" +// #include "static_switch_headdim.h" +#define False false +#define True true + +#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA") +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") + + +void set_params_fprop(Flash_fwd_params ¶ms, + // sizes + const size_t b, + const size_t seqlen_q, + const size_t seqlen_k, + const size_t seqlen_q_rounded, + const size_t seqlen_k_rounded, + const size_t h, + const size_t h_k, + const size_t d, + const size_t d_rounded, + const size_t vd, + const size_t vd_rounded, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + at::Tensor out, + void *cu_seqlens_q_d, + void *cu_seqlens_k_d, + void *seqused_k, + void *p_d, + void *softmax_lse_d, + float p_dropout, + float softmax_scale, + int window_size_left, + int window_size_right, + const float softcap, + bool seqlenq_ngroups_swapped=false, + const bool unpadded_lse=false) { + + // Reset the parameters + params = {}; + + params.is_bf16 = q.dtype() == torch::kBFloat16; + + // Set the pointers and strides. + params.q_ptr = q.data_ptr(); + params.k_ptr = k.data_ptr(); + params.v_ptr = v.data_ptr(); + // All stride are in elements, not bytes. + params.q_row_stride = q.stride(-3); + params.k_row_stride = k.stride(-3); + params.v_row_stride = v.stride(-3); + params.q_head_stride = q.stride(-2); + params.k_head_stride = k.stride(-2); + params.v_head_stride = v.stride(-2); + params.o_ptr = out.data_ptr(); + params.o_row_stride = out.stride(-3); + params.o_head_stride = out.stride(-2); + + if (cu_seqlens_q_d == nullptr) { + params.q_batch_stride = q.stride(0); + params.k_batch_stride = k.stride(0); + params.v_batch_stride = v.stride(0); + params.o_batch_stride = out.stride(0); + if (seqlenq_ngroups_swapped) { + params.q_batch_stride *= seqlen_q; + params.o_batch_stride *= seqlen_q; + } + } + + params.cu_seqlens_q = static_cast(cu_seqlens_q_d); + params.cu_seqlens_k = static_cast(cu_seqlens_k_d); + params.seqused_k = static_cast(seqused_k); + + // P = softmax(QK^T) + params.p_ptr = p_d; + + // Softmax sum + params.softmax_lse_ptr = softmax_lse_d; + + // Set the dimensions. + params.b = b; + params.h = h; + params.h_k = h_k; + params.h_h_k_ratio = h / h_k; + params.seqlen_q = seqlen_q; + params.seqlen_k = seqlen_k; + params.seqlen_q_rounded = seqlen_q_rounded; + params.seqlen_k_rounded = seqlen_k_rounded; + params.d = d; + params.d_rounded = d_rounded; + params.vd = vd; + params.vd_rounded = vd_rounded; + + // Set the different scale values. + #ifdef FLASHATTENTION_DISABLE_SOFTCAP + TORCH_CHECK(softcap <= 0.0, "This flash attention build does not support softcap."); + #endif + if (softcap > 0.0) { + params.softcap = softmax_scale / softcap; + params.scale_softmax = softcap; + params.scale_softmax_log2 = softcap * M_LOG2E; + } else{ + // Remove potential NaN + params.softcap = 0.0; + params.scale_softmax = softmax_scale; + params.scale_softmax_log2 = softmax_scale * M_LOG2E; + } + + // Set this to probability of keeping an element to simplify things. + params.p_dropout = 1.f - p_dropout; + // Convert p from float to int so we don't have to convert the random uint to float to compare. + // [Minor] We want to round down since when we do the comparison we use <= instead of < + // params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0)); + // params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0)); + params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0)); + params.rp_dropout = 1.f / params.p_dropout; + params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax; + TORCH_CHECK(p_dropout < 1.f); + #ifdef FLASHATTENTION_DISABLE_DROPOUT + TORCH_CHECK(p_dropout == 0.0f, "This flash attention build does not support dropout."); + #endif + + // Causal is the special case where window_size_right == 0 and window_size_left < 0. + // Local is the more general case where window_size_right >= 0 or window_size_left >= 0. + params.is_causal = window_size_left < 0 && window_size_right == 0; + + if (window_size_left < 0 && window_size_right >= 0) { window_size_left = seqlen_k; } + if (window_size_left >= 0 && window_size_right < 0) { window_size_right = seqlen_k; } + params.window_size_left = window_size_left; + params.window_size_right = window_size_right; + + #ifdef FLASHATTENTION_DISABLE_LOCAL + TORCH_CHECK(params.is_causal || (window_size_left < 0 && window_size_right < 0), + "This flash attention build does not support local attention."); + #endif + + params.is_seqlens_k_cumulative = true; + + #ifdef FLASHATTENTION_DISABLE_UNEVEN_K + TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32."); + #endif + + params.unpadded_lse = unpadded_lse; + params.seqlenq_ngroups_swapped = seqlenq_ngroups_swapped; +} + +void set_params_dgrad(Flash_bwd_params ¶ms, + // sizes + const size_t b, + const size_t seqlen_q, + const size_t seqlen_k, + const size_t seqlen_q_rounded, + const size_t seqlen_k_rounded, + const size_t h, + const size_t h_k, + const size_t d, + const size_t d_rounded, + const size_t vd, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + const at::Tensor out, + const at::Tensor dout, + at::Tensor dq, + at::Tensor dk, + at::Tensor dv, + void *cu_seqlens_q_d, + void *cu_seqlens_k_d, + void *dq_accum_d, + void *dk_accum_d, + void *dv_accum_d, + void *softmax_lse_d, + void *dsoftmax_sum_d, + float p_dropout, + float softmax_scale, + int window_size_left, + int window_size_right, + const float softcap, + bool deterministic, + const bool unpadded_lse) { + + set_params_fprop(params, + b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,vd, vd, + q, k, v, out, + cu_seqlens_q_d, + cu_seqlens_k_d, + nullptr, + nullptr, + softmax_lse_d, + p_dropout, + softmax_scale, + window_size_left, + window_size_right, + softcap, + false, // seqlenq_ngroups_swapped + unpadded_lse); + + // Set the pointers and strides. + params.do_ptr = dout.data_ptr(); + params.do_row_stride = dout.stride(-3); + params.do_head_stride = dout.stride(-2); + params.dq_ptr = dq.data_ptr(); + params.dk_ptr = dk.data_ptr(); + params.dv_ptr = dv.data_ptr(); + params.dq_row_stride = dq.stride(-3); + params.dk_row_stride = dk.stride(-3); + params.dv_row_stride = dv.stride(-3); + params.dq_head_stride = dq.stride(-2); + params.dk_head_stride = dk.stride(-2); + params.dv_head_stride = dv.stride(-2); + + if (cu_seqlens_q_d == nullptr) { + params.do_batch_stride = dout.stride(0); + params.dq_batch_stride = dq.stride(0); + params.dk_batch_stride = dk.stride(0); + params.dv_batch_stride = dv.stride(0); + } + + params.dq_accum_ptr = dq_accum_d; + params.dk_accum_ptr = dk_accum_d; + params.dv_accum_ptr = dv_accum_d; + + // Softmax sum + params.dsoftmax_sum = dsoftmax_sum_d; + + params.deterministic = deterministic; +} + +void run_mha_fwd(Flash_fwd_params ¶ms, cudaStream_t stream, bool force_split_kernel=false) { + constexpr bool is_bf16 = /*{is_bf16}*/; + constexpr bool is_causal = /*{is_causal}*/; + constexpr int kQKHeadDim = /*{Kd}*/; + constexpr int kVHeadDim = /*{D}*/; + assert(params.is_bf16 == is_bf16); + assert(params.is_causal == is_causal); + assert(params.d == kQKHeadDim); + assert(params.vd == kVHeadDim); + + if (params.num_splits <= 1 && !force_split_kernel) { // If we don't set it num_splits == 0 + run_mha_fwd_, kQKHeadDim, kVHeadDim, is_causal>(params, stream); + } else { + run_mha_fwd_splitkv_dispatch, kQKHeadDim, kVHeadDim, is_causal>(params, stream); + } + +} + +// Find the number of splits that maximizes the occupancy. For example, if we have +// batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency = 0.89) is +// better than having 3 splits (efficiency = 0.67). However, we also don't want too many +// splits as that would incur more HBM reads/writes. +// So we find the best efficiency, then find the smallest number of splits that gets 85% +// of the best efficiency. +inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n_blocks, int max_splits) { + // If we have enough to almost fill the SMs, then just use 1 split + if (batch_nheads_mblocks >= 0.8f * num_SMs) { return 1; } + max_splits = std::min({max_splits, num_SMs, num_n_blocks}); + float max_efficiency = 0.f; + std::vector efficiency; + efficiency.reserve(max_splits); + auto ceildiv = [](int a, int b) { return (a + b - 1) / b; }; + // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits, + // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks + // (i.e. it's 11 splits anyway). + // So we check if the number of blocks per split is the same as the previous num_splits. + auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) { + return num_splits == 1 || ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1); + }; + for (int num_splits = 1; num_splits <= max_splits; num_splits++) { + if (!is_split_eligible(num_splits)) { + efficiency.push_back(0.f); + } else { + float n_waves = float(batch_nheads_mblocks * num_splits) / num_SMs; + float eff = n_waves / ceil(n_waves); + // printf("num_splits = %d, eff = %f\n", num_splits, eff); + if (eff > max_efficiency) { max_efficiency = eff; } + efficiency.push_back(eff); + } + } + for (int num_splits = 1; num_splits <= max_splits; num_splits++) { + if (!is_split_eligible(num_splits)) { continue; } + if (efficiency[num_splits - 1] >= 0.85 * max_efficiency) { + // printf("num_splits chosen = %d\n", num_splits); + return num_splits; + } + } + return 1; +} + +std::tuple set_params_splitkv(Flash_fwd_params ¶ms, const int batch_size, + const int num_heads, const int head_size, const int v_head_size, const int max_seqlen_k, const int max_seqlen_q, + const int head_size_rounded, const int v_head_size_rounded,const float p_dropout, + const int num_splits, cudaDeviceProp *dprops, struct c10::TensorOptions opts) { + + // This needs to match with run_mha_fwd_splitkv_dispatch + const int max_head_size = head_size > v_head_size ? head_size : v_head_size; + const int block_n = max_head_size <= 64 ? 256 : (max_head_size <= 128 ? 128 : 64); + const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n; + // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel. + // In any case we don't expect seqlen_q to be larger than 64 for inference. + const int num_m_blocks = (max_seqlen_q + 64 - 1) / 64; + params.num_splits = num_splits; + at::Tensor softmax_lse_accum; + at::Tensor out_accum; + + if (p_dropout == 0.0f) { // SplitKV is not implemented for dropout + if (num_splits < 1) { + // We multiply number of SMs by 2 to hard-code the fact that we're using 128 threads per block. + params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, dprops->multiProcessorCount * 2, num_n_blocks, 128); + } + if (params.num_splits > 1) { + softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat)); + out_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, v_head_size_rounded}, opts.dtype(at::kFloat)); + params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr(); + params.oaccum_ptr = out_accum.data_ptr(); + } + TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported"); + } + + return std::make_tuple(softmax_lse_accum, out_accum); +} + +void set_params_alibi(Flash_fwd_params ¶ms, c10::optional &alibi_slopes_, int batch_size, int num_heads){ +#ifdef FLASHATTENTION_DISABLE_ALIBI + TORCH_CHECK(!alibi_slopes_.has_value(), "This flash attention build does not support alibi."); + params.alibi_slopes_ptr = nullptr; +#else + if (alibi_slopes_.has_value()) { + auto alibi_slopes = alibi_slopes_.value(); + TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32"); + CHECK_DEVICE(alibi_slopes); + TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); + TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads})); + params.alibi_slopes_ptr = alibi_slopes.data_ptr(); + params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + } else { + params.alibi_slopes_ptr = nullptr; + } +#endif +} + +std::vector +mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size + c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, + const float softmax_scale, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool return_softmax, + c10::optional gen_) { + + auto dprops = at::cuda::getCurrentDeviceProperties(); + // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; + bool is_sm8x = dprops->major == 8 && dprops->minor >= 0; + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer."); + // We will support Turing in the near future + // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer."); + + auto q_dtype = q.dtype(); + + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + if (q_dtype == torch::kBFloat16) { + TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer"); + } + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + + const auto sizes = q.sizes(); + const int v_head_size_og = v.sizes()[3]; + const int batch_size = sizes[0]; + int seqlen_q = sizes[1]; + int num_heads = sizes[2]; + const int head_size_og = sizes[3]; + const int seqlen_k = k.size(1); + const int num_heads_k = k.size(2); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } + + if (window_size_left >= seqlen_k) { window_size_left = -1; } + if (window_size_right >= seqlen_k) { window_size_right = -1; } + + // causal=true is the same as causal=false in this case + if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } + if (is_causal) { window_size_right = 0; } + + // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case + // H/t Daniel Haziza + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_k; + if (seqlenq_ngroups_swapped) { + q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + seqlen_q = ngroups; + num_heads = num_heads_k; + } + + + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); + CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + + at::Tensor q_padded, k_padded, v_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + // v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + q_padded = q; + k_padded = k; + // v_padded = v; + } + if (v_head_size_og % 8 != 0) { + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } else { + v_padded = v; + } + + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], v_head_size_og); + if (seqlenq_ngroups_swapped) { + out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2); + } + if (v_head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } + } else { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); + if (v_head_size_og % 8 != 0) { + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size = round_multiple(head_size_og, 8); + const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int v_head_size = round_multiple(v_head_size_og, 8); + const int v_head_size_rounded = v_head_size <= 192 ? round_multiple(v_head_size, 32) : 256; + const int seqlen_q_rounded = round_multiple(seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(seqlen_k, 128); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + + auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat)); + at::Tensor p; + // Only return softmax if there's dropout to reduce compilation time + if (return_softmax) { + TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0"); + p = torch::empty({ batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded }, opts); + } + + Flash_fwd_params params; + set_params_fprop(params, + batch_size, + seqlen_q, seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + v_head_size, v_head_size_rounded, + q_padded, k_padded, v_padded, out, + /*cu_seqlens_q_d=*/nullptr, + /*cu_seqlens_k_d=*/nullptr, + /*seqused_k=*/nullptr, + return_softmax ? p.data_ptr() : nullptr, + softmax_lse.data_ptr(), + p_dropout, + softmax_scale, + window_size_left, + window_size_right, + softcap + ); + + // Keep references to these tensors to extend their lifetime + at::Tensor softmax_lse_accum, out_accum; + std::tie(softmax_lse_accum, out_accum) = set_params_splitkv( + params, batch_size, num_heads, head_size, v_head_size, seqlen_k, seqlen_q, + head_size_rounded, v_head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts); + + // number of times random will be generated per thread, to offset philox counter in thc random + // state + // We use a custom RNG that increases the offset by batch_size * nheads * 32. + int64_t counter_offset = params.b * params.h * 32; + auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + // Forward kernel will populate memory with the seed and offset. + params.rng_state = reinterpret_cast(rng_state.data_ptr()); + + if (p_dropout > 0.0) { + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + params.philox_args = gen->philox_cuda_state(counter_offset); + } + + set_params_alibi(params, alibi_slopes_, batch_size, num_heads); + + if (seqlen_k > 0) { + auto stream = at::cuda::getCurrentCUDAStream().stream(); + run_mha_fwd(params, stream); + } else { + // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0. + out.zero_(); + softmax_lse.fill_(std::numeric_limits::infinity()); + } + + at::Tensor out_padded = out; + if (v_head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + } + + if (seqlenq_ngroups_swapped) { + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); + out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); + q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + } + return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state}; +} + +std::vector +mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + c10::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + c10::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. + c10::optional &leftpad_k_, // batch_size + c10::optional &block_table_, // batch_size x max_num_blocks_per_seq + c10::optional &alibi_slopes_, // num_heads or b x num_heads + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool return_softmax, + c10::optional gen_) { + + auto dprops = at::cuda::getCurrentDeviceProperties(); + // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; + bool is_sm8x = dprops->major == 8 && dprops->minor >= 0; + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer."); + // We will support Turing in the near future + // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer."); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + if (q_dtype == torch::kBFloat16) { + TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer"); + } + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); + TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(cu_seqlens_q); + CHECK_DEVICE(cu_seqlens_k); + + at::Tensor block_table; + const bool paged_KV = block_table_.has_value(); + if (paged_KV) { + block_table = block_table_.value(); + CHECK_DEVICE(block_table); + TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must have dtype torch.int32"); + TORCH_CHECK(block_table.stride(-1) == 1, "block_table must have contiguous last dimension"); + } + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + CHECK_CONTIGUOUS(cu_seqlens_q); + CHECK_CONTIGUOUS(cu_seqlens_k); + + const auto sizes = q.sizes(); + const int v_head_size_og = v.sizes()[2]; + const int batch_size = cu_seqlens_q.numel() - 1; + int num_heads = sizes[1]; + const int head_size_og = sizes[2]; + const int num_heads_k = paged_KV ? k.size(2) : k.size(1); + + if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } + + const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1); + const int num_blocks = !paged_KV ? 0 : k.size(0); + const int page_block_size = !paged_KV ? 1 : k.size(1); + TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256"); + + if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } // causal=true is the same as causal=false in this case + if (is_causal) { window_size_right = 0; } + + void *cu_seqlens_q_d = cu_seqlens_q.data_ptr(); + + // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case + // H/t Daniel Haziza + const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_k; + if (seqlenq_ngroups_swapped) { + q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + max_seqlen_q = ngroups; + num_heads = num_heads_k; + cu_seqlens_q_d = nullptr; + } + + const int total_q = q.sizes()[0]; + + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + if (window_size_left >= max_seqlen_k) { window_size_left = -1; } + if (window_size_right >= max_seqlen_k) { window_size_right = -1; } + + CHECK_SHAPE(q, total_q, num_heads, head_size_og); + if (!paged_KV) { + const int total_k = k.size(0); + CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + } else { + CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size_og); + CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, v_head_size_og); + CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); + } + + CHECK_SHAPE(cu_seqlens_q, batch_size + 1); + CHECK_SHAPE(cu_seqlens_k, batch_size + 1); + if (seqused_k.has_value()){ + auto seqused_k_ = seqused_k.value(); + TORCH_CHECK(seqused_k_.dtype() == torch::kInt32, "seqused_k must have dtype int32"); + TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device"); + TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous"); + CHECK_SHAPE(seqused_k_, batch_size); + } + + at::Tensor q_padded, k_padded, v_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + // v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + q_padded = q; + k_padded = k; + // v_padded = v; + } + if (v_head_size_og % 8 != 0) { + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } else { + v_padded = v; + } + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, sizes[0], sizes[1], v_head_size_og); + if (seqlenq_ngroups_swapped) { + out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + } + if (v_head_size_og % 8 != 0) { + out = torch::empty({total_q, num_heads, v_head_size_og}, q.options()); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } + } else { + out = torch::empty({total_q, num_heads, v_head_size_og}, q.options()); + if (v_head_size_og % 8 != 0) { + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size = round_multiple(head_size_og, 8); + const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int v_head_size = round_multiple(v_head_size_og, 8); + const int v_head_size_rounded = v_head_size <= 192 ? round_multiple(v_head_size, 32) : 256; + const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat)); + at::Tensor p; + // Only return softmax if there's dropout to reduce compilation time + if (return_softmax) { + TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0"); + p = torch::empty({ batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded }, opts); + } + + if (zero_tensors) { + out.zero_(); + softmax_lse.fill_(-std::numeric_limits::infinity()); + if (return_softmax) {p.zero_();} + } + + Flash_fwd_params params; + set_params_fprop(params, + batch_size, + max_seqlen_q, max_seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + v_head_size, v_head_size_rounded, + q_padded, k_padded, v_padded, out, + cu_seqlens_q_d, + cu_seqlens_k.data_ptr(), + seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr, + return_softmax ? p.data_ptr() : nullptr, + softmax_lse.data_ptr(), + p_dropout, + softmax_scale, + window_size_left, + window_size_right, + softcap, + seqlenq_ngroups_swapped, + /*unpadded_lse*/true); + params.total_q = total_q; + + if (paged_KV) { + params.block_table = block_table.data_ptr(); + params.block_table_batch_stride = block_table.stride(0); + params.k_batch_stride = k_padded.stride(0); + params.v_batch_stride = v_padded.stride(0); + } + params.page_block_size = page_block_size; + // Keep references to these tensors to extend their lifetime + at::Tensor softmax_lse_accum, out_accum; + if (seqlenq_ngroups_swapped) { + // Only apply split-k for decoding + std::tie(softmax_lse_accum, out_accum) = + set_params_splitkv(params, batch_size, num_heads, head_size, v_head_size, + max_seqlen_k, max_seqlen_q, head_size_rounded,v_head_size_rounded, + p_dropout, /*num_splits*/ 0, dprops, opts); + } + + if (leftpad_k_.has_value()) { + auto leftpad_k = leftpad_k_.value(); + TORCH_CHECK(!paged_KV, "We don't support Paged KV and leftpad_k running at the same time yet"); + TORCH_CHECK(leftpad_k.dtype() == torch::kInt32, "leftpad_k must have dtype int32"); + CHECK_DEVICE(leftpad_k); + CHECK_CONTIGUOUS(leftpad_k); + CHECK_SHAPE(leftpad_k, batch_size); + params.leftpad_k = static_cast(leftpad_k.data_ptr()); + } + + // number of times random will be generated per thread, to offset philox counter in thc random + // state + // We use a custom RNG that increases the offset by batch_size * nheads * 32. + int64_t counter_offset = params.b * params.h * 32; + auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + // Forward kernel will populate memory with the seed and offset. + params.rng_state = reinterpret_cast(rng_state.data_ptr()); + + if (p_dropout > 0.0) { + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + params.philox_args = gen->philox_cuda_state(counter_offset); + } + + set_params_alibi(params, alibi_slopes_, batch_size, num_heads); + + if (max_seqlen_k > 0) { + auto stream = at::cuda::getCurrentCUDAStream().stream(); + run_mha_fwd(params, stream, paged_KV); + } else { + // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0. + out.zero_(); + softmax_lse.fill_(std::numeric_limits::infinity()); + } + + at::Tensor out_padded = out; + if (v_head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + } + + if (seqlenq_ngroups_swapped) { + int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_k, head_size_og}; + int64_t size_after[] = {batch_size, num_heads_k * max_seqlen_q, head_size_og}; + int64_t o_size_before[] = {batch_size, max_seqlen_q, num_heads_k, v_head_size_og}; + int64_t o_size_after[] = {batch_size, num_heads_k * max_seqlen_q, v_head_size_og}; + out = out.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); + out_padded = out_padded.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); + q_padded = q_padded.reshape(size_before).transpose(1, 2).reshape(size_after); + softmax_lse = softmax_lse.reshape({num_heads * max_seqlen_q, batch_size}); + } + + return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state}; +} + +void run_mha_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { + + constexpr bool is_bf16 = /*{is_bf16}*/; + constexpr bool is_causal = /*{is_causal}*/; + constexpr int kQKHeadDim = /*{Kd}*/; + constexpr int kVHeadDim = /*{D}*/; + + assert(params.is_bf16 == is_bf16); + assert(params.is_causal == is_causal); + assert(params.d == kQKHeadDim); + assert(params.vd == kVHeadDim); + + run_mha_bwd_, kQKHeadDim, kVHeadDim, is_causal>(params, stream); +} +std::vector +mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x seqlen_q + c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool deterministic, + c10::optional gen_, + c10::optional &rng_state) { + + #ifdef FLASHATTENTION_DISABLE_BACKWARD + TORCH_CHECK(false, "This flash attention build does not support backward."); + #endif + if (is_causal) { window_size_right = 0; } + auto dprops = at::cuda::getCurrentDeviceProperties(); + // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; + bool is_sm8x = dprops->major == 8 && dprops->minor >= 0; + bool is_sm80 = dprops->major == 8 && dprops->minor == 0; + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer."); + // We will support Turing in the near future + // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer."); + + bool is_dropout = p_dropout > 0.0; + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + if (q_dtype == torch::kBFloat16) { + TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer"); + } + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype"); + TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); + TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + + const auto sizes = q.sizes(); + const int v_head_size_og = v.sizes()[3]; + const int batch_size = sizes[0]; + const int seqlen_q = sizes[1]; + const int num_heads = sizes[2]; + const int head_size_og = dout.size(3); + const int head_size = sizes[3]; + const int seqlen_k = k.size(1); + const int num_heads_k = k.size(2); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); + TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); + TORCH_CHECK(v_head_size_og % 8 == 0, " v head_size should be a multiple of 8"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); + if ((head_size > 192 || v_head_size_og > 192) && is_dropout) { + TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); + } + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int seqlen_q_rounded = round_multiple(seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(seqlen_k, 128); + + // TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8"); + if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } + + if (window_size_left >= seqlen_k) { window_size_left = -1; } + if (window_size_right >= seqlen_k) { window_size_right = -1; } + + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size); + CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); + CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og); + + at::Tensor dq, dk, dv; + if (dq_.has_value()) { + dq = dq_.value(); + TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q"); + CHECK_DEVICE(dq); + TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension"); + CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size); + } else { + dq = torch::empty_like(q); + } + if (dk_.has_value()) { + dk = dk_.value(); + TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q"); + CHECK_DEVICE(dk); + TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension"); + CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size); + } else { + dk = torch::empty_like(k); + } + if (dv_.has_value()) { + dv = dv_.value(); + TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); + CHECK_DEVICE(dv); + TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); + CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, v_head_size_og); + } else { + dv = torch::empty_like(v); + } + + at::Tensor dout_padded; + if (head_size_og % 8 != 0) { + dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + dout_padded = dout; + } + + // bool loop = seqlen_k > blocksize_c; + // TODO: change later, for now set to true for simplicity + bool loop = true; + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat)); + at::Tensor dq_accum; + at::Tensor dk_accum, dv_accum; + if (loop) { + if (!deterministic) { + dq_accum = torch::empty({batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat)); + } else { + const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads); + dq_accum = torch::zeros({nsplits, batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat)); + } + // dk_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat)); + // dv_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat)); + } + + at::Tensor dk_expanded, dv_expanded; + if (num_heads_k != num_heads) { // MQA / GQA + dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); + dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, v_head_size_og}, opts); + } else { + dk_expanded = dk; + dv_expanded = dv; + } + + Flash_bwd_params params; + + set_params_dgrad(params, + batch_size, + seqlen_q, seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + v_head_size_og, + q, k, v, out, + dout_padded, dq, dk_expanded, dv_expanded, + nullptr, + nullptr, + loop ? dq_accum.data_ptr() : nullptr, + // loop ? dk_accum.data_ptr() : nullptr, + // loop ? dv_accum.data_ptr() : nullptr, + nullptr, + nullptr, + softmax_lse.data_ptr(), + softmax_d.data_ptr(), + p_dropout, + softmax_scale, + window_size_left, + window_size_right, + softcap, + deterministic, + /*unpadded_lse*/false); + params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0); + + auto launch = &run_mha_bwd; + + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + + // We use a custom RNG that increases the offset by batch_size * nheads * 32. + int64_t counter_offset = params.b * params.h * 32; + + if ( rng_state.has_value() ) { + params.rng_state = reinterpret_cast(rng_state.value().data_ptr()); + } else if( is_dropout ) { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + params.philox_args = gen->philox_cuda_state(counter_offset); + auto seeds = at::cuda::philox::unpack(params.philox_args); + params.rng_state[0] = std::get<0>(seeds); + params.rng_state[1] = std::get<1>(seeds); + } + + set_params_alibi(params, alibi_slopes_, batch_size, num_heads); + + if (seqlen_q > 0) { + launch(params, stream); + } else { + // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0. + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + // For MQA/GQA we need to sum dK and dV across the groups + if (num_heads_k != num_heads) { + at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); + at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {3}); + } + if (head_size_og % 8 != 0) { + dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + // dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + } + + return { dq, dk, dv, softmax_d }; +} + +std::vector +mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size + const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &out, // total_q x num_heads x head_size + const at::Tensor &softmax_lse, // h x total_q, softmax logsumexp + c10::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + c10::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + c10::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + c10::optional &alibi_slopes_, // num_heads or b x num_heads + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool deterministic, + c10::optional gen_, + c10::optional &rng_state) { + + #ifdef FLASHATTENTION_DISABLE_BACKWARD + TORCH_CHECK(false, "This flash attention build does not support backward."); + #endif + + if (is_causal) { window_size_right = 0; } + auto dprops = at::cuda::getCurrentDeviceProperties(); + // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; + bool is_sm8x = dprops->major == 8 && dprops->minor >= 0; + bool is_sm80 = dprops->major == 8 && dprops->minor == 0; + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer."); + // We will support Turing in the near future + // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer."); + bool is_dropout = p_dropout > 0.0; + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + if (q_dtype == torch::kBFloat16) { + TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer"); + } + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype"); + TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype"); + TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); + TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); + CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); + TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + CHECK_CONTIGUOUS(cu_seqlens_q); + CHECK_CONTIGUOUS(cu_seqlens_k); + + const auto sizes = q.sizes(); + const int v_head_size_og = v.sizes()[2]; + const int total_q = sizes[0]; + const int batch_size = cu_seqlens_q.numel() - 1; + const int num_heads = sizes[1]; + const int head_size_og = dout.size(2); + const int head_size = sizes[2]; + const int total_k = k.size(0); + const int num_heads_k = k.size(1); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); + TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); + TORCH_CHECK(v_head_size_og % 8 == 0, " v head_size should be a multiple of 8"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); + + if ((head_size > 192 || v_head_size_og > 192) && is_dropout) { + TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); + } + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128); + + TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8"); + + if (window_size_left >= max_seqlen_k) { window_size_left = -1; } + if (window_size_right >= max_seqlen_k) { window_size_right = -1; } + + CHECK_SHAPE(q, total_q, num_heads, head_size); + CHECK_SHAPE(k, total_k, num_heads_k, head_size); + CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(out, total_q, num_heads, v_head_size_og); + CHECK_SHAPE(dout, total_q, num_heads, head_size_og); + CHECK_SHAPE(cu_seqlens_q, batch_size + 1); + CHECK_SHAPE(cu_seqlens_k, batch_size + 1); + + at::Tensor dq, dk, dv; + if (dq_.has_value()) { + dq = dq_.value(); + TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q"); + CHECK_DEVICE(dq); + TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension"); + CHECK_SHAPE(dq, total_q, num_heads, head_size); + } else { + dq = torch::empty_like(q); + } + if (dk_.has_value()) { + dk = dk_.value(); + TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q"); + CHECK_DEVICE(dk); + TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension"); + CHECK_SHAPE(dk, total_k, num_heads_k, head_size); + } else { + dk = torch::empty_like(k); + } + if (dv_.has_value()) { + dv = dv_.value(); + TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); + CHECK_DEVICE(dv); + TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); + CHECK_SHAPE(dv, total_k, num_heads_k, v_head_size_og); + } else { + dv = torch::empty_like(v); + } + + at::Tensor dout_padded; + if (head_size_og % 8 != 0) { + dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + dout_padded = dout; + } + + // bool loop = max_seqlen_k > blocksize_c; + // TODO: change later, for now set to true for simplicity + bool loop = true; + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + auto softmax_d = torch::empty({num_heads, total_q + 128 * batch_size}, opts.dtype(at::kFloat)); + at::Tensor dq_accum; + if (loop) { + // We don't want to allocate dq_accum of size (batch, seqlen_q_rounded, num_heads, head_size_rounded) + // because that would be too large if there is a very long sequence and the rest of the sequences are short. + // Instead, we allocate dq_accum of size (total_q + 128 * batch, num_heads, head_size_rounded). + // Note that 128 is the max block size on the seqlen_q dimension. + // For dQ, the i-th sequence is stored in indices from cu_seqlens[i] + 128 * i to + // cu_seqlens[i + 1] * 128 * i - 1. This ensures that the i-th sequence and (i + 1)-th sequence will + // be at least 128 apart. It's ok for us to do atomicAdds up to 128 rows beyond what we're normally + // allowed to do. So we won't have to do any bound checking, and performance should stay the same. + // Same holds for softmax_d, since LSE is stored in unpadded format. + if (!deterministic) { + dq_accum = torch::empty({total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat)); + } else { + const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads); + dq_accum = torch::zeros({nsplits, total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat)); + } + } + + at::Tensor dk_expanded, dv_expanded; + if (num_heads_k != num_heads) { // MQA / GQA + dk_expanded = torch::empty({total_k, num_heads, head_size}, opts); + dv_expanded = torch::empty({total_k, num_heads, v_head_size_og}, opts); + } else { + dk_expanded = dk; + dv_expanded = dv; + } + + if( zero_tensors ) { + dq.zero_(); + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + Flash_bwd_params params; + + set_params_dgrad(params, + batch_size, + max_seqlen_q, max_seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + v_head_size_og, + q, k, v, out, + dout_padded, dq, dk_expanded, dv_expanded, + cu_seqlens_q.data_ptr(), + cu_seqlens_k.data_ptr(), + loop ? dq_accum.data_ptr() : nullptr, + nullptr, + nullptr, + softmax_lse.data_ptr(), + softmax_d.data_ptr(), + p_dropout, + softmax_scale, + window_size_left, + window_size_right, + softcap, + deterministic, + /*unpadded_lse*/true); + params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0); + params.total_q = total_q; + + auto launch = &run_mha_bwd; + + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + + // We use a custom RNG that increases the offset by batch_size * nheads * 32. + int64_t counter_offset = params.b * params.h * 32; + + if ( rng_state.has_value() ) { + params.rng_state = reinterpret_cast(rng_state.value().data_ptr()); + } else if( is_dropout ) { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + params.philox_args = gen->philox_cuda_state(counter_offset); + auto seeds = at::cuda::philox::unpack(params.philox_args); + params.rng_state[0] = std::get<0>(seeds); + params.rng_state[1] = std::get<1>(seeds); + } + + set_params_alibi(params, alibi_slopes_, batch_size, num_heads); + + if (max_seqlen_q > 0) { + launch(params, stream); + } else { + // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0. + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + // For MQA/GQA we need to sum dK and dV across the groups + if (num_heads_k != num_heads) { + at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); + at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {2}); + } + if (head_size_og % 8 != 0) { + dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + // dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + } + + return { dq, dk, dv, softmax_d }; +} + +std::vector +mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &kcache, // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + const at::Tensor &vcache, // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + c10::optional &k_, // batch_size x seqlen_knew x num_heads_k x head_size + c10::optional &v_, // batch_size x seqlen_knew x num_heads_k x head_size + c10::optional &seqlens_k_, // batch_size + c10::optional &rotary_cos_, // seqlen_ro x (rotary_dim / 2) + c10::optional &rotary_sin_, // seqlen_ro x (rotary_dim / 2) + c10::optional &cache_batch_idx_, // indices to index into the KV cache + c10::optional &leftpad_k_, // batch_size + c10::optional &block_table_, // batch_size x max_num_blocks_per_seq + c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size + const float softmax_scale, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + bool is_rotary_interleaved, // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2 + int num_splits + ) { + + auto dprops = at::cuda::getCurrentDeviceProperties(); + // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; + bool is_sm8x = dprops->major == 8 && dprops->minor >= 0; + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer."); + // We will support Turing in the near future + // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer."); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + if (q_dtype == torch::kBFloat16) { + TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer"); + } + TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(vcache.dtype() == q_dtype, "query and value must have the same dtype"); + + CHECK_DEVICE(q); CHECK_DEVICE(kcache); CHECK_DEVICE(vcache); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(kcache.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(vcache.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + + at::Tensor block_table; + const bool paged_KV = block_table_.has_value(); + if (paged_KV) { + TORCH_CHECK(!cache_batch_idx_.has_value(), "Paged KVcache does not support cache_batch_idx"); + block_table = block_table_.value(); + CHECK_DEVICE(block_table); + TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must have dtype torch.int32"); + TORCH_CHECK(block_table.stride(-1) == 1, "block_table must have contiguous last dimension"); + } + + const auto sizes = q.sizes(); + const int v_head_size_og = vcache.sizes()[3]; + const int batch_size = sizes[0]; + int seqlen_q = sizes[1]; + int num_heads = sizes[2]; + const int head_size_og = sizes[3]; + + const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1); + const int num_blocks = !paged_KV ? 0 : kcache.size(0); + const int page_block_size = !paged_KV ? 1 : kcache.size(1); + TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256"); + const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size; + const int num_heads_k = kcache.size(2); + const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size; + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); + + // causal=true is the same as causal=false in this case + if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } + if (is_causal) { window_size_right = 0; } + + // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case + // H/t Daniel Haziza + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + if (seqlenq_ngroups_swapped) { + const int ngroups = num_heads / num_heads_k; + q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + seqlen_q = ngroups; + num_heads = num_heads_k; + } + + if (window_size_left >= seqlen_k) { window_size_left = -1; } + if (window_size_right >= seqlen_k) { window_size_right = -1; } + + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); + if (!paged_KV) { + CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og); + CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, v_head_size_og); + } else { + CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og); + CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, v_head_size_og); + CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); + } + + at::Tensor q_padded, kcache_padded, vcache_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + kcache_padded = torch::nn::functional::pad(kcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + vcache_padded = torch::nn::functional::pad(vcache, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } else { + q_padded = q; + kcache_padded = kcache; + vcache_padded = vcache; + } + + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); + if (v_head_size_og % 8 != 0) { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } + } else { + out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); + if (v_head_size_og % 8 != 0) { + out = torch::nn::functional::pad(out, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size = round_multiple(head_size_og, 8); + const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; + const int v_head_size = round_multiple(v_head_size_og, 8); + const int v_head_size_rounded = v_head_size <= 192 ? round_multiple(v_head_size, 32) : 256; + const int seqlen_q_rounded = round_multiple(seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(seqlen_k, 128); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + + auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat)); + + Flash_fwd_params params; + set_params_fprop(params, + batch_size, + seqlen_q, seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + v_head_size, v_head_size_rounded, + q_padded, kcache_padded, vcache_padded, out, + /*cu_seqlens_q_d=*/nullptr, + /*cu_seqlens_k_d=*/nullptr, + /*seqused_k=*/nullptr, + /*p_ptr=*/nullptr, + softmax_lse.data_ptr(), + /*p_dropout=*/0.f, + softmax_scale, + window_size_left, + window_size_right, + softcap + ); + + at::Tensor k, v, k_padded, v_padded; + if (k_.has_value()) { + TORCH_CHECK(v_.has_value(), "If key is supplied, value must also be passed in"); + TORCH_CHECK(seqlens_k_.has_value(), "If key is supplied, seqlens_k must also be passed in"); + TORCH_CHECK(seqlen_q <= seqlen_k, "If key is supplied, it must have seqlen <= the seqlen of the KV cache"); + k = k_.value(); + v = v_.value(); + TORCH_CHECK(k.dtype() == q_dtype, "Key must have the same dtype as query"); + TORCH_CHECK(v.dtype() == q_dtype, "Value must have the same dtype as query"); + CHECK_DEVICE(k); CHECK_DEVICE(v); + TORCH_CHECK(k.stride(-1) == 1, "Key tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension"); + int seqlen_knew = k.size(1); + CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, v_head_size_og); + if (head_size_og % 8 != 0) { + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); + } else { + k_padded = k; + v_padded = v; + } + params.seqlen_knew = seqlen_knew; + params.knew_ptr = k_padded.data_ptr(); + params.vnew_ptr = v_padded.data_ptr(); + // All stride are in elements, not bytes. + params.knew_batch_stride = k_padded.stride(0); + params.vnew_batch_stride = v_padded.stride(0); + params.knew_row_stride = k_padded.stride(-3); + params.vnew_row_stride = v_padded.stride(-3); + params.knew_head_stride = k_padded.stride(-2); + params.vnew_head_stride = v_padded.stride(-2); + } + + if (seqlens_k_.has_value()) { + auto seqlens_k = seqlens_k_.value(); + TORCH_CHECK(seqlens_k.dtype() == torch::kInt32, "seqlens_k must have dtype int32"); + CHECK_DEVICE(seqlens_k); + CHECK_CONTIGUOUS(seqlens_k); + CHECK_SHAPE(seqlens_k, batch_size); + params.cu_seqlens_k = static_cast(seqlens_k.data_ptr()); + } + params.is_seqlens_k_cumulative = !(seqlens_k_.has_value()); + if (leftpad_k_.has_value()) { + TORCH_CHECK(!paged_KV, "We don't support Paged KV and leftpad_k running at the same time yet"); + auto leftpad_k = leftpad_k_.value(); + TORCH_CHECK(leftpad_k.dtype() == torch::kInt32, "leftpad_k must have dtype int32"); + CHECK_DEVICE(leftpad_k); + CHECK_CONTIGUOUS(leftpad_k); + CHECK_SHAPE(leftpad_k, batch_size); + params.leftpad_k = static_cast(leftpad_k.data_ptr()); + } + + if (rotary_cos_.has_value()) { + TORCH_CHECK(k_.has_value(), "If rotary cos/sin are provided, new key / value to be appended to KV cache must also be provided"); + auto rotary_cos = rotary_cos_.value(); + CHECK_DEVICE(rotary_cos); + params.rotary_dim = rotary_cos.size(1) * 2; + TORCH_CHECK(params.rotary_dim <= head_size, "rotary_dim must be <= headdim"); + TORCH_CHECK(params.rotary_dim % 16 == 0, "Only rotary dimensions divisible by 16 are currently supported"); + const int seqlen_ro = rotary_cos.size(0); + TORCH_CHECK(seqlen_ro >= seqlen_k, "cos/sin seqlen must be at least the seqlen of KV cache"); + CHECK_SHAPE(rotary_cos, seqlen_ro, params.rotary_dim / 2); + CHECK_CONTIGUOUS(rotary_cos); + TORCH_CHECK(rotary_cos.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query"); + + TORCH_CHECK(rotary_sin_.has_value(), "If rotary cos is provided, rotary sin must also be provided"); + auto rotary_sin = rotary_sin_.value(); + CHECK_DEVICE(rotary_sin); + CHECK_SHAPE(rotary_sin, seqlen_ro, params.rotary_dim / 2); + CHECK_CONTIGUOUS(rotary_sin); + TORCH_CHECK(rotary_sin.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query"); + params.rotary_cos_ptr = rotary_cos.data_ptr(); + params.rotary_sin_ptr = rotary_sin.data_ptr(); + params.is_rotary_interleaved = is_rotary_interleaved; + } else { + params.rotary_dim = 0; + } + + if (cache_batch_idx_.has_value()) { + auto cache_batch_idx = cache_batch_idx_.value(); + CHECK_DEVICE(cache_batch_idx); + CHECK_CONTIGUOUS(cache_batch_idx); + TORCH_CHECK(cache_batch_idx.scalar_type() == torch::kInt32, "cache_batch_idx must have dtype int32"); + params.cache_batch_idx = reinterpret_cast(cache_batch_idx.data_ptr()); + } + + // Keep references to these tensors to extend their lifetime + at::Tensor softmax_lse_accum, out_accum; + std::tie(softmax_lse_accum, out_accum) = set_params_splitkv( + params, batch_size, num_heads, head_size, v_head_size, seqlen_k, seqlen_q, + head_size_rounded, v_head_size_rounded, /*dropout*/ 0.f, num_splits, dprops, opts); + + if (paged_KV) { + params.block_table = block_table.data_ptr(); + params.block_table_batch_stride = block_table.stride(0); + } + params.page_block_size = page_block_size; + + + set_params_alibi(params, alibi_slopes_, batch_size, num_heads); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + // Only split kernel supports appending to KV cache, or indexing to the cache with cache_batch_idx, + // or paged KV cache + run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value() || paged_KV); + + if (head_size_og % 8 != 0) { + // out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + // if (out_.has_value()) { out_.value().copy_(out); } + if (k_.has_value()) { + // It's expensive to copy the KV cache here for the case where head size not divisible by 8, + // but we don't expect to get this case in practice. This is just so that the code works for that case. + kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); + // vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); + } + } + if (v_head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + if (k_.has_value()) { + // It's expensive to copy the KV cache here for the case where head size not divisible by 8, + // but we don't expect to get this case in practice. This is just so that the code works for that case. + // kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)})); + vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, v_head_size_og)})); + } + } + + if (seqlenq_ngroups_swapped) { + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + } + return {out, softmax_lse}; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "FlashAttention"; + m.def("fwd", &mha_fwd, "Forward pass"); + m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)"); + m.def("bwd", &mha_bwd, "Backward pass"); + m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)"); + m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache"); +} From 83fd7a55f2ac1fa078c2cbb4faf1268385901d32 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Fri, 6 Sep 2024 21:32:54 +0800 Subject: [PATCH 34/46] basetuner fwd --- autotuner/base_tunner.py | 14 ++++++++------ autotuner/code_emitter.py | 10 +++++++++- autotuner/configs/base_config.py | 5 ++++- autotuner/template/flash_attn_profile_interface.py | 7 +++++-- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index 598a75da6..15eb8c166 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -73,7 +73,7 @@ def __init__(self, arch, torch_array: list, op_name, tempdir): self.Br_list = [32, 64, 128, 256] self.Bc_list = [32, 64, 128, 256] - self.template_dir = "template" + self.template_dir = "autotuner/template" self.op_name = op_name self.cache_path = os.path.join(os.path.dirname(__file__), "../../cache/") self.problem_key = { @@ -98,7 +98,7 @@ def compile_parallel(self, configs:list, temp_dir:str, timeout: float = None): pass def profile(self, config:BaseConfig, device="cuda:0") -> float: - spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.temp_dir+"/flash_attn_profile_interface.py") + spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.output_dir+"/flash_attn_profile_interface.py") flash_attn_func = importlib.util.module_from_spec(spec) spec.loader.exec_module(flash_attn_func) latency = profile_fwd(flash_attn_func) @@ -216,11 +216,12 @@ def generate_configs(self,Br:int,Bc:int,dim_qk:int,dim_v:int): if __name__=="__main__": import torch + from configs.fwd_config import FlashFwdConfig batch_size = 4 seqlen = 2048 nheads = 8 - headdim = 32 - v_headdim = 32 + headdim = 192 + v_headdim = 128 device = 'cuda' dtype = torch.bfloat16 q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, @@ -231,5 +232,6 @@ def generate_configs(self,Br:int,Bc:int,dim_qk:int,dim_v:int): requires_grad=True) base_tunner = BaseTunner(arch=None, torch_array=[q,k,v], op_name="flash_fwd", tempdir="autotuner/temp") - config = BaseConfig(headdim,v_headdim,64,32) - base_tunner.compile([config]) \ No newline at end of file + config = FlashFwdConfig(headdim,v_headdim,64,64) + base_tunner.compile([config]) + base_tunner.profile(config) \ No newline at end of file diff --git a/autotuner/code_emitter.py b/autotuner/code_emitter.py index d9c42022c..5278baae5 100644 --- a/autotuner/code_emitter.py +++ b/autotuner/code_emitter.py @@ -43,7 +43,7 @@ def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): # generate kernel code # TODO: parallelize for config in configs: - kernel_code_dir = Path(output_dir) / Path(str(config)) + kernel_code_dir = Path(output_dir) / Path(config.output_dir) if not kernel_code_dir.exists(): os.mkdir(kernel_code_dir) for file_name in self.kernel_file_list: @@ -54,6 +54,14 @@ def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): with open(kernel_code_dir / Path(file_name), "w") as f: f.write(code_template) + # flash_attn_profile_interface.py + with open(Path(template_dir) / Path("flash_attn_profile_interface.py")) as f: + code_template = f.read() + code_template = code_template.replace("OUTPUT_DIR", f"\"{str(output_dir)}\"") + code_template = code_template.replace("OUTPUT_KERNEL_DIR", f"\"{str(kernel_code_dir)}\"") + with open(Path(kernel_code_dir) / Path("flash_attn_profile_interface.py"), "w") as f: + f.write(code_template) + def emit_code_kernel(self, code_template:str, config:BaseConfig): kv = config.__dict__ diff --git a/autotuner/configs/base_config.py b/autotuner/configs/base_config.py index 39eed0865..80c3a9007 100644 --- a/autotuner/configs/base_config.py +++ b/autotuner/configs/base_config.py @@ -8,7 +8,6 @@ def __init__(self, Kd, D, Br, Bc, Nwarps=8) -> None: self.operation = None self.template_dir = None - self.output_dir = str(self) def __repr__(self) -> str: return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={}".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps) @@ -22,6 +21,10 @@ def from_dict(cls, dd:dict): cc.__dict__.update(dd) return cc + @property + def output_dir(self): + return str(self) + if __name__ == "__main__": cc = BaseConfig(1,2,3,4) print(cc) diff --git a/autotuner/template/flash_attn_profile_interface.py b/autotuner/template/flash_attn_profile_interface.py index 26273c293..c832161b0 100644 --- a/autotuner/template/flash_attn_profile_interface.py +++ b/autotuner/template/flash_attn_profile_interface.py @@ -20,7 +20,7 @@ "csrc/flash_attn", "csrc/flash_attn/src", "csrc/cutlass/include", - OUTPUT_DIR, + OUTPUT_KERNEL_DIR, ] cc_flag = [] @@ -31,6 +31,9 @@ # cc_flag.append("-gencode") # cc_flag.append("arch=compute_90,code=sm_90") +build_dir = OUTPUT_KERNEL_DIR + "/build" +if not os.path.exists(build_dir): + os.makedirs(build_dir) flash_attn_cuda = torch.utils.cpp_extension.load( @@ -68,7 +71,7 @@ + generator_flag + cc_flag, extra_include_paths=include_path, - build_directory=f"build_autotuner", + build_directory=build_dir, ) # isort: off From 7cf48586bcab6f47a2cb1a31959d01f9c0585811 Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 10 Sep 2024 09:21:43 +0800 Subject: [PATCH 35/46] update autotuner FLashFwd --- autotuner/arch/A100.py | 15 +++++++++ autotuner/arch/RTX4090.py | 15 +++++++++ autotuner/arch/__init__.py | 3 ++ autotuner/arch/arch_base.py | 13 ++++++++ autotuner/base_tunner.py | 16 ++-------- autotuner/test_run_tunner.py | 20 ++++++++++++ autotuner/tunner.py | 61 ++++++++++++++++++++++++++++++++++++ 7 files changed, 130 insertions(+), 13 deletions(-) create mode 100644 autotuner/arch/A100.py create mode 100644 autotuner/arch/RTX4090.py create mode 100644 autotuner/arch/__init__.py create mode 100644 autotuner/arch/arch_base.py create mode 100644 autotuner/test_run_tunner.py create mode 100644 autotuner/tunner.py diff --git a/autotuner/arch/A100.py b/autotuner/arch/A100.py new file mode 100644 index 000000000..2c3a16839 --- /dev/null +++ b/autotuner/arch/A100.py @@ -0,0 +1,15 @@ +from .arch_base import Arch +class A100(Arch): + def __init__(self): + self.reg_cap = 65536 # 32768 + self.smem_cap = 163*1024 # 164*1024 + self.compute_max_core = 108 + self.warp_size = 32 + self.sm_partition = 4 + self.transaction_size = [32, 128] # in bytes + self.max_smem_usage = 164 * 1024 + self.bandwidth = [1319, 16308] + self.platform = "CUDA" + self.compute_capability = "80" + self.cutlass_mma = [16, 8, 16] + self.register_per_thread = 255 diff --git a/autotuner/arch/RTX4090.py b/autotuner/arch/RTX4090.py new file mode 100644 index 000000000..bae3c291e --- /dev/null +++ b/autotuner/arch/RTX4090.py @@ -0,0 +1,15 @@ +from .arch_base import Arch +class RTX4090(Arch): + def __init__(self): + self.reg_cap = 65536 # 32768 + self.smem_cap = 100*1024 # 164*1024 + self.compute_max_core = 128 + self.warp_size = 32 + self.sm_partition = 4 + self.transaction_size = [32, 128] # in bytes + self.max_smem_usage = 100 * 1024 + self.bandwidth = [1008, 0] # TODO: 1 + self.platform = "CUDA" + self.compute_capability = "89" + self.cutlass_mma = [16, 8, 16] + self.register_per_thread = 255 \ No newline at end of file diff --git a/autotuner/arch/__init__.py b/autotuner/arch/__init__.py new file mode 100644 index 000000000..9ba8ec3a7 --- /dev/null +++ b/autotuner/arch/__init__.py @@ -0,0 +1,3 @@ +from .arch_base import Arch +from .A100 import * +from .RTX4090 import * diff --git a/autotuner/arch/arch_base.py b/autotuner/arch/arch_base.py new file mode 100644 index 000000000..74d6144d0 --- /dev/null +++ b/autotuner/arch/arch_base.py @@ -0,0 +1,13 @@ +class Arch: + def __init__(self) -> None: + self.reg_cap = 0 + self.smem_cap = 0 + self.compute_max_core = 0 + self.warp_size = 0 + self.sm_partition = 0 + self.transaction_size = [0, 0] + self.max_smem_usage = 0 + self.bandwidth = [0, 0] + self.platform = "unknown" + self.compute_capability = "unknown" + self.register_per_thread = 0 diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index 15eb8c166..9fa17c18f 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -75,7 +75,7 @@ def __init__(self, arch, torch_array: list, op_name, tempdir): self.template_dir = "autotuner/template" self.op_name = op_name - self.cache_path = os.path.join(os.path.dirname(__file__), "../../cache/") + self.cache_path = os.path.join(os.path.dirname(__file__), "./cache/") self.problem_key = { "dim_qk": torch_array[0].shape[-1], "dim_v": torch_array[2].shape[-1] @@ -89,14 +89,6 @@ def compile(self, configs:list, timeout: float = None): code_emitter.generate_code(self.shape_config, configs) - def compile_parallel(self, configs:list, temp_dir:str, timeout: float = None): - # ## compile - # arch = self.arch - # with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: - # libs = executor.map(_compile, configs,[arch for _ in configs],[temp_dir for _ in configs],[timeout for _ in configs]) - # return list(libs) - pass - def profile(self, config:BaseConfig, device="cuda:0") -> float: spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.output_dir+"/flash_attn_profile_interface.py") flash_attn_func = importlib.util.module_from_spec(spec) @@ -116,16 +108,14 @@ def get_tuned_configs(self): for Bc in self.Bc_list: cur_configs = self.generate_configs(Br,Bc,dim_qk,dim_v) for cur_config in cur_configs: - if cur_config.fuse_type=="register" and self.validate_register_fuse(cur_config): - configs.append(cur_config) - elif cur_config.fuse_type=="shared" and self.validate_shared_fuse(cur_config): + if self.operation == "flash_fwd" and self.validate_register_fuse(cur_config): configs.append(cur_config) else: # BWD if self.validate_kernel(cur_config): configs.append(cur_config) return configs - def tune(self, log_path="../logs/"): + def tune(self, log_path="./logs/"): st = time.time() dim_qk = self.problem_key["dim_qk"] diff --git a/autotuner/test_run_tunner.py b/autotuner/test_run_tunner.py new file mode 100644 index 000000000..bebdec57b --- /dev/null +++ b/autotuner/test_run_tunner.py @@ -0,0 +1,20 @@ +import torch +from tunner import FlashFwdTunner +from arch import A100 + +batch_size = 4 +seqlen = 2048 +nheads = 8 +headdim = 192 +v_headdim = 128 +device = 'cuda' +dtype = torch.bfloat16 +q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) +k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) +v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, + requires_grad=True) + +tunner = FlashFwdTunner(A100(), [q,k,v], "autotuner/temp") +tunner.tune() diff --git a/autotuner/tunner.py b/autotuner/tunner.py new file mode 100644 index 000000000..2aba5fcdf --- /dev/null +++ b/autotuner/tunner.py @@ -0,0 +1,61 @@ + +import ctypes +import os +import torch + +from .base_tunner import BaseTunner +from .configs.fwd_config import FlashFwdConfig + +class FlashFwdTunner(BaseTunner): + def __init__(self, arch, torch_array: list, tempdir: str): + super().__init__(arch, torch_array, "flash_fwd", tempdir) + + def validate_register_fuse(self, config): + Br = config.Br + Bc = config.Bc + Kd = config.Kd + D = config.D + Nthreads = config.Nwarps * 32 + mmam, mman, mmak = self.arch.cutlass_mma + belem_per_thread = mman*mmak/self.arch.warp_size + + # check tile size + if Br % (mmam*Nthreads/self.arch.warp_size) != 0: + return False + # check shared memory + smem_size_q = config.Br * config.Kd * 2 + smem_size_k = config.Bc * config.Kd * 2 + smem_size_qk = smem_size_q + smem_size_k + smem_size_v = config.Bc * config.D * 2 + smem_out = config.Br * config.D * 2 + if config.SharedQKSmem: + smem_size = max(smem_size_q, smem_size_k+smem_size_v) + else: + smem_size = smem_size_qk + smem_size_v + smem_size = max(smem_size, smem_out) + if smem_size > self.arch.smem_cap: + return False + # check register + reg_used_accum = (Br * D * 4 + Br*Bc*4)/(Nthreads * 4) + reg_used_matmul2 = (Br * D * 4 + Br*Bc*2)/(Nthreads * 4) + (D/(mman*1) * belem_per_thread*2) / 4 + reg_used_matmul1 = (Br * D * 4 + Br * Bc * 4)/(Nthreads * 4) + (Bc/(mman*1) * belem_per_thread*2) / 4 + reg_used_qinregs = (Br * Kd * 2)/(Nthreads * 4) + if config.isQinRegs: + reg_used = reg_used_accum + reg_used_qinregs + else: + reg_used = reg_used_accum # max(reg_used_accum, reg_used_matmul2, reg_used_matmul1) + if reg_used > min(self.arch.register_per_thread, self.arch.reg_cap/Nthreads): + return False + return True + + def generate_configs(self,Br:int,Bc:int,dim_qk:int,dim_v:int): + configs = [] + # TODO: more general + for Nthreads in [128, 256]: + config1 = FlashFwdConfig(dim_qk,dim_v,Br,Bc,Nthreads//32,False,False) + config2 = FlashFwdConfig(dim_qk,dim_v,Br,Bc,Nthreads//32,True,False) + config3 = FlashFwdConfig(dim_qk,dim_v,Br,Bc,Nthreads//32,True,True) + configs.append(config1) + configs.append(config2) + configs.append(config3) + return configs From 1ca8397f63b8c5ac72156491ac7242b4670813ad Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Tue, 10 Sep 2024 15:23:31 +0800 Subject: [PATCH 36/46] autotuner fwd --- autotuner/base_tunner.py | 31 ++- autotuner/code_emitter.py | 4 +- autotuner/configs/__init__.py | 6 + autotuner/configs/base_config.py | 2 +- autotuner/configs/fwd_config.py | 2 +- .../template/flash_attn_profile_interface.py | 12 +- autotuner/template/flash_fwd.h | 2 +- .../flash_fwd_launch_template_profile.h | 168 +++++++++++++++ autotuner/template/flash_profile.h | 196 ++++++++++++++++++ autotuner/template/flash_profile_api.cpp | 139 ++++++++----- autotuner/tunner.py | 4 +- 11 files changed, 493 insertions(+), 73 deletions(-) create mode 100644 autotuner/configs/__init__.py create mode 100644 autotuner/template/flash_fwd_launch_template_profile.h create mode 100644 autotuner/template/flash_profile.h diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index 9fa17c18f..d712666b6 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -1,13 +1,15 @@ import ctypes import os from concurrent.futures import ThreadPoolExecutor +# import multiprocessing +# from functools import partial import tempfile import subprocess import importlib.util import ctypes import torch -from configs.base_config import BaseConfig +from configs import BaseConfig, supported_configs import pprint import json @@ -70,8 +72,8 @@ class BaseTunner: def __init__(self, arch, torch_array: list, op_name, tempdir): self.arch = arch self.torch_array = torch_array - self.Br_list = [32, 64, 128, 256] - self.Bc_list = [32, 64, 128, 256] + self.Br_list = [32, 64, 128] # [32, 64, 128, 256] + self.Bc_list = [32, 64, 128] # [32, 64, 128, 256] self.template_dir = "autotuner/template" self.op_name = op_name @@ -80,6 +82,7 @@ def __init__(self, arch, torch_array: list, op_name, tempdir): "dim_qk": torch_array[0].shape[-1], "dim_v": torch_array[2].shape[-1] } + # TODO: causal, dropout self.shape_config = ShapeConfig(torch_array[0].shape[-1],torch_array[2].shape[-1]) self.tempdir = tempdir @@ -89,11 +92,12 @@ def compile(self, configs:list, timeout: float = None): code_emitter.generate_code(self.shape_config, configs) - def profile(self, config:BaseConfig, device="cuda:0") -> float: + def profile(self, config:BaseConfig, device="cuda:0", repeat=30) -> float: spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.output_dir+"/flash_attn_profile_interface.py") - flash_attn_func = importlib.util.module_from_spec(spec) - spec.loader.exec_module(flash_attn_func) - latency = profile_fwd(flash_attn_func) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + flash_attn_func = mod.flash_attn_func + latency = profile_fwd(flash_attn_func, self.shape_config.Kd, self.shape_config.D, is_bf16=self.shape_config.is_bf16, causal=self.shape_config.is_causal, device=device, repeats=repeat) if latency < 0: latency = 1e8 # remove lib @@ -108,7 +112,7 @@ def get_tuned_configs(self): for Bc in self.Bc_list: cur_configs = self.generate_configs(Br,Bc,dim_qk,dim_v) for cur_config in cur_configs: - if self.operation == "flash_fwd" and self.validate_register_fuse(cur_config): + if self.op_name == "flash_fwd" and self.validate_register_fuse(cur_config): configs.append(cur_config) else: # BWD if self.validate_kernel(cur_config): @@ -139,6 +143,17 @@ def tune(self, log_path="./logs/"): # cresults = self.compile(configs,src_dir.name,timeout=1200) # cresults = self.compile_parallel(configs,src_dir.name,timeout=120) self.compile(configs,timeout=120) + + # warm up (parallel compile module) + # module name must be different in api.py + with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + latencys = executor.map(self.profile, configs, ["cuda:0" for _ in range(len(configs))], [1 for _ in range(len(configs))]) + # with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + # latencys = executor.map(_profile,[self.tempdir for _ in range(len(configs))],[self.shape_config for _ in range(len(configs))], configs, ["cuda:0" for _ in range(len(configs))], [1 for _ in range(len(configs))]) + # multiprocessing.set_start_method('spawn', force=True) + # pool = multiprocessing.Pool(os.cpu_count()) + # outs = pool.map(partial(self.profile, repeat=1), configs) + profile_dict = {} latency = 1e8 best_config = None diff --git a/autotuner/code_emitter.py b/autotuner/code_emitter.py index 5278baae5..2f973cafc 100644 --- a/autotuner/code_emitter.py +++ b/autotuner/code_emitter.py @@ -22,6 +22,8 @@ def __init__(self, template_dir, output_dir) -> None: ] self.kernel_file_list = [ "flash_fwd.h", + "flash_profile.h", + "flash_fwd_launch_template_profile.h" ] def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): @@ -41,7 +43,6 @@ def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): f.write(code_template) # generate kernel code - # TODO: parallelize for config in configs: kernel_code_dir = Path(output_dir) / Path(config.output_dir) if not kernel_code_dir.exists(): @@ -59,6 +60,7 @@ def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): code_template = f.read() code_template = code_template.replace("OUTPUT_DIR", f"\"{str(output_dir)}\"") code_template = code_template.replace("OUTPUT_KERNEL_DIR", f"\"{str(kernel_code_dir)}\"") + code_template = code_template.replace("CONFIG_NAME", f"\"{str(config)}\"") with open(Path(kernel_code_dir) / Path("flash_attn_profile_interface.py"), "w") as f: f.write(code_template) diff --git a/autotuner/configs/__init__.py b/autotuner/configs/__init__.py new file mode 100644 index 000000000..b5e57de67 --- /dev/null +++ b/autotuner/configs/__init__.py @@ -0,0 +1,6 @@ +from .base_config import BaseConfig +from .fwd_config import FlashFwdConfig + +supported_configs = { + "flash_fwd": FlashFwdConfig, +} \ No newline at end of file diff --git a/autotuner/configs/base_config.py b/autotuner/configs/base_config.py index 80c3a9007..4c2e6ed1b 100644 --- a/autotuner/configs/base_config.py +++ b/autotuner/configs/base_config.py @@ -10,7 +10,7 @@ def __init__(self, Kd, D, Br, Bc, Nwarps=8) -> None: self.template_dir = None def __repr__(self) -> str: - return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={}".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps) + return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={})".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps) def __str__(self) -> str: return f"{self.Kd}_{self.D}_{self.Br}_{self.Bc}_{self.Nwarps}" diff --git a/autotuner/configs/fwd_config.py b/autotuner/configs/fwd_config.py index 70177d317..b8f5b4d6e 100644 --- a/autotuner/configs/fwd_config.py +++ b/autotuner/configs/fwd_config.py @@ -12,7 +12,7 @@ def __init__(self, Kd, D, Br, Bc, Nwarps=8, isQinRegs:bool = False, SharedQKSmem self.template_dir = os.path.join(os.path.dirname(__file__), "../../../csrc/kernels/attention") def __repr__(self) -> str: - return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={}, isQinRegs={}, SharedQKSmem={}".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps, self.isQinRegs, self.SharedQKSmem) + return "Config(Kd={}, D={}, Br={}, Bc={}, Nwarps={}, isQinRegs={}, SharedQKSmem={})".format(self.Kd, self.D, self.Br, self.Bc, self.Nwarps, self.isQinRegs, self.SharedQKSmem) def __str__(self) -> str: return f"{self.Kd}_{self.D}_{self.Br}_{self.Bc}_{self.Nwarps}_{self.isQinRegs}_{self.SharedQKSmem}" \ No newline at end of file diff --git a/autotuner/template/flash_attn_profile_interface.py b/autotuner/template/flash_attn_profile_interface.py index c832161b0..5cf4fa300 100644 --- a/autotuner/template/flash_attn_profile_interface.py +++ b/autotuner/template/flash_attn_profile_interface.py @@ -37,7 +37,7 @@ flash_attn_cuda = torch.utils.cpp_extension.load( - name="flash_attn_cuda", + name="flash_attn_cuda"+CONFIG_NAME, sources=[ OUTPUT_DIR + "/flash_profile_api.cpp", # "csrc/flash_attn/flash_api.cpp", OUTPUT_DIR + "/flash_fwd.cu", @@ -612,6 +612,7 @@ def forward( ): if softmax_scale is None: softmax_scale = q.shape[-1] ** (-0.5) + ctx.headdim_qk = q.shape[-1] # before padding out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward( q, k, @@ -657,8 +658,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : q.shape[-1]] # We could have padded the head dimension - dk = dk[..., : k.shape[-1]] + dq = dq[..., : ctx.headdim_qk] # We could have padded the head dimension + dk = dk[..., : ctx.headdim_qk] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None @@ -686,6 +687,7 @@ def forward( ): if softmax_scale is None: softmax_scale = q.shape[-1] ** (-0.5) + ctx.headdim_qk = q.shape[-1] # before padding out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward( q, k, @@ -744,8 +746,8 @@ def backward(ctx, dout, *args): ctx.deterministic, rng_state=rng_state, ) - dq = dq[..., : q.shape[-1]] # We could have padded the head dimension - dk = dk[..., : k.shape[-1]] + dq = dq[..., : ctx.headdim_qk] # We could have padded the head dimension + dk = dk[..., : ctx.headdim_qk] dv = dv[..., : dout.shape[-1]] return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None diff --git a/autotuner/template/flash_fwd.h b/autotuner/template/flash_fwd.h index 98ff86037..e7915205c 100644 --- a/autotuner/template/flash_fwd.h +++ b/autotuner/template/flash_fwd.h @@ -1,4 +1,4 @@ -#include "flash_fwd_launch_template.h" +#include "flash_fwd_launch_template_profile.h" #define False false #define True true diff --git a/autotuner/template/flash_fwd_launch_template_profile.h b/autotuner/template/flash_fwd_launch_template_profile.h new file mode 100644 index 000000000..1bb2fa8d0 --- /dev/null +++ b/autotuner/template/flash_fwd_launch_template_profile.h @@ -0,0 +1,168 @@ +/****************************************************************************** + * Copyright (c) 2023, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include + +#include "static_switch.h" +#include "flash_profile.h" +#include "flash_fwd_kernel.h" + +// Determine if the architecture supports FLASH and define a macro to handle parameter modifiers +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#define ARCH_SUPPORTS_FLASH +#define KERNEL_PARAM_MODIFIER __grid_constant__ +#else +#define KERNEL_PARAM_MODIFIER +#endif + +// Define a macro for unsupported architecture handling to centralize the error message +#define FLASH_UNSUPPORTED_ARCH printf("FATAL: FlashAttention requires building with sm version sm80-sm90, but was built for < 8.0!"); + +// Use a macro to clean up kernel definitions +#define DEFINE_FLASH_FORWARD_KERNEL(kernelName, ...) \ +template \ +__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_fwd_params params) + +DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Return_softmax) { + #if defined(ARCH_SUPPORTS_FLASH) + static_assert(!(Is_causal && Is_local)); // Enforce constraints + flash::compute_attn(params); + #else + FLASH_UNSUPPORTED_ARCH + #endif +} + +DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_kernel, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Split, bool Append_KV) { + #if defined(ARCH_SUPPORTS_FLASH) + flash::compute_attn_splitkv(params); + #else + FLASH_UNSUPPORTED_ARCH + #endif +} + +DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_combine_kernel, int kBlockM, int Log_max_splits, bool Is_even_K) { + static_assert(Log_max_splits >= 1); + flash::combine_attn_seqk_parallel(params); +} + +template +void run_flash_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr size_t smem_size = Kernel_traits::kSmemSize; + // printf("smem_size = %d\n", smem_size); + + // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH. + // https://github.com/kokkos/kokkos-kernels/issues/349 + // https://github.com/HazyResearch/flash-attention/issues/21 + + const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM; + dim3 grid(num_m_block, params.b, params.h); + const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0; + const bool is_even_K = params.d == Kernel_traits::kQKHeadDim; //TODO: Check if this is correct + const bool return_softmax = params.p_ptr != nullptr; + BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] { + EVENK_SWITCH(is_even_K, IsEvenKConst, [&] { + LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] { + BOOL_SWITCH(return_softmax, ReturnSoftmaxConst, [&] { + ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] { + SOFTCAP_SWITCH(params.softcap > 0.0, Is_softcap, [&] { + // Will only return softmax if dropout, to reduce compilation time. + // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates. + // If return_softmax, set IsEvenMNConst to false to reduce number of templates + // If head dim > 128, set IsEvenMNConst to false to reduce number of templates + // If Is_local, set Is_causal to false + auto kernel = &flash_fwd_kernel;// TODO: Check if this is correct + // auto kernel = &flash_fwd_kernel; + // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout)); + // auto kernel = &flash_fwd_kernel; + if (smem_size >= 48 * 1024) { + C10_CUDA_CHECK(cudaFuncSetAttribute( + kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + } + // int ctas_per_sm; + // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + // &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size); + // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm); + kernel<<>>(params); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + }); + }); + }); + }); + }); +} + +template +void run_flash_splitkv_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { + static_assert(!Kernel_traits::Is_Q_in_regs, "SplitKV implementation does not support Is_Q_in_regs"); + static_assert(!Kernel_traits::Share_Q_K_smem, "SplitKV implementation does not support Share_Q_K_smem"); + constexpr size_t smem_size = Kernel_traits::kSmemSize; + const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM; + dim3 grid(num_m_block, params.num_splits > 1 ? params.num_splits : params.b, params.num_splits > 1 ? params.b * params.h : params.h); + const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0; + const bool is_even_K = params.d == Kernel_traits::kQKHeadDim; //TODO: Check if this is correct + BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] { + EVENK_SWITCH(is_even_K, IsEvenKConst, [&] { + LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] { + BOOL_SWITCH(params.num_splits > 1, Split, [&] { + BOOL_SWITCH(params.knew_ptr != nullptr, Append_KV, [&] { + ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] { + SOFTCAP_SWITCH(params.softcap > 0.0, Is_softcap, [&] { + // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr. + // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates. + // If Is_local, set Is_causal to false + auto kernel = &flash_fwd_splitkv_kernel; // TODO: Check if this is correct + // auto kernel = &flash_fwd_splitkv_kernel; + // auto kernel = &flash_fwd_splitkv_kernel; + if (smem_size >= 48 * 1024) { + C10_CUDA_CHECK(cudaFuncSetAttribute( + kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + } + kernel<<>>(params); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + }); + }); + }); + }); + }); + }); + if (params.num_splits > 1) { + // We want kBlockM to be as small as possible for more parallelism. + // With 128 threads we can load 512 elements at a time, so if headdim is divisible by 128, kBlockM = 4. + // If headdim is divisible by 64, then we set kBlockM = 8, etc. + constexpr static int kBlockM = Kernel_traits::kQKHeadDim % 128 == 0 ? 4 : (Kernel_traits::kQKHeadDim % 64 == 0 ? 8 : 16); // TODO: Check if this is correct + dim3 grid_combine((params.b * params.h * params.seqlen_q + kBlockM - 1) / kBlockM); + EVENK_SWITCH(is_even_K, IsEvenKConst, [&] { + if (params.num_splits <= 2) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } else if (params.num_splits <= 4) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } else if (params.num_splits <= 8) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } else if (params.num_splits <= 16) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } else if (params.num_splits <= 32) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } else if (params.num_splits <= 64) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } else if (params.num_splits <= 128) { + flash_fwd_splitkv_combine_kernel<<>>(params); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } +} + +template +void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int kBlockM = 64; // Fixed for all head dimensions + // TD [2023-08-28]: nvcc segfaults for headdim 96 with block size 64 x 256, + // and for headdim 192 with block size 64 x 128. + // Also for headdim 160 with block size 64 x 128 after the rotary addition. + constexpr static int kBlockN = QKHeaddim <= 64 ? 256 : (QKHeaddim <= 128 ? 128 : 64); + run_flash_splitkv_fwd, Is_causal>(params, stream); +} diff --git a/autotuner/template/flash_profile.h b/autotuner/template/flash_profile.h new file mode 100644 index 000000000..51e04ced5 --- /dev/null +++ b/autotuner/template/flash_profile.h @@ -0,0 +1,196 @@ +/****************************************************************************** + * Copyright (c) 2023, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include + +#ifdef OLD_GENERATOR_PATH +#include +#else +#include +#endif + +#include // For at::cuda::philox::unpack + +constexpr int TOTAL_DIM = 0; +constexpr int H_DIM = 1; +constexpr int D_DIM = 2; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Qkv_params { + using index_t = int64_t; + // The QKV matrices. + void *__restrict__ q_ptr; + void *__restrict__ k_ptr; + void *__restrict__ v_ptr; + + // The stride between rows of the Q, K and V matrices. + index_t q_batch_stride; + index_t k_batch_stride; + index_t v_batch_stride; + index_t q_row_stride; + index_t k_row_stride; + index_t v_row_stride; + index_t q_head_stride; + index_t k_head_stride; + index_t v_head_stride; + + // The number of heads. + int h, h_k, h_v; + // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be + // different from nheads (query). + int h_h_k_ratio; // precompute h / h_k, + int h_h_v_ratio; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Flash_fwd_params : public Qkv_params { + + // The O matrix (output). + void * __restrict__ o_ptr; + void * __restrict__ oaccum_ptr; + + // The stride between rows of O. + index_t o_batch_stride; + index_t o_row_stride; + index_t o_head_stride; + + // The pointer to the P matrix. + void * __restrict__ p_ptr; + + // The pointer to the softmax sum. + void * __restrict__ softmax_lse_ptr; + void * __restrict__ softmax_lseaccum_ptr; + + // The dimensions. + int b, seqlen_q, seqlen_k, seqlen_knew, d, vd, seqlen_q_rounded, seqlen_k_rounded, d_rounded, vd_rounded, rotary_dim, total_q; + + // The scaling factors for the kernel. + float scale_softmax; + float scale_softmax_log2; + + // array of length b+1 holding starting offset of each sequence. + int * __restrict__ cu_seqlens_q; + int * __restrict__ cu_seqlens_k; + int * __restrict__ leftpad_k; + + // If provided, the actual length of each k sequence. + int * __restrict__ seqused_k; + + int *__restrict__ blockmask; + + // The K_new and V_new matrices. + void * __restrict__ knew_ptr; + void * __restrict__ vnew_ptr; + + // The stride between rows of the Q, K and V matrices. + index_t knew_batch_stride; + index_t vnew_batch_stride; + index_t knew_row_stride; + index_t vnew_row_stride; + index_t knew_head_stride; + index_t vnew_head_stride; + + // The cos and sin matrices for rotary embedding. + void * __restrict__ rotary_cos_ptr; + void * __restrict__ rotary_sin_ptr; + + // The indices to index into the KV cache. + int * __restrict__ cache_batch_idx; + + // Paged KV cache + int * __restrict__ block_table; + index_t block_table_batch_stride; + int page_block_size; + + // The dropout probability (probability of keeping an activation). + float p_dropout; + // uint32_t p_dropout_in_uint; + // uint16_t p_dropout_in_uint16_t; + uint8_t p_dropout_in_uint8_t; + + // Scale factor of 1 / (1 - p_dropout). + float rp_dropout; + float scale_softmax_rp_dropout; + + // Local window size + int window_size_left, window_size_right; + float softcap; + + // Random state. + at::PhiloxCudaState philox_args; + + // Pointer to the RNG seed (idx 0) and offset (idx 1). + uint64_t * rng_state; + + bool is_bf16; + bool is_causal; + + // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb]. + // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K. + bool is_seqlens_k_cumulative; + + bool is_rotary_interleaved; + + int num_splits; // For split-KV version + + void * __restrict__ alibi_slopes_ptr; + index_t alibi_slopes_batch_stride; + + bool unpadded_lse; // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q]. + bool seqlenq_ngroups_swapped; // q has been transposed from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d). +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Flash_bwd_params : public Flash_fwd_params { + + // The dO and dQKV matrices. + void *__restrict__ do_ptr; + void *__restrict__ dq_ptr; + void *__restrict__ dk_ptr; + void *__restrict__ dv_ptr; + + // To accumulate dQ + void *__restrict__ dq_accum_ptr; + void *__restrict__ dk_accum_ptr; + void *__restrict__ dv_accum_ptr; + + // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q + // dimension void *__restrict__ dk_accum_ptr; void *__restrict__ + // dv_accum_ptr; + + // The stride between rows of the dO, dQ, dK and dV matrices. + // TD [2022-04-16]: We're using 32-bit indexing to save registers. + // The code probably won't work for arrays larger than 2GB. + index_t do_batch_stride; + index_t do_row_stride; + index_t do_head_stride; + index_t dq_batch_stride; + index_t dk_batch_stride; + index_t dv_batch_stride; + index_t dq_row_stride; + index_t dk_row_stride; + index_t dv_row_stride; + index_t dq_head_stride; + index_t dk_head_stride; + index_t dv_head_stride; + + // The pointer to the softmax d sum. + void *__restrict__ dsoftmax_sum; + + bool deterministic; + index_t dq_accum_split_stride; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream); +// template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +// template void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream); diff --git a/autotuner/template/flash_profile_api.cpp b/autotuner/template/flash_profile_api.cpp index 0ee997cf8..48c84a5d6 100644 --- a/autotuner/template/flash_profile_api.cpp +++ b/autotuner/template/flash_profile_api.cpp @@ -10,7 +10,7 @@ #include -#include "flash.h" +#include "flash_profile.h" // #include "static_switch.h" // #include "static_switch_headdim.h" #define False false @@ -30,6 +30,7 @@ void set_params_fprop(Flash_fwd_params ¶ms, const size_t seqlen_k_rounded, const size_t h, const size_t h_k, + const size_t h_v, const size_t d, const size_t d_rounded, const size_t vd, @@ -97,7 +98,9 @@ void set_params_fprop(Flash_fwd_params ¶ms, params.b = b; params.h = h; params.h_k = h_k; + params.h_v = h_v; params.h_h_k_ratio = h / h_k; + params.h_h_v_ratio = h / h_v; params.seqlen_q = seqlen_q; params.seqlen_k = seqlen_k; params.seqlen_q_rounded = seqlen_q_rounded; @@ -169,6 +172,7 @@ void set_params_dgrad(Flash_bwd_params ¶ms, const size_t seqlen_k_rounded, const size_t h, const size_t h_k, + const size_t h_v, const size_t d, const size_t d_rounded, const size_t vd, @@ -197,7 +201,7 @@ void set_params_dgrad(Flash_bwd_params ¶ms, const bool unpadded_lse) { set_params_fprop(params, - b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,vd, vd, + b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, h_v, d, d_rounded,vd, vd, q, k, v, out, cu_seqlens_q_d, cu_seqlens_k_d, @@ -256,7 +260,8 @@ void run_mha_fwd(Flash_fwd_params ¶ms, cudaStream_t stream, bool force_split if (params.num_splits <= 1 && !force_split_kernel) { // If we don't set it num_splits == 0 run_mha_fwd_, kQKHeadDim, kVHeadDim, is_causal>(params, stream); } else { - run_mha_fwd_splitkv_dispatch, kQKHeadDim, kVHeadDim, is_causal>(params, stream); + // TODO: temporary workaround + // run_mha_fwd_splitkv_dispatch, kQKHeadDim, kVHeadDim, is_causal>(params, stream); } } @@ -358,7 +363,7 @@ void set_params_alibi(Flash_fwd_params ¶ms, c10::optional &alibi std::vector mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size - const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_v x head_size c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, @@ -402,10 +407,12 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const int head_size_og = sizes[3]; const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); + const int num_heads_v = v.size(2); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } @@ -418,18 +425,19 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); - const int ngroups = num_heads / num_heads_k; + const int num_heads_maxkv = num_heads_k > num_heads_v ? num_heads_k : num_heads_v; + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_maxkv && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && v_head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_maxkv; if (seqlenq_ngroups_swapped) { - q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + q = q.reshape({batch_size, num_heads_maxkv, ngroups, head_size_og}).transpose(1, 2); seqlen_q = ngroups; - num_heads = num_heads_k; + num_heads = num_heads_maxkv; } CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og); - CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_v, v_head_size_og); at::Tensor q_padded, k_padded, v_padded; if (head_size_og % 8 != 0) { @@ -455,7 +463,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], v_head_size_og); if (seqlenq_ngroups_swapped) { - out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2); + out = out.reshape({batch_size, num_heads_maxkv, ngroups, v_head_size_og}).transpose(1, 2); } if (v_head_size_og % 8 != 0) { out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); @@ -495,7 +503,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size batch_size, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size, v_head_size_rounded, q_padded, k_padded, v_padded, out, @@ -552,10 +560,10 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size } if (seqlenq_ngroups_swapped) { - out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); - out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); - q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); - softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, v_head_size_og}); + out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, v_head_size_og}); + q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_maxkv * seqlen_q, 1}); } return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state}; } @@ -563,7 +571,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size std::vector mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. - const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + const at::Tensor &v, // total_k x num_heads_v x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_v x head_size if there's a block_table. c10::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 @@ -627,6 +635,8 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s int num_heads = sizes[1]; const int head_size_og = sizes[2]; const int num_heads_k = paged_KV ? k.size(2) : k.size(1); + // TODO: check here + const int num_heads_v = paged_KV ? v.size(2) : v.size(1); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } @@ -642,12 +652,13 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); - const int ngroups = num_heads / num_heads_k; + const int num_heads_maxkv = num_heads_k > num_heads_v ? num_heads_k : num_heads_v; + const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_maxkv && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && v_head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_maxkv; if (seqlenq_ngroups_swapped) { - q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + q = q.reshape({batch_size, num_heads_maxkv, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_maxkv, head_size_og}); max_seqlen_q = ngroups; - num_heads = num_heads_k; + num_heads = num_heads_maxkv; cu_seqlens_q_d = nullptr; } @@ -657,6 +668,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(v_head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); if (window_size_left >= max_seqlen_k) { window_size_left = -1; } if (window_size_right >= max_seqlen_k) { window_size_right = -1; } @@ -665,10 +677,10 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s if (!paged_KV) { const int total_k = k.size(0); CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); - CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, total_k, num_heads_v, v_head_size_og); } else { CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size_og); - CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_v, v_head_size_og); CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); } @@ -705,7 +717,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); CHECK_SHAPE(out, sizes[0], sizes[1], v_head_size_og); if (seqlenq_ngroups_swapped) { - out = out.reshape({batch_size, num_heads_k, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og}); + out = out.reshape({batch_size, num_heads_maxkv, ngroups, v_head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_maxkv, head_size_og}); } if (v_head_size_og % 8 != 0) { out = torch::empty({total_q, num_heads, v_head_size_og}, q.options()); @@ -750,7 +762,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s batch_size, max_seqlen_q, max_seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size, v_head_size_rounded, q_padded, k_padded, v_padded, out, @@ -830,10 +842,10 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s } if (seqlenq_ngroups_swapped) { - int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_k, head_size_og}; - int64_t size_after[] = {batch_size, num_heads_k * max_seqlen_q, head_size_og}; - int64_t o_size_before[] = {batch_size, max_seqlen_q, num_heads_k, v_head_size_og}; - int64_t o_size_after[] = {batch_size, num_heads_k * max_seqlen_q, v_head_size_og}; + int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_maxkv, head_size_og}; + int64_t size_after[] = {batch_size, num_heads_maxkv * max_seqlen_q, head_size_og}; + int64_t o_size_before[] = {batch_size, max_seqlen_q, num_heads_maxkv, v_head_size_og}; + int64_t o_size_after[] = {batch_size, num_heads_maxkv * max_seqlen_q, v_head_size_og}; out = out.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); out_padded = out_padded.reshape(o_size_before).transpose(1, 2).reshape(o_size_after); q_padded = q_padded.reshape(size_before).transpose(1, 2).reshape(size_after); @@ -855,18 +867,19 @@ void run_mha_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { assert(params.d == kQKHeadDim); assert(params.vd == kVHeadDim); - run_mha_bwd_, kQKHeadDim, kVHeadDim, is_causal>(params, stream); + // TODO: temporary workaround + // run_mha_bwd_, kQKHeadDim, kVHeadDim, is_causal>(params, stream); } std::vector mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size - const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_v x head_size const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &softmax_lse, // b x h x seqlen_q c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size - c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &dv_, // batch_size x seqlen_k x num_heads_v x head_size c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, // probability to drop const float softmax_scale, @@ -923,6 +936,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si const int head_size = sizes[3]; const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); + const int num_heads_v = v.size(2); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); @@ -932,6 +946,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); } TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256; @@ -946,7 +961,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size); - CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_v, v_head_size_og); CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og); @@ -974,7 +989,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); CHECK_DEVICE(dv); TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); - CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_v, v_head_size_og); } else { dv = torch::empty_like(v); } @@ -1012,9 +1027,12 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si at::Tensor dk_expanded, dv_expanded; if (num_heads_k != num_heads) { // MQA / GQA dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); - dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, v_head_size_og}, opts); } else { dk_expanded = dk; + } + if (num_heads_v != num_heads) { + dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, v_head_size_og}, opts); + } else { dv_expanded = dv; } @@ -1024,7 +1042,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si batch_size, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size_og, q, k, v, out, @@ -1080,7 +1098,9 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si // For MQA/GQA we need to sum dK and dV across the groups if (num_heads_k != num_heads) { at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); - at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {3}); + } + if (num_heads_v != num_heads) { + at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_v, num_heads / num_heads_v, v_head_size_og}), {3}); } if (head_size_og % 8 != 0) { dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); @@ -1167,6 +1187,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size const int head_size = sizes[2]; const int total_k = k.size(0); const int num_heads_k = k.size(1); + const int num_heads_v = v.size(1); TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256"); @@ -1177,6 +1198,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 with dropout requires A100/A800 or H100/H800"); } TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); } auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; @@ -1191,7 +1213,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_SHAPE(q, total_q, num_heads, head_size); CHECK_SHAPE(k, total_k, num_heads_k, head_size); - CHECK_SHAPE(v, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, total_k, num_heads_v, v_head_size_og); CHECK_SHAPE(out, total_q, num_heads, v_head_size_og); CHECK_SHAPE(dout, total_q, num_heads, head_size_og); CHECK_SHAPE(cu_seqlens_q, batch_size + 1); @@ -1221,7 +1243,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); CHECK_DEVICE(dv); TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); - CHECK_SHAPE(dv, total_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(dv, total_k, num_heads_v, v_head_size_og); } else { dv = torch::empty_like(v); } @@ -1265,9 +1287,12 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size at::Tensor dk_expanded, dv_expanded; if (num_heads_k != num_heads) { // MQA / GQA dk_expanded = torch::empty({total_k, num_heads, head_size}, opts); - dv_expanded = torch::empty({total_k, num_heads, v_head_size_og}, opts); } else { dk_expanded = dk; + } + if (num_heads_v != num_heads) { + dv_expanded = torch::empty({total_k, num_heads, v_head_size_og}, opts); + } else { dv_expanded = dv; } @@ -1284,7 +1309,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size batch_size, max_seqlen_q, max_seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size_og, q, k, v, out, @@ -1339,7 +1364,9 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size // For MQA/GQA we need to sum dK and dV across the groups if (num_heads_k != num_heads) { at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); - at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, v_head_size_og}), {2}); + } + if (num_heads_v != num_heads) { + at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_v, num_heads / num_heads_v, v_head_size_og}), {2}); } if (head_size_og % 8 != 0) { dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); @@ -1419,10 +1446,12 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256"); const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size; const int num_heads_k = kcache.size(2); + const int num_heads_v = vcache.size(2); const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size; TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + TORCH_CHECK(num_heads % num_heads_v == 0, "Number of heads in value must divide number of heads in query"); TORCH_CHECK(v_head_size_og <= 256, "FlashAttention backward only supports head dimension at most 256"); // causal=true is the same as causal=false in this case @@ -1431,12 +1460,13 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int num_heads_maxkv = num_heads_k > num_heads_v ? num_heads_k : num_heads_v; + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_maxkv && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && v_head_size_og % 8 == 0 && !alibi_slopes_.has_value(); if (seqlenq_ngroups_swapped) { - const int ngroups = num_heads / num_heads_k; - q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + const int ngroups = num_heads / num_heads_maxkv; + q = q.reshape({batch_size, num_heads_maxkv, ngroups, head_size_og}).transpose(1, 2); seqlen_q = ngroups; - num_heads = num_heads_k; + num_heads = num_heads_maxkv; } if (window_size_left >= seqlen_k) { window_size_left = -1; } @@ -1445,10 +1475,10 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); if (!paged_KV) { CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og); - CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, v_head_size_og); + CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_v, v_head_size_og); } else { CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og); - CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, v_head_size_og); + CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_v, v_head_size_og); CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq); } @@ -1469,6 +1499,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); CHECK_DEVICE(out); TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + // TODO: check here for seqlenq_ngroups_swapped CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, v_head_size_og); if (v_head_size_og % 8 != 0) { out = torch::empty({batch_size, seqlen_q, num_heads, v_head_size_og}, q.options()); @@ -1502,7 +1533,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he batch_size, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, - num_heads, num_heads_k, + num_heads, num_heads_k, num_heads_v, head_size, head_size_rounded, v_head_size, v_head_size_rounded, q_padded, kcache_padded, vcache_padded, out, @@ -1532,7 +1563,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension"); int seqlen_knew = k.size(1); CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og); - CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, v_head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_v, v_head_size_og); if (head_size_og % 8 != 0) { k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - v_head_size_og % 8})); @@ -1647,8 +1678,8 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he } if (seqlenq_ngroups_swapped) { - out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, v_head_size_og}); - softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_maxkv * seqlen_q, v_head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_maxkv * seqlen_q, 1}); } return {out, softmax_lse}; } @@ -1657,7 +1688,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.doc() = "FlashAttention"; m.def("fwd", &mha_fwd, "Forward pass"); m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)"); - m.def("bwd", &mha_bwd, "Backward pass"); - m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)"); - m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache"); + // m.def("bwd", &mha_bwd, "Backward pass"); + // m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)"); + // m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache"); } diff --git a/autotuner/tunner.py b/autotuner/tunner.py index 2aba5fcdf..30d246fb1 100644 --- a/autotuner/tunner.py +++ b/autotuner/tunner.py @@ -3,8 +3,8 @@ import os import torch -from .base_tunner import BaseTunner -from .configs.fwd_config import FlashFwdConfig +from base_tunner import BaseTunner +from configs.fwd_config import FlashFwdConfig class FlashFwdTunner(BaseTunner): def __init__(self, arch, torch_array: list, tempdir: str): From 1e5c49d3ae91a292e0af9e6a1a26bb2bbda07a2f Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Tue, 10 Sep 2024 21:35:08 +0800 Subject: [PATCH 37/46] update code --- autotuner/base_tunner.py | 6 ++++-- autotuner/test_run_tunner.py | 3 ++- autotuner/tunner.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index d712666b6..7e357d703 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -69,7 +69,7 @@ def _create_code_for_profiling(config): # return CompileResult(config,lib_name) class BaseTunner: - def __init__(self, arch, torch_array: list, op_name, tempdir): + def __init__(self, arch, torch_array: list, op_name, shape_config: ShapeConfig, tempdir): self.arch = arch self.torch_array = torch_array self.Br_list = [32, 64, 128] # [32, 64, 128, 256] @@ -82,8 +82,10 @@ def __init__(self, arch, torch_array: list, op_name, tempdir): "dim_qk": torch_array[0].shape[-1], "dim_v": torch_array[2].shape[-1] } + assert torch_array[0].shape[-1] == shape_config.Kd + assert torch_array[2].shape[-1] == shape_config.D # TODO: causal, dropout - self.shape_config = ShapeConfig(torch_array[0].shape[-1],torch_array[2].shape[-1]) + self.shape_config = shape_config self.tempdir = tempdir def compile(self, configs:list, timeout: float = None): diff --git a/autotuner/test_run_tunner.py b/autotuner/test_run_tunner.py index bebdec57b..7d719e4ac 100644 --- a/autotuner/test_run_tunner.py +++ b/autotuner/test_run_tunner.py @@ -1,6 +1,7 @@ import torch from tunner import FlashFwdTunner from arch import A100 +from code_emitter import ShapeConfig batch_size = 4 seqlen = 2048 @@ -16,5 +17,5 @@ v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, requires_grad=True) -tunner = FlashFwdTunner(A100(), [q,k,v], "autotuner/temp") +tunner = FlashFwdTunner(A100(), [q,k,v], ShapeConfig(headdim,v_headdim), "autotuner/temp") tunner.tune() diff --git a/autotuner/tunner.py b/autotuner/tunner.py index 30d246fb1..458789550 100644 --- a/autotuner/tunner.py +++ b/autotuner/tunner.py @@ -7,8 +7,8 @@ from configs.fwd_config import FlashFwdConfig class FlashFwdTunner(BaseTunner): - def __init__(self, arch, torch_array: list, tempdir: str): - super().__init__(arch, torch_array, "flash_fwd", tempdir) + def __init__(self, arch, torch_array: list, shape_config, tempdir: str): + super().__init__(arch, torch_array, "flash_fwd", shape_config, tempdir) def validate_register_fuse(self, config): Br = config.Br From 409bddea9b83213395f771ad95d5df5ee9bf64a6 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Thu, 12 Sep 2024 16:25:14 +0800 Subject: [PATCH 38/46] update autotuner log --- autotuner/base_tunner.py | 23 ++++++++++++----------- autotuner/code_emitter.py | 31 +++++++++++++++++++++++++------ autotuner/test_run_tunner.py | 8 +++++--- autotuner/tunner.py | 4 ++-- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index 7e357d703..4fe894b8e 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -16,7 +16,7 @@ import time -from code_emitter import CodeEmitter, ShapeConfig +from code_emitter import CodeEmitter, ShapeConfig, ProfileConfig from profile_attn import profile_fwd @@ -69,7 +69,7 @@ def _create_code_for_profiling(config): # return CompileResult(config,lib_name) class BaseTunner: - def __init__(self, arch, torch_array: list, op_name, shape_config: ShapeConfig, tempdir): + def __init__(self, arch, torch_array: list, op_name, shape_config: ShapeConfig, profile_config: ProfileConfig, tempdir): self.arch = arch self.torch_array = torch_array self.Br_list = [32, 64, 128] # [32, 64, 128, 256] @@ -77,15 +77,16 @@ def __init__(self, arch, torch_array: list, op_name, shape_config: ShapeConfig, self.template_dir = "autotuner/template" self.op_name = op_name - self.cache_path = os.path.join(os.path.dirname(__file__), "./cache/") + # TODO: workaround for dropout_p + self.cache_path = os.path.join(os.path.dirname(__file__), "./cache/", str(profile_config.dropout_p!=0)) self.problem_key = { "dim_qk": torch_array[0].shape[-1], "dim_v": torch_array[2].shape[-1] } assert torch_array[0].shape[-1] == shape_config.Kd assert torch_array[2].shape[-1] == shape_config.D - # TODO: causal, dropout self.shape_config = shape_config + self.profile_config = profile_config self.tempdir = tempdir def compile(self, configs:list, timeout: float = None): @@ -94,12 +95,12 @@ def compile(self, configs:list, timeout: float = None): code_emitter.generate_code(self.shape_config, configs) - def profile(self, config:BaseConfig, device="cuda:0", repeat=30) -> float: + def profile(self, config:BaseConfig, repeat=30) -> float: spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.output_dir+"/flash_attn_profile_interface.py") mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) flash_attn_func = mod.flash_attn_func - latency = profile_fwd(flash_attn_func, self.shape_config.Kd, self.shape_config.D, is_bf16=self.shape_config.is_bf16, causal=self.shape_config.is_causal, device=device, repeats=repeat) + latency = profile_fwd(flash_attn_func, self.shape_config.Kd, self.shape_config.D, batch_size=self.profile_config.batch_size, seqlen=self.profile_config.seqlen_q, nheads=self.profile_config.nheads, dropout_p=self.profile_config.dropout_p,is_bf16=self.shape_config.is_bf16, causal=self.shape_config.is_causal, device=self.profile_config.device, repeats=repeat) if latency < 0: latency = 1e8 # remove lib @@ -149,7 +150,7 @@ def tune(self, log_path="./logs/"): # warm up (parallel compile module) # module name must be different in api.py with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: - latencys = executor.map(self.profile, configs, ["cuda:0" for _ in range(len(configs))], [1 for _ in range(len(configs))]) + latencys = executor.map(self.profile, configs, [1 for _ in range(len(configs))]) # with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # latencys = executor.map(_profile,[self.tempdir for _ in range(len(configs))],[self.shape_config for _ in range(len(configs))], configs, ["cuda:0" for _ in range(len(configs))], [1 for _ in range(len(configs))]) # multiprocessing.set_start_method('spawn', force=True) @@ -178,9 +179,9 @@ def tune(self, log_path="./logs/"): pprint.pprint(best_config) print("Latency: ", latency) - file_name = "profile_result_{}_{}_{}.txt".format(best_config.operation,dim_qk, dim_v) + file_name = "profile_result_{}_{}_{}_p{}_{}_{}_{}_c{}.txt".format(best_config.operation,dim_qk, dim_v, self.profile_config.batch_size, self.profile_config.seqlen_q, self.profile_config.nheads, self.profile_config.dropout_p,self.shape_config.is_causal) os.makedirs(log_path,exist_ok=True) - with open(os.path.join(log_path,file_name),"w") as f: + with open(os.path.join(log_path,file_name),"a") as f: for config in profile_dict: f.write(repr(config)+"\n") f.write(str(profile_dict[config])+"\n") @@ -188,7 +189,7 @@ def tune(self, log_path="./logs/"): f.write("best config: \n") f.write(repr(best_config)+"\n") f.write(str(latency)+"\n") - f.write("\nsearch time: "+str(end-st)+"s") + f.write("\nsearch time: "+str(end-st)+"s" + "\n\n") cache_path = self.cache_path os.makedirs(cache_path,exist_ok=True) @@ -237,7 +238,7 @@ def generate_configs(self,Br:int,Bc:int,dim_qk:int,dim_v:int): requires_grad=True) v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, requires_grad=True) - base_tunner = BaseTunner(arch=None, torch_array=[q,k,v], op_name="flash_fwd", tempdir="autotuner/temp") + base_tunner = BaseTunner(arch=None, torch_array=[q,k,v], op_name="flash_fwd", shape_config=ShapeConfig(headdim,v_headdim), profle_config=ProfileConfig(batch_size,seqlen,seqlen,nheads,nheads,nheads,device,dtype,0), tempdir="autotuner/temp") config = FlashFwdConfig(headdim,v_headdim,64,64) base_tunner.compile([config]) diff --git a/autotuner/code_emitter.py b/autotuner/code_emitter.py index 2f973cafc..d595d51c8 100644 --- a/autotuner/code_emitter.py +++ b/autotuner/code_emitter.py @@ -10,6 +10,18 @@ def __init__(self, Kd, D, is_bf16: bool=False, is_causal: bool=False) -> None: self.is_bf16 = is_bf16 self.is_causal = is_causal +class ProfileConfig: + def __init__(self, batch_size, seqlen_q, seqlen_kv, nheads, nheads_k, nheads_v, device, dtype, dropout_p) -> None: + self.batch_size = batch_size + self.seqlen_q = seqlen_q + self.seqlen_kv = seqlen_kv + self.nheads = nheads + self.nheads_k = nheads_k + self.nheads_v = nheads_v + self.device = device + self.dtype = dtype + self.dropout_p = dropout_p + class CodeEmitter: def __init__(self, template_dir, output_dir) -> None: @@ -30,23 +42,30 @@ def generate_code(self, shape_config:ShapeConfig, configs:list[BaseConfig]): template_dir = self.template_dir output_dir = self.output_dir + skip_api_code = False if not Path(output_dir).exists(): os.mkdir(output_dir) + else: + skip_api_code = True # generate api code - for file_name in self.profile_api_file_list: - with open(Path(template_dir) / Path(file_name)) as f: - code_template = f.read() - code_template = self.emit_code_profile_api(code_template, shape_config) + if not skip_api_code: + for file_name in self.profile_api_file_list: + with open(Path(template_dir) / Path(file_name)) as f: + code_template = f.read() + code_template = self.emit_code_profile_api(code_template, shape_config) - with open(Path(output_dir) / Path(file_name), "w") as f: - f.write(code_template) + with open(Path(output_dir) / Path(file_name), "w") as f: + f.write(code_template) # generate kernel code for config in configs: kernel_code_dir = Path(output_dir) / Path(config.output_dir) if not kernel_code_dir.exists(): os.mkdir(kernel_code_dir) + else: + continue + for file_name in self.kernel_file_list: with open(Path(template_dir) / Path(file_name)) as f: code_template = f.read() diff --git a/autotuner/test_run_tunner.py b/autotuner/test_run_tunner.py index 7d719e4ac..87ce9e325 100644 --- a/autotuner/test_run_tunner.py +++ b/autotuner/test_run_tunner.py @@ -1,15 +1,17 @@ import torch from tunner import FlashFwdTunner from arch import A100 -from code_emitter import ShapeConfig +from code_emitter import ShapeConfig,ProfileConfig batch_size = 4 seqlen = 2048 nheads = 8 headdim = 192 v_headdim = 128 -device = 'cuda' +device = 'cuda:0' dtype = torch.bfloat16 +dropout_p = 0.1 # 0.0 + q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True) k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, @@ -17,5 +19,5 @@ v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, requires_grad=True) -tunner = FlashFwdTunner(A100(), [q,k,v], ShapeConfig(headdim,v_headdim), "autotuner/temp") +tunner = FlashFwdTunner(A100(), [q,k,v], ShapeConfig(headdim,v_headdim), ProfileConfig(batch_size,seqlen,seqlen,nheads,nheads,nheads,device,dtype,dropout_p), "autotuner/temp") tunner.tune() diff --git a/autotuner/tunner.py b/autotuner/tunner.py index 458789550..18e9cb15c 100644 --- a/autotuner/tunner.py +++ b/autotuner/tunner.py @@ -7,8 +7,8 @@ from configs.fwd_config import FlashFwdConfig class FlashFwdTunner(BaseTunner): - def __init__(self, arch, torch_array: list, shape_config, tempdir: str): - super().__init__(arch, torch_array, "flash_fwd", shape_config, tempdir) + def __init__(self, arch, torch_array: list, shape_config, profile_config, tempdir: str): + super().__init__(arch, torch_array, "flash_fwd", shape_config, profile_config, tempdir) def validate_register_fuse(self, config): Br = config.Br From d4b620acb5ae72ce0bae1cdb845b3944b815ee6e Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Thu, 12 Sep 2024 17:25:39 +0800 Subject: [PATCH 39/46] update tunner --- autotuner/base_tunner.py | 6 ++++-- autotuner/profile_attn.py | 1 + autotuner/test_run_tunner.py | 8 ++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index 4fe894b8e..36096bc1a 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -95,11 +95,13 @@ def compile(self, configs:list, timeout: float = None): code_emitter.generate_code(self.shape_config, configs) - def profile(self, config:BaseConfig, repeat=30) -> float: + def profile(self, config:BaseConfig, repeat=30, load_only=False) -> float: spec = importlib.util.spec_from_file_location("flash_attn_func", self.tempdir+"/"+config.output_dir+"/flash_attn_profile_interface.py") mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) flash_attn_func = mod.flash_attn_func + if load_only: + return None latency = profile_fwd(flash_attn_func, self.shape_config.Kd, self.shape_config.D, batch_size=self.profile_config.batch_size, seqlen=self.profile_config.seqlen_q, nheads=self.profile_config.nheads, dropout_p=self.profile_config.dropout_p,is_bf16=self.shape_config.is_bf16, causal=self.shape_config.is_causal, device=self.profile_config.device, repeats=repeat) if latency < 0: latency = 1e8 @@ -150,7 +152,7 @@ def tune(self, log_path="./logs/"): # warm up (parallel compile module) # module name must be different in api.py with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: - latencys = executor.map(self.profile, configs, [1 for _ in range(len(configs))]) + latencys = executor.map(self.profile, configs, [1 for _ in range(len(configs))], [True for _ in range(len(configs))]) # with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # latencys = executor.map(_profile,[self.tempdir for _ in range(len(configs))],[self.shape_config for _ in range(len(configs))], configs, ["cuda:0" for _ in range(len(configs))], [1 for _ in range(len(configs))]) # multiprocessing.set_start_method('spawn', force=True) diff --git a/autotuner/profile_attn.py b/autotuner/profile_attn.py index f7f80ec4c..f967e675c 100644 --- a/autotuner/profile_attn.py +++ b/autotuner/profile_attn.py @@ -16,6 +16,7 @@ def profile_fwd(fn,headdim, v_headdim, batch_size=4, seqlen=2048, nheads=8, device='cuda', is_bf16=False, causal=False, dropout_p=0.0, repeats=30): dtype = torch.bfloat16 if is_bf16 else torch.float16 + # print(batch_size, seqlen, nheads, headdim, v_headdim, device, dtype, dropout_p, causal, repeats) q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True) k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, diff --git a/autotuner/test_run_tunner.py b/autotuner/test_run_tunner.py index 87ce9e325..61af4456d 100644 --- a/autotuner/test_run_tunner.py +++ b/autotuner/test_run_tunner.py @@ -6,11 +6,11 @@ batch_size = 4 seqlen = 2048 nheads = 8 -headdim = 192 -v_headdim = 128 +headdim = 128# 192 +v_headdim = 256# 128 device = 'cuda:0' dtype = torch.bfloat16 -dropout_p = 0.1 # 0.0 +dropout_p = 0.0 # 0.0 q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True) @@ -19,5 +19,5 @@ v = torch.randn(batch_size, seqlen, nheads, v_headdim, device=device, dtype=dtype, requires_grad=True) -tunner = FlashFwdTunner(A100(), [q,k,v], ShapeConfig(headdim,v_headdim), ProfileConfig(batch_size,seqlen,seqlen,nheads,nheads,nheads,device,dtype,dropout_p), "autotuner/temp") +tunner = FlashFwdTunner(A100(), [q,k,v], ShapeConfig(headdim,v_headdim), ProfileConfig(batch_size,seqlen,seqlen,nheads,nheads,nheads,device,dtype,dropout_p), "autotuner/temp128_256") # "autotuner/temp192_128" tunner.tune() From be21a0ada968c7903bebab2834e008449d993f88 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Thu, 12 Sep 2024 17:26:24 +0800 Subject: [PATCH 40/46] fix bug kernel launch --- csrc/flash_attn/src/kernel_traits.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/kernel_traits.h b/csrc/flash_attn/src/kernel_traits.h index c0fe049e2..0e1df5eb6 100644 --- a/csrc/flash_attn/src/kernel_traits.h +++ b/csrc/flash_attn/src/kernel_traits.h @@ -112,7 +112,9 @@ struct Flash_fwd_kernel_traits : public Base { static constexpr int kSmemQSize = size(SmemLayoutQ{}) * sizeof(Element); static constexpr int kSmemKSize = size(SmemLayoutK{}) * sizeof(Element); static constexpr int kSmemVSize = size(SmemLayoutV{}) * sizeof(Element); - static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKSize + kSmemVSize) : kSmemQSize + kSmemKSize + kSmemVSize; + static constexpr int kSmemOSize = size(SmemLayoutO{}) * sizeof(Element); + static constexpr int kSmemSizeQKV = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKSize + kSmemVSize) : kSmemQSize + kSmemKSize + kSmemVSize; + static constexpr int kSmemSize = std::max(kSmemSizeQKV, kSmemOSize); static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); static_assert(kQKHeadDim % kGmemElemsPerLoad == 0, "kQKHeadDim must be a multiple of kGmemElemsPerLoad"); From 90fa651c121aa2daabc1c5719a5be9814cc24cd9 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 18 Sep 2024 20:59:10 +0800 Subject: [PATCH 41/46] update autotuner tile space --- autotuner/base_tunner.py | 4 ++-- autotuner/tunner.py | 7 ++++++- csrc/flash_attn/src/flash_fwd_kernel.h | 6 ++++-- csrc/flash_attn/src/kernel_traits.h | 7 ++++++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/autotuner/base_tunner.py b/autotuner/base_tunner.py index 36096bc1a..c2111dfef 100644 --- a/autotuner/base_tunner.py +++ b/autotuner/base_tunner.py @@ -72,8 +72,8 @@ class BaseTunner: def __init__(self, arch, torch_array: list, op_name, shape_config: ShapeConfig, profile_config: ProfileConfig, tempdir): self.arch = arch self.torch_array = torch_array - self.Br_list = [32, 64, 128] # [32, 64, 128, 256] - self.Bc_list = [32, 64, 128] # [32, 64, 128, 256] + self.Br_list = [32, 64, 96, 128, 160, 192, 224, 256] # [32, 64, 128, 256] + self.Bc_list = [32, 64, 96, 128, 160, 192, 224, 256] # [32, 64, 128, 256] self.template_dir = "autotuner/template" self.op_name = op_name diff --git a/autotuner/tunner.py b/autotuner/tunner.py index 18e9cb15c..4b701996a 100644 --- a/autotuner/tunner.py +++ b/autotuner/tunner.py @@ -50,8 +50,13 @@ def validate_register_fuse(self, config): def generate_configs(self,Br:int,Bc:int,dim_qk:int,dim_v:int): configs = [] - # TODO: more general for Nthreads in [128, 256]: + # TODO: more general + # global load atom + load_atom = 64 if (dim_qk % 64 == 0 and dim_v % 64 == 0 ) else 32 + NthreadsPerRow = load_atom / (128/16) + if Br % (Nthreads / NthreadsPerRow) != 0 or Bc % (Nthreads / NthreadsPerRow) != 0: + continue config1 = FlashFwdConfig(dim_qk,dim_v,Br,Bc,Nthreads//32,False,False) config2 = FlashFwdConfig(dim_qk,dim_v,Br,Bc,Nthreads//32,True,False) config3 = FlashFwdConfig(dim_qk,dim_v,Br,Bc,Nthreads//32,True,True) diff --git a/csrc/flash_attn/src/flash_fwd_kernel.h b/csrc/flash_attn/src/flash_fwd_kernel.h index 2153ba07b..82b617f2f 100644 --- a/csrc/flash_attn/src/flash_fwd_kernel.h +++ b/csrc/flash_attn/src/flash_fwd_kernel.h @@ -356,7 +356,8 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi int block_col_idx = n_block * (kBlockN / 32); if (Return_softmax) { Tensor rP_drop = make_fragment_like(rP); - cute::copy(rP, rP_drop); + // cutlass'bug on vectorization for tile (192,64) + cute::copy(cute::coalesce(rP), cute::coalesce(rP_drop)); dropout.template apply_dropout( rP_drop, block_row_idx, block_col_idx, kNWarps ); @@ -418,7 +419,8 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, const int bi int block_col_idx = n_block * (kBlockN / 32); if (Return_softmax) { Tensor rP_drop = make_fragment_like(rP); - cute::copy(rP, rP_drop); + // cutlass'bug on vectorization for tile (192,64) + cute::copy(cute::coalesce(rP), cute::coalesce(rP_drop)); dropout.template apply_dropout( rP_drop, block_row_idx, block_col_idx, kNWarps ); diff --git a/csrc/flash_attn/src/kernel_traits.h b/csrc/flash_attn/src/kernel_traits.h index 0e1df5eb6..3daa46b47 100644 --- a/csrc/flash_attn/src/kernel_traits.h +++ b/csrc/flash_attn/src/kernel_traits.h @@ -69,7 +69,8 @@ struct Flash_fwd_kernel_traits : public Base { static constexpr int kVHeadDim = kVHeadDim_; static_assert(kQKHeadDim % 32 == 0); static_assert(kVHeadDim % 32 == 0); - static constexpr int kBlockKSmem = kQKHeadDim % 64 == 0 ? 64 : 32; + // TODO: split QK & V + static constexpr int kBlockKSmem = (kQKHeadDim % 64 == 0 && kVHeadDim % 64 == 0) ? 64 : 32; static constexpr int kBlockKGmem = kQKHeadDim % 128 == 0 ? 128 : (kQKHeadDim % 64 == 0 ? 64 : 32); static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3; @@ -118,6 +119,7 @@ struct Flash_fwd_kernel_traits : public Base { static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); static_assert(kQKHeadDim % kGmemElemsPerLoad == 0, "kQKHeadDim must be a multiple of kGmemElemsPerLoad"); + static_assert(kVHeadDim % kGmemElemsPerLoad == 0, "kVHeadDim must be a multiple of kGmemElemsPerLoad"); // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts. // For example, for d=128, smem is split into 2 "pages", each page takes care of columns // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem, @@ -127,6 +129,9 @@ struct Flash_fwd_kernel_traits : public Base { static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow"); using GmemLayoutAtom = Layout, Int>, Stride, _1>>; + // for global load thread mapping + static_assert(kBlockN % (kNThreads / kGmemThreadsPerRow) == 0, "kBlockN must be a multiple of kNThreads / kGmemThreadsPerRow"); + static_assert(kBlockM % (kNThreads / kGmemThreadsPerRow) == 0, "kBlockM must be a multiple of kNThreads / kGmemThreadsPerRow"); // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading // from the same address by the same threadblock. This is slightly faster. From 1ba39ebba1e004bd90194f8f37dfc15227c0df74 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 18 Sep 2024 22:14:34 +0800 Subject: [PATCH 42/46] update cutlass bugfix --- csrc/cutlass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cutlass b/csrc/cutlass index 756c351b4..f7b19de32 160000 --- a/csrc/cutlass +++ b/csrc/cutlass @@ -1 +1 @@ -Subproject commit 756c351b4994854b2f8c6dded3821ebbb580876b +Subproject commit f7b19de32c5d1f3cedfc735c2849f12b537522ee From c5fa3c92e81e22848c676c5a2116ca104ce334f3 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 18 Sep 2024 22:40:21 +0800 Subject: [PATCH 43/46] add autotuner doc --- autotunner.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 autotunner.md diff --git a/autotunner.md b/autotunner.md new file mode 100644 index 000000000..d480effbd --- /dev/null +++ b/autotunner.md @@ -0,0 +1,20 @@ +# Autotuner + +Autotuner can automatically generate the best config for flash-attention kernel with not-implemented headdim qk & headdim v , or different hardware such as nvidia Ampere, Ada Lovelace. + +Currently, the autotuner only support flash attention forward. We plan to support backward and forward_split soon. + +## Usage + +Currently, you need to first install flashattn from source. Then, you can run the autotuner with head-dimensions of qk and v you want to tune. After that, you need to modify/create `csrc/flash_attn/src/flash_fwd_qkdim*_vdim*_sm80.h` with the tuned config. Finally, you need to rebuild the flashattn from source. + + + +The detailed steps are as follows: + +- Install flashattn from source +- run ```python autotuner/test_run_tunner.py ``` with problem size you want to tune. +- If the headdim already exists in `csrc/flash_attn/src`, you need to modify `csrc/flash_attn/src/flash_fwd_qkdim*_vdim*_sm80.h` with the tuned best config. If the headdim does not exist, you need to create `csrc/flash_attn/src/flash_fwd_qkdim*_vdim*_sm80.h`, `csrc/flash_attn/src/flash_bwd_qkdim*_vdim*_sm80.h` with the tuned best config and the corresponding `.cu` files; After that, you need to add the headdim in `headdim.json`. +- Rebuild the flashattn from source. + + From b09eaeea95f0d078c722d89676bda88eb77bbfc2 Mon Sep 17 00:00:00 2001 From: chenfeiyang <2394209769@qq.com> Date: Wed, 18 Sep 2024 23:04:35 +0800 Subject: [PATCH 44/46] update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7f1804e9d..87520750e 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ we have supported: - FlashAttention-2 with QKHeadDim=128, VHeadDim=256 - FlashAttention-2 with QKHeadDim=192, VHeadDim=128 +For headdim not supported, you can use the autotuner to generate the implementation. Details are in `autotuner.md`. + Feel free to tell us what else you need. We might support it soon. :) Currently, we do not provide prebuilt library, you need to compile from source. From cd9fee42a4de0d5170ec55883e1f6aba9da1a99d Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Thu, 19 Sep 2024 16:04:53 +0800 Subject: [PATCH 45/46] update autotuner --- autotunner.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotunner.md b/autotunner.md index d480effbd..165b16d99 100644 --- a/autotunner.md +++ b/autotunner.md @@ -1,6 +1,6 @@ # Autotuner -Autotuner can automatically generate the best config for flash-attention kernel with not-implemented headdim qk & headdim v , or different hardware such as nvidia Ampere, Ada Lovelace. +Autotuner can automatically generate the best config for flash-attention kernel with not-implemented headdim qk & headdim v , or existing headdim on different hardware such as nvidia Ampere, Ada Lovelace. Currently, the autotuner only support flash attention forward. We plan to support backward and forward_split soon. From 014c349eb1c523f9a97d51ea1880486aeeee4a2e Mon Sep 17 00:00:00 2001 From: chenfeiyang Date: Thu, 19 Sep 2024 16:13:43 +0800 Subject: [PATCH 46/46] update readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 87520750e..8872c0948 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,10 @@ We display CustomFlashAttention speedup using these parameters: ### Speedup ![Custom-flash-attn](assets/Customflash2_a100_fwd_bwd_benchmark.png) +# FlashAttention +This repository provides the official implementation of FlashAttention and +FlashAttention-2 from the +following papers. **FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness** Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré