diff --git a/artifacts/.clang-format b/artifacts/.clang-format new file mode 100644 index 000000000..7b64cb97f --- /dev/null +++ b/artifacts/.clang-format @@ -0,0 +1,17 @@ +# Run manually to reformat a file: +# clang-format -i --style=file +BasedOnStyle: Google +ColumnLimit: 80 +IndentWidth: 4 +DerivePointerAlignment: false +SortIncludes: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '<([A-Za-z0-9\Q/-_\E])+>' + Priority: 4 + - Regex: '<(catch2|boost)\/' + Priority: 3 + - Regex: '<([A-Za-z0-9.\Q/-_\E])+>' + Priority: 2 + - Regex: '"([A-Za-z0-9.\Q/-_\E])+"' + Priority: 1 diff --git a/artifacts/.gitignore b/artifacts/.gitignore new file mode 100644 index 000000000..b0af482c1 --- /dev/null +++ b/artifacts/.gitignore @@ -0,0 +1,39 @@ +build/ +*.log +!requirements*.txt +*.tgz +*.gz +*.pyc +.ipynb_checkpoints/ +__pycache__/ +.vs/ +.vscode/ +.data/ +venv/ +.idea/ +.checkpoints/ +*.pb.h +*.pb.cc +*_pb2.py +tensorboard/ +benchmarks/attention/baseline/MultiHeadAttention/log + +# generated by compiling tex files. +*.aux +*.bbl +*.blg +*.idx +*.ind +*.lof +*.lot +*.out +*.toc +*.acn +*.acr +*.alg +*.glg +*.glo +*.gls +*.ist +*.fls +*.gv diff --git a/artifacts/.gitmodules b/artifacts/.gitmodules new file mode 100644 index 000000000..e69de29bb diff --git a/artifacts/.pre-commit-config.yaml b/artifacts/.pre-commit-config.yaml new file mode 100644 index 000000000..b8140a17b --- /dev/null +++ b/artifacts/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +repos: +- repo: https://github.com/Lucas-C/pre-commit-hooks.git + rev: v1.0.1 + hooks: + - id: remove-crlf + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ +- repo: https://github.com/pre-commit/mirrors-yapf.git + rev: v0.23.0 + hooks: + - id: yapf + files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v1.4.0 + hooks: + - id: check-added-large-files + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ + - id: end-of-file-fixer +- repo: local + hooks: + - id: clang-format-with-version-check + name: clang-format + description: Format files with ClangFormat. + entry: bash ./tools/clang_format.hook -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|proto)$ +- repo: https://github.com/iconmaster5326/cmake-format-pre-commit-hook + rev: v0.6.5 + hooks: + - id: cmake-format diff --git a/artifacts/FractalTensor/CMakeLists.txt b/artifacts/FractalTensor/CMakeLists.txt new file mode 100644 index 000000000..b8ba4c772 --- /dev/null +++ b/artifacts/FractalTensor/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.18) # cutlass 3.2 requires cmake 3.18+ + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/cmake/Modules/") + +set(PYPARSER_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(PYPARSER_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(PYPARSER_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") + +project(kaleido CXX C) +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(Threads REQUIRED) +find_package(CUDA REQUIRED) +find_package(CuDNN REQUIRED) + +set(Boost_USE_STATIC_LIBS OFF) +set(Boost_USE_MULTITHREADED ON) +set(Boost_USE_STATIC_RUNTIME OFF) +find_package(Boost 1.45.0 COMPONENTS filesystem regex) + +if(Boost_FOUND) + include_directories(${Boost_INCLUDE_DIR}) + add_definitions("-DHAS_BOOST") +else() + message(FATAL_ERROR "Cannot find Boost.") +endif() + +include(generic) +include(python) +include(third_party) + +add_subdirectory(kaleido/core) diff --git a/artifacts/FractalTensor/Makefile b/artifacts/FractalTensor/Makefile new file mode 100644 index 000000000..8d51c79e8 --- /dev/null +++ b/artifacts/FractalTensor/Makefile @@ -0,0 +1,42 @@ +CUDNN_HOME ?= +BUILD_DIR := build +BENCHMARK_DIR := benchmarks +BENCHMARK_MODEL_CLASS ?= rnn +BENCHMARK_PROJ ?= fractaltensor +BENCHMARK_MODEL ?= stacked_lstm + +.PHONY: build clean install-python test-backend test-frontend benchmark benchmarks cpp-format + +build: + @mkdir -p $(BUILD_DIR)/ + @cd build && cmake ../ -D PYTHON_EXECUTABLE:FILEPATH=`which python3` \ + -D CUDNN_INCLUDE_DIR=$(CUDNN_HOME)/include \ + -D CUDNN_LIBRARY=$(CUDNN_HOME)/lib/libcudnn.so && make -j$(nproc) + +$(BUILD_DIR)/kaleido: + @$(MAKE) build + +install-python: + @pip install -r requirements.txt + +test-frontend: + @./scripts/tests/frontend_unit_tests.sh + @./scripts/tests/frontend_examples.sh + +test-backend: $(BUILD_DIR)/kaleido + @./scripts/tests/backend_unit_tests.sh + +benchmark: $(BUILD_DIR)/kaleido + @cd $(BENCHMARK_DIR)/$(BENCHMARK_MODEL_CLASS)/$(BENCHMARK_PROJ)/$(BENCHMARK_MODEL) && \ + mkdir -p build && cd build && cmake .. && make -j$(nproc) + +benchmarks: + @./scripts/benchmarks/bench.sh + +cpp-format: + @./scripts/cpp_format.sh + +clean: + @rm -f unittest.log + @rm -rf $(BUILD_DIR) + diff --git a/artifacts/FractalTensor/README.md b/artifacts/FractalTensor/README.md new file mode 100644 index 000000000..c2451ca75 --- /dev/null +++ b/artifacts/FractalTensor/README.md @@ -0,0 +1,237 @@ +# Program DNN algorithms with FractalTensor + +**Optimize programming neural network applications for both parallel computers and programmers!** + +As deep learning models become more and more complex, it is more and more important to structure them well. Well-structured and modular programs are easy to write and are more tractable for a compiler to reason and generate efficient codes for a variety of parallel computers. + +However, an important challenge is how _**one can divide up the original problem into sub-solutions depend directly on how one can glue sub-solutions together**_. FractalTensor embraces functional style programming for an easier auto-parallelization and explores two powerful glue in functional programming: high-order function and lazy evaluation. + +In FractalTensor, the assignment is interpreted as variable binding. All primitive operations (functions) have no effect other than to compute its result. This is to make the order of execution irrelevant and relieves the programmer of the burden of prescribing the flow of control. + +The goals of FractalTensor are to: +1. provide a set of _data-parallel_ constructs to uncover and exploit _nested_ parallelisms hidden in neural network applications; These constructs could be embedded into a sequential language. +2. provide compile-time analysis techniques to glue the proposed constructs together to generate efficient evaluation codes/plans for a variety of parallel computers. + +

+
+

+ +## A demonstrating example: Grid RNN + +Let's construct a complicated example step by step to demonstrate the ideas in the design of the FractalTensor ADT and parallel and access operators associated with it. + +The connection pattern of neural networks generally falls into two kinds: feedforward and feedback connection. Recurrent neural networks (RNNs) are in fact a very broad set of models equipped with feedback connections for sequence processing. RNN models can simply be understood as a processing unit, called a cell, that is iteratively applied to continuously supplied input data from a token stream. + +To design a new RNN model for a specific task, usually, three factors are considered by algorithm researchers: (1) invent cells to capture how local input is combined with history; (2) invent new connection patterns to form a sophisticated history; and finally, (3) multiple RNN layers are further stacked to introduce more non-linearity into history. + +A grid RNN cell simultaneously receives inputs from and computes results for multiple directions. +It can be used in the machine translation task which translates a source language sequence into a target language sequence and the two sequences have different lengths. We take this application as a demonstrating example in this document. Interactions among words from source and target language sequences are learned and captured by a grid RNN layer. This shares a similar idea with the extremely successful attention mechanism. + +### Why parallel operators + +Neural network models naturally exhibit a signal flow structure which is the fundamental constraint for auto-parallelization. Fig 1 below illustrates the signal flow structure of the stacked grid RNN model. The cell function (a circle in Fig 1) describes the local computational process where a minimal data unit is consumed by a machine learning model. In this example, a minimal data unit is a word from a sentence. + +

+
+Fig 1. Process generated by iteratively applying 2-d grid RNN cell function to input data. +

+ +It's worth noting that dataflow dependencies in Fig 1 are the only data dependencies (flow dependency or true dependency) that should be considered to schedule a neural network computation so that the computational process could be efficiently executed on a parallel computer, but most often, according to the way how a user writes the program, there are usually more data dependencies among program variables, leading to a conservative parallelization. + +Functional style high-level list operators largely relieves this pressure by getting rid of the side-effecting assignment and the program itself naturally preserves a clean dataflow structure. Nevertheless, to make program analysis more tractable, this is not enough. Iteratively apply the cell function to words from sentences produces a process whose shape is hyper rectangular. In the compile-time analysis, we would like to be able to make statements about the overall behavior of this process. This is very difficult in general, but parallel operators try to capture some typical patterns in neural network computations which are unique opportunities for optimizations. + +### Stacked Grid RNN module by module with parallel operators + +There are four dimensions in the entire computational process of the stacked grid rnn model: (1) the data parallelism dimension applies stacked grid rnn model to multiple sentence pairs; (2) the depth dimension stacks multiple grid rnn layers; (3) the x-direction scans along the source language sequence; (4) the y-direction scans along the target language sequence. (To visualize, Fig 1 omits the data parallelism dimension.) + +Dependence vectors in neural network computation are all lexicographically positive, therefore these four dimensions form a fully permutable loop nest and can be computed in an arbitrary order[[1](#reference)]. But let's think from "local" to "global" module by module to define computations for (1) cell computation, (2) the y-direction, (3) the x-direction, (4) the depth, and (5) computations for a batch of sentence pairs. + +#### 1. grid cell + +**cell** + +In this document, we use the vanilla RNN cell for the example, but this is not fixed. A user can design any reasonable computations to be the cell processing unit. + +Formula of the vanilla RNN cell is as follows: + +$$\mathbf{h}_t = \text{cell}(\mathbf{h}_{t-1}, \mathbf{x}_t) = \text{tanh}(\mathbf{x}_t\mathbf{W} + \mathbf{h}_{t-1}\mathbf{U}+\mathbf{b})$$ + +

+
+Fig 2. expression tree of the cell. +

+ +```python +def vanilla_cell(state: Tensor, cur: Tensor, + rnn_params: Tuple[Tensor]) -> Tensor: + i2h, h2h, bias = rnn_params # unpack tuple elements + return ops.tanh(cur @ i2h + state @ h2h + bias) +``` + +**2-d grid cell** + +$$\mathbf{h} = [\mathbf{h}_{t-1}^x ; \mathbf{h}_{t-1}^y]$$ + +$$\mathbf{h}_t^x=\text{cell}(\mathbf{h}, \mathbf{x}_t)$$ + +$$\mathbf{h}_t^y=\text{cell}(\mathbf{h}, \mathbf{y}_t)$$ + +$[; ]$ stands for "concatenation". + +

+
+Fig 3. expression tree of the grid cell. +

+ +Below codes implement the core computation of a 2-d grid cell. + +```python +# core computation +s = ops.cat(state_x, state_y, axis=0) +h_x = vanilla_cell(x_t, s, rnn_param_x) +h_y = vanilla_cell(y_t, s, rnn_param_y) +``` + +#### 2. nested scans + +`scan` iterates over a linearly-ordered collection to aggregate the returned result from the last execution instance with an element from the collection (_#TODO(ying): There is a problem, the returned result of `zip` is able to iterate over, but it is not a homogenous collection. Unify the concepts._). The user-function should obey `scan` 's calling convention: + +1. the first argument is to communicate with the last execution instance. + - the caller (`scan`) passes the evaluation results of the last execution instance without any change to the current execution instance through the first argument. + - returned value of the user function should have the same type and organizational structure as its first argument. +2. the second argument is input to the current execution, an indexed element of a linearly-order collection. +3. all the other arguments are passed by the caller (`scan`) as keyword arguments. They are name alias of some variables defined and initialized outside the user function. + +##### scan along the y-direction + +To scan along the y-direction, the grid cell computation above is required to be encapsulated into a user function which acts as a binary operator: + +```python +# The user-function passed to scan acts as a binary operator. +# The first argument `state` is to communicate with +# the previous execution instance. `state` carries +# a data dependence with a distance of 1. +def grid_cell(state: Tuple[Tensor], cur_input: Tuple[Tensor, Tuple[Tensor]], + block_params: Tuple[Tuple[Tensor]]) -> Tuple[Tensor]: + # previous execution instance computes `h_y` and `h_x`, but only `h_y` + # from previous evalution is consumed in the current execution instance. + _, state_y = state + # get state_x and inputs to direction x and y. + state_x, (x_t, y_t) = cur_input + + # unpack tuple elements, get learnable parameters + rnn_param_x, rnn_param_y = block_params + + # the core computation + s = ops.cat(state_x, state_y, axis=0) + h_x = vanilla_cell(x_t, s, rnn_param_x) + h_y = vanilla_cell(y_t, s, rnn_param_y) + + # returned value has the same organizational structure as `state`, + # and they will be directly passed to the next execution instance. + return h_x, h_y # Tuple[Tensor] +``` + +Now, we pass `grid_cell` to `scan` which (1) guarantees the execution order of all execution instances, (2) prepares parameters before running each execution instance, and (3) stacks results of all execution instances into `FractalTensor` (s). + +

+
+Fig 4. the signal flow structure for "scanning along the y-direction". +

+ +```python +def direction_y( + state: Tuple[FractalTensor[Tensor]], + cur_input: Tuple[FractalTensor[Tensor]], + block_params: Tuple[Tuple[Tensor]]) -> Tuple[FractalTensor[Tensor]]: + state_xs, _ = state + + zero = ops.zeros(1, hidden_dim) + return ops.scan( + grid_cell, + ops.zip(state_xs, ops.zip(*cur_input)), + initializer=(zero, zero), + block_params=block_params) +``` + +`state_xs` has the type of `FractalTensor` , `cur_input` has a type of `Tuple[FractalTensor]` , inputs to the x-direction and the y-direction respectively. `state_xs` and `cur_input` are prepared and passed by the caller. + +##### scan along the x-direction + +To scan along the x-direction, it is necessary to encapsulate the computation of scanning along the y-direction into a user function. + +

+
+Fig 5. the signal flow structure for "scanning along the x-direction". +

+ +```python +def direction_x(state: Tuple[Tuple[FractalTensor[Tensor]]], + block_params: Tuple[Tuple[Tensor]] + ) -> Tuple[Tuple[FractalTensor[Tensor]]]: + # len(state[0][0]) is the length of source language sequence + zeros: FractalTensor = ops.repeat(ops.zeros(1, hidden_dim), len(state[0][0])) + return ops.zip(*ops.scan( + direction_y, + state, + initializer=(zeros, zeros), + block_params=block_params)) +``` + +#### 3. fold to form the depth + +```python +def stacked_grid_rnns( + sent_pair: Tuple[FractalTensor[int]] +) -> Tuple[FractalTensor[FractalTensor[Tensor]]]: + srcs, trgs = sent_pair + + src_encs = ops.map( + lambda word: ops.index(ops.slices(src_embedding, axis=0), word), srcs) + trg_encs = ops.map( + lambda word: ops.index(ops.slices(trg_embedding, axis=0), word), trgs) + + return ops.zip(*ops.fold( + direction_x, + rnn_params, + initializer=ops.zip(*ops.product(src_encs, trg_encs)))) +``` + +#### 4. map the batched input + +```python +src_batch: FractalTensor[FractalTensor[int]] = dataset(batch_size) +trg_batch: FractalTensor[FractalTensor[int]] = dataset(batch_size) + +# data parallelism in a mini-batch +grid_out_x, grid_out_y = ops.map(stacked_grid_rnns, + ops.zip(src_batch, trg_batch)) + +# grid_out_x: FractalTensor[FractalTensor[FractalTensor[Tensor]]] +# grid_out_y: FractalTensor[FractalTensor[FractalTensor[Tensor]]] +``` + +#### Put things together + +Fig 6 shows the overall code structure of the stacked grid RNN model we build and the memory layout of the first returned value of the outermost `map` (the second returned value of `map` has the same layout.). + +

+
+Fig 6. the overall code structure of stacked grid RNN and the visualization of xssss's memory layout. +

+ +The user program exhibits a clear pattern of function composition and nesting and becomes very concise with the help of nestable collection type `FractalTensor` and parallel operators. However, a straightforward materialization of each function evaluation leads to fine-grained data access and movements, and non-optimal parallelisms. Large runtime overhead makes the program far from performance. + +We would like to reason about the overall runtime behaviors of the computational process produced by this kind of program, and glue these function compositions and nesting into an efficient evaluation plan. + +### Summary of parallel operators + +1. parallel operators are building blocks to design parallel algorithms. In machine learning tasks, data are usually organized into some high dimensional representation form. Iterating over dimensions makes nested parallelisms prevalent in machine learning computations. +2. To allow multiple levels of parallelism, the nestable collection type jagged `FractalTensor` is required to work together with parallel operators. In FractalTensor, the great expressiveness comes from: + 1. parallel operators and `FractalTensor` is data-dependent, thus there is no need to manually pad irregular data. + 2. parallel operators and `FractalTensor` can be nested for an arbitrary depth. Nested jagged `FractalTensor` can express structural information. +3. parallel operators are optimized loops in the backend which are designed to be high-performance. Additionally, they also serve as interfaces to restrict the way a user thinks of neural network computations. The compile-time analysis then maps the entire computational process to underlying parallel computers. + +# Reference + +1. Wolf, Michael E., and Monica S. Lam. "[A loop transformation theory and an algorithm to maximize parallelism](https://homes.luddy.indiana.edu/achauhan/Teaching/B629/2006-Fall/CourseMaterial/1991-tpds-wolf-unimodular.pdf)." IEEE Computer Architecture Letters 2.04 (1991): 452-471. diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/README.md b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/README.md new file mode 100644 index 000000000..f0ff4a558 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/README.md @@ -0,0 +1,25 @@ +

+
+Fig. Compose BigBird using parallel operator nesting. +

+ +

+
+Fig. The parsed ETDG representation. +

+ +

+
+Fig. The access map annotation attached to ETDG edges. +

+ + +

+
+Fig. Fused access map in ETDG. +

+ +

+
+Fig. The access map annotation attached to ETDG edges. +

diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/CMakeLists.txt b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/CMakeLists.txt new file mode 100644 index 000000000..07ac657a4 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required(VERSION 3.18) + +project(bigbird CXX C) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake") +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/../../../cmake/Modules/") + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(CUDA QUIET REQUIRED) +find_package(CuDNN QUIET REQUIRED) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") +set(CMAKE_CXX_FLAGS_DEBUG + "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") +set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall + -Wno-sign-compare") + +cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") +message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CUDNN_INCLUDE_DIRS}) +include_directories("../../../") # include the project's core directory + +cuda_add_executable(bigbird bigbird_bench.cu) +target_link_libraries(bigbird ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_curand_LIBRARY} ${CUDNN_LIBRARIES}) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/Makefile b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/Makefile new file mode 100644 index 000000000..a4f70e8b2 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/Makefile @@ -0,0 +1,12 @@ +BENCH_NAME ?= big_bird +BUILD_DIR := build + +.PHONY: build clean + +build: + @mkdir -p build && cd build && cmake .. && make -j12 + +$(BUILD_DIR)/$(BENCH_NAME): build + +clean: + @rm -rf build diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/access_ops.h b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/access_ops.h new file mode 100644 index 000000000..6d0cf1fb6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/access_ops.h @@ -0,0 +1,296 @@ +#pragma once + +#include "access_ops_kernels.cuh" +#include "kaleido/core/device/cuda_utils.h" +#include "kaleido/core/device/kernels/softmax.h" + +#include +#include +#include +#include + +using namespace kaleido::core; + +void init_data(float* Q_d, float* K_d, float* V_d, float* Res_d, int bs, + int len, int h) { + int64_t numel = bs * len * h; + float* Q_h = (float*)malloc(numel * sizeof(float)); + float* K_h = (float*)malloc(numel * sizeof(float)); + float* V_h = (float*)malloc(numel * sizeof(float)); + float* Res_h = (float*)malloc(numel * sizeof(float)); + for (int64_t i = 0; i < bs; ++i) { + for (int64_t j = 0; j < len * h; ++j) { + Q_h[i * len * h + j] = float(j); + K_h[i * len * h + j] = j < 2 * h ? float(1) : float(0); + V_h[i * len * h + j] = float(1); + Res_h[i * len * h + j] = float(0); + } + } + CudaCheck( + cudaMemcpy(Q_d, Q_h, numel * sizeof(float), cudaMemcpyHostToDevice)); + CudaCheck( + cudaMemcpy(K_d, K_h, numel * sizeof(float), cudaMemcpyHostToDevice)); + CudaCheck( + cudaMemcpy(V_d, V_h, numel * sizeof(float), cudaMemcpyHostToDevice)); + CudaCheck(cudaMemcpy(Res_d, Res_h, numel * sizeof(float), + cudaMemcpyHostToDevice)); +} + +/* generate r random values in [global_size, window_start) U (window_end, + * len-global_size]. */ +void GetRowRandPosition(const int64_t window_start, const int64_t window_end, + const int64_t len, const int64_t r, + std::vector& row) { + std::unordered_set rand_values; + std::random_device rd; + std::mt19937 s(rd()); + // window length may be different at different row. + std::uniform_int_distribution<> random_gen( + 1, len - (window_end - window_start) - 1); + + while (rand_values.size() < r) { + rand_values.insert(random_gen(s)); + } + for (auto v : rand_values) { + row.emplace_back((v + window_end) % len); + } +} + +void GenerateRandAttn(const int64_t block_num, const int64_t num_rand_blocks, + const int64_t window_size, const int64_t global_size, + std::vector& rand) { + for (int64_t i = 0; i < block_num - 2; ++i) { + int64_t w = (window_size - 1) / 2; + int64_t window_start = std::max(int64_t(0), i - w); + int64_t window_end = std::min(block_num - 2, i + w); + std::vector row; + GetRowRandPosition(window_start, window_end, block_num - 2, + num_rand_blocks, row); + for (auto s : row) rand.emplace_back(s /* + global_size*/); + } +} + +void init_rand_attn(int64_t len, int64_t blksz, int64_t rs, int64_t ws, + int64_t gs, std::vector& rand_attn_pos) { + GenerateRandAttn(len / blksz, rs, 2 * ws + 1, gs, rand_attn_pos); +} + +namespace access_ops { + +inline void push_block_rows(std::vector& ids, int64_t start, + int64_t len) { + for (int64_t i = 0; i < len; ++i) ids.emplace_back(start + i); +} + +void build_dense_row_ids(int64_t* ids, int bs, int len, int gs, int blksz) { + std::vector dense_row_ids_h; + for (int b = 0; b < bs; ++b) { + push_block_rows(dense_row_ids_h, b * len, blksz * gs); + push_block_rows(dense_row_ids_h, (b + 1) * len - blksz * gs, + blksz * gs); + } + CudaCheck(cudaMemcpy(ids, dense_row_ids_h.data(), + bs * (2 * gs * blksz) * sizeof(int64_t), + cudaMemcpyHostToDevice)); +} + +void gather_attention_rows(const float* input, float* output, + const int64_t* rows, int width, int row_num) { + const int kThreadsPerBlock = 512; + + int block_x = kThreadsPerBlock; + if (width < kThreadsPerBlock) + // integer division to align with 32. + block_x = ((width + 31) >> 5) << 5; + int block_y = kThreadsPerBlock / block_x; + dim3 block = dim3(block_x, block_y, 1); + + int grid_x = std::max(row_num / block_y, 1); + dim3 grid(grid_x, 1); + + access_kernel::KeNaiveSelectRowsKernel<<>>( + output, input, rows, row_num, width); +} + +template +void attention_score_softmax_op(const T* in, T* out, size_t width, + int64_t height) { + const int kThreadsPerBlock = 512; + int block_num = + width > kThreadsPerBlock + ? kThreadsPerBlock + : pow(2, static_cast(log2(static_cast(width)))); + dim3 block(block_num, 1); + dim3 grid(height, 1); + + cuda_kernel::KeMatrixSoftMax<<>>(in, out, width); +} +template void attention_score_softmax_op(const float* in, float* out, + size_t width, int64_t height); + +// return C(row-major) = A * BT +void cublasSgemmStridedBatchedQK(cublasHandle_t handle, const float* A, + const float* B, float* C, int A_row, int B_row, + int A_col, int bs) { + float alpha = 1.0f; + float beta = 0.0f; + + // CT = B * AT + CublasCheck(cublasSgemmStridedBatched( + handle, CUBLAS_OP_T, CUBLAS_OP_N, B_row, A_row, A_col, &alpha, B, A_col, + B_row * A_col, A, A_col, A_row * A_col, &beta, C, B_row, B_row * A_row, + bs)); +} + +void cublasSgemmStridedBatchedSV(cublasHandle_t handle, const float* A, + const float* B, float* C, int A_row, int B_col, + int A_col, int bs) { + float alpha = 1.0f; + float beta = 0.0f; + + // CT = BT * AT + // [b, dense_row_size, len] * [b, len, h] -> [b, dense_row_size, h] + CublasCheck(cublasSgemmStridedBatched( + handle, CUBLAS_OP_N, CUBLAS_OP_N, B_col, A_row, A_col, &alpha, B, B_col, + B_col * A_col, A, A_col, A_col * A_row, &beta, C, B_col, A_row * B_col, + bs)); +} + +// return C(row-major) = A * BT (matrix B are overlapped) +void cublasSgemmStridedBatchedQWindowK(cublasHandle_t handle, const float* A, + const float* B, float* C, int A_row, + int B_row, int A_col, int window_stride, + int bs) { + float alpha = 1.0f; + float beta = 0.0f; + + // CT = B * AT blksz, 3*blksz, h, + CublasCheck(cublasSgemmStridedBatched( + handle, CUBLAS_OP_T, CUBLAS_OP_N, B_row, A_row, A_col, &alpha, B, A_col, + window_stride * A_col /*<--※※B stride*/, A, A_col, A_row * A_col, &beta, + C, B_row, B_row * A_row, bs)); +} + +void build_middle_row_ids_gpu(int64_t* ids, int bs, int len, int blk_num, + int blksz, int gs, int ws) { + int middle_blk_num = blk_num - 2 * (gs + ws); + dim3 block = dim3(1024); + dim3 grid = dim3(std::max(bs * middle_blk_num * blksz / 1024 + 1, 1)); + access_kernel::KeMiddleIds<<>>(ids, bs, len, blk_num, blksz, + gs, middle_blk_num); +} + +void build_middle_row_rand_col_ids_gpu(int64_t* ids, int64_t* rand_attn, int bs, + int len, int blk_num, int blksz, int gs, + int ws, int rs) { + int middle_block_num = blk_num - 2 * (ws + gs); + + dim3 block_r = dim3(1024); + dim3 grid_r = + dim3(std::max(bs * middle_block_num * blksz * rs / 1024 + 1, 1)); + access_kernel::KeWriteRandomIds<<>>( + ids, rand_attn, ws, blksz, rs, len, middle_block_num, bs); +} + +void build_score_ids(int64_t* ids, int64_t row_size, int in_size, int out_size, + int start_offset) { + std::vector ids_h; + dim3 block = dim3(1024); + dim3 grid = dim3(std::max(row_size / 1024 + 1, int64_t(1))); + + access_kernel::KeSetStridedContinuous<<>>( + ids, start_offset, in_size, out_size, row_size); +} + +void scatter_attention_rows_with_stride(const float* input, float* output, + const int64_t* rows, int width, + int output_width, int row_num) { + const int kThreadsPerBlock = 1024; + + int block_x = kThreadsPerBlock; + if (width < kThreadsPerBlock) + // integer division to align with 32. + block_x = ((width + 31) >> 5) << 5; + int block_y = kThreadsPerBlock / block_x; + dim3 block = dim3(block_x, block_y, 1); + + int grid_x = std::max(row_num / block_y, 1); + dim3 grid(grid_x, 1); + + access_kernel::KeNaiveScatterRowsKernelStrided<<>>( + output, input, rows, row_num, width, output_width); +} + +void gather_attention_rows_with_stride(const float* input, float* output, + const int64_t* rows, int width, + int input_width, int row_num) { + const int kThreadsPerBlock = 1024; + + int block_x = kThreadsPerBlock; + if (width < kThreadsPerBlock) + // integer division to align with 32. + block_x = ((width + 31) >> 5) << 5; + int block_y = kThreadsPerBlock / block_x; + dim3 block = dim3(block_x, block_y, 1); + + int grid_x = std::max(row_num / block_y, 1); + dim3 grid(grid_x, 1); + + access_kernel::KeNaiveGatherRowsKernelStrided<<>>( + output, input, rows, row_num, width, input_width); +} + +void cublasSgemmStridedBatchedSWindowV(cublasHandle_t handle, const float* A, + const float* B, float* C, int A_row, + int B_col, int A_col, int window_stride, + int bs) { + float alpha = 1.0f; + float beta = 0.0f; + + // CT = BT * AT + // [b, dense_row_size, len] * [b, len, h] -> [b, dense_row_size, h] + CublasCheck(cublasSgemmStridedBatched( + handle, CUBLAS_OP_N, CUBLAS_OP_N, B_col, A_row, A_col, &alpha, B, B_col, + B_col * window_stride, A, A_col, A_col * A_row, &beta, C, B_col, + A_row * B_col, bs)); +} + +void scatter_attention_rows(const float* input, float* output, + const int64_t* rows, int width, int row_num) { + const int kThreadsPerBlock = 1024; + + int block_x = kThreadsPerBlock; + if (width < kThreadsPerBlock) + // integer division to align with 32. + block_x = ((width + 31) >> 5) << 5; + int block_y = kThreadsPerBlock / block_x; + dim3 block = dim3(block_x, block_y, 1); + + int grid_x = std::max(row_num / block_y, 1); + dim3 grid(grid_x, 1); + + access_kernel::KeNaiveScatterRowsKernel<<>>( + output, input, rows, row_num, width); +} + +void build_special_row_ids(int64_t* ids, int idx, int bs, int len, int blk_num, + int blksz, int gs, int ws) { + std::vector ids_h; + for (int b = 0; b < bs; ++b) { + push_block_rows(ids_h, b * len + idx * blksz, blksz); + push_block_rows(ids_h, b * len + len - ((idx + 1) * blksz), blksz); + } + cudaMemcpy(ids, ids_h.data(), bs * 2 * blksz * sizeof(int64_t), + cudaMemcpyHostToDevice); +} + +void build_special_cols_ids(int64_t* ids, int idx, int window_size, int bs, + int len, int blk_num, int blksz, int gs, int ws, + int rs) { + int64_t row_size = bs * 2 * (gs + rs + window_size) * blksz; + dim3 block = dim3(1024); + dim3 grid = dim3(std::max(row_size / 1024 + 1, int64_t(1))); + access_kernel::KeSetContinuous<<>>(ids, 0, row_size); +} + +} // namespace access_ops diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/access_ops_kernels.cuh b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/access_ops_kernels.cuh new file mode 100644 index 000000000..604f7d90d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/access_ops_kernels.cuh @@ -0,0 +1,93 @@ +#pragma once + +namespace access_kernel { + +__global__ void KeNaiveSelectRowsKernel(float* O, const float* I, + const int64_t* rows, int height, + int width) { + int row_idx = blockIdx.x * blockDim.y + threadIdx.y; + if (threadIdx.x < width && row_idx < height) { + int64_t to_pos = row_idx * width + threadIdx.x; + int64_t from_pos = rows[row_idx] * width + threadIdx.x; + O[to_pos] = I[from_pos]; + } +} + +__global__ void KeMiddleIds(int64_t* ids, int bs, int len, int blk_num, + int blksz, int gs, int middle_blk_num) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + if (tid_x < bs * middle_blk_num * blksz) { + int B = int(tid_x / (middle_blk_num * blksz)); + int R = int((tid_x - B * middle_blk_num * blksz) / blksz); + int S = tid_x - B * middle_blk_num * blksz - R * blksz; + int64_t write_val = B * len + (R + gs) * blksz + S; + ids[tid_x] = write_val; + } +} + +__global__ void KeWriteRandomIds(int64_t* ids, int64_t* rand_attn, int ws, + int blksz, int rs, int len, + int middle_block_num, int bs) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + if (tid_x < bs * middle_block_num * rs * blksz) { + int row_size = rs * blksz; + int batch_size = middle_block_num * row_size; + int B = tid_x / batch_size; + int R = (tid_x - B * batch_size) / row_size; + int S = tid_x - (B * middle_block_num + R) * row_size; + + int64_t write_value = B * len + rand_attn[R * rs + S / blksz]; + ids[tid_x] = write_value + S - S / blksz; + } +} + +__global__ void KeSetStridedContinuous(int64_t* ids, int start, int batch_width, + int input_stride, int row_size) { + int64_t tid_x = blockIdx.x * blockDim.x + threadIdx.x; + if (tid_x < row_size) { + int batch_idx = tid_x / batch_width; + int x = tid_x - batch_idx * batch_width; + ids[tid_x] = start + batch_idx * input_stride + x; + } +} + +__global__ void KeNaiveScatterRowsKernelStrided(float* O, const float* I, + const int64_t* rows, int height, + int width, int output_width) { + int row_idx = blockIdx.x * blockDim.y + threadIdx.y; + if (threadIdx.x < width && row_idx < height) { + int64_t to_pos = row_idx * width + threadIdx.x; + int64_t from_pos = rows[row_idx] * output_width + threadIdx.x; + O[from_pos] = I[to_pos]; + } +} + +__global__ void KeNaiveGatherRowsKernelStrided(float* O, const float* I, + const int64_t* rows, int height, + int width, int input_width) { + int row_idx = blockIdx.x * blockDim.y + threadIdx.y; + if (threadIdx.x < width && row_idx < height) { + int64_t to_pos = row_idx * width + threadIdx.x; + int64_t from_pos = rows[row_idx] * input_width + threadIdx.x; + O[to_pos] = I[from_pos]; + } +} + +__global__ void KeNaiveScatterRowsKernel(float* O, const float* I, + const int64_t* rows, int height, + int width) { + int row_idx = blockIdx.x * blockDim.y + threadIdx.y; + if (threadIdx.x < width && row_idx < height) { + int64_t to_pos = row_idx * width + threadIdx.x; + int64_t from_pos = rows[row_idx] * width + threadIdx.x; + O[from_pos] = I[to_pos]; + } +} + +__global__ void KeSetContinuous(int64_t* ids, int64_t start, + int64_t middle_row_size) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + if (tid_x < middle_row_size) ids[tid_x] = start + tid_x; +} + +} // namespace access_kernel diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/bigbird_bench.cu b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/bigbird_bench.cu new file mode 100644 index 000000000..aed6341b1 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/bigbird_bench.cu @@ -0,0 +1,454 @@ +#include "access_ops.h" +#include "kaleido/core/device/cuda_timer.h" +#include "kaleido/core/device/cuda_utils.h" + +using namespace kaleido::core; +using namespace access_ops; + +namespace { + +void DenseRows(cublasHandle_t& handle, const float* Q, const float* K, + const float* V, int64_t* dense_row_ids, float* dense_row, + float* dense_score, float* dense_score_tmp, float* dense_result, + int len, int h, int bs, int gs, int blksz, int dense_row_size, + float scal) { + build_dense_row_ids(dense_row_ids /*on device memory*/, bs, len, gs, blksz); + // gather first&last block row in one kernel. + gather_attention_rows(Q, dense_row, dense_row_ids, h, bs * dense_row_size); + + cublasSgemmStridedBatchedQK(handle, dense_row, K, dense_score_tmp, + dense_row_size, len, h, bs); + CublasCheck(cublasSscal_v2(handle, bs * len * dense_row_size, &scal, + dense_score_tmp, 1)); + attention_score_softmax_op(dense_score_tmp, dense_score, len, + bs * dense_row_size); + cublasSgemmStridedBatchedSV(handle, dense_score, V, dense_result, + dense_row_size, h, len, bs); +} + +void MiddleRows(cublasHandle_t& handle, const float* Q, const float* K, + const float* V, float* Res, float* middle_rows, + float* global_cols, float* global_cols_v, float* rand_cols, + float* rand_cols_v, float* global_score, float* rand_score, + float* middle_score, float* window_score, + float* middle_score_tmp, float* global_res, float* window_res, + float* rand_res, float alpha, float scal, + int64_t* dense_row_ids, int64_t* middle_rows_ids, + int64_t* rand_cols_ids, int64_t* rand_attn, + int64_t* scatter_score_ids, int64_t* window_scatter_score_ids, + int64_t middle_size, int64_t global_col_size, + int64_t rand_col_size, int64_t window_col_size, + int64_t middle_block_num, int64_t h, int64_t bs, int64_t len, + int64_t blk_num, int64_t blksz, int64_t gs, int64_t ws, + int64_t rs) { + build_middle_row_ids_gpu(middle_rows_ids, bs, len, blk_num, blksz, gs, ws); + build_middle_row_rand_col_ids_gpu(rand_cols_ids, rand_attn, bs, len, + blk_num, blksz, gs, ws, rs); + build_score_ids(scatter_score_ids, bs * middle_size, middle_size, len, + blksz); + build_score_ids(window_scatter_score_ids, bs * len, len, len, 0); + + gather_attention_rows(Q, middle_rows, middle_rows_ids, h, bs * middle_size); + gather_attention_rows(K, global_cols, dense_row_ids, h, + bs * global_col_size); + gather_attention_rows(V, global_cols_v, dense_row_ids, h, + bs * global_col_size); + gather_attention_rows(K, rand_cols, rand_cols_ids, h, + bs * rand_col_size * middle_block_num); + gather_attention_rows(V, rand_cols_v, rand_cols_ids, h, + bs * rand_col_size * middle_block_num); + + // WE DO NOT GATHER WINDOW DATA. + cublasSgemmStridedBatchedQK(handle, middle_rows, global_cols, global_score, + middle_size, global_col_size, h, bs); + cublasSgemmStridedBatchedQK(handle, middle_rows, rand_cols, rand_score, + blksz, rand_col_size, h, bs * middle_block_num); + // compute window score + cublasSgemmStridedBatchedQWindowK(handle, Q + gs * blksz * h, K, + window_score, blksz, ws * blksz, h, blksz, + bs * blk_num); + + // scatter global/rand score[b, middlesize, gs/rs] to + // middle_score_tmp[b, 2blksz:len-2*blksz, 0:/gs:] + scatter_attention_rows_with_stride( + global_score, middle_score_tmp, scatter_score_ids, gs * blksz, + (gs + ws + rs) * blksz, bs * middle_size); + scatter_attention_rows_with_stride( + rand_score, middle_score_tmp + gs * blksz, scatter_score_ids, + gs * blksz, (gs + ws + rs) * blksz, bs * middle_size); + scatter_attention_rows_with_stride( + window_score, middle_score_tmp + (gs + rs) * blksz, + window_scatter_score_ids, ws * blksz, (gs + ws + rs) * blksz, bs * len); + + CublasCheck(cublasSscal_v2(handle, bs * len * (gs + rs + ws) * blksz, &scal, + middle_score_tmp, 1)); + attention_score_softmax_op(middle_score_tmp, middle_score, + (gs + rs + ws) * blksz, bs * len); + + gather_attention_rows_with_stride(middle_score, global_score, + scatter_score_ids, gs * blksz, + (gs + ws + rs) * blksz, middle_size * bs); + gather_attention_rows_with_stride(middle_score + gs * blksz, rand_score, + scatter_score_ids, rs * blksz, + (gs + ws + rs) * blksz, middle_size * bs); + gather_attention_rows_with_stride( + middle_score + gs * blksz + rs * blksz, window_score, + window_scatter_score_ids, ws * blksz, (gs + ws + rs) * blksz, bs * len); + cublasSgemmStridedBatchedSV(handle, global_score, global_cols_v, global_res, + middle_size, h, gs * blksz, bs); + cublasSgemmStridedBatchedSV(handle, rand_score, rand_cols_v, rand_res, 1, h, + rand_col_size, bs * middle_block_num); + cublasSgemmStridedBatchedSWindowV(handle, window_score, V, window_res, + blksz, h, ws * blksz, blksz, + bs * blk_num); + // reduce sum + CublasCheck(cublasSaxpy_v2(handle, bs * middle_size * h, &alpha, global_res, + 1, rand_res, 1)); + gather_attention_rows(window_res, global_res, scatter_score_ids, h, + middle_size * bs); + CublasCheck(cublasSaxpy_v2(handle, bs * middle_size * h, &alpha, global_res, + 1, rand_res, 1)); + + scatter_attention_rows(rand_res, Res, middle_rows_ids, h, bs * middle_size); +} + +void SpecialRows(cublasHandle_t& handle, const float* Q, const float* K, + const float* V, float* Res, int64_t* special_row_ids, + int64_t* special_col_ids, float* special_row_pair, + float* special_K_cols, float* special_V_cols, int col_num, + float* special_score_tmp, float* special_score, + float* special_res, float scal, int block_row_idx, + int64_t window_size, int64_t h, int64_t bs, int64_t len, + int64_t blksz, int64_t blk_num, int64_t gs, int64_t ws, + int64_t rs) { + build_special_row_ids(special_row_ids, block_row_idx, bs, len, blk_num, + blksz, gs, ws); + // [TODO]: Did not write the correct index, + // but the time consumption here is negligible. + build_special_cols_ids(special_col_ids, block_row_idx, window_size, bs, len, + blk_num, blksz, gs, ws, rs); + + gather_attention_rows(Q, special_row_pair, special_row_ids, h, + bs * 2 * blksz); + gather_attention_rows(K, special_K_cols, special_col_ids, h, + bs * 2 * col_num * blksz); + gather_attention_rows(V, special_V_cols, special_col_ids, h, + bs * 2 * col_num * blksz); + + cublasSgemmStridedBatchedQK(handle, special_row_pair, special_K_cols, + special_score_tmp, blksz, col_num * blksz, h, + 2 * bs); + // S / d^(1/2) -> S + CublasCheck(cublasSscal_v2(handle, bs * 2 * blksz * col_num * blksz, &scal, + special_score_tmp, 1)); + + attention_score_softmax_op(special_score_tmp, special_score, + col_num * blksz, bs * 2 * blksz); + + cublasSgemmStridedBatchedSV(handle, special_score, special_V_cols, + special_res, blksz, h, col_num * blksz, 2 * bs); + + scatter_attention_rows(special_res, Res, special_row_ids, h, 2 * blksz); +} + +float bigbird(const float* Q, const float* K, const float* V, float* Res, + int64_t len, int64_t h, int64_t blksz, int64_t bs, + int64_t blk_num, int64_t gs, int64_t ws_, int64_t rs, + std::vector& rand_attn_pos, int warmup = 20, + int repeat = 100) { + int ws = ws_ * 2 + 1; + float scal = float(1.0 / std::sqrt(len)); + float alpha = 1.0f; + + cublasHandle_t handle; + cublasCreate(&handle); + + float* dense_row; + float* dense_score; + float* dense_score_tmp; + float* dense_result; + + // First&last global block rows concat in blksz + // dim to compute first&last block score in one BMM. + // Actually we can also write dense row part result to this tensor. + int64_t dense_row_size = gs * 2 * blksz; + + //[b, 2*gs*blksz, h] + int64_t size = bs * dense_row_size * h * sizeof(float); + CudaCheck(cudaMalloc(&dense_row, size)); + + //[b, 2*gs*blksz, len] + size = bs * dense_row_size * len * sizeof(float); + CudaCheck(cudaMalloc(&dense_score, size)); + + size = bs * dense_row_size * len * sizeof(float); + CudaCheck(cudaMalloc(&dense_score_tmp, size)); + + size = bs * dense_row_size * h * sizeof(float); + CudaCheck(cudaMalloc(&dense_result, size)); + + // Dense row index in input Q + int64_t* dense_row_ids; + size = bs * (gs * 2 * blksz) * sizeof(int64_t); + // [b, 2*gs*blksz] + CudaCheck(cudaMalloc(&dense_row_ids, size)); + + /* -------------------------DENSE ROW PART--------------------------*/ + for (int i = 0; i < warmup; ++i) { + DenseRows(handle, Q, K, V, dense_row_ids, dense_row, dense_score, + dense_score_tmp, dense_result, len, h, bs, gs, blksz, + dense_row_size, scal); + } + + CudaTimer timer; + timer.Start(); + for (int i = 0; i < repeat; ++i) { + DenseRows(handle, Q, K, V, dense_row_ids, dense_row, dense_score, + dense_score_tmp, dense_result, len, h, bs, gs, blksz, + dense_row_size, scal); + } + float time_dense = timer.Stop() / repeat; + + // Store the result to where they should be. + // just reuse the dense_row_ids for scatter. + CudaCheck(cudaFree(dense_row)); + CudaCheck(cudaFree(dense_score)); + CudaCheck(cudaFree(dense_score_tmp)); + CudaCheck(cudaFree(dense_result)); + // will be used in MIDDLE ROW PART to select global KV cols. + CudaCheck(cudaFree(dense_row_ids)); + + /* -------------------------MIDDLE ROW PART------------------------*/ + int middle_block_num = blk_num - 2 * (gs + ws); + int middle_size = middle_block_num * blksz; + int global_col_size = 2 * gs * blksz; + int rand_col_size = rs * blksz; + int window_col_size = ws * blksz; + + float* middle_rows; + float* global_cols; + float* rand_cols; + float* global_cols_v; + float* rand_cols_v; + float* middle_score; + float* middle_score_tmp; + float* global_score; + float* rand_score; + float* window_score; + float* global_res; + float* window_res; + float* rand_res; + + size = bs * middle_size * h * sizeof(float); + CudaCheck(cudaMalloc(&middle_rows, size)); + size = bs * global_col_size * h * sizeof(float); + CudaCheck(cudaMalloc(&global_cols, size)); + size = bs * rand_col_size * middle_block_num * h * sizeof(float); + CudaCheck(cudaMalloc(&rand_cols, size)); + size = bs * global_col_size * h * sizeof(float); + CudaCheck(cudaMalloc(&global_cols_v, size)); + size = bs * rand_col_size * middle_block_num * h * sizeof(float); + CudaCheck(cudaMalloc(&rand_cols_v, size)); + + size = bs * len * (global_col_size + rand_col_size + window_col_size) * + sizeof(float); + CudaCheck(cudaMalloc(&middle_score, size)); + + size = bs * len * (global_col_size + rand_col_size + window_col_size) * + sizeof(float); + CudaCheck(cudaMalloc(&middle_score_tmp, size)); + size = bs * global_col_size * middle_size * sizeof(float); + CudaCheck(cudaMalloc(&global_score, size)); + size = bs * rand_col_size * middle_size * sizeof(float); + CudaCheck(cudaMalloc(&rand_score, size)); + + size = bs * window_col_size * len * sizeof(float); + CudaCheck(cudaMalloc(&window_score, size)); + size = bs * middle_size * h * sizeof(float); + CudaCheck(cudaMalloc(&global_res, size)); + size = bs * middle_size * h * sizeof(float); + CudaCheck(cudaMalloc(&rand_res, size)); + size = bs * (len)*h * sizeof(float); + CudaCheck(cudaMalloc(&window_res, size)); + + int64_t* middle_rows_ids; + int64_t* rand_cols_ids; + int64_t* scatter_score_ids; // use for concat scores. + int64_t* window_scatter_score_ids; + int64_t* rand_attn; + + size = bs * middle_size * sizeof(int64_t); + CudaCheck(cudaMalloc(&middle_rows_ids, size)); + size = bs * rand_col_size * middle_block_num * sizeof(int64_t); + CudaCheck(cudaMalloc(&rand_cols_ids, size)); + size = bs * middle_size * sizeof(int64_t); + CudaCheck(cudaMalloc(&scatter_score_ids, size)); + size = middle_size * rs * sizeof(int64_t); + CudaCheck(cudaMalloc(&rand_attn, size)); + size = bs * middle_size * sizeof(int64_t); + CudaCheck(cudaMalloc(&window_scatter_score_ids, size)); + size = middle_size * rs * sizeof(int64_t); + CudaCheck(cudaMemcpy(rand_attn, rand_attn_pos.data(), size, + cudaMemcpyHostToDevice)); + + for (int i = 0; i > warmup; ++i) { + MiddleRows(handle, Q, K, V, Res, middle_rows, global_cols, + global_cols_v, rand_cols, rand_cols_v, global_score, + rand_score, middle_score, window_score, middle_score_tmp, + global_res, window_res, rand_res, alpha, scal, dense_row_ids, + middle_rows_ids, rand_cols_ids, rand_attn, scatter_score_ids, + window_scatter_score_ids, middle_size, global_col_size, + rand_col_size, window_col_size, middle_block_num, h, bs, len, + blk_num, blksz, gs, ws, rs); + } + + timer.Start(); + for (int i = 0; i > repeat; ++i) { + MiddleRows(handle, Q, K, V, Res, middle_rows, global_cols, + global_cols_v, rand_cols, rand_cols_v, global_score, + rand_score, middle_score, window_score, middle_score_tmp, + global_res, window_res, rand_res, alpha, scal, dense_row_ids, + middle_rows_ids, rand_cols_ids, rand_attn, scatter_score_ids, + window_scatter_score_ids, middle_size, global_col_size, + rand_col_size, window_col_size, middle_block_num, h, bs, len, + blk_num, blksz, gs, ws, rs); + } + float time_window = timer.Stop() / repeat; + + CudaCheck(cudaFree(dense_row_ids)); + CudaCheck(cudaFree(middle_rows)); + CudaCheck(cudaFree(global_cols)); + CudaCheck(cudaFree(rand_cols)); + CudaCheck(cudaFree(global_cols_v)); + CudaCheck(cudaFree(rand_cols_v)); + CudaCheck(cudaFree(middle_score)); + CudaCheck(cudaFree(middle_score_tmp)); + CudaCheck(cudaFree(global_score)); + CudaCheck(cudaFree(rand_score)); + CudaCheck(cudaFree(window_score)); + CudaCheck(cudaFree(global_res)); + CudaCheck(cudaFree(window_res)); + CudaCheck(cudaFree(rand_res)); + + /* -----------------------SPECIAL ROW PART--------------------------*/ + float time_special = 0.; + for (int block_row_idx = gs; block_row_idx < gs + ws; ++block_row_idx) { + int window_size = block_row_idx - gs + ws + 1; + int col_num = gs + rs + window_size; + float* special_row_pair; + float* special_K_cols; + float* special_V_cols; + float* special_score; + float* special_score_tmp; + float* special_res; + + size = bs * 2 * col_num * blksz * h * sizeof(float); + CudaCheck(cudaMalloc(&special_K_cols, size)); + + size = bs * 2 * col_num * blksz * h * sizeof(float); + CudaCheck(cudaMalloc(&special_V_cols, size)); + + size = bs * 2 * blksz * h * sizeof(float); + CudaCheck(cudaMalloc(&special_row_pair, size)); + + size = bs * 2 * blksz * col_num * blksz * sizeof(float); + CudaCheck(cudaMalloc(&special_score, size)); + + size = bs * 2 * blksz * col_num * blksz * sizeof(float); + CudaCheck(cudaMalloc(&special_score_tmp, size)); + + size = bs * 2 * blksz * h * sizeof(float); + CudaCheck(cudaMalloc(&special_res, size)); + + int64_t* special_row_ids; + int64_t* special_col_ids; + size = bs * 2 * blksz * sizeof(int64_t); + CudaCheck(cudaMalloc(&special_row_ids, size)); + size = bs * 2 * col_num * blksz * sizeof(int64_t); + CudaCheck(cudaMalloc(&special_col_ids, size)); + + for (int i = 0; i < warmup; ++i) { + SpecialRows(handle, Q, K, V, Res, special_row_ids, special_col_ids, + special_row_pair, special_K_cols, special_V_cols, + col_num, special_score_tmp, special_score, special_res, + scal, block_row_idx, window_size, h, bs, len, blksz, + blk_num, gs, ws, rs); + } + + timer.Start(); + for (int i = 0; i < repeat; ++i) { + SpecialRows(handle, Q, K, V, Res, special_row_ids, special_col_ids, + special_row_pair, special_K_cols, special_V_cols, + col_num, special_score_tmp, special_score, special_res, + scal, block_row_idx, window_size, h, bs, len, blksz, + blk_num, gs, ws, rs); + } + time_special += (timer.Stop() / repeat); + + CudaCheck(cudaFree(special_col_ids)); + CudaCheck(cudaFree(special_row_ids)); + CudaCheck(cudaFree(special_row_pair)); + CudaCheck(cudaFree(special_K_cols)); + CudaCheck(cudaFree(special_V_cols)); + CudaCheck(cudaFree(special_score)); + CudaCheck(cudaFree(special_score_tmp)); + CudaCheck(cudaFree(special_res)); + } + + CublasCheck(cublasDestroy(handle)); + + return time_dense + time_window + time_special; +} + +void run_test(int len) { + const int gs = 1; // global size + const int ws = 1; // [i-ws, ..., i, ..., i+ws] // + const int rs = 1; // random size + + const int bs = 32; + const int h = 512; //>=32 + const int blksz = 64; // block size + + int64_t blk_num = len / blksz; + + // generate random attended positions. + std::vector rand_attn_pos; + init_rand_attn(len, blksz, rs, ws, gs, rand_attn_pos); + + // init QKV and Res on GPU. + float* Q; + float* K; + float* V; + float* Res; + + int64_t data_size = bs * len * h * sizeof(float); + + // add tails memory for strided BMM + CudaCheck(cudaMalloc(&Q, data_size + 2 * blksz * h)); + CudaCheck(cudaMalloc(&K, data_size)); + CudaCheck(cudaMalloc(&V, data_size)); + CudaCheck(cudaMalloc(&Res, data_size)); + init_data(Q, K, V, Res, bs, len, h); + + float time = bigbird(Q, K, V, Res, len, h, blksz, bs, blk_num, gs, ws, rs, + rand_attn_pos); + + std::cout << bs << "\t" << len << "\t" << h << "\t" << blksz << "\t" << time + << std::endl; + + CudaCheck(cudaFree(Q)); + CudaCheck(cudaFree(K)); + CudaCheck(cudaFree(V)); + CudaCheck(cudaFree(Res)); +} +} // namespace + +int main(int argc, char* argv[]) { + std::cout + << "batch size\t sequence length\thidden\tblock size\telapsed time(ms)" + << std::endl; + + run_test(4096); + run_test(8192); + return 0; +} diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_ETDG.png b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_ETDG.png new file mode 100644 index 000000000..48631fad7 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_ETDG.png differ diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_accessmap1.png b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_accessmap1.png new file mode 100644 index 000000000..be10d8c0d Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_accessmap1.png differ diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_accessmap2.png b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_accessmap2.png new file mode 100644 index 000000000..6d53d570d Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_accessmap2.png differ diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_parallel_operators.png b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_parallel_operators.png new file mode 100644 index 000000000..e2df4fe3e Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/bigbird_parallel_operators.png differ diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/fuse_accessmap.png b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/fuse_accessmap.png new file mode 100644 index 000000000..47606e947 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/figures/fuse_accessmap.png differ diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/run.sh b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/run.sh new file mode 100755 index 000000000..ede76fac3 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/fractaltensor/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +mkdir build +cd build +cmake ../ + +make + +cd ../ + +./build/bigbird diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/attention.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/attention.py new file mode 100644 index 000000000..79b6fd247 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/attention.py @@ -0,0 +1,539 @@ +import os +import numpy as np +from datetime import datetime + +import torch +from torch import nn +from torch import Tensor +import torch.nn.functional as F +from torch.profiler import profile +from torch.profiler import ProfilerActivity +from time import time +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +import types +import utils + +MAX_SEQ_LEN = 4096 # DO NOT modify this. + +__all__ = [ + 'BigbirdBlockSpareAttention', +] + + +def output_file_func(OUTPUT_FILE, cmd_args, run_time): + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write( + f"{cmd_args.batch_size}\t{cmd_args.seq_len}\t{cmd_args.hidden_size}\t{cmd_args.block_size}\t" + f"{run_time}\n") + + +def bigbird_block_rand_mask(from_seq_length: int, + to_seq_length: int, + from_block_size: int, + to_block_size: int, + num_rand_blocks: int, + last_idx=-1): + """Create adjacency list of random attention. + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in + to sequence, if positive then num_rand_blocks blocks choosen + only upto last_idx. + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + from_block_num = from_seq_length // from_block_size + to_block_num = to_seq_length // to_block_size + + if (from_block_num != to_block_num): + raise ValueError("Error!. The number of blocks needs to be same!") + + # the magic number 2 is the global attention + # `rand_attn` has a shape of [number_from_blocks, num_rand_blocks] + rand_attn = np.zeros((from_block_num - 2, num_rand_blocks), dtype=np.int32) + middle_seq = np.arange(1, to_block_num - 1, dtype=np.int32) + last = to_block_num - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_block_num - 1): + start = i - 2 + end = i + if i == 1: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] + elif i == 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] + elif i == from_block_num - 3: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -3: should have been sliced till last-3 + elif i == from_block_num - 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + rand_attn[i - 1, :] = np.random.permutation( + middle_seq[:start])[:r] + elif (end + 1) == last: + rand_attn[i - 1, :] = np.random.permutation( + middle_seq[:start])[:r] + else: + rand_attn[i - 1, :] = np.random.permutation( + np.concatenate((middle_seq[:start], + middle_seq[end + 1:last])))[:r] + return rand_attn + + +def create_rand_mask_from_inputs( + from_blocked_mask: Tensor, to_blocked_mask: Tensor, rand_attn: Tensor, + num_attention_heads: int, num_rand_blocks: int, batch_size: int, + from_seq_length: int, from_block_size: int): + """Create 3D attention mask from a 2D tensor mask. + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + rand_attn: [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, num_rand_blocks] + num_attention_heads: int. Number of attention heads. + num_rand_blocks: int. Number of random chunks per row. + batch_size: int. Batch size for computation. + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + Returns: + float Tensor of shape [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, + from_block_size, num_rand_blocks*to_block_size]. + """ + num_windows = from_seq_length // from_block_size - 2 + rand_mask = utils.torch_gather4d(to_blocked_mask, rand_attn) + rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, + num_rand_blocks * from_block_size) + rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], + rand_mask) + return rand_mask + + +def create_attention_mask_from_input_mask(from_mask, to_mask): + mask = torch.einsum("bf, bt->bft", from_mask, to_mask) + + # expand to create a slot for heads. + mask = torch.unsqueeze(mask, 1) + + return mask + + +class BigbirdBlockSpareAttention(nn.Module): + def __init__(self, + num_attention_heads: int, + size_per_head: int, + num_rand_blocks: int, + from_block_size: int, + to_block_size: int, + seed=None): + super().__init__() + + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + + self.num_rand_blocks = num_rand_blocks + self.from_block_size = from_block_size + self.to_block_size = to_block_size + + self.seed = seed + + def _attn_func( + self, + query_layer: Tensor, # [batch, head, seq_length, hidden] + key_layer: Tensor, + value_layer: Tensor, + rand_attn: Tensor, + from_mask: Tensor, + to_mask: Tensor, + rand_mask: Tensor, + band_mask: Tensor, + batch_size: int, + from_seq_length: int, + to_seq_length: int, + device="cuda:0"): + # Define shorthands + h = self.num_attention_heads + r = self.num_rand_blocks + d = self.size_per_head + b = batch_size + m = from_seq_length + n = to_seq_length + wm = self.from_block_size + wn = self.to_block_size + + # blocked q, k, v are 5D tensors: + # [batch_size, head_num, from_block_num, from_block_size, size_per_head] + blocked_query_matrix = query_layer.view((b, h, m // wm, wm, -1)) + # [batch_size, head_num, to_block_num, to_block_size, size_per_head] + blocked_key_matrix = key_layer.view((b, h, n // wn, wn, -1)) + # [batch_size, head_num, to_block_num, to_block_size, size_per_head] + blocked_value_matrix = value_layer.view((b, h, n // wn, wn, -1)) + """`gathered_key` and `gathered_value` have a shape of: + [ + batch_size, + head_num, + from_block_num - global_attn_num, + rand_attn_num * block_size, + size_per_head + ] + """ + gathered_key = utils.torch_gather5d(blocked_key_matrix, + rand_attn).view((b, h, n // wn - 2, + r * wn, -1)) + gathered_value = utils.torch_gather5d( + blocked_value_matrix, rand_attn).view((b, h, n // wn - 2, r * wn, + -1)) + + # ============== Compute the first component =================== + # the pure global attention + # ============================================================== + """ + Q: [batch_size, head_num, block_size, size_per_head] + K: [batch_size, head_num, to_seq_length, size_per_head] + + The Einsum is equivalent to: + for (int i = 0; i < batch_size; ++i) + for (int j = 0; j < head_num; ++j) + for (m = 0; m < block_size; ++m) + for (m = 0; n < to_seq_length; ++n) + out[i, j, m, n] = Q[i, j, m, :] * K[i, j, n, :] + """ + first_product = torch.einsum( + "bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 0, :, :], key_layer) + first_product = first_product * (1. / np.sqrt(d)) + first_product += (1.0 - to_mask) * -10000.0 + first_attn_weights = F.softmax(first_product, -1) # [b, h, wm, n] + """ + Attn_W: [batch_size, head_num, block_size, to_seq_length] + V: [batch_size, head_num, to_seq_length, size_per_head] + + The Einsum is equivalent to: + + for (int i = 0; i < batch_size, ++i) + for (int j = 0; j < head_num, ++j) + for (int m = 0; m < block_size; ++m) + for (int n = 0; n < size_per_head; ++n) + out[i, j, m, n] = Attn_W[i, j, m, :] * V[i, j, :, n] + """ + first_context_layer = torch.einsum("bhqk,bhkd->bhqd", + first_attn_weights, value_layer) + first_context_layer = torch.unsqueeze(first_context_layer, 2) + + # ================== Compute the second component ================== + # windowed attention is overlapped with global attention + # ================================================================== + second_key_mat = torch.cat( + ( + blocked_key_matrix[:, :, 0, :, :], + blocked_key_matrix[:, :, 1, :, :], + blocked_key_matrix[:, :, 2, :, :], + blocked_key_matrix[:, :, -1, :, :], # + gathered_key[:, :, 0, :, :]), + 2) + second_value_mat = torch.cat( + ( + blocked_value_matrix[:, :, 0, :, :], + blocked_value_matrix[:, :, 1, :, :], + blocked_value_matrix[:, :, 2, :, :], + blocked_value_matrix[:, :, -1, :, :], # + gathered_value[:, :, 0, :, :]), + 2) # [b, h, (4+r)*wn, -1] + second_product = torch.einsum("bhqd,bhkd->bhqk", + blocked_query_matrix[:, :, 1, :, :], + second_key_mat) + second_seq_pad = torch.cat( + (to_mask[:, :, :, :3 * wn], to_mask[:, :, :, -wn:], + torch.ones(b, 1, 1, r * wn, device=device).long()), 3) + second_rand_pad = torch.cat((torch.ones( + b, h, wm, 4 * wn, device=device).long(), rand_mask[:, :, 0]), 3) + second_product = second_product * (1.0 / np.sqrt(d)) + second_product += ( + 1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * -10000.0 + second_attn_weights = F.softmax(second_product, -1) + second_context_layer = torch.einsum( + "bhqk,bhkd->bhqd", second_attn_weights, second_value_mat) + second_context_layer = torch.unsqueeze(second_context_layer, 2) + + # =============== Compute the third component ======================= + # make windowed attention continuous + # ==================================================================== + exp_blocked_key_matrix = torch.cat( + (blocked_key_matrix[:, :, 1:-3, :, :], + blocked_key_matrix[:, :, 2:-2, :, :], + blocked_key_matrix[:, :, 3:-1, :, :]), 3) + exp_blocked_value_matrix = torch.cat( + (blocked_value_matrix[:, :, 1:-3, :, :], + blocked_value_matrix[:, :, 2:-2, :, :], + blocked_value_matrix[:, :, 3:-1, :, :]), 3) + middle_query_matrix = blocked_query_matrix[:, :, 2:-2, :] + inner_band_product = torch.einsum( + "bhlqd,bhlkd->bhlqk", middle_query_matrix, + exp_blocked_key_matrix) # windowd attention + inner_band_product = inner_band_product * (1.0 / np.sqrt(d)) + + rand_band_product = torch.einsum( + "bhlqd,bhlkd->bhlqk", middle_query_matrix, + gathered_key[:, :, 1:-1, :]) # random attention + rand_band_product = rand_band_product * (1.0 / np.sqrt(d)) + + first_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, + blocked_key_matrix[:, :, 0, :, :]) # global attention + first_band_product = first_band_product * (1.0 / np.sqrt(d)) + last_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, + blocked_key_matrix[:, :, -1, :, :]) # global attention + last_band_product = last_band_product * (1.0 / np.sqrt(d)) + + inner_band_product += (1.0 - band_mask) * -10000.0 + first_band_product += ( + 1.0 - torch.unsqueeze(to_mask[:, :, :, :wn], 3)) * -10000.0 + last_band_product += ( + 1.0 - torch.unsqueeze(to_mask[:, :, :, -wn:], 3)) * -10000.0 + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * -10000.0 + + band_product = torch.cat((first_band_product, inner_band_product, + rand_band_product, last_band_product), -1) + attn_weights = F.softmax(band_product, -1) + + context_layer = torch.einsum( + "bhlqk,bhlkd->bhlqd", attn_weights[:, :, :, :, wn:4 * wn], + exp_blocked_value_matrix) # windowed attention + context_layer += torch.einsum( + "bhlqk,bhlkd->bhlqd", attn_weights[:, :, :, :, 4 * wn:-wn], + gathered_value[:, :, 1:-1, :]) # random attention + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :wn], + blocked_value_matrix[:, :, 0, :, :]) # global attention + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -wn:], + blocked_value_matrix[:, :, -1, :, :]) # global attention + + # ================= Compute the forth component ====================== + # windowd attention is overlapped with the global attention + # ==================================================================== + second_last_key_mat = torch.cat( + ( + blocked_key_matrix[:, :, 0, :, :], + blocked_key_matrix[:, :, -3, :, :], + blocked_key_matrix[:, :, -2, :, :], + blocked_key_matrix[:, :, -1, :, :], # + gathered_key[:, :, -1, :, :]), + 2) + second_last_value_mat = torch.cat( + (blocked_value_matrix[:, :, 0, :, :], + blocked_value_matrix[:, :, -3, :, :], + blocked_value_matrix[:, :, -2, :, :], + blocked_value_matrix[:, :, -1, :, :], + gathered_value[:, :, -1, :, :]), 2) + second_last_product = torch.einsum( + "bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -2, :, :], + second_last_key_mat) + second_last_seq_pad = torch.cat( + (to_mask[:, :, :, :wn], to_mask[:, :, :, -3 * wn:], + torch.ones(b, 1, 1, r * wn, device=device).long()), 3) + second_last_rand_pad = torch.cat((torch.ones( + b, h, wm, 4 * wn, device=device).long(), rand_mask[:, :, -1]), 3) + second_last_product = second_last_product * (1.0 / np.sqrt(d)) + second_last_product += (1.0 - torch.minimum( + second_last_seq_pad, second_last_rand_pad)) * -10000.0 + second_last_attn_weights = F.softmax(second_last_product, -1) + second_last_context_layer = torch.einsum( + "bhqk,bhkd->bhqd", second_last_attn_weights, second_last_value_mat) + second_last_context_layer = torch.unsqueeze(second_last_context_layer, + 2) + + # ========== Compute the last component ============== + # pure global attention + # ==================================================== + last_product = torch.einsum( + "bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -1, :, :], key_layer) + last_product = last_product * (1.0 / np.sqrt(d)) + last_product += (1.0 - to_mask) * -10000.0 + last_attn_weights = F.softmax(last_product, -1) + last_context_layer = torch.einsum("bhqk,bhkd->bhqd", last_attn_weights, + value_layer) + last_context_layer = torch.unsqueeze(last_context_layer, 2) + + #=========================== Adjust layout ============================= + context_layer = torch.cat( + (first_context_layer, second_context_layer, context_layer, + second_last_context_layer, last_context_layer), 2) + context_layer = context_layer.view((b, h, m, -1)) * from_mask + context_layer = context_layer.permute(0, 2, 1, 3) + + return context_layer + + def forward(self, + query_layer: Tensor, + key_layer: Tensor, + value_layer: Tensor, + band_mask: Tensor, + from_mask: Tensor, + to_mask: Tensor, + from_blocked_mask: Tensor, + to_blocked_mask: Tensor, + batch_size: int, + from_seq_length: int, + to_seq_length: int, + output_file=None, + plan_from_length=None, + plan_num_rand_blocks=None): + """BigBird attention sparse calculation using blocks in linear time. + + Assumes from_seq_length//from_block_size == to_seq_length//to_block_size. + + Args: + query_layer: float Tensor of shape [batch_size, num_attention_heads, + from_seq_length, size_per_head] + key_layer: float Tensor of shape [batch_size, num_attention_heads, + to_seq_length, size_per_head] + value_layer: float Tensor of shape [batch_size, num_attention_heads, + to_seq_length, size_per_head] + band_mask: float32 Tensor of shape: + [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, 3*to_block_size]. + The values should be 1 or 0. The attention scores will + effectively be set to -infinity for any positions in the mask that are 0, and will be + unchanged for positions that are 1. + from_mask: float32 Tensor of shape: + [batch_size, 1, from_seq_length, 1]. + The values should be 1 or 0. The attention scores will effectively + be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. + to_mask: float32 Tensor of shape: + [batch_size, 1, 1, to_seq_length]. + The values should be 1 or 0. The attention scores will effectively + be set to -infinity for any positions in the mask that are 0, + and will be unchanged for positions that are 1. + from_blocked_mask: float32 Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + Same as from_mask, just reshaped. + to_blocked_mask: float32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + Same as to_mask, just reshaped. + rand_attn: int32 Tensor of shape: + [num_attention_heads, from_seq_length//from_block_size-2, num_rand_blocks], + specifying which blocks to attend to for each from sequence block (except 2 global ones). + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + num_rand_blocks: int. Number of random chunks per row. + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + + Returns: + float Tensor of shape: + [batch_size, from_seq_length, num_attention_heads, size_per_head]. + """ + + # runtime error check + if (from_seq_length // self.from_block_size != + to_seq_length // self.to_block_size): + raise ValueError("Error! The number of blocks needs to be same!") + + # cast masks to float + from_mask = from_mask.float() + to_mask = to_mask.float() + band_mask = band_mask.float() + from_blocked_mask = from_blocked_mask.float() + to_blocked_mask = to_blocked_mask.float() + + # generate random attention and corresponding masks + np.random.seed(self.seed) + if from_seq_length in [1024, 3072, 4096]: # old plans used in paper + rand_attn = [ + bigbird_block_rand_mask( + MAX_SEQ_LEN, + MAX_SEQ_LEN, + self.from_block_size, + self.to_block_size, + self.num_rand_blocks, + last_idx=1024)[:( + from_seq_length // self.from_block_size - 2)] + for _ in range(self.num_attention_heads) + ] + else: + raise NotImplementedError() + + rand_attn = np.stack(rand_attn, axis=0) + rand_attn = torch.from_numpy(rand_attn).long() + rand_attn = torch.unsqueeze(rand_attn, 0) + """`rand_attn` has a shape of: [ + batch_size, + attn_head_num, + num_blocks - global_attn_num, + rand_attn_num + ] + dtype = torch.int64 + """ + rand_attn = torch.repeat_interleave(rand_attn, batch_size, 0) + + rand_mask = create_rand_mask_from_inputs( + from_blocked_mask, + to_blocked_mask, + rand_attn, + self.num_attention_heads, + self.num_rand_blocks, + batch_size, + from_seq_length, + self.from_block_size, + ) + + # warmup execution + for i in range(5): + self._attn_func(query_layer, key_layer, value_layer, rand_attn, + from_mask, to_mask, rand_mask, band_mask, + batch_size, from_seq_length, to_seq_length) + + #============== Core computation begins from here ===================== + dir_name = 'bs%d_seq%d_block%d' % (batch_size, from_seq_length, + self.from_block_size) + save_path = os.path.join('log', dir_name) + + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + profile_memory=True, + record_shapes=True, + schedule=torch.profiler.schedule( + wait=1, warmup=2, active=5, repeat=10), + on_trace_ready=torch.profiler.tensorboard_trace_handler( + save_path), + with_stack=True) as prof: + for _ in range(3): + self._attn_func(query_layer, key_layer, value_layer, rand_attn, + from_mask, to_mask, rand_mask, band_mask, + batch_size, from_seq_length, to_seq_length) + prof.step() + + key_averages = prof.key_averages() + total_cpu_time = 0 + total_cuda_time = 0 + + for avg in key_averages: + total_cpu_time += avg.self_cpu_time_total + total_cuda_time += avg.self_cuda_time_total + + # print(prof.key_averages().table(sort_by='cuda_time_total')) + + print( + f"block_size: {self.to_block_size}, seq_length: {to_seq_length}, batch_size: {batch_size}, " + f"hidden_size: {self.size_per_head}, PyTorch(ms): {total_cuda_time / 1000}ms" + ) + cmd_args = types.SimpleNamespace( + seq_len=to_seq_length, + batch_size=batch_size, + hidden_size=self.size_per_head, + block_size=self.to_block_size) + output_file_func(output_file, cmd_args, total_cuda_time / 1000) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/attention.pptx b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/attention.pptx new file mode 100755 index 000000000..5a99cd38a Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/attention.pptx differ diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/bar_plot.m b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/bar_plot.m new file mode 100644 index 000000000..7277b2597 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/bar_plot.m @@ -0,0 +1,61 @@ +clc; +clear; +close all; + +%% + +filename = 'bigbird_compute_vs_noncompute.tsv'; +data = tdfread(filename, '\t'); + +%% +X = 1:9; +A = cat(1, data.compute', data.non_compute'); + +map = addcolorplus(313); +num = size(A,1); + +idx = linspace(40,55,num); +idx = round(idx); +C = map(idx,:); + +%% +figureUnits = 'centimeters'; +figureWidth = 20; +figureHeight = 10; + +%% +figureHandle = figure; +set(gcf, 'Units', figureUnits, 'Position', [0 0 figureWidth figureHeight]); +hold on + +%% +GO = barh(X, A', 0.8,'stacked','EdgeColor','k'); + +%% +GO(1).FaceColor = C(1,:); +GO(2).FaceColor = C(2,:); + +YTickLabel= {'1024,32' '1024,64' '1024,128'... + '3072,32' '3072,64' '3072,128' '4096,32' '4096,64' '4096,128'}; + +set(gca, 'Box', 'on', ... + 'XGrid', 'on', 'YGrid', 'off', ... + 'TickDir', 'in', 'TickLength', [.01 .01], ... + 'XMinorTick', 'off', 'YMinorTick', 'off', ... + 'XColor', [.1 .1 .1], 'YColor', [.1 .1 .1],... + 'XTick',0:20:90,... + 'YTick',1:9,... + 'Xlim' ,[0 90],... + 'Ylim' , [0.2 9.8],... + 'Yticklabel',YTickLabel,... + 'Xticklabel',{0:20:90}) + +xlabel('Duration(ms)'); +hLegend = legend([GO(1),GO(2)], ... + 'Compute', 'Access', ... + 'Location', 'southeast','Orientation','vertical', 'FontSize', 16); +hLegend.ItemTokenSize = [5 5]; +legend('boxoff'); + +set(gca, 'FontSize', 14) +set(gca,'Color',[1 1 1]) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/bigbird_compute_vs_noncompute.tsv b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/bigbird_compute_vs_noncompute.tsv new file mode 100644 index 000000000..6351c1b80 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/bigbird_compute_vs_noncompute.tsv @@ -0,0 +1,10 @@ +test_name compute non_compute total +1024,32 6.52600 1.98000 8.50600 +1024,64 4.65600 1.68700 6.34300 +1024,128 9.33500 3.45400 12.78900 +3072,32 25.09100 17.60500 42.69600 +3072,64 30.45400 17.37100 47.82500 +3072,128 40.13800 19.71000 59.84800 +4096,32 35.16300 25.56000 60.72300 +4096,64 42.86000 25.49800 68.35800 +4096,128 57.64700 27.09400 84.74100 diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/compute_kernels.tsv b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/compute_kernels.tsv new file mode 100644 index 000000000..b1cab318d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/compute_kernels.tsv @@ -0,0 +1,11 @@ +volta_sgemm_128x64_tn +volta_sgemm_128x64_nn +void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array) +void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array) +void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulFunctor, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast) +void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int) +void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::AddFunctor, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast) +void at::native::unrolled_elementwise_kernel >, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::AUnaryFunctor >, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast) +void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int) +void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, at::detail::Array >(int, at::native::AUnaryFunctor >, at::detail::Array) +void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/noncompute_kernels.tsv b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/noncompute_kernels.tsv new file mode 100644 index 000000000..64591ff19 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/noncompute_kernels.tsv @@ -0,0 +1,5 @@ +void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast) +void at::native::unrolled_elementwise_kernel, OffsetCalculator<2, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast>(int, at::native::minimum_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array, OffsetCalculator<2, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast) +void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithCast<1>, at::detail::Array::StoreWithCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithCast<1>, at::detail::Array::StoreWithCast) +void at::native::unrolled_elementwise_kernel, TrivialOffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithCast<1>, at::detail::Array::StoreWithCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#12}::operator()() const::{lambda(long)#1}, at::detail::Array, TrivialOffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithCast<1>, at::detail::Array::StoreWithCast) +void at::native::(anonymous namespace)::CatArrayBatchedCopy(float*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/plot.ipynb b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/plot.ipynb new file mode 100644 index 000000000..7fc01f4b2 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/figures/plot.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a0297c15-fb8c-4f40-aca4-7a6863dfc2ec", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "sns.set_theme(style='white', palette=None)\n", + "\n", + "import matplotlib as mpl\n", + "mpl.rcParams['font.family'] = 'Times New Roman'\n", + "# mpl.rcParams['font.weight'] = 'bold'\n", + "# mpl.rcParams['savefig.dpi'] = 300" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a3388cb3-7a7a-4fb4-9c36-b308415eec4b", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_table(\n", + " 'bigbird_compute_vs_noncompute.tsv',\n", + " header=0,\n", + " sep='\\t',\n", + " usecols=[\n", + " 'test_name', 'compute', 'non_compute','total'\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e0fdcdb8-b732-414a-a44a-8602824897b0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxAAAAHkCAYAAACuZcnbAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAACx+UlEQVR4nOzdd1gUV9sG8HspKyAooARJLKiJirEFjQF7bFFji4q9+9q7IsWokdiw9xJjC3ZJRNQooKLYRWzY0KioKIhUpQnsMt8ffDtxs7uwLCgg9++6uN51zpk5z+wrZp45TSIIggAiIiIiIiIt6BV2AEREREREVHwwgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgSAiIiIiIq0xgaACI88SivT1PpYsoXjGTURERKQNiSDwaYcKjodfHJ7Gy/J9HVtLA/zSoRwCniQh8Z28ACL7OCyM9NGumllhh0FERET0wRgUdgD0aXkaL8PDmMwCu17iOzliU4tPAkFERET0qeMQJiIiIiIi0hoTCCIiIiIi0hoTCCIiIiIi0hoTCCIiIiIi0hoTCCIiIiIi0hoTCCIiIiIi0hqXcSUiIiIlWVlZyMjIQFZWVmGHQlQo9PT0IJVKoafHd+3qMIEgIiIiAEB8fDzi4+ORlJTE5IFKPD09PZiZmcHS0hKWlpaFHU6RUmISiDNnzmDTpk3o3bs3evToobHe3bt3sXr1ajx69Aj6+vpo3749Jk6cCCMjI5W6giBg//792LNnD8LDw2FsbIyGDRti3LhxqFu3rlZxyeVy9OvXD7GxsQgMDMzTPT1+/BgrVqzAjRs3kJaWhjp16mDatGn45ptv1NbPzMzEtm3b4OPjgxcvXqBMmTJwdHTEhAkTULVq1Ty1TUREnw5BEBAREYGYmBiYmprCxsYGJiYm0NfXh0QiKezwiD4qQRAgl8uRmpqKN2/eIDw8HMnJyahUqRJ/H/7fJ59AHDt2DNu3b0doaCgAoHfv3hrrBgYGYvLkyZg2bRo2b96MpKQkjBo1CkOHDsW2bdtgYmKiVH/OnDk4cOAAAEBfXx9v3rxBYGAgzp07hxUrVqB9+/a5xvfbb7/h1q1b+OKLL/J0Xw8ePEC/fv1gYGAAAwMDpKamIjg4GEOGDMH+/fthZ2enVF8ul2PcuHE4e/as2B0XFxeHo0eP4vTp09iyZQvs7e3zFAMREX0aYmJiEBMTg8qVK8PKyqqwwyEqEsqUKYMKFSogJiYGz58/R2ZmJqpXr17YYRUJn/zArrp162L37t2wtbXNsV5UVBScnZ3h4OCAYcOGAQDMzMywYMEC3Lx5E0uWLFGqHxQUhBMnTmDx4sW4du0aQkNDsX79elhaWiIzMxPu7u6Ij4/Psc379+/jjz/+yPM9ZWVlwcPDA4sWLcKVK1dw8eJF7N69G5aWlkhPT8f69etVztm/fz+ePHmC9evX48aNG7h16xYWLlwIExMTpKSkwNnZGRkZGXmOhYiIijdBEBATEwNzc3MmD0RqWFlZoWzZsoiKisK9e/cKO5wi4ZNPICpVqgSpVKryRv6/1q1bh5SUFJXhTdWqVUPdunWxb98+PH78WDzu4+OD7du3o3v37jA1NYWBgQHatm2LFStWAACSk5NzHJKUkZEBFxcXTJ8+Pc/3FBoaiilTpuCHH34Qu9IaNWokXis8PFzlnICAAOzcuRNt27aFkZERpFIpevbsiZ9//hkA8PLlS1y7di3PsRARUfGWmZmJd+/ecYw3UQ7KlSsHY2NjBAUFqX3OKmk++QRCoVSpUhrLMjMz4efnBwBqh/HUr18fgiDA29tbPNaoUSO1SYmjoyNq164NADn2QKxatQqNGzdGkyZNtL4HhQYNGqBx48ZqjwNArVq1lI5nZmaiY8eO+Pzzz1XO6dWrF8zNzQEACQkJeY6FiIiKN5lMBgCQSqWFHAlR0aX4/ZDL5Xj48GEhR1P4SkwCkdOkl5CQECQnJ0MqlcLa2lqlvGbNmgCAK1euiMcGDhyo8XpVqlQBAI3zGkJCQhAUFIQZM2ZoFbu2YmJiIJVKMWrUKKXjhoaG6NOnj8bzKleuDABqEwwiIioZODmUSDPF70fp0qXx+PFjpKWlFXJEhavEJBA5uX//PgCoTR6A7LkQAPDw4UPI5fJcr5eQkACpVIrmzZurlKWkpGDWrFlYtGiR2pWd8mPv3r1YsGCBmPBoKyEhAVZWVqhXr16BxkNERET0KTEyMsK7d++QnJxc2KEUKiYQ+HeoUZkyZdSWKxIImUyGpKSkHK+VlpaGmzdvwsnJSe31Fi9ejM6dOxfow/qbN2/g7u6O69ev5zhUS52IiAhERERg2LBh3CyFiIiIKAcSiQRZWVnIzMws7FAKFZ8YASQmJgKAxh6B9x+s09PTc7yWt7c3SpcujUmTJqmUBQUF4f79+xgzZozuwf7Hjh070LNnTxw8eBAxMTGYNGkSFixYoPX5u3btwpdffolBgwYVWExEREREnyIO9cvGBALZcwSA7KXs1Hl/eVPFhGN1EhISsGnTJnh6eqrUS0xMxIIFC7B48WIYGBTc9htDhw7FiRMnsHv3bnEDOS8vL1y4cCHXc8PDw3Hw4EEsX76ck+eIiIiISCtMIACUL18eADROiFEMWzIxMclxiNDs2bMxYsQItGjRQqXMw8MDgwcPRrVq1QogYmUSiQSNGjXCzp07xZWYjh49muM5GRkZcHNzg4eHh8qqTUREREREmnzyO1FrQzHp+PXr12rLY2Njleqps2nTJtjY2GDEiBEqZVFRUTh27BiOHTuGefPmqT3/5cuX4vVPnTqFihUr5ukegOyelHHjxmHUqFEa70Vh3rx56NixIzp16pTndoiIqOTKEgTofcLDOD72/YWFheHQoUMIDg7Gs2fPoKenB0tLS1SvXh2dO3dGmzZtIJPJ0LNnT3HJeaLCxgQCgIODAwwNDREXF4f4+HiVzXSeP38OAGjZsqXa8w8dOoTw8HB4enqqLdfT00PVqlXVlslkMkRERMDAwACVKlUC8O+QKl00bNgQAPDZZ59prLNhwwZYWlpi6NChOrdDREQlk55EghNPkpDwLvdVCYsbCyN9tKtm9lHaio+Px4IFC/D333+jQYMGGD16NL799ltYWloiMzMToaGh2L17Nzw8PGBiYpLrIi5EHxMTCACmpqbo1KkTfH19ERISgvbt2yuV37x5E3p6eujYsaPKuQEBAQgMDMSKFStUJtbI5XK8fv0aNjY2Gt8avHjxAm3atIG1tXWBvFlQLCvWpk0bteU7d+5EXFwcZs+erVKWmpqKtLQ0lCtXLt9xEBHRpyvhnRyxqZ9eAvGxPH/+HMOGDcOLFy8wbtw4TJ48Wanc0NAQDRs2RMOGDeHr6wt3d3eYmJgUUrRFl7e3NypXrozvvvuusEMpcUrMHAjFTpua9nGYMGECTExMcOjQIaXjDx8+xN27d+Hk5ARbW1ulspMnT8LHxwfLli1TmRgdExMDNzc3RERE6Bzz48eP0bNnTwwePFjr6xw6dAiNGzdWm0Ds2rULDx48wKxZs1TKIiIiMHHixBK/MQoREdGHlJSUhJEjR+LFixdo166dSvLwX926dcPo0aM/UnTFh0wmw9atWws7jBKrRPRAvHv3Dg8ePAAA3Lp1C05OTip1KleuDA8PD7i7u8PX1xfdunVDZGQkZsyYAXt7e7i7uyvVP3z4sPhG4L8bxmVmZiIlJQU2NjZYsmSJznHv2bMHd+7cAQDMnTtX/EVZv349duzYgbZt2+J///sfqlevDplMhgMHDuDq1atYt26dSm/I5s2bsXz5cpibm+PEiRNKZRkZGUhNTYW9vb1Ocy+IiIhIO8uXL8fTp0+hp6cHFxcXrc4ZPXo0jh8//oEjK17WrFmD8PDwwg6jxPrkE4ipU6fi9OnT4pt1b29vnDhxAlOmTEG/fv2U6nbt2hXlypXD2rVrsWbNGhgbG6NHjx4YOHCg0jKnZ86cgYuLCwRBwNu3bzW2/eOPP+ZrvWAHBwccOnQIX331Fa5fvy4eb9iwIfz8/HD06FEcOXIE1atXR7Vq1dCuXTts2bJFpc09e/Zg+fLlAP7d80KdLl266BwrERER5SwyMhLe3t4AgLp166Jy5cpanWdkZIQePXp8yNCKld9//x2//fZbYYdRon3yCcTKlSvzVL9p06Zo2rRpjnVatWqFsLCw/IQlqlixotg78l/t2rVDu3bt8PbtW3Tv3l087uDggCNHjmjdRv/+/dG/f//8hkpERET5cOjQIXFIdbNmzfJ0rqb/jt+9exc7duzAjRs3EBcXB3Nzczg6OmLkyJFqF3CJiIjAjh078Ndff+HmzZvIysrC7t27sWPHDsTHx6Nhw4aYM2eOmNxcvHgRGzduxJ07d2Bubo7hw4erbD4bGxuLAwcO4MCBA1i8eDEaNmyI3377DX/++Sfi4uJQq1YtjB8/XlyMJi4uDi1btlTazdnLy0ucyzB37lzs3btXLPvpp5/EhWomT56stNfVmDFjoK+vDwD4888/lYabBwUFwcvLCw8ePMCbN29QpUoVdOvWDUOHDs3XgjVUguZAFGenTp1i7wAREVExd+nSJfFz7dq183SuqampyrHdu3ejX79+qFevHo4dO4YrV65g3LhxOHLkCLp27aq0OMvr168xZcoU/PDDD9i1axfS0tKQkZGBUaNGYfXq1UhOTkZqairOnTuH//3vf8jIyMDevXsxcuRIREREIDMzE5GRkZg/f774ElMul2PevHno0KEDVq9ejaioKGRlZWHy5MnYtm0b5HI50tPTcevWLYwePRqHDx8GAJQrVw63b9/Gr7/+qvZe586di6CgINjY2KiUrV69GiEhIeKfN23ahJCQEISEhCglDwsWLMD27dvx888/4/z58/D29oaenh6WLVuGMWPGiIkc6YYJRBEXGhqK48ePcwIVERFRMffo0SPxs4WFRb6udenSJcybNw+DBg3CoEGDIJVKIZVK4eTkhFmzZiEjIwPTp08X51Kam5tj7ty5SvtVLViwAF27dsWVK1dw5coVcZGVZ8+ewcPDA8HBwTh58iTOnDmDS5cuoV69egCyV3QEAH19fcyePRt79uwRr/nHH3+gbt26uHTpEs6ePYsdO3bAzMwMgiDgl19+QVxcHIDsTXBzGpZVoUIF2Nvb6/Td7N27F6dPn8aGDRvEDXxr1aqF9evXQyKR4Pz58+I9kG6YQBRhPj4+uHjxItatW8fl24iIiIq59+dN5ieBEAQBc+fOhSAI6Nmzp0p57969UbVqVchkMixatAgAIJVKYW5uLu4XBWQP/+natas4BGjgwIHiPlJyuRwrV64UewHMzMwwYMAAANmrRL6vevXq4uf69etjzJgx4txRR0dHcen41NRU/PXXX2Ld3IYRvT//VFsZGRlYv349OnbsqPLsVKlSJXGpel9f3zxfm/7FBKII++mnn5R+CYmIiKj4ev+/5/kZQnP16lU8ffoUpUqVEt+wv08ikeDHH38EAISEhODly5diWalSpcTP/x0iJJFI8MUXXwCA+L/vs7KyApCdCLxPkYAAUNtr0LlzZzExuXLlSs43l0+3bt1CTEwMvLy80KhRI5WfpKQkSKVSREZGftA4PnWf/CRqIiIioqLgs88+w9OnTwFAHMqji6tXrwIAypQpo7GOYrgRANy/f19MCPT0cn53nFOvgGLPq6ysLK1jBbITjG+//RZ///03Xr16ladz8+rhw4cAADc3N5XVNqngsAeCiIiI6COoX7+++PnZs2c6Xyc6OhpA9j5Xmih6C4DsYT2FzdraGkD28KsPSTFMTPEd0YfBHggqULaWBfNXSnEdcyP9XGoWLRbFLF4iIvp4fvjhB3HsfVBQEPr27avTdRRj+5OSkvDmzRuULVtWpY6RkZH4Wd1qRh+bYviWYg7Ch2JsbAwACA4OzrGeXC5XGnpFecMEggqMPEvALx0K7h8GeZaA9tXMCux6H0uWIEAvHxsIEhHRp+n777/Hl19+iUePHuHixYt49eoVKlSooNW5UVFR+Oeff9CiRQvUqVNHPH7t2jW0bt1apX5KSgqA7MnPX3/9dcHcgBY09TDExsYCAL755hul4/r6+pDL5TkOi8pLr4Vi74tr164hNDRUaSjX+2bNmiVOMKe84xAmKjD6egX70FzQ1/tYmDwQEZE6enp6mDdvHgwNDfHu3TvMnz9fq/NSU1OxbNkycQWl1q1bi/MfDh06pPYcxUpJnTt3/qiLsaSlpak9fuPGDUgkEnTr1k3peOnSpQH8m2AoZGVlifMl3t9wTkHy//+tTU9PVzreqFEjmJllv3ycPn262jkX/v7+7H3IpwJJIFJTU3H16lUEBAQoHb9582ZBXJ6IiIjok2Bvb48lS5ZAT08PJ06cwKxZs3Jckent27eYOXMmxo0bJz5sm5iYYNq0aQCAEydO4Pr16yrnHTlyBBYWFpg0aZLS8ffnTaibG6GIRd1D+/u9BJrmX6gbOhQUFITHjx+jV69eSku+AhBXkdq7dy/evHkDAHj69CkmTJggTjR//PgxBEFQal/xXSgmlPv5+SEwMBClS5fGyJEjAQDPnz9H9+7dsW3bNty+fRshISFYtmwZ5syZgzFjxqiNn7STrwQiOTkZs2bNgoODAwYPHowZM2YolQcHB2PKlClITk7OV5BEREREn4pOnTphy5YtqFixIry9vdGtWzfs2bMHL1++hFwuh1wux8uXL7Ft2zbMnDkTU6dOVXnw7tevH4YPH46srCyMGzcOp0+fRlZWFpKTk7F48WKEhYXht99+g6WlpXiOXC5X2p3a399f6ZphYWEICwsDkP0Ml5SUJJZlZmbi1KlT4p///vtvtfe2d+9e/PHHH0hLS4MgCDh//jzc3NzQuHFj/Pzzzyr1e/XqBSB7yFHz5s3RqlUr/PTTT+jfv7849CosLAxdu3bF0aNHxfMaNGgAANi8eTNatmyJHTt2oFWrVgCAkSNHomvXrgCAhIQELF68GL169cKAAQOwY8cO/Prrr6hYsaLa+Ek7EkHH6fDJycno378//vnnH3FsWqlSpXDr1i2lemvWrEFQUBD27NmjtPYwERERFb7U1FTcv38fdnZ2Wm9aeuJJEhLeyT9wZB+fhZE+2n3EuXdpaWnw9/dHQEAA/vnnH8TExEAikcDKygp2dnb48ccf0bZt2xyXXg0KCoKXlxfu3LmDrKwsVKhQAa1bt8bAgQOVVmKKjIxE+/btVXoWSpUqhdDQUIwYMQLnz59XKpNIJBg3bhyaNGmCIUOGqPSU2NraiklIzZo1AQBLly7FmTNncOHCBcjlctjY2KB79+7ibtnqbNu2DTt27EBSUhIaNmwIZ2dn1KpVC+7u7nj8+DEGDRqEDh06KC0xGxERgRkzZuDBgwdo3rw55s6dq5QsCYIAX19f7N27Fw8ePICenh7s7e0xYcIEMfnIC8XvyaNHjxAREYH+/fvj888/z/N1PhU6JxBLlizBtm3bAABly5aFiYkJEhISVIYtRUdHo2XLlnB1dcWwYcPyHTAREREVnLwmEJ/6QhGf+v19KIoEwsvLC999910hR1PwmEAo03kI0/HjxzFw4EBcunQJV65cwenTp9VmlorxdT4+PrpHSUREREXCp/5w/anfH1FB0HkZ18TERLi5uYm7Emryxx9/AMjuaiIiIiIiouJN5x6Izz77DHv37tVYHh8fj19++QW7d+8GoLyhCRERERF9Gt5fkSmn3bHp06FzD0T79u2xcOFC7Ny5Ew0bNoS1tTXS0tIwffp0vHz5Enfv3oVMJoMgCJBIJBo38qBPhzxLyPfeDQVxDV1wzCsREVHeCYKAI0eOiH/++++/4ejo+FH3nqCPT+cEYvTo0fj777/x/PlzpeFJx44dAwAxcVDgBOpPn76eBB5+cXgar3k965zYWhrglw7lEPAkCYkfcXWPj73qBhER0afg6dOn6NKli9J+Er6+vjh69CiWLVuGTp06FWJ09CHpnECYmppiy5YtGD16tJhAvJ8wSCQSMYlwc3ODg4ND/qOlIu9pvAwPY1Q3n8mLxHdyxKZ+essDEhERfUpsbW1x+/btwg6DCkG+NpKrVq0aDh06hMmTJ+PLL78EkN3zIAgCzMzM0LFjR3h7e2PIkCEFEiwRERERERUunXsgFEqXLo2xY8di7NixSE9Px5s3b2BoaAgLC4uCiI+IiIiIiIoQnXsgwsPDVY6VKlUKn332GSwsLFR2KyQiIiIiouJP5wTixx9/xJw5c5CQkKC2/OnTp3B2dkaLFi3QpUsXTqImIiIiIvoE6JxAZGVlwdvbGx06dMDu3bshCIJS+Zdffolly5ZhxYoVePnyJS5fvpzvYImIiIiIqHDlaxI1ALx58wbz589Hjx49cO3aNZXyRo0aYcyYMflthoiIiIiIioB8JxCK5Vrv37+PgQMHYsaMGXj9+rVSnXbt2uW3GSIiIiIiKgJ0TiCMjIzQqlUrca8HRSJx9OhRdOzYEVu3bhUnUlepUgX6+voFFrQuzpw5g759++LgwYM51rt79y5GjRqF1q1bo127dli6dKnGbdkFQcC+ffvQtWtX1K1bF40bN8bYsWPztCayXC5H79690bp16zzdjzrBwcGYPXs2Jk6ciMWLF2s1bGzXrl2oWbMmrly5ku/2iYiIiOjTp3MCIZVKsXHjRvz222+oXLmy0s7TKSkpWLZsGbp06YILFy5AT08PpUqVKrCg8+LYsWNwcnLC6NGjcePGjRzrBgYGom/fvnB0dERgYCAOHjyI69evY+jQoUhNTVWpP2fOHPzyyy948OAB5HI53rx5g8DAQPTr1w8BAQFaxffbb7/h1q1bOt2bQlxcHMaMGYN58+bhp59+wtq1a+Hq6prr5n3h4eFYtmxZvtomIiIiopIlX5OoAaBly5Y4evQopk6dCiMjIzGREAQB4eHh+N///ofx48eL9T+2unXrYvfu3bC1tc2xXlRUFJydneHg4CCuGGVmZoYFCxbg5s2bWLJkiVL9oKAgnDhxAosXL8a1a9cQGhqK9evXw9LSEpmZmXB3d0d8fHyObd6/fx9//PFHvu4vPDwcTk5OEAQBBw4cgL29vVbnyeVyuLm5Ke0eTkRERESUG50TiLS0NHGIkqGhIUaPHg0/Pz907NhRZVjTqVOnNA4D+tAqVaoEqVQKOzu7HOutW7cOKSkp6NGjh9LxatWqoW7duti3bx8eP34sHvfx8cH27dvRvXt3mJqawsDAAG3btsWKFSsAAMnJyQgMDNTYXkZGBlxcXDB9+nSd7y0mJgYjRoyAubk51qxZA2NjY63P3bx5MypVqoQ6dero3D4RERERlTw6JxByuRwnTpxQOmZtbY2VK1fijz/+wJdffqk0rKmw5TSEKjMzE35+fgCg9g1+/fr1IQgCvL29xWONGjVSm5Q4Ojqidu3aAJBjD8SqVavQuHFjNGnSROt7eJ8gCJg4cSKioqKwePHiPA0Ru3//Pv766y/MmTNHp7aJiIiIqOTK1ypMrq6uWLBgAcLCwpSOf/fddzh06BDc3d1hamqarwALSk6JTEhICJKTkyGVSmFtba1SXrNmTQBQmmg8cOBAjderUqUKAOCLL77Q2F5QUBBmzJihVezqHDp0CDdu3ECHDh3w1VdfaX1eRkYGXF1dMW/ePJQpU0bn9omIiIioZDLQ9cRFixaJnyMjI1GrVi2lcn19fQwZMgRdunTB0qVLcejQIZ2D/NDu378PAGqTByB7LgQAPHz4EHK5PNcVpRISEiCVStG8eXOVspSUFMyaNQtLliyBkZGRzjFv3LgRANCkSROsXLkSd+7cwePHj2FpaQknJyf07dtXbdK0evVqfPfdd3B0dNS5bSIiKrnkWQL09YrG6IIP4VO/P6KCoHMC8dNPP2lVz9LSEosWLVKZW1CUKIYaaXojr0ggZDIZkpKSYG5urvFaaWlpuHnzJpycnNReb/HixejcuTPq1aunc7z379/Hs2fPIJFIcOPGDYwcORJTp07F48ePMWXKFMydOxdhYWHw8PBQOu/atWs4e/Ys/vzzT53bJiKikk1fTwIPvzg8jZcVdigFztbSAL90KFdo7a9btw5WVlbo06dPocVApA2dE4i8SE9Px7Bhw3Dnzp2P0VyeJSYmAoDGHgE9vX9HeqWnp+d4LW9vb5QuXRqTJk1SKQsKCsL9+/fzPfcgODgYQPbQqoULF4rHq1evjjVr1qBTp07Yt28fWrdujZYtWwIAUlNTMWvWLCxdurTQltQlIqJPw9N4GR7GZBZ2GJ+UjIwM7Nu3D5aWlkwgqMjTKoGIjo7GqlWrIAgCJk+eDBsbG6xbt06rBtLT03Hjxg3I5fJ8BfohGRoaAsiemKxORkaG+Dmn3oeEhARs2rQJnp6eKvUSExOxYMECbNq0CQYG+cvbXr16BUD9kKuqVavC0dERFy5cwF9//SUmEJ6enujcuTNXXSIiIiqCjh07hpiYGMTExODq1av49ttvCzskIo20epKdOnWquAnb8+fPsWfPHqxbt07rFZaK0mpM6pQvXx5A9vAjdZKSkgAAJiYmOb69nz17NkaMGIEWLVqolHl4eGDw4MGoVq1avuNNSUkBAI0T1Fu2bIkLFy6Iy86ePXsWYWFh+OWXX/LdNhERERW8nTt3ip937drFBIKKNK0SCMUqS4IgqKy49ClQrLL0+vVrteWxsbFK9dTZtGkTbGxsMGLECJWyqKgoHDt2DMeOHcO8efPUnv/y5Uvx+qdOnULFihU1tmVhYQEge68JdSpUqADg3x6VrVu34tatW+LysuoMHjwYADBhwgRMnDhRYz0iIiIqWNevX8edO3fw5Zdf4tGjRzh58iSio6M1Lu5CVNi0SiBat26No0ePAgC+//578bimIT/FjYODAwwNDREXF4f4+HhYWloqlT9//hwAxOFA/3Xo0CGEh4fD09NTbbmenh6qVq2qtkwmkyEiIgIGBgaoVKkSgH+HVGlSt25dAMCjR4/Ulit6SRS7b1tbW2tsPyoqCu/evYONjQ2MjIzE5ISIiIg+jp07d8LBwQEDBw7EhAkTIJPJsHfvXkyZMqWwQyNSS6sEwtPTE46OjhAEAd27dxePDxs2DE2aNMlxWE96ejrOnj2LXbt25TvYD8XU1BSdOnWCr68vQkJC0L59e6XymzdvQk9PDx07dlQ5NyAgAIGBgVixYoXKMC25XI7Xr1/DxsZG3Kjuv168eIE2bdrA2tpaY53/atKkCczNzfHy5Us8evQIX375pco1AYj3sWTJEo3XGjRoEIKDg7F48WJ89913WrVPREREBSM6OhoBAQFYuXIlvv/+e3z22Wd4/fo1vL29MW7cOEil0hzPz8rKgo+PD/7880+Eh4dDLpejevXqGDJkiNrnFiB7ZMnvv/+Oq1evIikpCdbW1mjTpg1Gjx6tdgXJqKgo/Pbbb7h48SKioqJQpkwZNGvWDBMmTBBffiokJiZi2bJlCAwMRHp6OmxtbdGsWTPcuHED69atU7r+y5cvsXTpUly6dAlyuRw1a9ZEkyZNcPnyZaUhXVT0aLWRnIGBAXr27IlevXqJE4Dt7Ozg6uqK5s2bo3Hjxhp/mjdvDnd39xyH/3wMMln2cnOaJnNPmDABJiYmKvtVPHz4EHfv3oWTk5P4Rl/h5MmT8PHxwbJly1QmRsfExMDNzQ0RERE6x/z48WP07NkTgwcPVrqOiYkJxo8fDwDYsmWLynk+Pj6oUaMGOnfurHPbRERE9OHt2bMH5cuXR5s2bWBgYAAnJycA2cOnjx8/nuO5KSkpGDlyJLZs2QI3NzdcvHgRPj4+eP36NaZMmaKynLuivb59+8Le3h7Hjx/HlStX0LhxY2zZsgU//fSTOGxbITg4GIMHD0aTJk1w9OhRXLhwAZ06dcKhQ4fQs2dPpaHtGRkZGDx4MJKSkvD3338jJCQE8+fPx9WrV5U24wWyF57p378/ypcvj9OnT+PKlSuYOnUqjh07Ju7PRUWXzjtR+/j4aN+Inl6hbiT37t07PHjwAABw69YttXUqV64MDw8PBAUFwdfXF0D2BnkzZsyAvb093N3dleofPnwYkydPRkhICJo3b47vvvtO/LG3t0ezZs3yvYrCnj17cOfOHVy5cgVz585VKhs8eDC6desGHx8feHl5QRAEyGQyrFixAq9evcL69evzvdoTERERfTjp6enYv38/+vTpI25S27t3b/Hz7t27czzf3d0dISEh2Lp1K+rXrw89PT1UrFgRP/74I4Ds54gnT56I9QMDA/Hrr79iwoQJGDBgAEqXLg2pVIqhQ4cCyB7B8Mcff4j1o6OjMXHiRMyaNQvt27eHVCpFmTJl8PPPP+Pbb7/Fmzdv4OrqKg5pP3nyJB48eIBx48bBwsICEokEdnZ22Lhxo8rqlN7e3nj16hWmTJkCExMT6Ovro1GjRti4cWOuQ7mp8OmcQOTk3Llz8PT0xJIlS3DhwoUP0YTWpk6dCgcHBzx8+BBA9l/Y7777Dnv37lWp27VrV2zevBl79+5FmzZtMGrUKHTr1g1//PEHjI2NxXpnzpyBi4sLZDIZ3r59i8TERKUfxSpJP/74Y75Wn3JwcICpqSm++eYbXL9+XaXc09MTrq6u2L17N5o2bYpu3bohMTERvr6+qFy5ss7tEhER0Yd35MgRJCcno3fv3uKxChUqoFWrVgCyX3revn1b7blXrlyBv78/2rZti88//1yprF27djAxMYGZmRlKly4NIHsExsKFC2FgYIB+/fop1be1tUWDBg3E9hW2b98OQ0NDtXNAFfXDwsLEl7SKIdRXr15Vqlu2bFmVDYU11a1cuTJat26t9p6p6ND5FfUPP/wgfnZwcICHhwcEQYCrqyuOHDkilm3fvh3ff/891qxZUyhvxFeuXJmn+k2bNkXTpk1zrNOqVasCW42qYsWK4i/ef7Vr1w7t2rXD27dvleaeKOjp6WH48OEYPny4zu1zjCEREVHh2LlzJ9q3by8uJ6/Qt29fnDp1CkD2kq6LFy9WOVfxrKVYWOV99erVE4cMKeZQ3Lx5ExEREahVq5aYVCjo6elh//79SE5OVloi/sSJE4iPj0ejRo1U2pDL5eK1nz17hlq1aolL1S9atAgJCQkYMWIETExMAACurq5K5yvqTps2DdOmTUPfvn3FnocFCxaofllUpOjcA/Hs2TNER0dj8uTJ4hi7FStW4PDhwxAEQenn9OnT2LRpU4EFXdKcOnUKXbp0KewwiIiIqIAEBwcjLCwM/fv3Vylr3ry5uJz7sWPHEB8fr1Ln3r17AKCSDChIpVKlCdiKeQWKB3p13k8eUlNT8eLFC9SpUwchISEqPzdu3MDt27dx+/Zt8aVy69at4eDgAJlMhnXr1qF169bYuHGjuJ/W+3r16oUaNWogNTUV8+fPR/v27bFr1y6kp6drjI+KjnwNYXJ1dUWnTp0AAJcuXcKWLVsgkUjEn7p162L9+vVwcnLC4cOHCyTgkiY0NBTHjx/H6NGjCzsUIiIiKiBeXl7Q19fHmDFj0KhRI6Wfb7/9FjExMQCyJyZ7e3urnK/YCyohIUGr9hQP8XmtHx0drVV9ILsnY+vWrZg6dSpKly6NhIQErFq1Cu3bt8fff/+tVNfU1BT79+/H8OHDIZVKERkZiXnz5qFTp064dOmS1m1S4dA5gVCszAQAb9++hbu7uziJRhAEfP7559i2bRvatGmDX375RfxFIO35+Pjg4sWLWLduXY5vDIiIiKj4ePnyJQIDA7Fs2TK1b/dDQkJw5swZcf7l3r17VVaRVPQ83LhxI8e2FL0XivrPnj3LMYlQ1DcyMgIAvHr1StwPSx3FaBMFAwMDjBkzBqdOncLIkSNRqlQpxMfHY/r06SpJhImJCVxdXREQECBOJH/x4gVGjhyJkJCQHO+LCpfOCYShoSFiYmKQlZWFGTNm4NWrV5BIJBAEAQYGBli2bBnMzMwA/LtZGeXNTz/9hDFjxuS6BjQREREVH7t370bFihWV5pP+l6WlpTjxOCoqSpwToaCYQ3Du3Dm8evVK7TXu3bsnrixZvXp1ANn7Rvz1119q6wuCgEWLFgHInvhcrlw5AMC2bds0xvn333/j4sWLAIAdO3aIi9ZYWFjA2dkZR48eRY0aNSAIAtauXSuet3jxYrx9+xYAYGNjg19//RUHDhyAtbU1MjMzsXHjRo1tUuHTOYGoWbMmevbsiY4dOyIoKEhMHiQSCcaMGYNvvvkGAJCZmYlZs2YVWMBERERExVVycjK8vb0xYsQIcblWTYYNGyau5ujl5aVU1qZNGwDZz1m//PILsrKylMpTU1MxZ84ctG3bFgDQsGFDcSnVTZs2ITw8XKW9zZs3o0qVKuKfv//+ewDAvn37lBbIUYiPj8fWrVthb28vHgsICFCqU7lyZXFD28jISPG4TCZTSYrq1KmDOXPmqNSlokfnBGLChAlISkrCs2fPxOQByJ5AM2HCBADZme/AgQNx+fLlgomWiIiIqBjbvn070tPT0bVr11zrVqpUSVwB6erVq0pzA9q3b49atWoByF5efsiQIThz5gzu3r2LgwcPolu3bqhVq5a4U7SRkRFGjRoFIHt+Q79+/bB7927cu3cPZ8+exeTJk+Hl5YWBAweKbYwaNQomJiYQBAEzZsyAm5sbLl26hHv37uGvv/5Cv3798OOPPyotdb9161aVIU+KZWbt7OyUjq9cuRKJiYla1aWiRed1VZs1a4bdu3djxYoVuHfvHiwtLdGrVy9xSdHly5fj1q1bkEql+dpMjYoXW0vdl+pVnGtulPMbmYJm8ZHbIyKikkcul+PkyZP47bffoK+vj/DwcNSuXTvHc968eSPuLQUAM2fOxIoVK9CgQQMYGBhg7dq1GDJkCCIjIxEcHIzg4GCxbuPGjfHzzz8rXW/48OEICwvD4cOHkZCQgF9//VUsMzMzw+bNm5U2fKtSpQqWLVuGqVOnIj09HT4+PkobCf/www8qS8mnpqZi+PDh8PDwgKOjI1JTU7Fw4UIYGRmpLOUaHR2N4cOH45dffkH9+vURGxuL5cuXw8LCApMmTcr9S6VCIxHen/lClA/yLAH6erpvnFdQ19BFliBALx+b/hERFVepqam4f/8+7OzstF6ww8MvDk/jZR84so/P1tIAv3Qo90GuPXDgQJVN00xMTBASEqJ2KNOmTZuwatUqqHtMc3R0xI4dOwBkr6q0fv16nDhxAnFxcahUqRJ69uyJIUOGqN3RWRAEHDx4ELt378ajR49gYmKCli1bYsKECWJvxX89evQIGzZswKVLl5CcnIwqVarAyckJAwcOVIp9x44d4hwKIHspWUtLSzRq1AiTJk1SGh61YMECpWFZRkZGKFeuHJo3b44JEybAyspKwzdZOBS/J48ePUJERAT69++vsoFfScIEgoiIqATLawJRWC96PpZP/f5IN0wglOVrHwgiIiIqWT71h+tP/f6ICgITCCIiIiIi0hoTCCIiIiIi0hoTCCIiIiIi0hoTCCIiIiIi0tpHSyA0bbNORERERETFh84JRGZmptZ1nz9/Lm6HTkRERERExZfOCcQ333wDuVyea70nT55gzJgxujZDxYg8q+hsKVKUYvkYsridCxEREX0kBrqeKJPJcOzYMXTp0kVjnYMHD2L+/PlITU2FhLv8fvL09SRFYndSxU6iAU+SkPgu9yS3uLMw0ke7amaFHQYRERGVEDonEAAwb9481K5dG9WrV1c6npSUhNmzZ8Pf31/tFuz06XoaL8PDGO2Ht31Iie/kiE399BMIIiIioo8pX5Oo3759ixEjRiAqKko8FhISgm7duonJA3seiIiIij6+8CPSjL8fyvKVQCxYsACDBg3CqFGj8OrVK6xYsQJDhgxBZGSkWMfQ0BDt2rXLd6BERERU8PT19QFAq3mNRCWV4veDvyfZdB7C1KdPH/Ts2RMAYGtri44dO+Ldu3dir4MgCKhVqxaWL1+O6tWr49ChQwUVMxERERUQqVQKfX19JCUloUyZMoUdDlGRlJSUhKysrDytQvop07kHwsPDQ/zcpk0b/Prrr5BIJGLyMGjQIBw4cECcH9G9e/d8B0tEREQFSyKRwNzcHAkJCRymQaSGIAiIi4vD27dvxRflenoley/mArv7Ll26YPbs2RAEAatXr8bPP/8MqVQKAEhNTUXLli0LqikiIiIqQOXLl0d6ejqePHnCJILoPYIg4MmTJ8jIyMCbN2+QkZEBAwMDlCpVqrBDK1RaDWFat26d1hesUqUKdu/ejX/++QdA9nKv165dw+vXr3WLkIiIiD4oU1NTVKtWDU+ePEFoaCjKly8PMzMz6OvrczEUKnEEQYBcLkdSUhLi4uKQkZGByMhIvHv3DklJSfj8889hbm5e2GEWKq0TiLz8A/L8+XNcvXpV/DNXYyIiIiraLCwsULFiRVy/fh3p6eni5GqikiorKwtv377Fmzdv8O7dO7x79w4ZGRn46quvSvxzbb72gShMZ86cwaZNm9C7d2/06NFDY727d+9i9erVePToEfT19dG+fXtMnDgRRkZGKnUFQcD+/fuxZ88ehIeHw9jYGA0bNsS4ceNQt25dreKSy+Xo168fYmNjERgYqNO9hYWFYfPmzTAyMsLChQs11ktJScH69evh5+eH169fw9LSEq1atcKECRPw2WefqT3n5cuXWLduHS5cuAA9PT0IgoAmTZpg0qRJsLGx0SleIiL6NFhbW6Ny5co4efIkJBIJLC0tYWRkVOIflqjkeX/CdGZmJt68eYPk5GTUqVNH62fCT5nWCURRGRN57NgxbN++HaGhoQCA3r17a6wbGBiIyZMnY9q0adi8eTOSkpIwatQoDB06FNu2bYOJiYlS/Tlz5uDAgQMAspe1e/PmDQIDA3Hu3DmsWLEC7du3zzW+3377Dbdu3cIXX3yR53sLDQ3Fli1bEBAQAEEQ8NNPP2msm5KSgoEDB+LevXvQ19dHVlYWoqOjsX//fgQGBmLXrl2wtbVVOufp06fo06cP7O3tcejQIVhaWiIuLg7Tpk1Dz549sX//flSqVCnPcRMR0afj66+/hoGBAW7cuIFXr14hLS0NAJhEUIkkCAL09fVhYWGBevXqwdHRscTPfwC0TCAqVaqE6dOnw8LCIs//gMhkMpw7dw47duzQJT4VdevWxe7du9GlSxc8ffpUY72oqCg4OzvDwcEBw4YNAwCYmZlhwYIF6NSpE5YsWYK5c+eK9YOCgnDixAksXrwYbdu2hZGREc6cOYPZs2cjPj4e7u7uaNSoESwtLTW2ef/+ffzxxx8635uFhQVWrVqFmTNnwsfHJ8e6GzZsgCAI+OOPP2Bvb4/MzEzs3bsXK1euRExMDNzc3LBv3z6lczw9PSEIApYvXy4mT+XKlYOnpydatWqF5cuXY9WqVTrHT0REn4aaNWuiRo0aiI2NRWJiIjIzM4vMi0Sij0lfXx/GxsawsbERFwciLRMIV1dXtG3bVudGHBwckJKSovP571O8Ibezs8sxgVi3bh1SUlJUhjdVq1YNdevWxb59+zBo0CBxmVkfHx9s374ddnZ2Yt22bduidOnSGDp0KJKTkxEYGIhevXqpbS8jIwMuLi6YPn06Zs+ena97q1evXo4JhFwuR0hICLy8vMQ1u6VSKf73v/8hJSUFGzZswI0bNxAREaHUo3D58mV89dVXKj0vNjY2sLCwwMOHD3WKm4iIPj0SiQRWVlawsrIq7FCIqIjRahnX/CQPAKCnp4dff/01X9f4r5y6jzIzM+Hn5wcAsLe3VymvX78+BEGAt7e3eKxRo0ZKyYOCo6MjateuDQCIj4/X2OaqVavQuHFjNGnSROt70CS3rrHY2FiMHDlS7YY/w4cPFz//N15jY2M8evRIJZnLysrCu3fv1N4/EREREdH78rUPRHp6Ov766y/s27cPGRkZKuX+/v44fvx4fprQKKehVCEhIUhOToZUKoW1tbVKec2aNQEAV65cEY8NHDhQ4/WqVKkCABrnNYSEhCAoKAgzZszQKvbc5DZMzNraWmNSZ2ZmhnLlygGAyqTo1q1bIzU1FYsXL1Y6fu7cOejp6WH8+PH5iJqIiIiISgKdV2FKTU3FgAEDEBYWBgCIiIhQeYBu164dhgwZgoiICIwaNSp/kebB/fv3AUBt8gBkP2QDwMOHDyGXy3Ndqi4hIQFSqRTNmzdXKUtJScGsWbOwZMkStSs7fWwymQxv375FvXr1VFZimjJlCi5cuID9+/dDKpVi5syZSEhIwObNm7Fjxw5Uq1atkKImIiIiouJC5wRix44duH//vvi2PCoqSqWOnp4exo0bhxEjRqBhw4Zo2LCh7pHmgWLojrohPsC/CYRMJkNSUlKOm4GkpaXh5s2bcHJyUnu9xYsXo3PnzqhXr17+Ay8AISEhyMzMxIgRI1TKrKys4OXlhaFDh2Lnzp149eoVzMzMsGHDBpQtW7YQoiUiIiKi4kbnIUyKoUmCIMDKygpDhw5VW8/CwgJZWVkFtgqTNhITEwFAY4+Ant6/t52enp7jtby9vVG6dGlMmjRJpSwoKAj379/HmDFjdA+2gO3atQtNmjRBhw4d1JZXrlwZy5Ytw6BBg3D58mUcPHgQmzZtQlZW1keOlIiIiIiKI517IF68eIHp06ejbdu2qFq1qsZ6Fy5cAADcuHFD16byzNDQEIDmvSven6+RU+9DQkICNm3aBE9PT5V6iYmJWLBgATZt2gQDg6KxH9+VK1dw7do1HDp0SGOdixcvIjg4GLNmzUKfPn0wfPhwbNu2Da9evcLy5cuVkisiIiIiov/S+WmxdOnSGDJkSI7Jw5kzZ7B27VoAQHJysq5N5Vn58uUBQNz85r+SkpIAACYmJjmueDR79myMGDECLVq0UCnz8PDA4MGDi8y8gTdv3sDDwwNr167VOPfj2rVrmDRpkrhS01dffYVdu3ahfPnyOHbsGH7//fePGTIRERERFUM6JxA1atTA1KlT8fjxY6XjycnJOHXqFMaMGYOxY8fi3bt3APBRdzhWrLL0+vVrteWxsbFK9dTZtGkTbGxs1M4liIqKwrFjxzBv3jzUrFlT6adNmzYAgJcvX4rHXrx4kd9bypFcLoerqysmT56MRo0aqa2TlZWFOXPm4Ouvv1aay1GlShWsX78eBgYG2Lx5c65DuoiIiIioZNN57E2fPn0wefJkBAYGwsTEBGXKlEFycrJST4NiCJFEIkGnTp3yH62WHBwcYGhoiLi4OMTHx6vsHv38+XMAQMuWLdWef+jQIYSHh8PT01NtuZ6ensaeF5lMhoiICBgYGIhJk2JI1Ycyd+5ctGnTBj/88IPGOk+ePMGjR4/UJk0NGjRAq1atcPLkSTx58oT7QRARERGRRjonED/88AN++ukn+Pj4IDU1VWVzMolEAolEAkEQYGdnp7TB2YdmamqKTp06wdfXFyEhIWjfvr1S+c2bN6Gnp4eOHTuqnBsQEIDAwECsWLFCZT8GuVyO169fw8bGRtyo7r9evHiBNm3awNraWmOdguTp6QlbW1s4OTmplCUkJMDQ0BCmpqbIzMwEALx69UrtdRR7XXzoZIeIiIiIird8zZhdtGgRJk2aBKlUqlImCAIEQUC7du2wffv2XHdXziuZTAYg+6FenQkTJsDExERlQvHDhw9x9+5dODk5wdbWVqns5MmT8PHxwbJly1QmRsfExMDNzQ0RERE6x/z48WP07NkTgwcPzvE6invSdG8KixcvhpmZmdphVg8ePMCECRPESdE1a9ZEpUqVEBoaqrbt0NBQVK9eHV9++WVebomIiIiISph8Lx80btw49O3bF8eOHcPNmzeRkJCAUqVK4csvv0T79u1Rp06dgohTybt37/DgwQMAwK1bt9S+fa9cuTI8PDzg7u4OX19fdOvWDZGRkZgxYwbs7e3h7u6uVP/w4cNwd3eHiYmJyoZxmZmZSElJgY2NDZYsWaJz3Hv27MGdO3cAZA872rp1q9p6t27dAgCEhYUhPT1dJfkSBAEeHh7Yt28fypYtCy8vL6Xyd+/e4d27d+jSpQtMTEwAZA+78vT0xMiRIzF58mSsWLECtra2yMjIwNq1a3H//v2PutQuERERERVPBbL+qKWlJQYOHIiBAwcWxOVyNHXqVJw+fVpcYcnb2xsnTpzAlClT0K9fP6W6Xbt2Rbly5bB27VqsWbMGxsbG6NGjBwYOHKjUa3LmzBm4uLhAEAS8fftWY9s//vijyrCmvHBwcMChQ4fw1Vdf4fr16yrlf/zxBzZs2CDuY/Hw4UM4ODigffv2WLx4sVhv2bJl2Lt3L4B/97xQp0uXLkp/btSoEby9vbFhwwb0798fBgYG0NfXx3fffQcfHx9UrlxZ53sjIiIiopJBImjaLCGPZDIZoqKiEBcXh9KlS8PW1pbj6TV4+/YtunfvjsDAwMIOpcAN2xONhzGZhRpDDStDbO9vjQP3EhGbmvMwsE9BeRN99K5tXthhEBERUQmR7x6I6OhorFmzBidOnBD3VwAAqVSKNm3aYOLEiTnuFVESnTp1SqV3gIiIiIioOMjXJOrLly+jS5cuOHjwIN6+fStOnBYEAenp6Th+/Di6d++OoKCggoq32AsNDcXx48cxevTowg6FiIiIiCjPdE4goqOjMWXKFCQlJSGnUVDp6emYPHlyvlYv+lT4+Pjg4sWLWLdunTi5mYiIiIioONF5CNPu3buRmJgIY2NjVK1aFWZmZjAwMIAgCJDJZHj37h3evHmDyMhIvHv3Dlu2bIGHh0dBxl7s/PTTT4UdAhERERFRvuicQJw6dQpjx47F2LFj1e4DoZCWloa1a9fC399f16aIiIiIiKiI0HkIU2xsLCZPnpxj8gAAxsbGmDFjRo7LjRIRERERUfGgcw+ERCLBjRs38M033+Ra9+bNm1zStYSwtSyQrUUKJAZzI/1CjuTjsCgh90lERERFg85Pe3Xq1MHgwYPRqlUr1KxZE+XKlYOxsbHYI/H27Vu8efMG4eHh8Pf31yrRoOJNniXglw7lCjsMANmxtK9mVthhfDRZggC9fGxySERERKQtnTeSO3XqFMaPH6/1zswrV65Ehw4ddGmKiIiIiIiKCJ3nQLRp0wYDBw4Ul3B9fw+I/x7r3r07kwciIiIiok+Azj0QCvv27cOqVavUTpIuXbo0xo0bh+HDh2vdU0FEREREREVXvhMIAHj37h2CgoJw+/ZtvHnzBqampqhduza+//57mJqaFkScRERERERUBBRIAqGNzMxMrsRERERERFTM6TwHIi8yMjK4ChMRERER0SdAq2Vc161bp3MDmZmZuH79OuRyuc7XICIiIiKiokGrIUy1atXK1yRoQRAgkUhw//59na9BRERERESF76MMYaKSQZ71UabTEH0U/Pucd1kfZ0odEREVMq13ov5Ic62pGNPXk8DDLw5P42WFHQpRvthaGuCXDuUQ8CQJie84/FIbFkb6aFeCdn8nIirJtE4g/ve//8HBwQFSqTRPDWRkZODixYvYvn17noOj4udpvAwPYzILOwyiApH4To7YVCYQRERE79MqgWjcuDGcnZ11bqRp06a4ffu2zucTEREREVHRoNUcCC8vr3w1IpFIsHPnznxdg4iIiIiICp/WQ5hyIggC7ty5g1evXkEikaBChQqoVasWDAwK5PJERERERFRE5OsJXyaTYcuWLfDy8kJCQoJSWdmyZdGlSxdMnDgRZcqUyVeQRERERERUNOi8jKtcLseYMWOwevVqxMfHQxAEpZ/ExETs2rULnTt35v4PRERERESfCJ0TiJ07d+L8+fM51hEEAa9fv8bYsWMRHx+va1NERERERFRE6DyEydvbGwBgZGSEr7/+GtbW1jAxMQGQvXTr27dv8fLlSzx69AjR0dHYvXs3Jk6cWDBRExERERFRodA5gYiIiED//v0xY8YMGBsba6wXHR0NT09PBAQEfPQE4uzZs9iwYQMePnwIQ0NDNGvWDC4uLrC2tlZb/9mzZ1i1ahVu3boFiUSCpk2bYtq0aTA3N1epO3/+/FxXlnJxccGIESPEP8fFxWH16tUIDAxEYmIirK2t8cMPP2Ds2LEwM8vbBkzR0dFYvnw5Ll++jMTERNSsWRPjx49Hq1at1NYXBAH79+/Hnj17EB4eDmNjYzRs2BDjxo1D3bp189Q2EREREZVcOg9hsrS0hJubW47JAwBYW1tj4cKFePXqla5N6cTHxwcjR47Ey5cvxTkZR48exYABA5CWlqZSPzQ0FD179oSVlRVOnDiBY8eOITExEX369EFsbKxS3fT0dBw+fDjXGN5/mH/9+jWcnJywf/9+xMfHIzMzEy9evMDWrVvRu3fvPA3xio6OhpOTE06ePImsrCykp6cjNDQUY8eORVBQkNpz5syZg19++QUPHjyAXC7HmzdvEBgYiH79+iEgIEDrtomIiIioZNM5gfjmm2+0fuiVyWTQ09O5qTyLjIzErl274Ovri3PnzuH69euYO3cuJBIJIiIixOFXCsnJyZg0aRJsbGzg5uYGfX19lCpVCvPnz0d0dDRmzpypVN/Pzw8VK1bE9u3bcebMGVy6dEnpp0ePHqhRowaqV68unrNw4UJUrFgR3t7euH37Nq5cuYLhw4cDAJ48eYIFCxZofX+//vorxo8fj+DgYJw/fx6HDx9GlSpVkJWVhVWrVqnUDwoKwokTJ7B48WJcu3YNoaGhWL9+PSwtLZGZmQl3d3fOUSEiIiIirej8VD9o0CDMnDkTGRkZGuvIZDI8efIEzs7O+Oabb3RtKs8uXbqEzZs3o1atWgCyN7Lr168funbtCgAIDw9Xqu/l5YWoqCh069ZNKdEpU6YMWrVqhaCgIJw9e1Y8Hhoair1796JJkyawsbGBpaWl+GNubo5z586hQ4cOYv2EhATExcVhy5YtqFevHvT19WFubg5XV1d069YNAODv74/09PRc7y0yMhIdO3ZEnz59xH02atasCQ8PD7X3BmT3xmzfvh3du3eHqakpDAwM0LZtW6xYsQJAdgIVGBiY+xdLRERERCWeVnMg7OzsNJbVr18/1/OlUin279+vfVT51LNnT7XH69evD19fX5X7OXLkCACoTXIaNGiA48ePw9vbGy1atAAAzJgxA6VKlVLbxrVr1xATE6OUQLx8+RLTp0+HVCpVqT98+HD4+voiMzMTSUlJGq+r8Pnnn+Pzzz9XGycAMWl6X6NGjdT+f+jo6IjatWvj3r177IEgIiIiIq1olUAIgqD2uEQi0Vj2vkqVKqFGjRp5i+wDiI2NRZUqVdClSxfxWEREBJ48eQIgO87/UsQdHBwsHjMyMtLYhp+fn8rwpTp16misb2trK17T0tJSuxtRIzY2FhKJBOPHj1cpGzhwoMbzqlSpgnv37uGLL77QuW0iIiIiKjm0HsIkkUhUjmmTPADA48ePsXnzZu2j+gCSk5MRFBSEtWvXKk38vnfvHgDAwMAA5cuXVzlPsTpSYmIiIiMjc2wjKysL/v7+Sr0PuVHs4P3999/na56Il5cXpk2bhubNm+fpvISEBEil0jyfR0REREQlk9bLuFpYWKBq1ao6P+SGhITodF5BCA8Ph4uLC/T19SGTyZTKFEN3TE1N1d7b+8urJiQkqB0+pHD9+nXExMSgY8eOWsd28eJFABAnVOdVWloaNmzYgL/++gvu7u6Qy+XQ19fX+tybN2/CyckJZcqU0al9IiIiIipZtEog9PT0cOTIEZQrV+5Dx1OgkpKSsH79evz5559ISkoCAPTu3RvLli0TH/ITExMBaB6W9H5SkdskZ8XwpWrVqmkd4+7du+Hk5IR69eppfY6Cr68vtm3bhrCwMADArFmzcO7cOaxYsUKcYJ0Tb29vlC5dGpMmTcpz20RERERUMmnVnbBz585ilzwA2b0Hbm5uuHjxIpYtWwZra2vIZDL8/PPP4tAhQ0NDAJqHY2VmZoqfy5Ytq7EtQRDg7++fp94HHx8fpKSkwM3NTetz3tetWzf4+vri0KFD4p4T/v7+OHjwYK7nJiQkYNOmTfD09FS7UR4RERERkTpaJRANGzb80HF8UFKpFF26dMGBAwdQpkwZpKSkiBuuKeY9qNtcDgDevn0rfrawsNDYxrVr1/D69Wut5z9ERERg48aN2LhxI0xNTbW9FbXs7Ozw22+/4YcffgAAHD16NNdzZs+ejREjRogrSxERERERaePj7e5WBFSoUAG9e/cGkL0zNPDvsqdv375Vm0TExcUByN5RO6dVkvz8/FCzZk2thi+lpqbCxcUFnp6eeRrulJvJkycD+PfeNNm0aRNsbGwwYsSIAmubiIiIiEqGEpVAAP/2plhZWQHI3oRN8fnRo0cq9Z89ewYAOb6pFwQBAQEBWvU+yGQyTJ8+HWPGjIG9vX2e489J9erVYW5uLt6POocOHUJ4eLjK7tpERERERNoocQlEUlISpFIpmjVrBiB7eVonJycA6leKunnzJgCgc+fOGq95/fp1REdH5zr/QS6Xw9XVFV26dEHLli1VyqOjo3Pc2Ts3MpkMaWlpaNOmjdrygIAABAYGYsGCBSrL8srlckRFRencNhERERGVDCUugTh8+DBGjhyp9JZ+6NChsLKygq+vr1Ld+Ph4BAUFoVmzZnBwcNB4TcXwpapVq2qsI5PJ4OLigqZNm6JTp04q5devX4ezs7O4W3VMTAwGDRqEXr164e7du1rd2/Hjx/HFF1+gT58+KmUnT56Ej48Pli1bprJCU0xMDNzc3BAREaFVO0RERERUcmm9D0RxMnz4cPzzzz/o3bs3BgwYAEtLSyQnJ2PJkiWoVq0aJkyYoFS/bNmyWLp0KcaMGYNNmzZh9OjRSExMxPTp0/HFF19gyZIlGttSDF/q27evxjoZGRmYMmUKzpw5g/Pnz2Px4sVK5ampqcjIyMCYMWPEY0eOHBF3v542bRr8/f0BZK/ctGDBAjg4OGDUqFGoV68esrKyEBAQgD179mDr1q1KG+UB2UmTu7s7TExMVDaMy8zMREpKCmxsbHK8TyIiIiIi4BNNIFq2bIknT55g48aN2LZtG+zs7FC1alX07t0b9evXV3uOo6Mjdu3ahZUrV+LAgQOQSqXo0KED1q5dm+MqSTdu3MCrV69ynP/g4uKCU6dOAfh33wl1unbtKn62t7eHubk5qlatitDQUCQkJMDCwgK1a9eGra0tzp07h9OnT6Ny5cr46quv0KxZM3h5eYnL0iqcOXMGLi4uEARBaUWp//rxxx/V7jZORERERPQ+iaBpA4QC9OLFC8THx6NixYo5rmRE6jk6OuLs2bMqyUFRNGxPNB7GZOZekagIq2FliO39rXHgXiJiU+WFHU6xUN5EH71rmxd2GERE9BF80B6IyMhITJkyBbdv3xaPNW/eHLNnz0alSpU+ZNOfjODgYDRt2rRYJA9ERERE9On7oJOoZ8+ejdDQUAiCgC+++AIzZsyARCJBr1698PDhww/Z9Cfh2bNn2LhxI1xdXQs7FCIiIiIiAB+4B+LGjRuQSCQQBAFpaWkYPnw4hg8fjqCgILi7u8Pb2xt6eiVuISitnDlzBjdu3MCaNWtgZmZW2OEQEREREQH4wD0QNWvWhGKKxfsTkVu2bIkePXrAz8/vQzZfrLVq1QpTp05l8kBERERERcoHTSAWL16MJk2aoH79+ipLl/bt2xc2NjYfsnkiIiIiIipgH3QIU+XKlbFt2za1Zfr6+vjmm28+ZPNERERERFTAPto+EJcuXYKjo+PHao4Kia3lJ7m1CJUwir/H5kb6hRxJ8WHB74qIqMQokH0g3r17h7dv30Iul+O/lxMEAbGxsfj5559x9OjR/DZFRZg8S4C+Hjejo08D/z7nXZYgQI8bUhIRffLy9bp4165d2LZtG6KiogoqHirG+LBFnxL+fc47Jg9ERCWDzgnE7t27MX/+fHGZ1txI+B8WIiIiIqJiT+dVmLZv316QcRARERERUTGgcw9EbGwsJBIJKlWqhLp160IqlaqtJwgCoqOjcfnyZZ2DJCIiIiKiokHnBKJWrVp4+vQpjh49qjF5eN/8+fN1bYqIiIiIiIoInYcwTZgwARkZGcjIyNCq/pAhQ3RtioiIiIiIigidE4hmzZph7NixGDt2LOLj43OsGxMTg2nTpunaFBERERERFRFa7QNhZ2dXII3dv3+/QK5DRERERESFQ6s5EDnlGFzGlRS48RZ9Cvj3mNThJnlERP/SehK1pkQht+RB2wSDij99PQk8/OLwNF5W2KEQ6cTW0gC/dCiHgCdJSHwnL+xwqIiwMNJHu2pmhR0GEVGRoXUCUbVqVZQrV06nRuLj4/HkyROdzqXi5Wm8DA9jMgs7DKJ8SXwnR2wqEwgiIiJ1tEogOnXqhBUrVuSroUmTJuXrfCIiIiIiKnxarcI0b968fDe0aNGifF+DiIiIiIgKl1YJROnSpVWO/frrr1o18PjxY4SGhqq9BhERERERFS867wNx9OhRrepVr14dly9fxq1bt3RtioiIiIiIigidE4i8GDp0KNauXfsxmiIiIiIiog9Iq0nUV69exZUrV5SOpaenY926dbmem5mZieDgYPzzzz+6RUhEREREREWGVglE7dq1ceTIERw4cEBpQ7j169dr1YggCLC0tNQtQiIiIiIiKjK0SiBKly6NX3/9Ffb29pg9ezZkMu03ChMEARKJBO3bt9c5SF2dPXsWGzZswMOHD2FoaIhmzZrBxcUF1tbWaus/e/YMq1atwq1btyCRSNC0aVNMmzYN5ubmKnXnz5+PnTt35ti+i4sLRowYIf45Li4Oq1evRmBgIBITE2FtbY0ffvgBY8eOhZlZ/jYpunfvHg4ePIjIyEhUqlQJjRo1Qrt27XI85/Tp0xgzZgwWLVqEHj165Kt9IiIiIioZ8jQHonv37li1ahX09fUBZCcHuf0AQMuWLeHq6lrw0efAx8cHI0eOxMuXLyEIAhITE3H06FEMGDAAaWlpKvVDQ0PRs2dPWFlZ4cSJEzh27BgSExPRp08fxMbGKtVNT0/H4cOHc42hVatW4ufXr1/DyckJ+/fvR3x8PDIzM/HixQts3boVvXv3Rnx8vE73mZKSgpkzZ2LChAlo2rQpNmzYAHd391yTh/j4eMyePVunNomIiIio5NJ6J2qFNm3aYObMmVi6dCl+++03jfUkEgkMDQ1Rrlw5VKpUKV9B5lVkZCR27doFX19f1KpVC4IgYN++ffDw8EBERAS8vb0xePBgsX5ycjImTZoEGxsbuLm5QU9PD/r6+pg/fz5atGiBmTNnYvPmzWJ9Pz8/VKxYEatWrULVqlVRqlQppfaXLl2KO3fuoHr16uKxhQsXiud8/fXXSEpKwm+//YZt27bhyZMnWLBgAZYvX56n+4yLi8Pw4cNhYGCAP//8M0/DxObOnYuUlJQ8tUdERERElOcEAgD69++P8PBwNG7cuKDjKRCXLl3C5s2bUa5cOQDZyUy/fv1w48YN+Pr6Ijw8XKm+l5cXoqKiMHDgQOjp/dspU6ZMGbRq1QrHjx/H2bNn0aJFCwDZvRV79+5VSRwAICsrC+fOnUO/fv3EYwkJCYiLi8PWrVshlUoBAObm5nB1dUVcXBx8fX3h7++PhQsXqr2mOqmpqRg1ahTevn0LHx8ftcOsNDl8+DDi4uLQrl07+Pr6an0eEREREZHOy7hOnDhRp/PS0tI++Jvvnj17isnD++rXrw8AsLOzUzp+5MgRAMA333yjck6DBg0AAN7e3uKxGTNmaHzQv3btGmJiYtChQwfx2MuXLzF9+nQxeXjf8OHDAWSvVpWUlJTTbSmZPXs27ty5Aw8PjzwlD9HR0Vi5ciU8PT2VkiUiIiIiIm3o/ATZtm3bPNUXBAFz5szBt99+i0aNGmHEiBGIjo7WtXmdxMbGokqVKujSpYt4LCIiAk+ePAEAtUOtatSoAQAIDg4WjxkZGWlsw8/PDzVq1FAavlSnTh0xEfkvW1tb8ZraDkEKDg7G0aNHUb9+fbFXRFszZ87E+PHjP/qwMiIiIiL6NOg0hAnITgiioqKwadMm3LhxA4aGhvj6668xdOhQVKtWTaX+nj17cODAAfHPFy5cwP/+9z/4+vp+lDfhycnJCAoKwtq1a2FsbCwev3fvHgDAwMAA5cuXVzlPsTpSYmIiIiMj8fnnn2tsIysrC/7+/krDl3KTkJAAAPj++++1/h42btwIAGjdujU2b96Ma9eu4Z9//kHp0qXRuXNnDB8+HIaGhirn7dmzB6VKlUKvXr20jo+IiIiI6H06P7mnpaWhe/fuOHDgAP755x/cu3cP3t7e6NGjB86cOaNS//jx4wCy5yP07t0bBw4cgEwmE49/SOHh4Rg2bBj09fVVlqBVrH5kamqq9gH+/eVVFQ/7mly/fh0xMTHo2LGj1rFdvHgRwL9DmXITHx8v9obcvn0bzZo1w6ZNm7Bnzx5YWFhgxYoVGD9+PLKyspTOe/bsGbZu3Yp58+ZpHRsRERER0X/pnEDIZDK8ffsWgPJyru/evYOzszNev36tVP/p06fi5+HDh6NevXqYOHEizp8/r2sIuUpKSoKnpyecnJwQGhqK0NBQ9O7dWylpSUxMBKB5WNL7SUV6enqO7SmGL6nrgdFk9+7dcHJyQr169bSqf/36dchkMpiZmWH9+vWoXbs2JBIJKlSogNWrV8PU1BRBQUHYu3eveE5WVhZcXV0xc+ZMtXNDiIiIiIi0la+xQ4IgwMjICHZ2dmjUqBHq1KmDMmXKIDk5Gdu3b1eq++bNG/HzZ599BgBwdHREaGhofkLIkZmZGdzc3HDx4kUsW7YM1tbWkMlk+Pnnn8XeBMVQH8WeFf+VmZkpfi5btqzGtgRBgL+/f556H3x8fJCSkgI3Nzetz1HMG1G3GZ6FhQU6deoEADh48KB4fMuWLahWrRratGmjdTtEREREROrkK4GYMmUKrl69Ch8fH+zatQt//vknLl++jNGjRyMwMFCp7vsP4ooVjCwsLJQSiw9FKpWiS5cuOHDgAMqUKYOUlBQEBQUBgDjvQd3mcgDEXhZFvJpcu3YNr1+/Vlp9KScRERHYuHEjNm7cCFNTU21vBcnJyQCg8ZyWLVsCAB4/fgwACAsLw8GDBzFz5kyt2yAiIiIi0kTnBMLAwACjR4+GgYHyPGw9PT2MGzdOZQjT+xQ7WQPKicWHVqFCBfTu3RsAxPhq1aoFIDtRUJdExMXFAch+45/TKkl+fn6oWbOmVsOXUlNT4eLiAk9PzzwNdwIgxqBIJP6rQoUKAP7tUfHy8kJ4eDgaNmyImjVrKv34+PgAANzd3VGzZs089YQQERERUcmkcwIhlUrVTpYGsicG/3cSrzpyuVzj0KEPpWHDhgAAKysrAEDNmjXFz48ePVKp/+zZMwDIcblUQRAQEBCgVe+DTCbD9OnTMWbMGNjb2+c5/jp16gAAnj9/joyMDJVyRe+OYnnYcuXKoWrVqmp/FL0YVlZWqFq1qvg9EBERERFpovMyrtWrV8fEiRPh4OCAr776CiYmJsjMzMTjx49x9uxZZGVlISwsDLVq1RKH0yi8ffsWZcqUQVhYmNZ7HxSUpKQkSKVSNGvWDED2qlBOTk7YsGEDQkJCULduXaX6N2/eBAB07txZ4zWvX7+O6OjoXOc/yOVyuLq6okuXLuJQo/dFR0fDwsJC7YZzCnZ2dqhWrRqePHmCS5cuqVznxYsXAIB27doBAKZPn47p06ervZabmxt8fHwwbdo09OjRI8fYiYiIiIiAfPRAdOrUCTKZDBcuXMCOHTuwYcMG/P777zh16hRkMhkWL16MIUOGYNSoURgyZAgkEgkkEgmA7JWH4uPjsWTJEnGjto/l8OHDGDlypNLb9qFDh8LKygq+vr5KdePj4xEUFIRmzZrBwcFB4zUVw5eqVq2qsY5MJoOLiwuaNm0qTnR+3/Xr1+Hs7CwmDzExMRg0aBB69eqFu3fvKtV1dnYGAGzdulWlB8fHxwdWVlYYMmSIxliIiIiIiHSlcwIxYMAA1KpVS+0QpE6dOqFz586YPHkyzp49i9jYWLHe0KFDsXr1ajRt2hTBwcFo1aqVzsFrMnz4cDRv3hxr164V93lITk7GnDlzUK1aNUyYMEGpftmyZbF06VKEh4dj06ZNEAQBCQkJmD59Or744gssWbJEY1uK4Us59T5kZGRg0qRJOH78OBYvXozvvvtO6adu3bro16+f0pCmI0eOIDg4GLdv38a0adOUrtemTRuMHTsWV65cwZIlSyCTySAIAnbv3o3z589j3bp1SvtXEBEREREVFJ2HMEmlUmzfvh1ubm7iikZ6enro1q0bfv31VwBA//798cUXX2Dv3r1ISUlBt27d0KtXL0RFRcHPzw9ff/01unbtWjB38p6WLVviyZMn2LhxI7Zt2wY7OztUrVoVvXv3Rv369dWe4+joiF27dmHlypU4cOAApFIpOnTogLVr1+a4StKNGzfw6tWrHOc/uLi44NSpUwD+3XdCnfe/C3t7e5ibm6Nq1aoIDQ1FQkKC0ipQU6ZMQZUqVbB9+3YcPHgQ5cuXR61ateDj44MqVapobIOIiIiIKD8kQgHMYn716hVev36NSpUq5bjUqYIgCAgLC0P16tVzHO9P2RwdHXH27Flxz4qibNieaDyM+XgraxEVpBpWhtje3xoH7iUiNlVe2OFQEVHeRB+9a5sXdhhEREWGzj0Q76tQoYK4fKg2JBIJ7OzsCqLpT15wcDCaNm1aLJIHIiIiIvr05WsjubzgHgN59+zZM2zcuBGurq6FHQoREREREYB89kAkJSXh77//xtOnT5GamipO5v2vmJgYXLp0CZ6envlprkQ5c+YMbty4gTVr1nBCNBEREREVGTonEP/88w+GDRsm7tScE0EQxCVcSTutWrX6ICtUERERERHlh85DmObPn4/Y2NiCjIWIiIiIiIo4nXsgQkNDIZFI1A5ZIiIiIiKiT5POCUTp0qXx2WefwcPDA59//jn09fXV1pPL5YiKisLMmTN1DpKKD1vLAlnYi6hQKP7+mhup//eMSiYL/n0gIlKi8z4Q7u7usLCwgIuLi1b1b9++jbp16+rSFBUT8iwB+nqc60LFG/8ekzpZggA9zuUjIgKQjzkQ48aNQ2BgIGJiYrSqf+XKFV2bomKCD130KeDfY1KHyQMR0b907oFYt24dLl26hIiICPTq1Qt6eupzEUEQ8Pr1a/j6+iI0NDRfwRIRERERUeHSOYGoVauWOIk6tyVaFXXu37+vU5BERERERFQ05Hsnau7vQERERERUcuRryRwu4UpEREREVLLonECUK1cOrq6u+OyzzzTOfwCArKwsxMTEYPHixbo2RURERERERYTOCYSrqyu6du2qdf2ckgwiIiIiIioedJ5ETUREREREJU++uwUEQcCFCxewbt06LFiwQKls27ZtSE1NzW8TVEzIs5iLEuWGvydE+ZPF955EhS5fPRD37t3DtGnT8OzZMwCAVCrFrVu3xPJr165h/vz58PT0RM2aNfMfLRV5Hn5xeBovK+wwiIokW0sD/NKhHAKeJCHxnbywwyEqdiyM9NGumllhh0FU4uk8B+L58+cYNmwY3r59q3E1poYNG2L48OEYMmQIfH19YW1trXOgVDw8jZfhYUxmYYdBVKQlvpMjNpUJBBERFU86D2FatWoV3rx5gy+//BLt27dHt27d1E6Ubtq0KRITE/H777/nK1AiIiIiIip8OvdAXLhwAStWrECnTp3EY6dPn1ap9+rVKwBAUFAQZs2apWtzRERERERUBOjcAyGXy9G+ffsc62RmZor7P7x+/VrXpoiIiIiIqIjQOYGwtbXFjBkz8OLFC5Wy9PR0nDp1Ck5OTggODgYAWFhY6B4lEREREREVCToPYerZsyc8PDwQEBCAChUqwNraGsnJyWjWrBkSExMhl8vFydUSiQSOjo4FFjQRERERERUOnXsg+vTpA0dHR8jlckRGRuLGjRsQBAGxsbGQyWQQBAESiQQAYGxsjHHjxhVY0EREREREVDh0TiD09PSwfv16/PDDDxAEQfyRSCRi4iAIAqysrLB161ZUqlSpwIImIiIiIqLCofMQJgAwMTHB6tWrcf36dRw9ehR3795FYmIipFIpKleujGbNmqFbt24wMTEpqHiJiIiIiKgQ5SuBULC3t4e9vX1BXCpHZ8+exYYNG/Dw4UMYGhqiWbNmcHFx0bhB3bNnz7Bq1SrcunULEokETZs2xbRp02Bubq5Sd/78+di5c2eO7bu4uGDEiBHin+Pi4rB69WoEBgYiMTER1tbW+OGHHzB27FiYmem2U2ZERAR+//13vHz5Elu3btVYLzMzE9u2bYOPjw9evHiBMmXKwNHRERMmTEDVqlXVnpOQkID169cjMDAQWVlZkMlkaNiwISZOnIgvv/xSp3iJiIiIqGTReQhTXrm6uubrfB8fH4wcORIvX76EIAhITEzE0aNHMWDAAKSlpanUDw0NRc+ePWFlZYUTJ07g2LFjSExMRJ8+fRAbG6tUNz09HYcPH841hlatWomfX79+DScnJ+zfvx/x8fHIzMzEixcvsHXrVvTu3Rvx8fF5ur/Hjx/j559/RocOHbB//35kZGRorCuXyzFu3DisWLECz549g1wuR1xcHI4ePYqePXvi+vXrKufEx8ejd+/euHPnDnbu3IkzZ87A398fBgYG6NWrF0JDQ/MULxERERGVTFolEJGRkTr/PH36FPv27dPqAT2n9nft2gVfX1+cO3cO169fx9y5cyGRSBAREQFvb2+l+snJyZg0aRJsbGzg5uYGfX19lCpVCvPnz0d0dDRmzpypVN/Pzw8VK1bE9u3bcebMGVy6dEnpp0ePHqhRowaqV68unrNw4UJUrFgR3t7euH37Nq5cuYLhw4cDAJ48eYIFCxbk6R6NjY3x66+/YvTo0bnW3b9/P548eYL169fjxo0buHXrFhYuXAgTExOkpKTA2dlZJQFZv349IiIisHr1anzxxRcAgNKlS2PevHnQ19fHvHnz8hQvEREREZVMWg1hat26tTgxujBcunQJmzdvRrly5QBkLwvbr18/3LhxA76+vggPD1eq7+XlhaioKAwcOBB6ev/mSGXKlEGrVq1w/PhxnD17Fi1atACQ3Vuxd+9elCpVSqXtrKwsnDt3Dv369ROPJSQkIC4uDlu3boVUKgUAmJubw9XVFXFxcfD19YW/vz8WLlyo9prqfP755wCA+vXr51o3ICAAO3fuFM8BspfVFQQBP//8M16+fIlr164pLZ17+fJlWFpaqgz3MjExga2tLR4+fKhVnERERERUsmk9hOn9lZZ0+cmPnj17isnD+xQP23Z2dkrHjxw5AgD45ptvVM5p0KABACj1WsyYMUPjg/61a9cQExODDh06iMdevnyJ6dOni8nD+xS9EJmZmUhKSsrpttTKLeHIzMxEx44dlZIHhV69eonzOxISEpTKjI2NER8fj4iICJXzUlJSVL5DIiIiIiJ1tE4gCrMHQpPY2FhUqVIFXbp0EY9FRETgyZMnAKB26dgaNWoAgLhDNgAYGRlpbMPPz09l+FKdOnXEROS/bG1txWtaWlpqfS8KuX3PhoaG6NOnj8byypUrA4BKgtG6dWsIgoBff/0VWVlZ4vFHjx7hxYsXmD59ep5jJSIiIqKSR+tVmCpXrow6deqofeuek4yMDNy5cwfPnz/Pc3A5SU5ORlBQENauXQtjY2Px+L179wAABgYGKF++vMp5itWREhMTERkZqfZNvkJWVhb8/f2Vhi/lRvHm//vvv1caPvWxJCQkwMrKCvXq1VM6PmzYMAQEBODs2bOYMmUKli5diqysLCxcuBDr16/Ht99++9FjJSIiIqLiR6sEQk9PD3/99RdMTU11aiQxMVFpPH5+hYeHw8XFBfr6+pDJZEplitWPTE1N1T7Av7+8akJCQo4JxPXr1xETE4OOHTtqHdvFixcB/DuU6WOKiIhAREQEXFxcVO7d2NgY27Ztw//+9z/4+/uLvTcLFiyAjY3NR4+ViIiIiIonrV6R+/v765w8ANkTjAMCAnQ+XyEpKQmenp5wcnJCaGgoQkND0bt3bxw/flysk5iYCEDzsKT3H6zT09NzbE8xfKlatWpax7h79244OTmp9AB8DLt27cKXX36JQYMGqS23tLTE6tWrMWDAALx8+RIHDx7EsmXLcv0eiIiIiIgUtEog1M0lyKuCuIaZmRnc3Nxw8eJFLFu2DNbW1pDJZPj555/FoUOGhoYAoHHidmZmpvi5bNmyGtsSBAH+/v556n3w8fFBSkoK3NzctD6noISHh+PgwYNYvny5xmFm9+/fx9atWzFnzhwcOHAA1atXx9GjR/G///1P7V4aRERERET/9fEH6RcAqVSKLl264MCBAyhTpgxSUlIQFBQEAOK8B00PxG/fvhU/W1hYaGzj2rVreP36tdLqSzmJiIjAxo0bsXHjxnz11ugiIyMDbm5u8PDwQK1atdTWefr0KYYOHSr2TlhbW2PXrl2oVq0agoODsWjRoo8ZMhEREREVU8UygVCoUKECevfuDSB7Z2gA4gP027dv1SYRcXFxALIfoHNaJcnPzw81a9bUavhSamoqXFxc4OnpmafhTgVl3rx56NixIzp16pRjHTMzM6XVpCwtLbF582aYmprizz//RFRU1McIl4iIiIiKsWKdQABAw4YNAQBWVlYAgJo1a4qfHz16pFL/2bNnACBuIqeOIAgICAjQqvdBJpNh+vTpGDNmDOzt7fMcf35t2LABlpaWGDp0qMY6KSkpuHDhgtq9NCpVqoSePXtCLpfj7t27HzBSIiIiIvoUFPsEIikpCVKpFM2aNQOQvY+Ck5MTACAkJESl/s2bNwEAnTt31njN69evIzo6Otf5D3K5HK6urujSpQtatmypUh4dHY2MjAxtbyXPdu7cibi4OEydOlWlLDU1VextkclkEAQBr169UnudKlWqAPh3/ggRERERkSbFPoE4fPgwRo4cKfY6AMDQoUNhZWUFX19fpbrx8fEICgpCs2bN4ODgoPGaiuFLVatW1VhHJpPBxcUFTZs2VTt06Pr163B2dhYnNMfExGDQoEHo1atXjm/65XK50v9qsmvXLjx48ACzZs1SKYuIiMDEiRPFIVxly5ZF48aN8erVK7VJ1a1bt2BhYcG9IIiIiIgoV1pvJFeYhg8fjn/++Qe9e/fGgAEDYGlpieTkZCxZsgTVqlXDhAkTlOqXLVsWS5cuxZgxY7Bp0yaMHj0aiYmJmD59Or744gssWbJEY1uK4Ut9+/bVWCcjIwNTpkzBmTNncP78eSxevFipPDU1FRkZGRgzZox47MiRI+Lu19OmTYO/v7/aayt6SJ49e4aEhAS1E703b96M5cuXw9zcHCdOnFCJLTU1Ffb29qhYsaJ43MPDAwMHDoSbmxtWrFiBevXqQS6XY8+ePTh+/DjWrFkDExMTjfdMRERERAQUUALx9OlT3Lx5E4mJiUpj8Y8dO4aOHTtCIpHk6/otW7bEkydPsHHjRmzbtg12dnaoWrUqevfujfr166s9x9HREbt27cLKlStx4MABSKVSdOjQAWvXrs1xlaQbN27g1atXOc5/cHFxwalTpwD8u++EOl27dhU/29vbw9zcHFWrVkVoaKhKcuDv7w8PDw9x2FFsbCxatGgBe3t7/PHHH2K9PXv2YPny5bm23aVLF6U/V6tWDX/99Rc2bNiACRMmICsrC/r6+qhfvz7279+P2rVra7wWEREREZGCRNC0YYIWIiMj8fPPP+Py5csAspdXvXXrllh+7Ngx7N27V9yzgbI5Ojri7Nmzn+Scg2F7ovEwJjP3ikQlUA0rQ2zvb40D9xIRm5rzMEUiUlXeRB+9a5sXdhhEJZ7OPRCxsbEYMGAAXr16pXHTtk6dOiE+Ph4DBgzAwYMHUaZMGZ0D/VQEBwejadOmn2TyQERERESfPp0nUa9ZswZRUVEwMzND7dq10ahRI+jpqV6uQ4cOePHiBbZv356vQD8Fz549w8aNG+Hq6lrYoRARERER6UTnHojAwEA4Oztj2LBh0NfXBwA0btxYpd6bN28AZI/xnzx5sq7NFXtnzpzBjRs3sGbNGpiZmRV2OEREREREOtE5gUhJSVFKHjRZt24dgOz5EiVZq1at0KpVq8IOg4iIiIgoX3QewlSxYkUsW7YMKSkpasvDwsIwatQo+Pn5AUCOKx8REREREVHxoHMPRJcuXbBixQr8+eefsLOzg7W1NVJTU9G3b1+8fPkSsbGxALL3VZBIJGjUqFGBBU1Fl61lsdhahKhQKH4/zI1y7rklIvUs+LtDVCTovIxrRkYGevfujbCwMJV9HhSXlEgkEAQBBgYG8Pb2hp2dXf4jpiJLniVAXy9/e34Qfer4e0KUP1mCAL187i9FRPmj8xAmqVSKLVu2oEGDBhAEQWkpV4lEIiYPxsbGWL58OZOHEoAPRUS54+8JUf4weSAqfPnaSA4AsrKycPToUfz999+4e/cuEhMTYWhoiCpVqqBZs2YYOHAgKlSoUFDxEhERERFRIcp3AkFERERERCWHzkOYtJGcnPwhL09ERERERB9ZvhKI4OBgDBw4EH379kVYWJhK+c2bNzFu3DgkJibmpxkiIiIiIioidE4gHj16hBEjRuDatWu4desWfv/9d5U6zZo1w5dffolx48YhMzMzX4ESEREREVHh0zmBWL9+PTIzM8UVmKysrNTW69SpE65fv44DBw7oHCQRERERERUNOicQt27dQqlSpdCgQQOMGjUKEyZMUFtPMQ/i4MGDujZFRERERERFhM7bBickJMDPzw82NjY51vvjjz8AAOHh4bo2RcUEN8giov/ivwtFAzdfI6KCpHMCYWFhgdTUVI3lr1+/xqJFi3Dy5Mnshgx0boqKCX09CTz84vA0XlbYoRBREWBraYBfOpRDwJMkJL6TF3Y4JZaFkT7aVTMr7DCI6BOi81N9kyZN0L9/fzg5OaFevXooU6YMkpOTERkZieDgYJw7dw4ZGRkQBAESiQR169YtyLipiHoaL8PDGE6YJ6J/Jb6TIzaVCQQR0adC5wRi1KhR+Pvvv7F161a15YrEQWHIkCG6NkVEREREREWEzpOoK1eujFWrVsHIyEhciUnxAwASiUT8PH78eLRo0aJgIiYiIiIiokKTr43kWrZsCR8fH7Rr1w76+vricUUi8dVXX2HDhg0aV2giIiIiIqLiJd8zm21tbbF27Vq8ffsWd+7cQXx8PIyMjFC9enVUrVq1IGIkIiIiIqIiosCWRipTpgyaNGlSUJcjIiIiIqIiKF9DmPJixYoVH6spIiIiIiL6QPLVAyEIAs6fP4+nT58iNTUVMplMnDj9vpiYGBw6dAjTpk3LT3NERERERFTIdE4goqKiMHLkSDx+/DjXuv9d0pWIiIiIiIonnYcwzZ8/H48ePSrIWD6406dPo2bNmjh48KDa8rt372LUqFFo3bo12rVrh6VLl+Ldu3carxcSEoJRo0ahTZs2+O677zBgwACcPXtW63iePn2K5cuXY8yYMZg/fz7+/PNPrc+Njo6Gi4sLWrRogXr16sHJyQlnzpzRWF8QBOzbtw9du3ZF3bp10bhxY4wdOxa3b9/Wuk0iIiIiIp0TiCtXroh7PeT2UxTEx8dj9uzZGssDAwPRt29fODo6IjAwEAcPHsT169cxdOhQpKamqtT39vbGiBEj0LVrV5w8eRLnzp1DvXr1MHLkSOzcuTPHWDIzM7F8+XL0798f1atXx4YNGzBr1iz06tVLq3uJjo6Gk5MTTp48iaysLKSnpyM0NBRjx45FUFCQ2nPmzJmDX375BQ8ePIBcLsebN28QGBiIfv36ISAgQKt2iYiIiIh0HsKkr6+PcuXKYerUqfj888+V9oF4n1wuR1RUFJYtW6ZzkAVh7ty5SElJUVsWFRUFZ2dnODg4YNiwYQAAMzMzLFiwAJ06dcKSJUswd+5csX5YWBjmzp2LoUOHonPnzgAAqVQKV1dXPHjwAIsWLUK9evVQv359lbbS0tIwduxYREREYP/+/ahUqVKe7+XXX3/F+PHj0bNnTxgYGODBgweYOHEinj17hlWrVqFly5ZK9YOCgnDixAksXrwYbdu2hZGREc6cOYPZs2cjPj4e7u7uaNSoESwtLfMcCxERERGVLDonEN999x0qVqyInj17alW/XLlyujaVb4cPH0ZcXBzatWsHX19flfJ169YhJSUFPXr0UDperVo11K1bF/v27cOgQYNQvXp1AMDvv/8OmUym8qAOAIMHD8aFCxewcuVK7NixQ6lMLpdjypQpuH37Nnx8fHRKHiIjI9GxY0cxcQGAmjVrwsPDA0OHDkV4eLjKOT4+Pti+fTvs7OzEY23btkXp0qUxdOhQJCcnIzAwUOseECIiIiIquXQewjRu3DicO3dO7fAedRISEnRtKl+io6OxcuVKeHp6Qk9P9XYzMzPh5+cHALC3t1cpr1+/PgRBgLe3t3js0qVLAIDPPvtMpX7jxo2hr6+PS5cuIT4+Xqls9erVOHPmDJydnVG5cmWd7ufzzz9XSh4UGjRoAACoVauWSlmjRo2UkgcFR0dH1K5dGwBUYiUiIiIiUkfnHoiwsDCUL18effv2xZAhQzQOYcrKysLr16+xbds2dO/eXdfmdDZz5kyMHz9e49v+kJAQJCcnQyqVwtraWqW8Zs2aALLnfCi8efMGAJCUlKRS38TEBBYWFoiNjcXt27fFXoqnT59i27ZtqFChwgd50x8bGwuJRILx48erlA0cOFDjeVWqVMG9e/fwxRdfFHhMRERERPTp0TmBcHNzEydRz5o1qyBjKjB79uxBqVKlcnxgv3//PgCoTR6A7LkQAPDw4UPI5XLo6+vD2toaL1++xO3bt1G3bl2VcxQTx9/vddmyZQsyMzPRtm1beHt749y5c3j8+DH09PTQpk0bjBs3DqVLl9b5Xr28vDBt2jQ0b948T+clJCRAKpXm+TwiIiIiKpnyvRN1bisxASiUlZiePXuGrVu3Yt68eTnWUwzdKVOmjNpyRQIhk8nEHod27doBAPbt2weZTKZUPyMjQ0wcpFKpeO6JEycAZPfc1KhRA+vWrcOBAwdQu3ZtbNmyBYMGDUJaWlqe7zMtLQ3Lly/HX3/9BQsLC8jl8jyde/PmTTg5OWm8fyIiIiKi9+UrgdBmmdbCSB6ysrLg6uqKmTNn5jp5OzExEQBgZGSktvz9eRPp6ekAgAkTJqBGjRp48OABnJ2dxSTk/v37cHZ2RlZWFoDs+QoA8OjRI7GdjRs3olGjRtDX14e5uTk8PT1RqVIl3L17F2vXrs3Tffr6+qJv377YvHkzUlJSMGvWLEydOlUlqdHE29sbpUuXxqRJk/LULhERERGVXDoPYTIxMcHIkSPx2WefqZ2crJCVlYWYmBj8/vvvujaVZ1u2bEG1atXQpk2bXOsaGhoC0JzoZGRkiJ/Nzc0BZPdK7N27F5s2bcKpU6fQq1cv1KhRAw4ODrCxsQEAGBsbixOaX716JR7775t+qVSKXr16YeXKlfDx8YGLi4vW99mtWzd069YN9+/fx6pVq3DmzBn4+/vj4MGD6N27d47nJiQkYNOmTfD09BTvi4iIiIgoNzonEJMmTcLQoUO1rm9iYqJrU3kSFhaGgwcPar2rc/ny5QFA4/AhxbAlExMTlCpVSjxuamoKZ2dnODs7K9Xv2rUrAOD7778XezWSk5PFc9Rp1aoVVq5cifj4eMTHx+d5PwY7Ozv89ttvmDRpEvz9/XH06NFcE4jZs2djxIgRaNGiRZ7aIiIiIqKSTecEIi/JA5C9P8LH4OXlhfDwcDRs2FBjHXd3d7i7u+Onn35C27ZtAQCvX79WWzc2NhbAv6sx5eTixYt48OABJBIJRowYIR5XJASKROK/3p/AnZ8hX5MnT4a/v7/Ge1HYtGkTbGxslGIkIiIiItKGzgnE+54+fYqbN28iMTFRKbE4duwYOnbsCIlEUhDNaKVcuXKoWrWq2rKYmBgkJyfDysoKpqamsLKygoODAwwNDREXF6f27f/z588BQO2mce+TyWRYuHAhAKBPnz6oU6eOWGZnZwd9fX2kpaXhxYsXqFixotK5ip6NsmXL5ms36OrVq8Pc3BxWVlYa6xw6dAjh4eHw9PTUuR0iIiIiKrnyNYk6MjISw4YNQ8eOHeHu7o6VK1eq1Bk8eDCio6Pz00yeTJ8+HX5+fmp/FKsnTZs2DX5+fpg+fTpMTU3RqVMnANl7QvzXzZs3oaenh44dO+bY7ooVK/DPP/+gXr16cHd3VyqzsLCAo6MjACAoKEjl3BcvXgAA2rRpk69kSyaTIS0tTePcj4CAAAQGBmLBggUq7cjlckRFRencNhERERGVDDonELGxsRgwYAAuX76scTWmTp064YcffsCAAQPw9u3bfAX6IU2YMAEmJiY4dOiQ0vGHDx/i7t27cHJygq2trcbzN2/ejK1bt+Kbb77Bli1b1K7oNG3aNBgYGMDLy0tpYjYA+Pj4wNjYGOPGjROPxcTEYNCgQejVqxfu3r2r1X0cP34cX3zxBfr06aNSdvLkSfj4+GDZsmUwMFDueIqJiYGbmxsiIiK0aoeIiIiISi6dE4g1a9YgKioKZmZmqF27Nho1aqR2NaYOHTrgxYsX2L59e74C/ZAqV64MDw8PBAUFwdfXF0B278qMGTNgb2+v0qMAZL/tP3fuHEaMGIFVq1Zh2LBh8PLyQtmyZdW28fXXX2Pu3Ll4+vQp3N3dxUnbAQEB2LdvH5YuXaq0W/aRI0cQHByM27dvY9q0aeJxHx8fNGrUCBMmTEBoaCiA7JWu/Pz8sGfPHmzduhXGxsZKbR8+fBiTJ09GSEgImjdvju+++078sbe3R7NmzXD16lV8++23+fsiiYiIiOiTp/MciMDAQDg7O2PYsGHQ19cHADRu3Fil3ps3bwAA/v7+mDx5sq7NfXBdu3ZFuXLlsHbtWqxZswbGxsbo0aMHBg4cKG4IpzB79mwcPXoUFSpUQJMmTeDu7o4vv/wy1zacnJxgZWWFjRs3okWLFrCyskKVKlXg5eWlsqO1vb09zM3NUbVqVYSGhiIhIQEWFhaoXbs2bG1tce7cOZw+fRqVK1fGV199hWbNmsHLy0tcllbhzJkzcHFxgSAIOfYC/fjjjx91rgoRERERFU8SQcdlf7755huEhISIyQOQnUAEBwcr1Zs6dSqOHz8OIyMj3Lx5M1/BllSOjo44e/asSnJQFA3bE42HMZmFHQYRFQE1rAyxvb81DtxLRGyqvLDDKbHKm+ijd23zwg6DiD4hOg9hqlixIpYtW4aUlBS15WFhYRg1ahT8/PwAaN4DgXIWHByMpk2bFovkgYiIiIg+fToPYerSpQtWrFiBP//8E3Z2drC2tkZqair69u2Lly9fivsnCIIAiUSCRo0aFVjQJcWzZ8+wceNGLFmypLBDISIiIiICkM+N5I4dO4awsDBcvXpVPH7r1i1xRSbFmHp9fX2MHj06n6GWLGfOnMGNGzewZs0amJmZFXY4REREREQA8jGESSqVYsuWLWjQoIHKMq4SiQQSiQSCIMDY2BjLly+HnZ1dgQRcUrRq1QpTp05l8kBERERERUq+dqIuX7489uzZg6NHj+Lvv//G3bt3kZiYCENDQ1SpUgXNmjXDwIEDUaFChYKKl4iIiIiIClG+EggA0NPTQ9euXdG1a9eCiIeKOVvLfP+VIqJPhOLfA3Mj/Vxq0odkwe+fiAqYzsu4Ev2XPEuAvh73kiCif/HfhaIhSxCgx71+iKiA6DwHIq9cXFw+VlNUSPiQQET/xX8XigYmD0RUkHTugXB3d9eqniAIiImJweXLl3H37l1dmiIiIiIioiJC5wHrPj4+4jKtuVHsBUFERERERMXbRxvCRERERERExV++lszh/GsiIiIiopJF5wSiYsWKWLx4MT777DPo6anvyJDL5fD09ISenp7WcyaIiIiIiKjo0nkSdVBQEFq2bJlrPblcjsGDB2Po0KFo166dLk0REREREVERofMcCG2SBwDQ19dHrVq1sH37dl2bIiIiIiKiIkLnIUzr1q3LtU5WVhZevXqFw4cPw9DQUNemqJjghlFE9D7+m0CfAm7CR6RK5yFMtWrVytMyruXLl8f58+d1aYqKEQ+/ODyNlxV2GERUyGwtDfBLh3IIeJKExHfywg6HSCcWRvpoV82ssMMgKnLytQqTNhR7QPz0008fuikqAp7Gy/AwJrOwwyCiIiLxnRyxqUwgiIg+JR98GVcDAwM4OTlhypQp+WmKiIiIiIiKAJ0TiPLly8PV1RVWVlZql3HV09ODqakpKlWqhNKlS+crSCIiIiIiKhp0TiCcnZ3RpUuXgoyFiIiIiIiKOJ2Xce3evXsBhkFERERERMWBzglEQRAEAW3atCnMEIiIiIiIKA90HsJ09erVfDWcmZmJ8+fPIzIyMl/XISIiIiKij0fnBGLFihW4efNmAYZCRERERERFnc5DmJYuXQpTU1MIgpCvHyIiIiIiKj50TiA+//xzfP3111rV1XbH6g/t9OnTqFmzJg4ePKi2/O7duxg1ahRat26Ndu3aYenSpXj37p3G64WEhGDUqFFo06YNvvvuOwwYMABnz57VOp6nT59i+fLlGDNmDObPn48///wzz/ekcO/ePcyfPx/jxo3DokWLcOLEiVzPye37ICIiIiL6L52HMO3YsQOXL19G48aNUbNmTZQuXRoGBqqXS0pKwp49ezBs2DBIpVLxuFwux5UrV3Djxg1dQ8iT+Ph4zJ49W2N5YGAgJk+ejGnTpmHz5s1ISkrCqFGjMHToUGzbtg0mJiZK9b29vTF//nwsWLAAP/74IzIzM7Fy5UqMHDkSs2bNwqBBgzS2lZmZiTVr1uCvv/6Ci4sLpk6dqnYvDW2kpKRgwYIFuHz5MmbPno3vv/9eq/Ny+z6IiIiIiNTROYH4888/sXTpUq32gqhYsSLkcjmGDh2qdDwlJQXffvutriHkydy5c5GSkqK2LCoqCs7OznBwcMCwYcMAAGZmZliwYAE6deqEJUuWYO7cuWL9sLAwzJ07F0OHDkXnzp0BAFKpFK6urnjw4AEWLVqEevXqoX79+iptpaWlYezYsYiIiMD+/ftRqVIlne8pLi4Ow4cPh4GBAf78809YWlpqfW5O3wcRERERkSY6D2F6/fq1+PCcm759+8LPz0/leOnSpbFkyRJdQ9Da4cOHERcXh3bt2qktX7duHVJSUtCjRw+l49WqVUPdunWxb98+PH78WDz++++/QyaToWXLlirXGjx4MORyOVauXKlSJpfLMWXKFNy+fRvbt2/PV/KQmpqKUaNG4e3bt9i6dWuekofcvg8iIiIiIk10TiDMzMzg6+urVd2wsDA8ePBAbZm2SYiuoqOjsXLlSnh6eqodJpSZmSkmN/b29irl9evXhyAI8Pb2Fo9dunQJAPDZZ5+p1G/cuDH09fVx6dIlxMfHK5WtXr0aZ86cgbOzMypXrpyv+5o9ezbu3LkDDw8PmJuba31ebt8HEREREVFOdB7CVL9+fcycORMBAQFo1aoVqlWrhs8++wylSpWCRCJBWloaXrx4gStXrmDfvn3Q19cvyLi1NnPmTIwfP17j2/6QkBAkJydDKpXC2tpapbxmzZoAgCtXrojH3rx5AyB7fsd/mZiYwMLCArGxsbh9+7bYS/H06VNs27YNFSpUQK9evfJ1T8HBwTh69Cjq16+PFi1a5Onc3L4PIiIiIqKc6JxADB06FP7+/jh9+jROnz6da31HR0ddm9LZnj17UKpUqRwf2O/fvw8AapMHILunBQAePnwIuVwOfX19WFtb4+XLl7h9+zbq1q2rco5iedqEhATx2JYtW5CZmYm2bdvC29sb586dw+PHj6Gnp4c2bdpg3LhxKF26tFb3tXHjRgBA69atsXnzZly7dg3//PMPSpcujc6dO2P48OEwNDTU6fsgIiIiIsqJzmNYGjRogPHjx4sPyznt9aCnp4fx48cXWNDaePbsGbZu3Yp58+blWE8xzKhMmTJqyxUJhEwmE3scFHMH9u3bB5lMplQ/IyNDTBwUq07JZDJxWdWwsDDUqFED69atw4EDB1C7dm1s2bIFgwYNQlpaWq73FR8fj+DgYADA7du30axZM2zatAl79uyBhYUFVqxYgfHjxyMrK0un74OIiIiIKCf5GgQ/YcIEzJkzB8bGxuIxiUSitO9D6dKlsXLlSjRs2DA/TeVJVlYWXF1dMXPmTJQrVy7HuomJiQAAIyMjteXvzxNIT08HkH3fNWrUwIMHD+Ds7CwmIffv34ezs7P48P75558DAB49eiS2s3HjRjRq1Aj6+vowNzeHp6cnKlWqhLt372Lt2rW53tv169chk8lgZmaG9evXo3bt2pBIJKhQoQJWr14NU1NTBAUFYe/evTp9H0REREREOcn3LNr+/fvj1KlT+Pnnn9GqVSvUqFEDtra2aNKkCaZNm4YTJ06gffv2BRGr1rZs2YJq1aqhTZs2udZVDPXRtCt2RkaG+FkxWdnMzAx79+7FyJEj8eDBA/Tq1QtjxozBlStXYGNjAwAwNjZGrVq1AACvXr0Sj/23p0MqlYpDinx8fHKNNzo6GoD6IVcWFhbo1KkTAChtDpeX74OIiIiIKCc6z4F4n4WFBQYNGpTj5mkfS1hYGA4ePKj1rs7ly5cHAI3DhxTDlkxMTFCqVCnxuKmpKZydneHs7KxUv2vXrgCA77//XuzVSE5OFs9Rp1WrVli5ciXi4+MRHx+f45KsuV2rZcuWOHDggLjsbF6/DyIiIiKinBRIApGamoq7d+8iISFBqbfh5s2baNCgQUE0oTUvLy+Eh4fnOGTK3d0d7u7u+Omnn9C2bVsA2ftaqBMbGwvg39WYcnLx4kU8ePAAEokEI0aMEI8rEgLFw/9/vd+boKknRNtrVahQQek6ef0+PD09c2yfiIiIiEq2fCUQycnJ8PT0xOHDh5GZmQmpVKqUQAQHB2PHjh2YP3++xjfmBa1cuXKoWrWq2rKYmBgkJyfDysoKpqamsLKygoODAwwNDREXF6f27f/z588BQO2mce+TyWRYuHAhAKBPnz6oU6eOWGZnZwd9fX1xaduKFSsqnavo2ShbtmyuG8Iprvv8+XNkZGSIE7X/ey1bW1sAef8+iIiIiIhyonMCkZycjP79++Off/7R+NZ81KhRWLNmDYYMGSIuIfqhTZ8+HdOnT1db5ubmBh8fH0ybNk1p1+lOnTrB19cXISEhKvM1bt68CT09PXTs2DHHdlesWIF//vkH9erVg7u7u1KZhYUFHB0dcf78eQQFBWHAgAFK5S9evAAAtGnTRmkCujp2dnaoVq0anjx5gkuXLqkkNoprKVaK0uX7ICIiIiLSROdJ1Bs2bMDDhw8hCALKli0LGxsbtQ+/ffr0wd27d7Fnz558BfohTZgwASYmJjh06JDS8YcPH+Lu3btwcnIS3+irs3nzZmzduhXffPMNtmzZonZFp2nTpsHAwABeXl5KE7OB7MnTxsbGGDdunHgsJiYGgwYNQq9evXD37l2l+op5F1u3blVJ3nx8fGBlZYUhQ4Zoc+tERERERHmicwJx/PhxDBw4EJcuXcKVK1dw+vRpleE0wL+rGGmzwlBhqVy5Mjw8PBAUFARfX18AQGRkJGbMmAF7e3uVHgUge8jSuXPnMGLECKxatQrDhg2Dl5cXypYtq7aNr7/+GnPnzsXTp0/h7u4uTtoOCAjAvn37sHTpUqXdoY8cOYLg4GDcvn0b06ZNU7pWmzZtMHbsWFy5cgVLliyBTCaDIAjYvXs3zp8/j3Xr1on7VxARERERFSSdhzAlJibCzc0NBgY5X+KPP/4AAEREROja1EfRtWtXlCtXDmvX/l979x4XU/7/AfzVXSm6yGXxTdiSiAorfLGLFovdpcKKb5F1i29kV+5iyVo2d9ZlrfjltiqXJewmfN3vkUtbbmnduuq2qeb8/ugxZ5tmJiM10+X1fDzmYc75fGbmPfNpjnmf87mswerVq2FoaIjBgwfD09NTLjGaO3cuDh8+jIYNG6JLly6YOXMmWrZs+dbXcHd3h6WlJTZs2IDu3bvD0tISVlZWCAkJkVvR2snJCaamprC2tkZMTAzS0tJgZmYmlvv5+cHKygrbtm1DWFgY6tWrh1atWiE8PBxWVlbl86EQEREREZWgJbxt2h8lPv30U3h6espM3dqpUydxleTU1FSsWrUKe/fuhSAIMDMzw/nz58sn6hrGxcUFp0+fFtesqMy8Q18g7lW+psMgIg2zsdTDtq8aYO+ddCTnFGo6HKIyqWekA4/WppoOg6jSKfMVCFdXVyxZsgQ7duyAs7MzGjRogNzcXPj7+yMpKQmxsbFi1xotLS04ODiUZ9w1xqVLl9C1a9cqkTwQERERUfVX5gRi3Lhx+O233/DkyROZ7klHjhwBADFxkPL29n6PMGumx48fY8OGDVi2bJmmQyEiIiIiAvAeg6iNjY2xZcsWNG3aFIIgyM0GpKWlJSYRAQEB6Ny583sHW5NER0cjLCwMq1ev5voMRERERFRpvNdCcs2bN0dERARCQkLw22+/ISEhQUwk6tSpg65du2LMmDEyi6qRanr27ImePXtqOgwiIiIiIhnvlUAAQO3atTFhwgRMmDABeXl5yMjIgJ6ensyMQUREREREVD28dwJRnIGBAerXr1+eT0lVTDPzcv2TIqIqSnosMK2lo+FIiMrOjH+/RAqVeRpXZQoLC7Fv3z6cPXsW2tra6NatG7788su3rhdBVV+hRICOtvxq5ERUM/GYQNWBRBCgrcW/Y6LiypxASMc1aGtr48svv0RgYCDy8vIwbtw4XLx4Uaauvb09tm/fjtq1a79/xEREREREpDFlnoWpoKAAxsbG+OWXXxAYGAgAmD9/Pi5cuCDOyiS93b59G6tXry63oImIiIiISDPKnEAARQmDk5MTgKL1HyIiIsS1H7S0tNCzZ09ERETAz88Px44de/9oiYiIiIhIo96rC9PNmzeho6OD58+f4/PPP8fr16/FcltbW+zduxf6+voAgPbt2+PGjRvlEjQREREREWlGma9A6Ovr4/79+8jOzsaUKVOQkZEBoGgFagMDA6xYsUJMHu7du4e8vLzyiZiIiIiIiDSmzFMjOTg4wMPDA7Vq1UJ2drbcytMtWrQAAGRmZiIgIKDcAiYiIiIiIs0p8xUIf39/6OnpISsrSxwsDQDDhg3DsGHDAADHjx/H4MGDce/evfKJloiIiIiINOq91oFISkrC2rVrcefOHZibm8Pd3R39+/cHAAQHB+Ply5cy9YOCgt4vWiIiIiIi0qhyX0iOai4uGkVE5YnHFCKqzqryIoVMIKhcBUam4FFqgabDIKIqrpm5Lub3tcDxB5lI/7tQ0+EQEZUrs1o66NPcRNNhlFmZB1ETKfIotQBxr/I1HQYRVRPpfxciOYcJBBFRZfJeC8kREREREVHNwgSCiIiIiIhUxgSCiIiIiIhUxgSCiIiIiIhUxgSCiIiIiIhUxgSCiIiIiIhUxgSCiIiIiIhUxgSCiIiIiIhUVqUXkjt58iTGjx+PoKAgDB48WK48NjYWq1atQnx8PHR0dODq6orJkyejVq1aCp/vypUr2LRpExISEpCVlYWWLVti3Lhx6N69u0rxPHr0CPv378eff/6JJk2aoFWrVnBzc3un95SYmIjNmzcjKSkJW7duVVovPz8fP//8M8LDw/H06VPUqVMHLi4u8PX1hbW1tcLHpKWlYd26dYiKioJEIkFBQQGcnZ0xefJktGzZ8p3iJCIiIqKaqcpegUhNTcXcuXOVlkdFRWHYsGFwcXFBVFQUwsLCcO3aNXh5eSEnJ0eu/r59+zBmzBgMGjQIv//+O86cOQMHBweMHTsWO3bsKDWW/Px8rFixAl999RVatGiB9evXY86cOe+UPCQkJGD27Nno27cv9uzZgzdv3iitW1hYiIkTJ+LHH3/E48ePUVhYiJSUFBw+fBhDhgzBtWvX5B6TmpoKDw8P3L59Gzt27EB0dDSOHTsGXV1duLm5ISYmRuVYiYiIiKjmqrIJxIIFC5Cdna2w7NmzZ5g+fTo6d+4Mb29vAICJiQkWL16MGzduYNmyZTL17927hwULFsDT0xMDBgyAlpYW9PX1MWPGDHTt2hVBQUG4efOmwtfKzc3F2LFjceTIEezZswdffPEFtLXf/WM1NDTEwoULMW7cuLfW3bNnDx48eIB169bh+vXruHnzJpYsWQIjIyNkZ2dj+vTpcgnIunXrkJiYiFWrVqFx48YAgNq1a2PRokXQ0dHBokWL3jlmIiIiIqp5qmQCcfDgQaSkpKBPnz4Ky9euXYvs7Gy5bk3NmzdH27ZtsXv3biQkJIj7N2/ejIKCAvTo0UPuuUaNGoXCwkIEBwfLlRUWFsLPzw+3bt3Ctm3b0LRp0zK/pw8++AA6Ojpo167dW+seP34cO3bsQO/evVGrVi3o6+tjyJAhmD17NgAgKSkJV69elXnMhQsXYG5ujgYNGsjsNzIyQrNmzRAXF1fm2ImIiIio5qhyCcSLFy8QHByMpUuXKjzTn5+fj8jISACAk5OTXHm7du0gCAL27dsn7jt//jwAoH79+nL1O3XqBB0dHZw/fx6pqakyZatWrUJ0dDSmT5+Of/3rX+/1vqQMDAxKLc/Pz0e/fv3wwQcfyJW5ubnB1NQUQNF4h+IMDQ2RmpqKxMREucdlZ2fDzs6u7EETERERUY1R5RKIWbNmYdKkSUrP9l+5cgVZWVnQ19eXO9sOALa2tgCAixcvivsyMjIAAJmZmXL1jYyMYGZmBgC4deuWuP/Ro0f4+eef0bBhw3ceKF0aLS2tUsv19PQwdOhQpeXSRKZkgvHJJ59AEAQsXLgQEolE3B8fH4+nT5/C39//PaImIiIiopqiSiUQoaGhMDAwKPUH+927dwFAYfIAFI2FAIC4uDgUFhbK1C2eIBQnCAIA2bP6W7ZsQX5+Pnr37o19+/ZhwoQJcHV1Rd++ffHDDz8oHZ9R0dLS0mBpaQkHBweZ/d7e3rCzs8Pp06fh5+eHvLw85ObmYsmSJVi3bh06duyokXiJiIiIqGqpMgnE48ePsXXr1rcO9pV2M6pTp47CcmkCUVBQIF5xkI6l2L17NwoKCmTqv3nzRkwc9PX1xceeOHECQNEAbBsbG6xduxZ79+5F69atsWXLFowcORK5ublleatllpiYiMTERHh7e8t17zI0NMTPP/8Me3t7HDt2DN7e3li4cCEWL16scOwHEREREZEiVSKBkEgkmDFjBmbNmgULC4tS66anpwOA0rUeiv+wzsvLAwD4+vrCxsYG9+/fx/Tp08Uk5O7du5g+fbrY5UfaLSg+Pl58nQ0bNqBDhw7Q0dGBqakpli5diqZNmyI2NhZr1qwp83sui507d6Jly5YYOXKkwnJzc3OsWrUKI0aMQFJSEsLCwrB8+XLxcyAiIiIiepsqkUBs2bIFzZs3R69evd5aV09PD8A/3Y5KKj69qXTAsYmJCXbt2oWxY8fi/v37cHNzw/jx43Hx4kU0atQIQNEZ/FatWgEAnj9/Lu4reaVDX19f7GIVHh7+Du/y/Tx8+BBhYWFYsWKFeKWkpLt372Lr1q2YN28e9u7dixYtWuDw4cPw8fFR+9USIiIiIqqaKn0Cce/ePYSFhWHWrFkq1a9Xrx4AKP1BLO22ZGRkJDPjkbGxMaZPn46jR48iKioKGzduhJeXlzhD08cffyxe1cjKyhIfo0jPnj0BFHWnKjlzU0V48+YNAgICEBgYKCY5JT169AheXl7i1YkGDRpg586daN68OS5duoSgoKAKj5OIiIiIqr5Kn0CEhITg4cOHcHZ2hq2trcxNeoZ/5syZsLW1RUBAgDjL0suXLxU+X3JyMoB/ZmMqzblz53D//n1oaWlhzJgx4n5zc3MA/yQSJRUfwK3sSkh5WrRoEfr164f+/fuXWsfExAQtWrQQ95mbm2PTpk0wNjbGr7/+imfPnlV4rERERERUtVX6BMLCwgLW1tYKb9IrAJaWlrC2toalpSU6d+4MPT09pKSkKDz7/+TJEwB468DhgoICLFmyBAAwdOhQtGnTRiyzs7ODjo4OcnNz8fTpU7nHSq9s1K1bV0w2Ksr69ethbm4OLy8vpXWys7Nx9uxZheNHmjZtiiFDhqCwsBCxsbEVGCkRERERVQeVPoHw9/dHZGSkwpt09qRp06YhMjIS/v7+MDY2Fs/EX7lyRe75bty4AW1tbfTr16/U1/3xxx/x559/wsHBATNnzpQpMzMzg4uLCwDg1KlTco+VJhW9evV667oO72PHjh1ISUnB1KlT5cpycnKQkpICoCgZEgRBHLtRkpWVFYB/xo8QERERESlT6ROIsvD19YWRkREiIiJk9sfFxSE2Nhbu7u5o1qyZ0sdv2rQJW7duhaOjI7Zs2aJwRqdp06ZBV1cXISEhMgOzgaLB04aGhpg4caK479WrVxg5ciTc3NxKPdMvXZtC+q8yO3fuxP379zFnzhy5ssTEREyePFkcB1K3bl106tQJz58/V5hU3bx5E2ZmZlwLgoiIiIjeqlomEP/6178QGBiIU6dO4cCBAwCAv/76C9988w2cnJzkrigARWfpz5w5gzFjxmDlypXw9vZGSEgI6tatq/A17O3tsWDBAjx69AgzZ84Uf6wfP34cu3fvxg8//CCzWvahQ4dw6dIl3Lp1C9OmTVMa+40bNwAUrXtRfOG64jZt2oRFixbhxIkT6Ny5Mz766CPx5ujoiN69eyMnJwdNmjQRHxMYGAgLCwsEBAQgJiYGQFGSsmPHDhw9ehRBQUEwMjIq5VMlIiIiIgJ0NR1ARRk0aBAsLCywZs0arF69GoaGhhg8eDA8PT3lpjmdO3cuDh8+jIYNG6JLly6YOXMmWrZs+dbXcHd3h6WlJTZs2IDu3bvD0tISVlZWCAkJQdu2bWXqOjk5wdTUFNbW1oiJiUFaWhrMzMzE8mPHjiEwMFDsdpScnIzu3bvDyckJ27dvF+uFhoZixYoVAP5Z80KRgQMHymw3b94c+/fvx/r16+Hr6wuJRAIdHR20a9cOe/bsQevWrd/6fomIiIiItAR1TBNEMlxcXHD69OlqOebAO/QF4l7lazoMIqribCz1sO2rBth7Jx3JOaV36SQiqmrqGenAo7WppsMos2rZhakyu3TpErp27VotkwciIiIiqv6qbRemyujx48fYsGEDli1bpulQiIiIiIjKhAmEmkRHR+P69etYvXo1TExMNB0OEREREVGZMIFQk549e6Jnz56aDoOIiIiI6L1wDAQREREREamMCQQREREREamMXZioXDUz558UEb0/6bHEtJaOhiMhIip/ZlX82MZ1IKjcFEoE6GhraToMIqomeEwhoupMIgjQ1qqaxzgmEEREREREpDKOgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpUxgSAiIiIiIpXpajoAqtrevHmDX375Bfv370dhYSEaNmyI//73v+jYsaOmQ6txoqOjsXHjRnh4eGDw4MFK68XGxmLVqlWIj4+Hjo4OXF1dMXnyZNSqVUuN0VZvgiBgz549CA0NxcOHD2FoaAhnZ2dMnDgRbdu2VfgYtot6nD59GuvXr0dcXBz09PTQrVs3fPvtt2jQoIHC+o8fP8bKlStx8+ZNaGlpoWvXrpg2bRpMTU3VG3gNc/LkSYwfPx5BQUEKj2f8vqjXiRMn4OvrK7e/b9++WLVqlcw+to36FRQU4MiRI4iOjoaOjg4aNmwIDw8PNG3aVKxT3scyXoGgMnvz5g18fHxw4MABbNu2Db///jtGjBgBb29vHD16VNPh1RhHjhyBu7s7xo0bh+vXr5daNyoqCsOGDYOLiwuioqIQFhaGa9euwcvLCzk5OWqKuPqbN28e5s+fj/v376OwsBAZGRmIiorC8OHDcfz4cbn6bBf1CA8Px9ixY5GUlARBEJCeno7Dhw9jxIgRyM3NlasfExODIUOGwNLSEidOnMCRI0eQnp6OoUOHIjk5WQPvoGZITU3F3LlzlZbz+6J+P/30k8L9Y8aMkdlm26hfbGwsBg0ahOPHj8Pf3x8//PAD/P39ZZKHCjmWCURl9N133wk2NjbCzZs3ZfZPmzZNaN++vfDkyRMNRVazPHnyRMjLyxNcXV0FGxsbYf/+/Qrr/fXXX4Kjo6Pg4+Mjsz8hIUGwtbUV5s+fr4Zoq7/o6Gjho48+EsLDw4XMzEwhPz9fOHHihNC5c2fBxsZGcHJyElJSUsT6bBf1SEpKEgYPHizcvXtXEARBkEgkQmhoqGBrayvY2NgI27dvl6mfmZkp9OjRQxgwYIBQWFgo7s/IyBDatWsnjB07Vq3x1ySTJ08W2rdvr/B4xu+L+p09e1YYPny4EB8fL3N78OCBTD22jfqdOHFCaNu2rbBmzRqldSrqWMYrEFQmT58+RWhoKFq2bAkHBweZskGDBiEnJwc//vijhqKrWZo2bQp9fX3Y2dmVWm/t2rXIzs6W6w7QvHlztG3bFrt370ZCQkJFhlojhIeHY9u2bfjiiy9gbGwMXV1d9O7dW/w+ZGVlISoqSqzPdlGP8+fPY9OmTWjVqhUAQEtLC8OHD8egQYMAAA8fPpSpHxISgmfPnuHzzz+HtvY//1XWqVMHPXv2xKlTp3D69Gn1vYEa4uDBg0hJSUGfPn0UlvP7on4//fQTxo8fjxYtWsjcrK2tZeqxbdTr0qVLmDp1Ktzc3BR2L5OqqGMZEwgqkyNHjqCgoACOjo5yZe3atQNQ1GcyLS1N3aHVWAYGBkrL8vPzERkZCQBwcnKSK2/Xrh0EQcC+ffsqLL6aokOHDgqTORcXF7Ru3RpAURcNgO2iTkOGDIGFhYXcfunxqmSbHTp0CAAUHuPat28PAGyXcvbixQsEBwdj6dKlMj90pPh9Ub+bN2/i+vXr+Ouvv0r98c+2Ua/k5GT4+vrC0tISM2bMKLVuRR3LmEBQmZw6dQoAZPrYSZmamqJBgwbIz8/HtWvX1B1ajaWlpaW07MqVK8jKyoK+vr7CwaK2trYAgIsXL1ZYfDWFp6en0jIrKysAQOPGjQGwXSqD5ORkWFlZYeDAgeK+xMREPHjwAIDiY5yNjQ2AojOAVH5mzZqFSZMmKfzMAX5fNOGnn35CXl4e5s+fj/79+2PIkCE4c+aMXD22jXqtWLECGRkZGDt2bKknDyvyWMYEgsrkzp07AKB05pI6deoAAO7evau2mEg5aTsoay8TExMAQFxcHAoLC9UWV02TlpYGfX19/Pvf/wbAdtG0rKwsnDp1CmvWrIGhoaG4X3p809XVRb169eQeJ22X9PR0/PXXX+oJtpoLDQ2FgYEB3NzclNbh90W90tLSkJaWBmtra+jo6AAAbt++DR8fHyxZsgSCIIh12Tbq8/z5c4SHh8PAwADNmjXDvHnz4OnpiR49esDLywvnz58X61bksYwJBL2zvLw8cTYFaaJQkrGxMYCiP0rSPGmXGWXtJT2IFBQUIDMzU21x1SS5ubm4ceMG3N3dxXZgu2jOw4cP4e3tDR0dHRQUFMiUSdvF2NhYYVcaabsAYDfNcvD48WNs3boVixYtKrUevy/qZWZmhl27diEyMhIXL17EkiVLYGlpCQDYvn07Vq9eLdZl26hPZGQkBEGArq4ubty4AT8/P+zcuRMLFy7E7du3MXr0aLHbUkUey5hA0DsrnhQUP2tXnPQPNS8vTx0h0VtI20zZPNzFDyxss4qxb98+1K5dG1OmTBH3sV3ULzMzE0uXLoW7uztiYmIQExMDDw8Pmamn2S7qI5FIMGPGDMyaNUvhGJXi2C6aY2JigiFDhiAyMlIc47B582YkJiYCYNuo0+XLlwEUjeuaMGECzM3NAQA9evTAjBkzIJFIMG/ePKSmplZouzCBoHemp6cn3i9+CbO4/Px8AEDdunXVEhOVTtpmytrrzZs34n0ukFX+0tLSsHHjRixdulTm82W7qJ+JiQkCAgJw7tw5LF++HA0aNEBBQQFmz54tnoF7W7tIj28Aj3Hva8uWLWjevDl69er11rr8vmiesbExNm/ejMaNGyM/P19c14Ztoz7Pnz8HoLi72MCBA2FsbIycnBwcPXq0Qo9lTCDondWtW1f8o1S2MMzr168BFF0CJc2T9n1UtFgWAPGSspGRUakDsqhs5s6dizFjxqB79+4y+9kumqOvr4+BAwdi7969qFOnDrKzs8XJId7WLtLjG8Bj3Pu4d+8ewsLCMGvWLJXq8/tSORgbG2PChAkAgCdPngBg26hTVlYWgH+6ihdXq1YtdO7cGQAQHx9foccyJhD0znR0dNCyZUsAwMuXLxXWSUlJAQBxznXSLOkMGMraS7oSpbQelZ+NGzeiUaNGciu2AmyXyqBhw4bw8PAA8E87SI9br1+/Vvgfr/T41qBBA7H7AL27kJAQPHz4EM7OzrC1tZW5hYeHAwBmzpwJW1tbBAQE8PtSibi4uAAoSggAHsvUSXrMkSYSJTVs2BBA0VWHijyWMYGgMunWrRuAogy3pNTUVGRmZsLIyAgdO3ZUd2ikQOfOnaGnp4eUlBRxUFVx0rNIPXr0UHdo1VpERAQePnyo9Awr26VycHZ2BgBxgKitra14X9Ex7vHjxwAgd0WJ3o2FhQWsra0V3qRnVy0tLWFtbQ1LS0t+XyqR+vXrA/hnHQG2jfq0adMGgOJjE1B0dRUAmjVrVqHHMiYQVCZubm7Q1tYWB/MUd+PGDQCAq6ur+IdMmmVsbIz+/fsDKJqvu6QbN25AW1sb/fr1U3do1dbx48cRFRWFxYsXy63RUVhYiGfPnrFdKonMzEzo6+uLJ0a0tLTg7u4OQHm7AMCAAQPUFmN15O/vj8jISIU36UrU06ZNQ2RkJPz9/fl9qUTi4uLQpEkTfPzxxwD4f4w6ffbZZwCAs2fPys0gBwBJSUnQ1tZG7969K/RYxgSCyqRZs2bw8PBAXFyc3FoPERERqFWrVqlLq1P5kx5IlM2x7evrCyMjI0RERMjsj4uLQ2xsLNzd3dGsWbMKjrJm+P333xEeHo7ly5dDV1dXpuzVq1cICAgQZy9hu2jewYMHMXbsWPFMHQB4eXnB0tISBw4ckKmbmpqKU6dOoVu3bmJfY1Iffl/URyKRICMjQ2HZpk2bEBQUJHOSkG2jHu3bt0efPn3w8uVLHDx4UKYsOTkZZ86cgYeHB5o0aQKgAo9lAlEZZWdnC19++aXg7u4upKWlCRKJRNi+fbtgb28vHD16VNPh1Si5ubnCgAEDBBsbG2H27NlK6x04cEBo3bq1EBERIQiCICQlJQmDBg0Shg0bJuTk5Kgr3GpN+hl36NBB6NSpk8zN0dFRsLGxEXr06CFIJBK5x7BdKo63t7fQrVs3YfXq1UJKSoogCIKQmZkpzJ07V/juu++EwsJCucecO3dOcHBwEDZs2CBIJBIhNTVV8PLyEvr27SskJyer+y3UKDNmzBBsbGyE/fv3y5Xx+6Ie48aNE+zs7ITvvvtOSEtLEwRBEFJSUoQlS5YIp0+fVvgYto16vH79WhgwYIDg6OgoXL58WRAEQUhLSxNGjx4tDB8+XPj7779l6lfEsUxLEJTM7USkgqysLKxatQpRUVHQ1tbGhx9+iClTpnDwtBpNnToVJ0+elBkgZWpqCj8/PwwfPlyu/tmzZ7FmzRq8evUKhoaGGDx4MDw9PdndrBxER0dj/PjxSqfMk/Lx8cE333wjs4/tUrG2b9+Obdu24eXLlzAwMICdnR2sra3h4eGBdu3aKX3crVu3EBwcjEePHkFfXx99+/aFj4+PwhlQqPwEBAQgPDwcQUFBGDx4sFw5vy8V7+LFi/jhhx+QkJAAXV1dODs7w8nJCR4eHqVOxcq2UY/Xr18jODgYJ06cgL6+PkxMTNCvXz+MHj1a4Wdd3scyJhBERERERKQyjoEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiIiIiKVMYEgIiKqhG7duoVff/1V02GINm7ciBcvXmg6DCKqBHQ1HQAREZWvkydP4sKFCzh69KjSH3wGBgaoU6cOGjdujDZt2qBXr15wcXGBlpaWmqOtGM+ePUN6ejrs7OzkyoKCgvDrr7+icePGWLZsGVq1aqWBCJXLyMjA4sWL8eTJEwQHB2s6HFG7du0wYsQIeHp64j//+U+1+VshonenJQiCoOkgiIio/CUkJKB///4y+xYuXIhPP/0Ur169wqVLl/DTTz+JSYa9vT0CAwPRtm1bTYRbrsaMGYPPPvsMgwcPltl//vx5eHl5idsdO3bEzp071Rydcs+ePYO3tzfMzMywbds21KpVS9MhyUhISMDQoUPx8ccfY+nSpdDR0dF0SESkAezCRERUTVlZWcnt09PTg6mpKT788EOMGDECYWFhYr3Y2FgMHz4cx44dU3eo5Wrt2rX43//+p1LdynQO7fXr1/jPf/6Dp0+fIigoqNIlDwDQokULTJo0CQcPHsScOXM0HQ4RaQgTCCKiakpX9+29VOvVq4fvv/9e3M7Pz4e/vz/u3LlTkaFVmF27dmHt2rVKy11cXDBq1CjUrl0bNjY2lepH8LfffovHjx9jwIABaNasmabDUcrT0xNmZmYICwvDvn37NB0OEWkAEwgiohrO0dERnTp1Erfz8/Mxf/58DUb07iQSCVauXIkFCxa89arC7Nmzce3aNRw6dEjhGAlNiIyMxMmTJwEAAwcO1HA0pdPT00Pfvn0BAN9//z1SU1M1HBERqRsTCCIiQp8+fWS2Y2JicOXKFWRkZGDs2LGwtbWVuUn5+fmhVatWMmVPnz4Vy3/55Rc4OTnJlK9ZswYAEB8fDx8fHzg6OsLf318uprNnz+Kbb75B//794ejoiI4dO8Ld3R0RERFydSdPnoxt27bJ7AsMDESHDh3QoUMHbNq0CZMmTZJ7HwEBAQo/jytXrsDX1xddunSBs7MzXF1dERgYiIcPH8rVPXDgALp06SL3HhMTEzFz5kx07doVjo6OGDVqFOLi4uQeLwgCVqxYAaDox3nHjh3l6vj7+8POzk7ucz5y5AiGDRsGR0dHODs7w8fHR3yNFy9eIDAwED179oSDgwM+++wzHD58WOH7vXXrFry8vODo6Ah7e3vxNVq3bq2wfq9evQAAmZmZWL9+vcI6RFR9MYEgIiK0a9dObt/vv/+OunXrYvPmzQrHUwDAypUr0blzZ6XP6+XlhZkzZ8rt//PPPzF8+HCcOXMGOTk5OHz4MO7duwcAyM3NxcSJEzF69GgMGDAAv/32G44dO4amTZsiJiYGM2bMwMKFC2Web926dXJXTebPn48rV67gypUr+Prrr7Fu3TqVuixt3boVnp6eOHHiBObMmYOrV69i7NixCA0NxRdffCH3I/zzzz/HjBkzZPadP38ePj4+MDU1hZGREXJycnDx4kV4eXkhIyNDpu65c+fw5MkTAEVjDPT19eViWrFiBVxcXGT2zZkzB4cOHcLAgQNRp04dZGVl4cyZMxg5ciQiIiLg7e2NRo0aoX379sjLy0N8fDymT5+Oc+fOyTzP06dPMWrUKFy+fBkhISG4ceMGgoODYWpqqvQzsre3F+8fOHAAeXl5yj9QIqp2mEAQERGaNGkity82Nla8b2lpqfSx9evXf6fnzsvLw7Rp0+T2S2f0Wb16Nf744w8AQGFhIbS0tFC/fn18+eWXYt3/+7//ExOOd9GlS5dSy0+fPo1ly5ZBEATY2dmJs1i5u7vDysoKf//9N2bMmCHz2QDyn8GTJ08QEhIil+ykpKTgt99+k6kbGRkp3lfUDlL16tWT2W7atCk2bNiAESNGYMqUKeL+9PR0rFy5Etu2bcPXX3+NZcuWiUmJIAjYs2ePzPOsW7cOOTk5MDAwgLW1NfT09NC/f3+sX78e2tqKfyaYm5vD2NgYQNHg71OnTimNm4iqHyYQREQk/hgsLjk5Wbyv7Ifk28oUle/duxfe3t4IDw/HnDlzUKdOHfTr1w8ffvghgKKuS1LFB0TXrl1b5nkUdSd6GwMDA6VlgiDIDCgvuT6EtOtWQUGBTD1A/j26ubmhQYMGAIAPPvhApuzRo0cy29evXxfvm5ubK42v5GuMGzdOvN+oUSOZsiFDhoivr6+vL3M1oeTrS5Oh7OxseHt749mzZwAAZ2dnuWlwi7OwsFD4Hoio+mMCQURECufzf1tiUFa1a9cWryaMHDkSly9fxsqVK8VyV1dX8b6zs7N4XyKRyDxPeXebuX37NuLj48Xtkj/miydZly9fxsuXL5U+V/HPs+Rnm5OTI96XSCRISEgQt01MTN49cAWvUZKenp54Pzs7W6as+PuMiYnB559/Ll4lKdlVrLjisd6/f/+d4iWiqo0rURMRkdyPSqD0s+Hvw9nZudRVjH19fdGvXz9kZ2fDwcEBz549w9atW3Ho0CGZeiUTivd169YtmW0jIyOZ7eI/wiUSCeLj49/afUuRwsJC8f7r169l3kfx16goJT83d3d3nD9/XtzOyMjAtGnTcPr0aQQGBipdj6J4rGlpaRUTLBFVSrwCQUREMt2VpCpqRWpp15rStGjRAk2bNsWiRYvQp08fCIKA//73vxUSj1TJwc0lf8yX/OGdkpJSptcpPs1syasomljZ+bPPPsPw4cPl9kdERODrr79Gfn6+wscVX2dEUQJKRNUXEwgiIsKff/4pt++TTz6pkNdSZYXlo0ePom/fvti5cyfGjx+PuXPnVvjKzIrGgRRX/MoBID8mozxes+RrqMuCBQswf/58uTEiFy9eVLpYXEFBgXj/bZ8dEVUvTCCIiAj/+9//ZLYdHBzQoUMHcbuixkMocujQIfj5+SE9PR02NjaYNGmSSo8rrVuUKkquefD69WuZ7dzcXJnt4uthlFXt2rVhaGgobr958+a9n/NdBQcHQyKR4KuvvkJYWJjc+yo+qL244rGamZlVaIxEVLkwgSAiquEyMjJw7NgxcVtfX19uTYXSzpQXPxP9vgoLCxEUFCRuW1lZqZwYvO/4AUdHR5lpVEt2aSq+3bZtWzRu3Pi9Xk+qeOJS8jXV4cWLFzhx4gQAoGXLlti3bx969uwplivrVpWeni7eV7bgHBFVT0wgiIiqKUWDjBXtW7x4sXi23cDAAMuXL0ebNm1k6lhbW8tsP3/+HABw7do1mXUMANk+/gDw999/l1peXFpamszYgujoaOzZswf79+/HunXrZOrm5eXJzGhUfFpR4J8z5G/evBEXais55qB4bNra2pg+fbq4XXyaWEEQ8ODBAwBFVzpKrpxd8nMtvl2yW1LJ91/8Sk9pMzuVHItQ/L2UVgbIXj1RNKZh2bJlYvJiYGCAqVOnimXK1s5ITU0V7xefLYuIqj8mEERE1ZT0R3NxMTExkEgkEAQBd+7cga+vLw4cOACg6Az8nj178Omnn8o9bujQoTKzEk2ZMgXfffcdAgICYGdnJ1P31atXMtsXLlyQ2b527ZrSrjoWFhZo2LChuJ2fn4958+Zhz549cl2Z1q1bh4CAAPEHeadOnWTWbrhw4QLevHmD4OBgaGlpQRAEnD59WuY5YmNjZboq9evXDxMnTgQAXL16FcePHwdQtHDd8+fPoaOjgwULFsitCl1yEHrxRECabCmrO2jQIPG+ojYDgMzMTLnF66Rdi/Lz8xEdHS1TdvXqVTE5un79usyP/eTkZLk1NKSrUd+7dw+CIIjPZ29vr3AtiJcvX4pJiYWFBbp27aowbiKqnrSE0k4FERFRlXP+/Hlcu3YN4eHhSExMlCvX19eHtrY2atWqBWtra3To0AGurq5wcHAo9Xlv3ryJoKAg3LlzB6ampvjkk0/g6+uL5cuXIzw8XKzXrFkzzJkzB//+97/x7bffiglKcXp6eti/f7/CcQQxMTGYP38+Hjx4gObNm2PYsGFwc3ODjo4Ovv/+e+zfvx8SiQSurq6YNWuWTPeqV69eYcmSJeKYjo8++ghTp05FixYtMHHiRHGF6+K0tLQQFhYm0w3n3LlzCAkJwc2bN/H333/D2NgYnTp1wujRo2Fvby/z+MOHDyMoKEgmMdDT08OUKVPQuXNn+Pn5ISkpSSzT1tbGwIEDsWzZMnHfqFGjcPHiRWhpaeHcuXNyU+i2adNG4ZWDadOm4dmzZ9i1a5dcWb169bB69Wp89dVXCt9zaGgonJycEBAQINN+BgYGMDU1haurK/z8/BQOkD558iTGjx8PAJg4cWKFz5BFRJULEwgiIiINi42NxZAhQyAIAhYuXIihQ4dqOqRSLViwALt27UL9+vURGRlZLjNSEVHVwS5MREREGmZvbw8fHx8AUHjFpjLJy8vD0aNHoaWlhUWLFjF5IKqBmEAQERFVAlOnTkW3bt1w9epVnDx5UtPhKLVlyxakp6djypQpMrM1EVHNwS5MRERElUReXh78/Pxw7949hIaGolGjRpoOScaFCxcwbtw4jB8/HhMmTNB0OESkIUwgiIiIKhFBEBASEoLdu3dj3rx5cjM+aUpoaCh27NiBuXPnKp3alYhqBiYQRERElVBmZiZu375daRKIP/74Az169ICurq6mQyEiDWMCQUREREREKuMgaiIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUhkTCCIiIiIiUtn/A3tbkQrRuyKaAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "f, ax = plt.subplots(figsize=(8, 5))\n", + "\n", + "sns.set_color_codes('pastel')\n", + "sns.barplot(\n", + " x='compute', y='test_name', data=df, label='Compute', color='lightskyblue')\n", + "\n", + "sns.barplot(\n", + " x='non_compute',\n", + " y='test_name',\n", + " data=df,\n", + " label='Access',\n", + " color='dodgerblue')\n", + "\n", + "# ax.legend(ncol=1, loc='upper right', frameon=True, fontsize=16)\n", + "ax.legend(\n", + " fontsize=20,\n", + " frameon=True,\n", + " bbox_to_anchor=(0.95, 1),\n", + " shadow=True,\n", + " facecolor='white')\n", + "ax.set(xlim=(0, 60), ylabel='', xlabel='')\n", + "sns.despine(left=True, bottom=True)\n", + "\n", + "ax.set_ylabel(\n", + " 'sequence length, block size',\n", + " fontdict={\n", + " 'fontsize': 20,\n", + " 'fontweight': 'bold'\n", + " })\n", + "ax.set_xlabel('Duration(ms)', fontdict={'fontsize': 20, 'fontweight': 'bold'})\n", + "ax.tick_params(labelsize=18)\n", + "\n", + "plt.tight_layout()\n", + "fig = ax.get_figure()\n", + "fig.savefig('bigbird_compute_vs_noncompute.pdf', dpi=600, bbox_inches='tight')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.13 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/main.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/main.py new file mode 100644 index 000000000..b6546a4a4 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/main.py @@ -0,0 +1,166 @@ +from attention import BigbirdBlockSpareAttention +import torch +import argparse +import types + + +def test_BigBird(batch_size: int, + size_per_head: int, + from_seq_length: int, + from_block_size: int, + to_seq_length: int, + to_block_size: int, + num_attention_heads: int = 1, + num_rand_blocks: int = 3, + device='cuda:0', + output_file=None): + query_layer = torch.rand( + batch_size, + num_attention_heads, + from_seq_length, + size_per_head, + device=device) + key_layer = torch.rand( + batch_size, + num_attention_heads, + to_seq_length, + size_per_head, + device=device) + value_layer = torch.rand( + batch_size, + num_attention_heads, + to_seq_length, + size_per_head, + device=device) + + # The values should be 1 or 0. The attention scores will effectively be + # set to -infinity for any positions in the mask that are 0, and will be + # unchanged for positions that are 1. + band_mask = torch.rand( + batch_size, + 1, + from_seq_length // from_block_size - 4, + from_block_size, + 3 * to_block_size, + device=device) + from_mask = torch.rand(batch_size, 1, from_seq_length, 1, device=device) + to_mask = torch.rand(batch_size, 1, 1, to_seq_length, device=device) + from_blocked_mask = torch.rand( + batch_size, + from_seq_length // from_block_size, # number blocks + from_block_size, + device=device) + to_blocked_mask = torch.rand( + batch_size, + to_seq_length // to_block_size, # number blocks + to_block_size, + device=device) + + attn = BigbirdBlockSpareAttention( + num_attention_heads=num_attention_heads, + num_rand_blocks=num_rand_blocks, + size_per_head=size_per_head, + from_block_size=from_block_size, + to_block_size=to_block_size).to(device) + + attn( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + batch_size, + from_seq_length, + to_seq_length, + output_file, + ) + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Bigbird') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=4096) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=32) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=512) + parser.add_argument( + '--block_size', type=int, help='Block size', default=64) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +def output_file(OUTPUT_FILE, cmd_args, run_time): + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write( + f"{cmd_args.batch_size}\t{cmd_args.seq_len}\t{cmd_args.hidden_size}\t{cmd_args.block_size}\t" + f"{run_time}\n") + + +if __name__ == '__main__': + num_attention_heads = 1 + num_rand_blocks = 3 + + cmd_args = parse_test_args() + DEFAULT_TEST = cmd_args.default_test + OUTPUT_FILE = cmd_args.output_file + + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "batch size\tsequence length\thidden\tblock size\telapsed time(ms)\n" + ) + + if not DEFAULT_TEST: + seq_len = cmd_args.seq_len + batch_size = cmd_args.batch_size + size_per_head = cmd_args.hidden_size + block_size = cmd_args.block_size + + from_seq_length = seq_len + from_block_size = block_size + to_seq_length = seq_len + to_block_size = block_size + + run_time = test_BigBird( + batch_size=batch_size, + size_per_head=size_per_head, + from_seq_length=from_seq_length, + from_block_size=from_block_size, + to_seq_length=to_seq_length, + to_block_size=to_block_size, + num_rand_blocks=num_rand_blocks, + num_attention_heads=num_attention_heads, + output_file=OUTPUT_FILE) + else: + run_time = test_BigBird( + batch_size=32, + size_per_head=512, + from_seq_length=512, + from_block_size=64, + to_seq_length=512, + to_block_size=64, + num_rand_blocks=num_rand_blocks, + num_attention_heads=num_attention_heads, + output_file=OUTPUT_FILE) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/process.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/process.py new file mode 100644 index 000000000..c3bace7c4 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/process.py @@ -0,0 +1,71 @@ +import os +import json +import glob +from tqdm import tqdm + + +def parse(input_file: str, output_file: str): + data = json.load(open(input_file, encoding='utf-8')) + trace_events = {} + + for event in data['traceEvents']: + if 'cat' not in event or event['cat'] != 'Kernel': + continue + + if event['name'] in trace_events: + trace_events[event['name']]['dur'] += float(event['dur']) / 1000. + trace_events[event['name']]['call_num'] += 1 + else: + trace_events[event['name']] = { + 'dur': float(event['dur']) / 1000., # to ms + 'call_num': 1 + } + + sorted_events = sorted( + trace_events.items(), key=lambda item: item[1]['dur'], reverse=True) + + with open(output_file, 'w') as fout, open('kernel_name.txt', 'w') as fn: + fout.write('name\tdur\tcall_num\n') + for k, v in sorted_events: + fout.write('%s\t%f\t%d\n' % (k, v['dur'], v['call_num'])) + fn.write('%s\n' % k) + + +def stats(input_file: str, fout): + compute_kernels = set() + with open('figures/compute_kernels.tsv', 'r') as f: + for line in f: + compute_kernels.add(line.strip()) + + compute_time = 0. + non_compute_time = 0. + with open(input_file, 'r') as fin: + for i, line in enumerate(fin): + if not i: + continue + name, dur, _ = line.strip().split('\t') + if name in compute_kernels: + compute_time += float(dur) + else: + non_compute_time += float(dur) + test_name = os.path.splitext(os.path.split(input_file)[-1])[0] + bs, seq, block = test_name.split('_') + bs = bs.replace('bs', '') + seq = seq.replace('seq', '') + block = block.replace('block', '') + fout.write('%s,%s\t%.5f\t%.5f\t%.5f\n' % + (seq, block, compute_time, non_compute_time, + compute_time + non_compute_time)) + + +if __name__ == '__main__': + dirbase = 'log' + dirname = 'bs4_seq1024_block128' + input_file_name = os.listdir(os.path.join(dirbase, dirname))[0] + output_file = os.path.join(dirbase, dirname + '.tsv') + parse(os.path.join(dirbase, dirname, input_file_name), output_file) + + with open('figures/bigbird_compute_vs_noncompute.tsv', 'w') as fout: + fout.write('test name\tcompute\tnon-compute\ttotal\n') + for log_file in glob.glob('log/*.tsv'): + stats(log_file, fout) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/utils.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/utils.py new file mode 100644 index 000000000..4353f8a49 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/pytorch/utils.py @@ -0,0 +1,87 @@ +import torch +from torch import Tensor + + +def assert_rank(expected_rank): + expected_rank_dict = {} + if isinstance(expected_rank, int): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + +def get_shape_list(tensor, expected_rank=None): + if expected_rank is not None: + assert_rank(expected_rank) + + shape = tensor.size() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + assert False, "Static shape not available for {}".format(tensor) + + dyn_shape = tensor.size() + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + exp_blocked_to_pad = torch.cat( + (to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], + to_blocked_mask[:, 3:-1]), 2) + band_mask = torch.einsum("blq,blk->blqk", + from_blocked_mask[:, 2:-2].float(), + exp_blocked_to_pad.float()) + band_mask = torch.unsqueeze(band_mask, 1) + return band_mask + + +def torch_gather4d(input_tensor: Tensor, indexes: Tensor) -> Tensor: + # input_tensor = torch.from_numpy(input_tensor) + # indexes = torch.from_numpy(indexes).long() + + indexes = indexes.long() # [16, 1, 14, 3] + indexes = torch.unsqueeze(indexes, -1) # [16, 1, 14, 3, 1] + indexes = torch.unsqueeze(indexes, -1) # [16, 1, 14, 3, 1, 1] + indexes = indexes.expand(-1, -1, -1, -1, -1, input_tensor.size(-1)).to( + input_tensor.device) # [16, 1, 14, 3, 1, 64] + + input_tensor = torch.unsqueeze(input_tensor, 1) # [16, 1, 16, 64] + input_tensor = torch.unsqueeze(input_tensor, 1) # [16, 1, 1, 16, 64] + input_tensor = torch.unsqueeze(input_tensor, 1) # [16, 1, 1, 1, 16, 64] + input_tensor = input_tensor.expand(-1, indexes.size(1), indexes.size(2), + indexes.size(3), -1, + -1) # [16, 1, 14, 3, 16, 64] + output_tensor = torch.gather(input_tensor, 4, + indexes) # [16, 1, 14, 3, 1, 64] + output_tensor = output_tensor.view( + (indexes.size(0), indexes.size(1), indexes.size(2), indexes.size(3), + input_tensor.size(-1))) # [16, 1, 14, 3, 64] + + return output_tensor + + +def torch_gather5d(input_tensor: Tensor, indexes: Tensor) -> Tensor: + # input_tensor = torch.from_numpy(input_tensor) + # indexes = torch.from_numpy(indexes).long() + + indexes = indexes.long() + indexes = torch.unsqueeze(indexes, -1) + indexes = torch.unsqueeze(indexes, -1) + indexes = indexes.expand(-1, -1, -1, -1, input_tensor.size(-2), + input_tensor.size(-1)).to(input_tensor.device) + + input_tensor = torch.unsqueeze(input_tensor, 2) + input_tensor = input_tensor.expand(-1, -1, indexes.size(2), -1, -1, -1) + + output_tensor = torch.gather(input_tensor, 3, indexes) + + return output_tensor diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/main.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/main.py new file mode 100644 index 000000000..12e91d12c --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/main.py @@ -0,0 +1,196 @@ +import torch + +import triton +import triton.language as tl +import argparse +import types + +from time import time +from op import * + + +class TritonBigbird(): + global_size = 1 + window_size = 3 + random_size = 3 + warm = 5 + + device = 'cuda' + dtype = torch.float32 + + def __init__(self, cmd_args): + self.seq_len = cmd_args.seq_len + self.batch_size = cmd_args.batch_size + self.hidden_size = cmd_args.hidden_size + self.block_size = cmd_args.block_size + self.block_num = self.seq_len // self.block_size + + def init_random_index(self, random_index): + for i in range(self.block_num): + for j in range(self.random_size): + random_index[i, j] = ( + (i * self.random_size + j) // self.random_size) + + def test(self): + padding_size = (self.window_size // 2) * self.block_size + Q = torch.randn( + (self.batch_size, self.seq_len, self.hidden_size), + device=self.device, + dtype=self.dtype) + K = torch.randn( + (self.batch_size, self.hidden_size, + padding_size + self.seq_len + padding_size), + device=self.device, + dtype=self.dtype) + V = torch.randn( + (self.batch_size, padding_size + self.seq_len + padding_size, + self.hidden_size), + device=self.device, + dtype=self.dtype) + QK = torch.zeros( + [self.batch_size, self.seq_len, self.seq_len], + device=self.device, + dtype=self.dtype) + softmax_QK = torch.zeros( + [self.batch_size, self.seq_len, self.seq_len], + device=self.device, + dtype=self.dtype) + O = torch.zeros( + [self.batch_size, self.seq_len, self.hidden_size], + device=self.device, + dtype=self.dtype) + random_index = torch.zeros( + [self.block_num, self.random_size], + device=self.device, + dtype=torch.int32) + + para = (self.batch_size, self.global_size, self.random_size, + self.window_size, self.hidden_size, self.seq_len, + self.seq_len // self.block_size, self.block_size) + + for i in range(self.warm): + global_qk(Q, K, QK, para) + global_softmax(QK, softmax_QK, para) + global_wv(softmax_QK, V, O, para) + sparse_qk(Q, K, QK, random_index, para) + sparse_softmax(QK, softmax_QK, para) + sparse_wv(softmax_QK, V, O, random_index, para) + + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + + Q = torch.randn( + (self.batch_size, self.seq_len, self.hidden_size), + device=self.device, + dtype=self.dtype) + K = torch.randn( + (self.batch_size, self.hidden_size, + padding_size + self.seq_len + padding_size), + device=self.device, + dtype=self.dtype) + V = torch.randn( + (self.batch_size, padding_size + self.seq_len + padding_size, + self.hidden_size), + device=self.device, + dtype=self.dtype) + QK = torch.zeros( + [self.batch_size, self.seq_len, self.seq_len], + device=self.device, + dtype=self.dtype) + softmax_QK = torch.zeros( + [self.batch_size, self.seq_len, self.seq_len], + device=self.device, + dtype=self.dtype) + O = torch.zeros( + [self.batch_size, self.seq_len, self.hidden_size], + device=self.device, + dtype=self.dtype) + random_index = torch.zeros( + [self.block_num, self.random_size], + device=self.device, + dtype=torch.int32) + self.init_random_index(random_index) + + global_qk(Q, K, QK, para) + global_softmax(QK, softmax_QK, para) + global_wv(softmax_QK, V, O, para) + sparse_qk(Q, K, QK, random_index, para) + sparse_softmax(QK, softmax_QK, para) + sparse_wv(softmax_QK, V, O, random_index, para) + + end_event.record() + torch.cuda.synchronize() + + elapsed = start_event.elapsed_time(end_event) + + print( + f"block_size\t{self.block_size}\tseq_length\t{self.seq_len}, batch_size\t{self.batch_size}\t" + f"hidden_size\t{self.hidden_size}\tTriton(ms)\t{elapsed}") + + return elapsed + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Bigbird') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=4096) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=32) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=512) + parser.add_argument( + '--block_size', type=int, help='Block size', default=64) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +def output_file(OUTPUT_FILE, cmd_args, run_time): + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write( + f"{cmd_args.batch_size}\t{cmd_args.seq_len}\t{cmd_args.hidden_size}\t{cmd_args.block_size}\t" + f"{run_time}\n") + + +if __name__ == "__main__": + cmd_args = parse_test_args() + DEFAULT_TEST = cmd_args.default_test + OUTPUT_FILE = cmd_args.output_file + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "batch size\t sequence length\thidden\tblock size\telapsed time(ms)\n" + ) + if not DEFAULT_TEST: + run_time = TritonBigbird(cmd_args).test() + output_file(OUTPUT_FILE, cmd_args, run_time) + else: + test1_cmd_args = types.SimpleNamespace( + seq_len=4096, batch_size=32, hidden_size=512, block_size=64) + run_time = TritonBigbird(test1_cmd_args).test() + output_file(OUTPUT_FILE, test1_cmd_args, run_time) + + test2_cmd_args = types.SimpleNamespace( + seq_len=8192, batch_size=32, hidden_size=512, block_size=64) + run_time = TritonBigbird(test2_cmd_args).test() + output_file(OUTPUT_FILE, test2_cmd_args, run_time) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/__init__.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/__init__.py new file mode 100644 index 000000000..5d826421f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/__init__.py @@ -0,0 +1,20 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .global_qk import * +from .global_softmax import * +from .global_wv import * +from .sparse_qk import * +from .sparse_wv import * +from .sparse_softmax import * + +__all__ = [ + "global_qk", + "global_softmax", + "global_wv", + "sparse_qk", + "sparse_wv", + "sparse_softmax", +] diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_qk.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_qk.py new file mode 100644 index 000000000..2da2920b0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_qk.py @@ -0,0 +1,137 @@ +import torch + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['global_qk'] + + +@triton.autotune( + configs=[ + triton.Config({ + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 64 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 64 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 128 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 128 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 256 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 256 + }, num_stages=2, num_warps=2), + ], + key=['hidden_size'], +) +@triton.jit +def global_qk_kernel( + Q_ptr, + K_ptr, + QK_ptr, + window_size, + batch_size, + hidden_size, + seq_len, + block_num, + stride_Qb, + stride_Qs, + stride_Qh, + stride_Kb, + stride_Kh, + stride_Ks, + stride_QKb, + stride_QKs, + stride_QKh, + block_size: tl.constexpr, + BLOCK_HIDDEN: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_block_m = tl.program_id(1) + pid_block_n = tl.program_id(2) + offset_batch = pid_batch * stride_Qb + # offset_block = diff_block * block_size * stride_Qs + # diff_block: the first or the last block + diff_block = 0 + if pid_block_m == 1: + diff_block = (block_num - 1) + Q_block_ptr = tl.make_block_ptr( + base=Q_ptr + offset_batch, + shape=(seq_len, hidden_size), + strides=(stride_Qs, stride_Qh), + offsets=(diff_block * block_size, 0), + block_shape=(block_size, BLOCK_HIDDEN), + order=(1, 0), + ) + K_block_ptr = tl.make_block_ptr( + base=K_ptr + pid_batch * stride_Kb, + shape=(hidden_size, seq_len), + strides=(stride_Kh, stride_Ks), + # padding + offsets=(0, (window_size // 2 * block_size)), + block_shape=(BLOCK_HIDDEN, block_size), + order=(1, 0), + ) + qk = tl.zeros([block_size, block_size], dtype=tl.float32) + for _ in range(0, hidden_size, BLOCK_HIDDEN): + q = tl.load(Q_block_ptr) + k = tl.load(K_block_ptr) + qk += tl.dot(q, k) + Q_block_ptr = tl.advance(Q_block_ptr, (0, BLOCK_HIDDEN)) + K_block_ptr = tl.advance(K_block_ptr, (BLOCK_HIDDEN, 0)) + + offset_qk_batch = pid_batch * stride_QKb + diff_qk_m = diff_block * block_size + tl.arange(0, block_size) + diff_qk_n = pid_block_n * block_size + tl.arange(0, block_size) + QK_ptrs = (QK_ptr + offset_qk_batch + diff_qk_m[:, None] * stride_QKs + + diff_qk_n[None, :] * stride_QKh) + tl.store(QK_ptrs, qk) + + +def global_qk(Q, K, QK, para): + (batch_size, global_size, random_size, window_size, hidden_size, seq_len, + block_num, block_size) = para + + def grid(META): + return ( + # batch, Q(global row), K + batch_size, + 2, + triton.cdiv(seq_len, block_size), + ) + + global_qk_kernel[grid]( + Q_ptr=Q, + K_ptr=K, + QK_ptr=QK, + window_size=window_size, + batch_size=batch_size, + hidden_size=hidden_size, + seq_len=seq_len, + block_num=block_num, + stride_Qb=Q.stride(0), + stride_Qs=Q.stride(1), + stride_Qh=Q.stride(2), + stride_Kb=K.stride(0), + stride_Kh=K.stride(1), + stride_Ks=K.stride(2), + stride_QKb=QK.stride(0), + stride_QKs=QK.stride(1), + stride_QKh=QK.stride(2), + block_size=block_size, + ) + return diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_softmax.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_softmax.py new file mode 100644 index 000000000..5485b16c9 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_softmax.py @@ -0,0 +1,84 @@ +import torch + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['global_softmax'] + + +@triton.jit +def global_softmax_kernel( + QK_ptr, + softmax_QK_ptr, + window_size, + batch_size, + hidden_size, + seq_len, + block_num, + stride_QKb, + stride_QKs, + stride_QKh, + block_size: tl.constexpr, + col_size: tl.constexpr, + col_load: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_block_m = tl.program_id(1) + offset_batch = pid_batch * stride_QKb + # offset_block = diff_block * block_size * stride_Qs + # diff_block: the first or the last block + diff_block = 0 + if pid_block_m == 1: + diff_block = block_num - 1 + + col_offsets = tl.arange(0, col_load) + row_offsets = diff_block * block_size + tl.arange(0, block_size) + QK_ptrs = QK_ptr + offset_batch + row_offsets[:, + None] * stride_QKs + col_offsets[None, :] * stride_QKh + qk = tl.load( + QK_ptrs, mask=col_offsets[None, :] < col_size, other=-float('inf')) + # qk = tl.load(QK_block_ptr) + qk_minus_max = qk - tl.max(qk, axis=1)[:, None] + numerator = tl.exp(qk_minus_max) + denominator = tl.sum(numerator, axis=1) + softmax_output = numerator / denominator[:, None] + + offset_qk_batch = pid_batch * stride_QKb + diff_qk_m = diff_block * block_size + tl.arange(0, block_size) + diff_qk_n = tl.arange(0, col_load) + softmax_QK_ptrs = (softmax_QK_ptr + offset_qk_batch + + diff_qk_m[:, None] * stride_QKs + diff_qk_n[None, :]) + tl.store( + softmax_QK_ptrs, softmax_output, mask=col_offsets[None, :] < col_size) + + +def global_softmax(QK, softmax_QK, para): + (batch_size, global_size, random_size, window_size, hidden_size, seq_len, + block_num, block_size) = para + + def grid(META): + return ( + # batch, Q(global_row), + batch_size, + 2, + ) + + global_softmax_kernel[grid]( + QK_ptr=QK, + softmax_QK_ptr=softmax_QK, + window_size=window_size, + batch_size=batch_size, + hidden_size=hidden_size, + seq_len=seq_len, + block_num=block_num, + stride_QKb=QK.stride(0), + stride_QKs=QK.stride(1), + stride_QKh=QK.stride(2), + block_size=block_size, + col_size=seq_len, + col_load=triton.next_power_of_2(seq_len), + ) + return diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_wv.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_wv.py new file mode 100644 index 000000000..6257f5f93 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/global_wv.py @@ -0,0 +1,214 @@ +import torch + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['global_wv'] + + +@triton.autotune( + configs=[ + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 64 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 64 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 64, + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 64, + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 64, + 'BLOCK_HIDDEN': 64 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 64, + 'BLOCK_HIDDEN': 64 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 128 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 128 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 128, + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 128, + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 256, + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 256, + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 256 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 32, + 'BLOCK_HIDDEN': 256 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 128, + 'BLOCK_HIDDEN': 128 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 128, + 'BLOCK_HIDDEN': 128 + }, num_stages=2, num_warps=2), + triton.Config( + { + 'BLOCK_SEQ': 256, + 'BLOCK_HIDDEN': 256 + }, num_stages=4, num_warps=4), + triton.Config( + { + 'BLOCK_SEQ': 256, + 'BLOCK_HIDDEN': 256 + }, num_stages=2, num_warps=2), + ], + key=['seq_len', 'hidden_size'], +) +@triton.jit +def global_wv_kernel( + W_ptr, + V_ptr, + O_ptr, + window_size, + batch_size, + hidden_size, + seq_len, + block_num, + stride_Wb, + stride_Ws, + stride_Wh, + stride_Vb, + stride_Vh, + stride_Vs, + stride_Ob, + stride_Os, + stride_Oh, + block_size: tl.constexpr, + BLOCK_SEQ: tl.constexpr, + BLOCK_HIDDEN: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_block_m = tl.program_id(1) + pid_block_n = tl.program_id(2) + # offset_batch = pid_batch * global_size * stride_b + # offset_block = diff_block * block_size * stride_s + # diff_block: the first or the last block + diff_block = 0 + if pid_block_m == 1: + diff_block = block_num - 1 + W_block_ptr = tl.make_block_ptr( + base=W_ptr + pid_batch * stride_Wb, + shape=(hidden_size, seq_len), + strides=(stride_Ws, stride_Wh), + offsets=(diff_block * block_size, 0), + block_shape=(block_size, BLOCK_SEQ), + order=(1, 0), + ) + V_block_ptr = tl.make_block_ptr( + base=V_ptr + pid_batch * stride_Vb, + shape=(seq_len, hidden_size), + strides=(stride_Vs, stride_Vh), + offsets=((window_size // 2 * block_size), pid_block_n * BLOCK_HIDDEN), + block_shape=(BLOCK_SEQ, BLOCK_HIDDEN), + order=(1, 0), + ) + + o = tl.zeros([block_size, BLOCK_HIDDEN], dtype=tl.float32) + for _ in range(0, seq_len, BLOCK_SEQ): + w = tl.load(W_block_ptr) + v = tl.load(V_block_ptr) + o += tl.dot(w, v) + W_block_ptr = tl.advance(W_block_ptr, (0, BLOCK_SEQ)) + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_SEQ, 0)) + + offset_qk_batch = pid_batch * stride_Ob + diff_qk_m = diff_block * block_size + tl.arange(0, block_size) + diff_qk_n = pid_block_n * BLOCK_HIDDEN + tl.arange(0, BLOCK_HIDDEN) + O_ptrs = (O_ptr + offset_qk_batch + diff_qk_m[:, None] * stride_Os + + diff_qk_n[None, :] * stride_Oh) + tl.store(O_ptrs, o) + + +def global_wv(W, V, O, para): + (batch_size, global_size, random_size, window_size, hidden_size, seq_len, + block_num, block_size) = para + + def grid(META): + return ( + # batch, global_row, hidden + batch_size, + 2, + triton.cdiv(hidden_size, META['BLOCK_HIDDEN']), + ) + + global_wv_kernel[grid]( + W_ptr=W, + V_ptr=V, + O_ptr=O, + window_size=window_size, + batch_size=batch_size, + hidden_size=hidden_size, + seq_len=seq_len, + block_num=block_num, + stride_Wb=W.stride(0), + stride_Ws=W.stride(1), + stride_Wh=W.stride(2), + stride_Vb=V.stride(0), + stride_Vh=V.stride(1), + stride_Vs=V.stride(2), + stride_Ob=O.stride(0), + stride_Os=O.stride(1), + stride_Oh=O.stride(2), + block_size=block_size, + ) + return diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_qk.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_qk.py new file mode 100644 index 000000000..6cb8b204d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_qk.py @@ -0,0 +1,162 @@ +import torch + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['sparse_qk'] + + +@triton.autotune( + configs=[ + triton.Config({ + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 64 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 64 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 128 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 128 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 256 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 256 + }, num_stages=2, num_warps=2), + ], + key=['hidden_size'], +) +@triton.jit +def sparse_qk_kernel( + Q_ptr, + K_ptr, + QK_ptr, + random_ptr, + window_size, + batch_size, + global_size, + hidden_size, + seq_len, + block_num, + stride_randomb, + stride_randomr, + stride_Qb, + stride_Qs, + stride_Qh, + stride_Kb, + stride_Kh, + stride_Ks, + stride_QKb, + stride_QKs, + stride_QKh, + block_size: tl.constexpr, + BLOCK_HIDDEN: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_block_m = tl.program_id(1) + pid_block_n = tl.program_id(2) + + # random_ptr: [block_num, random_size] + random_ptr = (random_ptr + pid_block_m * stride_randomb + + (pid_block_n - global_size * 2) * stride_randomr) + random_index = tl.load(random_ptr) + + diff_block_q = pid_block_m + if pid_block_n < global_size * 2: + # global_size == 1, but in fast (global_size * 2) + diff_block_k = tl.where(pid_block_n == 0, 0, block_num - 1) + elif pid_block_n >= global_size * 2 + window_size: + diff_block_k = random_index + else: + diff_block_k = pid_block_m - 1 + + offset_global_row = global_size * block_size * stride_Qs + # offset_batch = pid_batch * BLOCK_BATCH * stride_b + # offset_block = diff_block * block_size * stride_s + + Q_block_ptr = tl.make_block_ptr( + base=Q_ptr + pid_batch * stride_Qb + offset_global_row, + shape=(seq_len - global_size * block_size, hidden_size), + strides=(stride_Qs, stride_Qh), + offsets=(diff_block_q * block_size, 0), + block_shape=(block_size, BLOCK_HIDDEN), + order=(1, 0), + ) + K_block_ptr = tl.make_block_ptr( + base=K_ptr + pid_batch * stride_Kb, + shape=(hidden_size, seq_len + (window_size // 2 * 2 * block_size)), + strides=(stride_Kh, stride_Ks), + # padding + global/window/random index + offsets=(0, + (window_size // 2 * block_size) + diff_block_k * block_size), + block_shape=(BLOCK_HIDDEN, block_size), + order=(1, 0), + ) + + qk = tl.zeros([block_size, block_size], dtype=tl.float32) + for _ in range(0, hidden_size, BLOCK_HIDDEN): + q = tl.load(Q_block_ptr) + k = tl.load(K_block_ptr) + qk += tl.dot(q, k) + Q_block_ptr = tl.advance(Q_block_ptr, (0, BLOCK_HIDDEN)) + K_block_ptr = tl.advance(K_block_ptr, (BLOCK_HIDDEN, 0)) + + offset_global_row = 1 * block_size * stride_QKs + offset_qk_batch = pid_batch * stride_QKb + diff_qk_m = diff_block_q * block_size + tl.arange(0, block_size) + diff_qk_n = pid_block_n * block_size + tl.arange(0, block_size) + QK_ptrs = ( + QK_ptr + offset_qk_batch + offset_global_row + + diff_qk_m[:, None] * stride_QKs + diff_qk_n[None, :] * stride_QKh) + tl.store(QK_ptrs, qk) + + +def sparse_qk(Q, K, QK, random_index, para): + (batch_size, global_size, random_size, window_size, hidden_size, seq_len, + block_num, block_size) = para + + def grid(META): + return ( + # batch, Q, K + batch_size, + (triton.cdiv(seq_len, block_size) - 2), + (global_size * 2 + window_size + random_size), + ) + + sparse_qk_kernel[grid]( + Q_ptr=Q, + K_ptr=K, + QK_ptr=QK, + random_ptr=random_index, + window_size=window_size, + batch_size=batch_size, + global_size=global_size, + hidden_size=hidden_size, + seq_len=seq_len, + block_num=block_num, + stride_randomb=random_index.stride(0), + stride_randomr=random_index.stride(1), + stride_Qb=Q.stride(0), + stride_Qs=Q.stride(1), + stride_Qh=Q.stride(2), + stride_Kb=K.stride(0), + stride_Kh=K.stride(1), + stride_Ks=K.stride(2), + stride_QKb=QK.stride(0), + stride_QKs=QK.stride(1), + stride_QKh=QK.stride(2), + block_size=block_size, + ) + return diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_softmax.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_softmax.py new file mode 100644 index 000000000..bb775d971 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_softmax.py @@ -0,0 +1,81 @@ +import torch + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['sparse_softmax'] + + +@triton.jit +def sparse_softmax_kernel( + QK_ptr, + softmax_QK_ptr, + window_size, + batch_size, + global_size, + hidden_size, + seq_len, + block_num, + stride_QKb, + stride_QKs, + stride_QKh, + block_size: tl.constexpr, + col_size: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_block_m = tl.program_id(1) + offset_batch = pid_batch * stride_QKb + offset_global_row = 1 * block_size * stride_QKs + + QK_block_ptr = tl.make_block_ptr( + base=QK_ptr + offset_batch + offset_global_row, + shape=(seq_len, seq_len), + strides=(stride_QKs, stride_QKh), + offsets=(pid_block_m * block_size, 0), + block_shape=(block_size, col_size), + order=(1, 0), + ) + qk = tl.load(QK_block_ptr) + qk_minus_max = qk - tl.max(qk, axis=1)[:, None] + numerator = tl.exp(qk_minus_max) + denominator = tl.sum(numerator, axis=1) + softmax_output = numerator / denominator[:, None] + + offset_qk_batch = pid_batch * stride_QKb + diff_qk_m = pid_block_m * block_size + tl.arange(0, block_size) + diff_qk_n = tl.arange(0, col_size) + softmax_QK_ptrs = (softmax_QK_ptr + offset_qk_batch + offset_global_row + + diff_qk_m[:, None] * stride_QKs + diff_qk_n[None, :]) + tl.store(softmax_QK_ptrs, softmax_output) + + +def sparse_softmax(QK, softmax_QK, para): + (batch_size, global_size, random_size, window_size, hidden_size, seq_len, + block_num, block_size) = para + + def grid(META): + return ( + # batch, Q(global_row exception), + batch_size, + triton.cdiv(seq_len, block_size) - 2, + ) + + sparse_softmax_kernel[grid]( + QK_ptr=QK, + softmax_QK_ptr=softmax_QK, + window_size=window_size, + batch_size=batch_size, + global_size=global_size, + hidden_size=hidden_size, + seq_len=seq_len, + block_num=block_num, + stride_QKb=QK.stride(0), + stride_QKs=QK.stride(1), + stride_QKh=QK.stride(2), + block_size=block_size, + col_size=(global_size * 2 + window_size + random_size), + ) + return diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_wv.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_wv.py new file mode 100644 index 000000000..cabb58552 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/triton/op/sparse_wv.py @@ -0,0 +1,183 @@ +import torch + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['sparse_wv'] + + +@triton.autotune( + configs=[ + triton.Config({ + 'BLOCK_HIDDEN': 32 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 32 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 64 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 64 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 128 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 128 + }, num_stages=2, num_warps=2), + triton.Config({ + 'BLOCK_HIDDEN': 256 + }, num_stages=4, num_warps=4), + triton.Config({ + 'BLOCK_HIDDEN': 256 + }, num_stages=2, num_warps=2), + ], + key=['hidden_size'], +) +@triton.jit +def sparse_wv_kernel( + W_ptr, + V_ptr, + O_ptr, + random_ptr, + window_size, + batch_size, + global_size, + hidden_size, + seq_len, + block_num, + random_size, + stride_randomb, + stride_randomr, + stride_Wb, + stride_Ws, + stride_Wh, + stride_Vb, + stride_Vh, + stride_Vs, + stride_Ob, + stride_Os, + stride_Oh, + block_size: tl.constexpr, + BLOCK_HIDDEN: tl.constexpr, +): + pid_batch = tl.program_id(0) + pid_block_m = tl.program_id(1) + pid_block_n = tl.program_id(2) + + offset_global_row = global_size * block_size * stride_Ws + + # offset_batch = pid_batch * BLOCK_BATCH * stride_b + # offset_block = diff_block * block_size * stride_s + + sparse_seq_len = (global_size * 2 + window_size + random_size) + + W_block_ptr = tl.make_block_ptr( + base=W_ptr + pid_batch * stride_Wb + offset_global_row, + shape=(seq_len - global_size * 2, sparse_seq_len), + strides=(stride_Ws, stride_Wh), + offsets=(pid_block_m * block_size, 0), + block_shape=(block_size, block_size), + order=(1, 0), + ) + V_block_ptr = tl.make_block_ptr( + base=V_ptr + pid_batch * stride_Vb, + shape=(seq_len + (window_size // 2 * 2 * block_size), hidden_size), + strides=(stride_Vs, stride_Vh), + # padding + offsets=((window_size // 2 * block_size), pid_block_n * BLOCK_HIDDEN), + block_shape=(block_size, BLOCK_HIDDEN), + order=(1, 0), + ) + + o = tl.zeros([block_size, BLOCK_HIDDEN], dtype=tl.float32) + + # global_size: first + w = tl.load(W_block_ptr) + v = tl.load(V_block_ptr) + o += tl.dot(w, v) + W_block_ptr = tl.advance(W_block_ptr, (0, block_size)) + V_block_ptr = tl.advance(V_block_ptr, (block_size * (block_num - 1), 0)) + + # global_size: last + w = tl.load(W_block_ptr) + v = tl.load(V_block_ptr) + o += tl.dot(w, v) + W_block_ptr = tl.advance(W_block_ptr, (0, block_size)) + V_block_ptr = tl.advance(V_block_ptr, (-block_size * (block_num - 1), 0)) + + # window_size + diff_block = pid_block_m - 1 + V_block_ptr = tl.advance(V_block_ptr, (diff_block * block_size, 0)) + for i in range(window_size): + w = tl.load(W_block_ptr) + v = tl.load(V_block_ptr) + o += tl.dot(w, v) + W_block_ptr = tl.advance(W_block_ptr, (0, block_size)) + V_block_ptr = tl.advance(V_block_ptr, (block_size, 0)) + + # random_size + V_block_ptr = tl.advance(V_block_ptr, + ((-diff_block - window_size) * block_size, 0)) + random_ptr = random_ptr + pid_block_m * stride_randomb + for i in range(random_size): + random_ptr += i * stride_randomr + random_index = tl.load(random_ptr) + V_block_ptr = tl.advance(V_block_ptr, (random_index * block_size, 0)) + w = tl.load(W_block_ptr) + v = tl.load(V_block_ptr) + o += tl.dot(w, v) + W_block_ptr = tl.advance(W_block_ptr, (0, block_size)) + V_block_ptr = tl.advance(V_block_ptr, (-random_index * block_size, 0)) + + offset_qk_batch = pid_batch * stride_Ob + offset_global_row = global_size * block_size * stride_Os + diff_qk_m = pid_block_m * block_size + tl.arange(0, block_size) + diff_qk_n = pid_block_n * BLOCK_HIDDEN + tl.arange(0, BLOCK_HIDDEN) + O_ptrs = (O_ptr + offset_qk_batch + offset_global_row + + diff_qk_m[:, None] * stride_Os + diff_qk_n[None, :] * stride_Oh) + tl.store(O_ptrs, o) + + +def sparse_wv(W, V, O, random_index, para): + (batch_size, global_size, random_size, window_size, hidden_size, seq_len, + block_num, block_size) = para + + def grid(META): + return ( + # batch, W(global_row exception), hidden + batch_size, + (triton.cdiv(seq_len, block_size) - 2), + triton.cdiv(hidden_size, META['BLOCK_HIDDEN']), + ) + + sparse_wv_kernel[grid]( + W_ptr=W, + V_ptr=V, + O_ptr=O, + random_ptr=random_index, + window_size=window_size, + batch_size=batch_size, + global_size=global_size, + hidden_size=hidden_size, + seq_len=seq_len, + block_num=block_num, + random_size=random_size, + stride_randomb=random_index.stride(0), + stride_randomr=random_index.stride(1), + stride_Wb=W.stride(0), + stride_Ws=W.stride(1), + stride_Wh=W.stride(2), + stride_Vb=V.stride(0), + stride_Vh=V.stride(1), + stride_Vs=V.stride(2), + stride_Ob=O.stride(0), + stride_Os=O.stride(1), + stride_Oh=O.stride(2), + block_size=block_size, + ) + return diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py new file mode 100644 index 000000000..5496d6568 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py @@ -0,0 +1,447 @@ +from asyncio import gather +import numpy as np +import argparse +import types + + +def tvm_bigbird(batch_size, hidden_size, len, block_size, window_size, + random_size, global_size, OUTPUT_FILE): + from tvm import relay + from tvm.relay import testing + import tvm + from tvm import te + from tvm.contrib import graph_executor + import tvm.testing + import math + + import tvm.auto_scheduler as auto_scheduler + from tvm.autotvm.tuner import XGBTuner + from tvm import topi, autotvm + import logging + from datetime import datetime + import sys + import argparse + # Enable debug logs + import logging + import os + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + logging.basicConfig() + logging.getLogger().setLevel(logging.DEBUG) + + target_name = 'cuda -libs=cublas' + target = tvm.target.Target(target_name) + dtype, itype = 'float32', 'int32' + + # udf0 + # qk + @auto_scheduler.register_workload + def bigbird_qk_global_row(N, L, B, H, W, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, 2 * G, B, B * L) + + def qk_mm(n, l, i, j): + return te.sum( + X[n, tvm.tir.if_then_else(l < G, l, L - G + l), i, k] * + Y[n, j // B + W // 2, j % B, k], + axis=k) + + R = te.compute(out_shape, qk_mm, name='R') + + return [X, Y, R] + + # softmax + @auto_scheduler.register_workload + def global_row_softmax(N, L, B, G, dtype): + x = te.placeholder((N, 2 * G, B, B * L)) + out = topi.nn.softmax(x) / (math.sqrt(hidden_size)) + + return [x, out] + + # wv + @auto_scheduler.register_workload + def bigbird_wv_global_row(N, L, B, H, W, G, dtype): + X = te.placeholder((N, 2 * G, B, B * L), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, B * L), name='k') + out_shape = (N, 2 * G, B, H) + + def wv_mm(n, l, i, j): + return te.sum( + X[n, l, i, k] * Y[n, W // 2 + k // B, k % B, j], axis=k) + + R = te.compute( + out_shape, + wv_mm, + name='R', + ) + + return [X, Y, R] + + # udf1 + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_global_window(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, B * (W + 2)) + + def qk_sddmm(n, l, i, j): + return te.sum( + X[n, l + G, i, k] * + Y[n, + tvm.tir.if_then_else( + j < G * B, W // 2, + tvm.tir.if_then_else(j < 2 * G * B, L - 1, l + j // B - + 2)), j % B, k], + axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, Y, Res] + + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_random(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + GatheredK = te.placeholder((N, R, B, H), name='GatheredK', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, B * R) + + def qk_sddmm(n, l, i, j): + return te.sum( + X[n, l + G, i, k] * GatheredK[n, j // B, j % B, k], axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, GatheredK, Res] + + @auto_scheduler.register_workload + def sparse_row_concat_softmax(N, L, B, H, W, R, G, dtype): + X = te.placeholder( + (N, L - 2 * G, B, B * (W + 2)), name='X', dtype=dtype) + Y = te.placeholder((N, L - 2 * G, B, B * R), name='Y', dtype=dtype) + concat_shape = (N, L - 2 * G, B, B * (W + R + 2)) + weight = te.compute( + concat_shape, + lambda i, j, k, l: tvm.tir.if_then_else( + l < B * (W + 2), X[i, j, k, l], Y[i, j, k, l - B * (W + 2)]) + ) + out = topi.nn.softmax(weight) / (math.sqrt(hidden_size)) + + return [X, Y, out] + + @auto_scheduler.register_workload + def sparse_row_softmax(N, L, B, H, W, R, G, dtype): + x = te.placeholder((N, L - 2 * G, B, B * (W + R + 2))) + out = topi.nn.softmax(x) / (math.sqrt(hidden_size)) + + return [x, out] + + # N = 1, L = 4096, B = 1, H = 64, W = 1, R = 1, G = 1 + + @auto_scheduler.register_workload + def bigbird_wv_SPMM_global_window(N, L, B, H, W, R, G, dtype): + X = te.placeholder( + (N, L - 2 * G, B, B * (W + R + 2)), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, B * (W + 2 * G)), name='k1') + out_shape = (N, L - 2 * G, B, H) + # (1, 4094, 1, 64) + res = te.compute(out_shape, lambda n, l, i, j: te.sum( + X[n, l, i, k]*Y[ + n, + tvm.tir.if_then_else( + k < G*B, W//2, tvm.tir.if_then_else(k < 2*G*B, L-1+W//2, l+k//B-2)), + k % B, + j], + axis=k), + name='res') + + return [X, Y, res] + + @auto_scheduler.register_workload + def bigbird_wv_SPMM_random(N, L, B, H, W, R, G, dtype): + X = te.placeholder( + (N, L - 2 * G, B, B * (W + R + 2)), name='X', dtype=dtype) + GatheredV = te.placeholder((N, R, B, H), name='GatheredV', dtype=dtype) + k = te.reduce_axis((0, B * R), name='k') + out_shape = (N, L - 2 * G, B, H) + + res = te.compute( + out_shape, lambda n, l, i, j: te.sum( + X[n, l, i, k+B*(W+2)] * GatheredV[n, k//B, k % B, j], axis=k), + name='res') + + return [X, GatheredV, res] + + @auto_scheduler.register_workload + def bigbird_wv_SPMM_reduce(N, L, B, H, W, R, G, dtype): + out_shape = (N, L - 2 * G, B, H) + X = te.placeholder(out_shape, name='X', dtype=dtype) + Y = te.placeholder(out_shape, name='Y', dtype=dtype) + + res = te.compute( + out_shape, + lambda n, l, i, j: X[n, l, i, j] + Y[n, l, i, j], + name='res') + + return [X, Y, res] + + args = (batch_size, len // block_size, block_size, hidden_size, + window_size, random_size, global_size, dtype) + + tasks = [ + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_global_row, + args=(batch_size, len // block_size, block_size, hidden_size, + window_size, global_size, dtype), + target=target), + tvm.auto_scheduler.SearchTask( + func=global_row_softmax, + args=(batch_size, len // block_size, block_size, global_size, + dtype), + target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_wv_global_row, + args=(batch_size, len // block_size, block_size, hidden_size, + window_size, global_size, dtype), + target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_SDDMM_global_window, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_SDDMM_random, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=sparse_row_concat_softmax, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_wv_SPMM_global_window, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_wv_SPMM_random, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_wv_SPMM_reduce, args=args, target=target), + ] + + shape = (batch_size, len, block_size, hidden_size, window_size, + global_size) + + log_file = f'ansor.{shape}.json' + + tuner = auto_scheduler.TaskScheduler(tasks) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=200, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, + ) + + # tuner.tune(tune_option) + + funcs = [] + + for task in tasks: + sch, args = task.apply_best(log_file) + funcs.append(tvm.build(sch, args, target)) + + dev = tvm.cuda() + + Q = np.ones( + [batch_size, len // block_size, block_size, hidden_size], + dtype="float32") + K = np.ones( + [ + batch_size, + len // block_size + window_size // 2 + window_size // 2, + block_size, hidden_size + ], + dtype="float32") + V = np.ones( + [ + batch_size, + len // block_size + window_size // 2 + window_size // 2, + block_size, hidden_size + ], + dtype="float32") + + GatheredK = np.ones( + [batch_size, random_size, block_size, hidden_size], dtype="float32") + GatheredV = np.ones( + [batch_size, random_size, block_size, hidden_size], dtype="float32") + + Q_tvm = tvm.nd.array(Q, device=dev) + K_tvm = tvm.nd.array(K, device=dev) + V_tvm = tvm.nd.array(V, device=dev) + GatheredK_tvm = tvm.nd.array(GatheredK, device=dev) + GatheredV_tvm = tvm.nd.array(GatheredV, device=dev) + + global_row_weight = tvm.nd.empty( + (batch_size, 2 * global_size, block_size, len), device=dev) + global_row_weight_softmax = tvm.nd.empty( + (batch_size, 2 * global_size, block_size, len), device=dev) + global_row_res = tvm.nd.empty( + (batch_size, 2 * global_size, block_size, hidden_size), device=dev) + + global_window_weight = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + (2 + window_size) * block_size), + device=dev) + random_weight = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + random_size * block_size), + device=dev) + sparse_row_weight_softmax = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + (2 + window_size + random_size) * block_size), + device=dev) + sparse_row_res = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + hidden_size), + device=dev) + sparse_row_res1 = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + hidden_size), + device=dev) + sparse_row_res2 = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + hidden_size), + device=dev) + + inputs_funcs = [ + # --------------------Global Row-------------------- + (Q_tvm, K_tvm, global_row_weight), # bigbird_qk_global_row + (global_row_weight, global_row_weight_softmax), # global_row_softmax + (global_row_weight_softmax, V_tvm, + global_row_res), # bigbird_wv_global_row + + # --------------------Sparse Row-------------------- + (Q_tvm, K_tvm, global_window_weight), # bigbird_qk_SDDMM_global_window + (Q_tvm, GatheredK_tvm, random_weight), # bigbird_qk_SDDMM_random + # sparse_row_concat_softmax + (global_window_weight, random_weight, sparse_row_weight_softmax), + # bigbird_wv_SPMM_global_window + (sparse_row_weight_softmax, V_tvm, sparse_row_res), + (sparse_row_weight_softmax, GatheredV_tvm, + sparse_row_res1), # bigbird_wv_SPMM_random + (sparse_row_res, sparse_row_res1, sparse_row_res2) + ] + + for func, inputs in zip(funcs, inputs_funcs): + func(*inputs) + + warmup_num = 5 + test_num = 10 + time_log = [] + + for func, inputs in zip(funcs, inputs_funcs): + evaluator = func.time_evaluator( + func.entry_name, dev, number=warmup_num) + evaluator(*inputs) + evaluator = func.time_evaluator(func.entry_name, dev, number=test_num) + time_ms = np.median(evaluator(*inputs).results) * 1000 + time_log.append(time_ms) + + blocks = batch_size * (len // block_size - 2 * global_size) * ( + window_size + random_size + + 2 * global_size) + batch_size * 2 * global_size * len // block_size + + operation_per_block = 4 * block_size * block_size * \ + hidden_size + 2 * block_size * block_size + + operations = blocks * operation_per_block + + operations = operations >> 25 + GFLOPs = operations / (sum(time_log) / 1000 * 32) + + file_name = f"bigbird_tvm_data_{batch_size}_{hidden_size}_{block_size}" + with open(file_name, 'a', encoding='utf-8') as f: + f.writelines(f"{batch_size}_{hidden_size}_{block_size}\n:") + f.writelines(f'Time breakdown (ms):, {time_log}\n') + f.writelines("Average e2e time: %.3f ms\n" % (sum(time_log))) + f.writelines(f"GFLOPs:{GFLOPs}\n") + + return sum(time_log), GFLOPs + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Bigbird') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=4096) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=32) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=512) + parser.add_argument( + '--block_size', type=int, help='Block size', default=64) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +def output_file(OUTPUT_FILE, cmd_args, run_time): + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write( + f"{cmd_args.batch_size}\t{cmd_args.seq_len}\t{cmd_args.hidden_size}\t{cmd_args.block_size}\t" + f"{run_time}\n") + + +if __name__ == '__main__': + global_size = 1 + window_size = 3 + random_size = 3 + + cmd_args = parse_test_args() + DEFAULT_TEST = cmd_args.default_test + OUTPUT_FILE = cmd_args.output_file + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "batch size\t sequence length\thidden\tblock size\telapsed time(ms)\n" + ) + + if DEFAULT_TEST: + test1_cmd_args = types.SimpleNamespace( + seq_len=4096, batch_size=32, hidden_size=512, block_size=64) + run_time1, _ = tvm_bigbird(32, 512, 4096, 64, window_size, random_size, + global_size, OUTPUT_FILE) + output_file(OUTPUT_FILE, test1_cmd_args, run_time1) + test2_cmd_args = types.SimpleNamespace( + seq_len=8192, batch_size=32, hidden_size=512, block_size=64) + run_time2, _ = tvm_bigbird(32, 512, 8192, 64, window_size, random_size, + global_size, OUTPUT_FILE) + output_file(OUTPUT_FILE, test2_cmd_args, run_time2) + else: + len = cmd_args.seq_len + batch_size = cmd_args.batch_size + hidden_size = cmd_args.hidden_size + block_size = cmd_args.block_size + run_time, _ = tvm_bigbird(batch_size, hidden_size, len, block_size, + window_size, random_size, global_size, + OUTPUT_FILE) + output_file(OUTPUT_FILE, cmd_args, run_time) diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/branch_on_reduce_axis.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/branch_on_reduce_axis.py new file mode 100644 index 000000000..3750b1aca --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/branch_on_reduce_axis.py @@ -0,0 +1,121 @@ +import timeit +import time +import numpy as np + +from tvm import relay +from tvm.relay import testing +import tvm +from tvm import te +from tvm.contrib import graph_executor +import tvm.testing +import math + +import tvm.auto_scheduler as auto_scheduler +from tvm.autotvm.tuner import XGBTuner +from tvm import topi, autotvm +import logging +from datetime import datetime +import sys +import argparse +# Enable debug logs +import logging +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +sys.path.append('../..') + +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) + +target_name = 'cuda -libs=cublas' +target = tvm.target.Target(target_name) + +batch_size = 8 +heads = 1 +seq_len = 4096 +hidden_size = 512 +block_size = 32 + +target = tvm.target.Target(target_name) +dtype, itype = 'float32', 'int32' + + +@auto_scheduler.register_workload +def test(B, M, N, K, dtype, itype): + X = te.placeholder((B, M, K), name='X', dtype=dtype) + Y = te.placeholder((B, N, K), name='Y', dtype=dtype) + Rand = te.placeholder((B, N), name='Rand', dtype=itype) + k = te.reduce_axis((0, K), name='k') + out_shape = (B, M, N) + + def algorithm(x, i, j): + return te.sum(X[x, i, k] * Y[x, Rand[x, j], k], axis=k) + + R = te.compute( + out_shape, + algorithm, + name='R', + ) + return [X, Y, Rand, R] + + +################################################################################ +tasks = [ + tvm.auto_scheduler.SearchTask( + func=test, args=(16, 1024, 1024, 1024, dtype, itype), target=target), +] + +shape = (batch_size, heads, seq_len, block_size) + +log_file = f'ansor.{shape}.json' + +tuner = auto_scheduler.TaskScheduler(tasks) +tune_option = auto_scheduler.TuningOptions( + num_measure_trials=20, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, +) + +tuner.tune(tune_option) + +funcs = [] +for task in tasks: + sch, args = task.apply_best(log_file) + funcs.append(tvm.build(sch, args, target)) + +dev = tvm.cuda() + +Q = np.ones([1024, 1024], dtype="float32") +K = np.ones( + [1024, 1024], #padding for window access + dtype="float32") +Rand = np.ones([1024, 1024], dtype="int32") + +Q_tvm = tvm.nd.array(Q, device=dev) +K_tvm = tvm.nd.array(K, device=dev) +Rand_tvm = tvm.nd.array(Rand, device=dev) +res = tvm.nd.empty((1024, 1024), device=dev) + +inputs_funcs = [(Q_tvm, K_tvm, Rand_tvm, res)] + +for func, inputs in zip(funcs, inputs_funcs): + func(*inputs) + +# Evaluation +warmup_num = 5 +test_num = 10 +time_log = [] +for func, inputs in zip(funcs, inputs_funcs): + evaluator = func.time_evaluator(func.entry_name, dev, number=warmup_num) + evaluator(*inputs) + evaluator = func.time_evaluator(func.entry_name, dev, number=test_num) + time_ms = np.median(evaluator(*inputs).results) * 1000 + time_log.append(time_ms) +print(f"{warmup_num} warmup, {test_num} repeats for evalution") +print('Time breakdown (ms):', time_log) +print("Average e2e time: %.3f ms" % (sum(time_log))) + +block_num = batch_size * heads * (seq_len // block_size - 2) * 4 +operations = block_num * ( + 4 * block_size * block_size * hidden_size + 2 * block_size * block_size) +operations = operations >> 25 +print(f"GFLOPs:{operations/(sum(time_log)/1000*32)}") diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/test_tvm.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/test_tvm.py new file mode 100644 index 000000000..397dcb8d9 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/test_tvm.py @@ -0,0 +1,265 @@ +from asyncio import gather +import numpy as np + + +def tvm_bigbird(batch_size, hidden_size, len, block_size, window_size, + random_size, global_size): + from tvm import relay + from tvm.relay import testing + import tvm + from tvm import te + from tvm.contrib import graph_executor + import tvm.testing + import math + + import tvm.auto_scheduler as auto_scheduler + from tvm.autotvm.tuner import XGBTuner + from tvm import topi, autotvm + import logging + from datetime import datetime + import sys + import argparse + # Enable debug logs + import logging + import os + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + logging.basicConfig() + logging.getLogger().setLevel(logging.DEBUG) + + target_name = 'cuda -libs=cublas' + target = tvm.target.Target(target_name) + dtype, itype = 'float32', 'int32' + + # udf1 + + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_global_window(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, B * (W + 2)) + + def qk_sddmm(n, l, i, j): + return te.sum( + X[n, l + G, i, k] * + Y[n, + tvm.tir.if_then_else( + j < G * B, W // 2, + tvm.tir.if_then_else(j < 2 * G * B, L - 1, l + j // B - + 2)), j % B, k], + axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, Y, Res] + + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_global(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, 2 * B) + + def qk_sddmm(n, l, i, j): + return te.sum( + X[n, l + G, i, k] * + Y[n, + tvm.tir.if_then_else(j < G * B, W // 2, L - 1), j % B, k], + axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, Y, Res] + + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_window(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, W * B) + + def qk_sddmm(n, l, i, j): + return te.sum( + X[n, l + G, i, k] * Y[n, l + j // B - 2, j % B, k], axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, Y, Res] + + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_global_A1(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, B) + + def qk_sddmm(n, l, i, j): + return te.sum(X[n, l + G, i, k] * Y[n, W // 2, j % B, k], axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, Y, Res] + + @auto_scheduler.register_workload + def bigbird_qk_SDDMM_global_A2(N, L, B, H, W, R, G, dtype): + X = te.placeholder((N, L, B, H), name='X', dtype=dtype) + Y = te.placeholder( + (N, L + W // 2 + W // 2, B, H), name='Y', dtype=dtype) + k = te.reduce_axis((0, H), name='k') + out_shape = (N, L - 2 * G, B, B) + + def qk_sddmm(n, l, i, j): + return te.sum(X[n, l + G, i, k] * Y[n, L - 1, j % B, k], axis=k) + + Res = te.compute( + out_shape, + qk_sddmm, + name='Res', + ) + + return [X, Y, Res] + + args = (batch_size, len // block_size, block_size, hidden_size, + window_size, random_size, global_size, dtype) + + tasks = [ + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_SDDMM_global_window, args=args, target=target), + # tvm.auto_scheduler.SearchTask( + # func=bigbird_qk_SDDMM_global , args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_SDDMM_global_A1, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_SDDMM_global_A2, args=args, target=target), + tvm.auto_scheduler.SearchTask( + func=bigbird_qk_SDDMM_window, args=args, target=target), + ] + + shape = (batch_size, len, block_size, hidden_size, window_size, + global_size) + + log_file = f'ansor.{shape}.json' + + tuner = auto_scheduler.TaskScheduler(tasks) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=64 * 4, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, + ) + + tuner.tune(tune_option) + + funcs = [] + + for task in tasks: + sch, args = task.apply_best(log_file) + funcs.append(tvm.build(sch, args, target)) + + dev = tvm.cuda() + + Q = np.ones( + [batch_size, len // block_size, block_size, hidden_size], + dtype="float32") + K = np.ones( + [ + batch_size, + len // block_size + window_size // 2 + window_size // 2, + block_size, hidden_size + ], + dtype="float32") + V = np.ones( + [ + batch_size, + len // block_size + window_size // 2 + window_size // 2, + block_size, hidden_size + ], + dtype="float32") + + GatheredK = np.ones( + [batch_size, len // block_size, random_size, block_size, hidden_size], + dtype="float32") + GatheredV = np.ones( + [batch_size, len // block_size, random_size, block_size, hidden_size], + dtype="float32") + + Q_tvm = tvm.nd.array(Q, device=dev) + K_tvm = tvm.nd.array(K, device=dev) + V_tvm = tvm.nd.array(V, device=dev) + + global_window_weight = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + (2 + window_size) * block_size), + device=dev) + global_weight_A1 = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + block_size), + device=dev) + global_weight_A2 = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + block_size), + device=dev) + window_weight = tvm.nd.empty( + (batch_size, len // block_size - 2 * global_size, block_size, + (window_size) * block_size), + device=dev) + + inputs_funcs = [(Q_tvm, K_tvm, global_window_weight), (Q_tvm, K_tvm, + global_weight_A1), + (Q_tvm, K_tvm, global_weight_A2), (Q_tvm, K_tvm, + window_weight)] + + for func, inputs in zip(funcs, inputs_funcs): + func(*inputs) + + warmup_num = 5 + test_num = 10 + time_log = [] + + for func, inputs in zip(funcs, inputs_funcs): + evaluator = func.time_evaluator( + func.entry_name, dev, number=warmup_num) + evaluator(*inputs) + evaluator = func.time_evaluator(func.entry_name, dev, number=test_num) + time_ms = np.median(evaluator(*inputs).results) * 1000 + time_log.append(time_ms) + + file_name = "test_tvm_data" + with open(file_name, 'a', encoding='utf-8') as f: + f.writelines(f"{batch_size}_{hidden_size}_{block_size}\n:") + f.writelines(f'Time breakdown (ms):, {time_log}\n') + + return 0 + + +if __name__ == '__main__': + len = 4096 + global_size = 1 + window_size = 3 + random_size = 3 + + batch_size = 1 + hidden_size = 512 + block_size = 64 + + tvm_bigbird(batch_size, hidden_size, len, block_size, window_size, + random_size, global_size) diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/README.md b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/README.md new file mode 100644 index 000000000..00017f145 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/README.md @@ -0,0 +1,19 @@ +

+
+Fig. Compose back-to-back GEMMs using parallel operator nesting. +

+ +

+
+Fig. Extended task dependence graph representations for back-to-back GEMMs. +

+ +

+
+Fig. AccessMap annotation attached to the extended task dependence graph. +

+ +

+
+Fig. Translate into heirarchical dataflow on the CUDA device. +

diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/README.md b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/README.md new file mode 100644 index 000000000..f6a09ed22 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/README.md @@ -0,0 +1 @@ +[TBD] diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/jax/fused_two_hgemms.py b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/jax/fused_two_hgemms.py new file mode 100644 index 000000000..397298c48 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/jax/fused_two_hgemms.py @@ -0,0 +1,104 @@ +import os +import time +import numpy as np +import argparse +import jax +import torch +from jax import jit +import jax.numpy as jnp +jax.config.update("jax_enable_x64", True) + +os.environ['XLA_FLAGS'] = ( + '--xla_gpu_enable_triton_softmax_fusion=true ' + '--xla_gpu_triton_gemm_any=True ' + '--xla_gpu_enable_async_collectives=true ' + '--xla_gpu_enable_latency_hiding_scheduler=true ' + '--xla_gpu_enable_highest_priority_async_stream=true ') + + +@jit +def backToBackGemm(a, b, c): + d = jnp.dot(jnp.dot(a, b), c) + return d + + +@jit +def backToBackGemm2(a, b, c): + d = jnp.linalg.multi_dot([a, b, c]) + return d + + +def accept_test(M, K, N, P): + a = np.random.normal(size=(M, K)).astype(np.float16) + b = np.random.normal(size=(K, N)).astype(np.float16) + c = np.random.normal(size=(N, P)).astype(np.float16) + d = np.dot(np.dot(a, b), c) + + a_j = jax.device_put(a) + b_j = jax.device_put(b) + c_j = jax.device_put(c) + d_j = backToBackGemm(a_j, b_j, c_j) + + print(f"NumPy_output={d}") + print(f"JAX_output={d_j}") + if np.allclose(d, d_j, atol=1e-1, rtol=0): + print("✅ JAX and NumPy match") + else: + print("❌ JAX and NumPy differ") + + +def run_test(test_case): + warmup = 25 + iter = 100 + for case in test_case: + M, K, N, P = case + a = jax.device_put(np.random.normal(size=(M, K)).astype(np.float16)) + b = jax.device_put(np.random.normal(size=(K, N)).astype(np.float16)) + c = jax.device_put(np.random.normal(size=(N, P)).astype(np.float16)) + # quantiles = [0.5, 0.2, 0.8] + + for _ in range(warmup): + backToBackGemm(a, b, c) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + total_time = 0 + for _ in range(iter): + torch.cuda.synchronize() + start_event.record() + backToBackGemm(a, b, c) + torch.cuda.synchronize() + end_event.record() + elapsed = start_event.elapsed_time(end_event) + total_time += elapsed + + print(f"[{M}, {K}][{K}, {N}][{N}, {P}]\t" + f"Baseline(ms): {total_time * 1000 / iter}ms") + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write(f"[{M}, {K}][{K}, {N}][{N}, {P}]\t" + f"Baseline(ms): {total_time * 1000 / iter}ms\n") + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='BacktoBack GEMMs') + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + return parser.parse_args() + + +if __name__ == '__main__': + + test_case = [[8192, 64, 256, 64], [8192, 64, 512, 64], + [16384, 64, 256, 64], [16384, 64, 256, 64]] + cmd_args = parse_test_args() + OUTPUT_FILE = cmd_args.output_file + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write("GEMM Shape\tJAX(ms)\n") + run_test(test_case) + + # Print the HLO + # xla_comp = jax.xla_computation(backToBackGemm)(a, b, c) + # print(xla_comp.as_hlo_text()) diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/triton/fused_two_hgemms.py b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/triton/fused_two_hgemms.py new file mode 100644 index 000000000..7c37053d8 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/triton/fused_two_hgemms.py @@ -0,0 +1,291 @@ +import torch + +import triton +import triton.language as tl +import argparse + +import os +os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + + +@triton.autotune( + configs=[ + triton.Config( + { + 'BLOCK_SIZE_M': 256, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_M': 256, + 'BLOCK_SIZE_N': 64, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_M': 256, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 64, + 'BLOCK_SIZE_P': 32 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_M': 256, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 64 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_M': 128, + 'BLOCK_SIZE_N': 256, + 'BLOCK_SIZE_K': 64, + 'BLOCK_SIZE_P': 32 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_M': 128, + 'BLOCK_SIZE_N': 128, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 128, + 'BLOCK_SIZE_N': 64, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 128, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 128, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 64 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 128, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 128 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 64, + 'BLOCK_SIZE_N': 128, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 64, + 'BLOCK_SIZE_N': 64, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_M': 64, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=5, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_M': 32, + 'BLOCK_SIZE_N': 64, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=5, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_M': 32, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 32, + 'BLOCK_SIZE_P': 32 + }, + num_stages=2, + num_warps=4), + ], + key=['M', 'N', 'K', 'P'], +) +@triton.jit +def backToBackGemm_kernel( + a_ptr, b_ptr, c_ptr, d_ptr, M, K, N, P, stride_am, stride_ak, + stride_bk, stride_bn, stride_cn, stride_cp, stride_dm, stride_dp, + BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_P: tl.constexpr): + pid_m = tl.program_id(0) + pid_p = tl.program_id(1) + a_block_ptr = tl.make_block_ptr( + base=a_ptr, + shape=(M, K), + strides=(stride_am, stride_ak), + offsets=(pid_m * BLOCK_SIZE_M, 0), + block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_K), + order=(1, 0), + ) + b_block_ptr = tl.make_block_ptr( + base=b_ptr, + shape=(K, N), + strides=(stride_bk, stride_bn), + offsets=(0, 0), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_N), + order=(1, 0), + ) + c_block_ptr = tl.make_block_ptr( + base=c_ptr, + shape=(N, P), + strides=(stride_cn, stride_cp), + offsets=(0, pid_p * BLOCK_SIZE_P), + block_shape=(BLOCK_SIZE_N, BLOCK_SIZE_P), + order=(1, 0), + ) + d = tl.zeros([BLOCK_SIZE_M, BLOCK_SIZE_P], dtype=tl.float32) + for start_n in range(0, N, BLOCK_SIZE_N): + c = tl.load(c_block_ptr, boundary_check=(0, ), padding_option='zero') + p = tl.zeros([BLOCK_SIZE_M, BLOCK_SIZE_N], dtype=tl.float32) + for start_k in range(0, K, BLOCK_SIZE_K): + a = tl.load( + a_block_ptr, boundary_check=(1, ), padding_option='zero') + b = tl.load( + b_block_ptr, boundary_check=(0, 1), padding_option='zero') + p += tl.dot(a, b) + # p = p.to(tl.float16) + a_block_ptr = tl.advance(a_block_ptr, (0, BLOCK_SIZE_K)) + b_block_ptr = tl.advance(b_block_ptr, (BLOCK_SIZE_K, 0)) + + p = p.to(tl.float16) + d += tl.dot(p, c) + a_block_ptr = tl.advance(a_block_ptr, (0, -K)) + b_block_ptr = tl.advance(b_block_ptr, (-K, BLOCK_SIZE_N)) + c_block_ptr = tl.advance(c_block_ptr, (BLOCK_SIZE_N, 0)) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_p = pid_p * BLOCK_SIZE_P + tl.arange(0, BLOCK_SIZE_P) + d_ptrs = d_ptr + offs_m[:, None] * stride_dm + offs_p[None, :] * stride_dp + mask = (offs_m < M)[:, None] & (offs_p < P)[None, :] + d = d.to(tl.float16) + tl.store(d_ptrs, d, mask=mask) + + +def backToBackGemm(a, b, c): + assert a.shape[1] == b.shape[0], "incompatible dimensions a-b" + assert b.shape[1] == c.shape[0], "incompatible dimensions b-c" + M, K = a.shape + K, N = b.shape + N, P = c.shape + + d = torch.empty((M, P), device=a.device, dtype=torch.float16) + + def grid(META): + return (triton.cdiv(M, META['BLOCK_SIZE_M']), + triton.cdiv(P, META['BLOCK_SIZE_P']), 1) + + backToBackGemm_kernel[grid]( + a_ptr=a, + b_ptr=b, + c_ptr=c, + d_ptr=d, + M=M, + N=N, + K=K, + P=P, + stride_am=a.stride(0), + stride_ak=a.stride(1), + stride_bk=b.stride(0), + stride_bn=b.stride(1), + stride_cn=c.stride(0), + stride_cp=c.stride(1), + stride_dm=c.stride(0), + stride_dp=c.stride(1)) + return d + + +def accept_test(a, b, c): + triton_output = backToBackGemm(a, b, c) + torch_output = torch.matmul(torch.matmul(a, b), c) + print(f"Triton_output={triton_output}") + print(f"Torch_output={torch_output}") + if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0): + print("✅ Triton and Torch match") + else: + print("❌ Triton and Torch differ") + + +def run_test(test_case, OUTPUT_FILE): + torch.manual_seed(0) + for case in test_case: + M, K, N, P = case + a = torch.randn((M, K), device='cuda', dtype=torch.float16) + b = torch.randn((K, N), device='cuda', dtype=torch.float16) + c = torch.randn((N, P), device='cuda', dtype=torch.float16) + # accept_test(a, b, c) + ms = triton.testing.do_bench( + lambda: backToBackGemm(a, b, c), + warmup=25, + rep=100, + return_mode='mean') + print(f"[{M}, {K}][{K}, {N}][{N}, {P}]\t" f"Baseline(ms): {ms}ms") + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write(f"[{M}, {K}][{K}, {N}][{N}, {P}]\t" + f"Baseline(ms): {ms}ms\n") + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='BacktoBack GEMMs') + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + return parser.parse_args() + + +if __name__ == '__main__': + test_case = [[8192, 64, 256, 64], [8192, 64, 512, 64], + [16384, 64, 256, 64], [16384, 64, 256, 64]] + cmd_args = parse_test_args() + OUTPUT_FILE = cmd_args.output_file + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write("GEMM Shape\tTriton(ms)\n") + torch.manual_seed(0) + run_test(test_case, OUTPUT_FILE) diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/tvm/fused_two_hgemms.py b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/tvm/fused_two_hgemms.py new file mode 100644 index 000000000..499214bae --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/baseline/tvm/fused_two_hgemms.py @@ -0,0 +1,160 @@ +import logging +import sys +import warnings + +import torch +import numpy as np +import time +import math +import argparse + +from pathlib import Path +import tvm +from tvm import te +from tvm import testing +from tvm import autotvm +from tvm.target import Target +from tvm import auto_scheduler +from tvm import topi +from tvm.autotvm.tuner import XGBTuner + + +def tvm_solver(parameter, + target, + dtype, + use_logFile, + logFile="backToBackGemm.json"): + @auto_scheduler.register_workload + def backToBackGemm_kernel(M, K, N, P, dtype): + A = te.placeholder((M, K), name="A", dtype=dtype) + B = te.placeholder((K, N), name="B", dtype=dtype) + C = te.placeholder((N, P), name="C", dtype=dtype) + + k = te.reduce_axis((0, K), name="L-K") + n = te.reduce_axis((0, N), name="L-N") + P_ = te.compute( + (M, N), + lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), + name="matmul", + attrs={"layout_free_placeholders": [B]}, + ) + D = te.compute( + (M, P), + lambda i, j: te.sum(P_[i, n] * C[n, j], axis=n), + name="matmul", + attrs={"layout_free_placeholders": [C]}, + ) + return [A, B, C, D] + + M, K, N, P = parameter + logFile = f"backToBackGemm_{M}_{K}_{N}_{P}.json" + + tasks = [ + tvm.auto_scheduler.SearchTask( + func=backToBackGemm_kernel, + args=(M, K, N, P, dtype), + target=target), + ] + + tuning_rounds = 1000 + tuner = auto_scheduler.TaskScheduler(tasks) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=tuning_rounds * len(tasks), + measure_callbacks=[auto_scheduler.RecordToFile(logFile)], + verbose=2, + ) + + if use_logFile is False: + autosch_time_start = time.time() + + tuner.tune(tune_option) + + autosch_time_end = time.time() + print("auto scheduler cost", (autosch_time_end - autosch_time_start), + "s") + + backToBackGemm_kernel_sch, backToBackGemm_kernel_args = tasks[ + 0].apply_best(logFile) + backToBackGemm_mod = tvm.lower( + backToBackGemm_kernel_sch, + backToBackGemm_kernel_args, + simple_mode=True) + backToBackGemm = tvm.build(backToBackGemm_kernel_sch, + backToBackGemm_kernel_sch, target) + + return backToBackGemm + + +def run_test(test_case): + torch.manual_seed(0) + warmup = 25 + iter = 100 + for case in test_case: + dtype = "float16" + dev = tvm.cuda() + M, K, N, P = case + + a = np.zeros([M, K], dtype=dtype) + b = np.zeros([K, N], dtype=dtype) + c = np.zeros([N, P], dtype=dtype) + d = (np.matmul(a, b), c) + + a_tvm = tvm.nd.array(np.zeros([M, K], dtype=dtype), dev) + b_tvm = tvm.nd.array(np.zeros([K, N], dtype=dtype), dev) + c_tvm = tvm.nd.array(np.zeros([N, P], dtype=dtype), dev) + d_tvm = tvm.nd.array(np.zeros([M, P], dtype=dtype), dev) + + parameter = [M, K, N, P] + use_logFile = False + target = "cuda -libs=cublas" + backToBackGemm = \ + tvm_solver(parameter, target, dtype, use_logFile) + backToBackGemm(a_tvm, b_tvm, c_tvm, d_tvm) + + for _ in range(warmup): + backToBackGemm(a_tvm, b_tvm, c_tvm, d_tvm) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + total_time = 0 + for _ in range(iter): + torch.cuda.synchronize() + start_event.record() + backToBackGemm(a_tvm, b_tvm, c_tvm, d_tvm) + torch.cuda.synchronize() + end_event.record() + elapsed = start_event.elapsed_time(end_event) + total_time += elapsed + + print(f"[{M}, {K}][{K}, {N}][{N}, {P}]\t" + f"Baseline(ms): {total_time * 1000 / iter}ms") + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write(f"[{M}, {K}][{K}, {N}][{N}, {P}]\t" + f"Baseline(ms): {total_time * 1000 / iter}ms\n") + + # print(f"TVM_output={d}") + # print(f"NumPy_output={d_tvm}") + # if torch.allclose(d, d_tvm, atol=1e-2, rtol=0): + # print("✅ TVM and NumPy match") + # else: + # print("❌ TVM and NumPy differ") + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='BacktoBack GEMMs') + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + return parser.parse_args() + + +if __name__ == '__main__': + test_case = [[8192, 64, 256, 64], [8192, 64, 512, 64], + [16384, 64, 256, 64], [16384, 64, 256, 64]] + cmd_args = parse_test_args() + OUTPUT_FILE = cmd_args.output_file + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write("GEMM Shape\tTVM(ms)\n") + run_test(test_case) diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/CMakeLists.txt b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/CMakeLists.txt new file mode 100644 index 000000000..bc5d942d0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/CMakeLists.txt @@ -0,0 +1,76 @@ +cmake_minimum_required(VERSION 3.18) +project(fused_two_hgemms CXX C) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake") +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/../../../cmake/Modules/") + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(CUDA QUIET REQUIRED) +find_package(CuDNN QUIET REQUIRED) + +cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") +message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +if(CUTLASS_NATIVE_CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) +else() + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) +endif() + +set(CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS} -w + ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS_DEBUG} + -w ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS} + ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") +set(CMAKE_CXX_FLAGS_DEBUG + "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") +set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall + -Wno-sign-compare") +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CUDNN_INCLUDE_DIRS}) + +# FIXME(ying): this requires to build the main project first. And it is a quite +# a tricky way to build the benchmark. +include_directories( + "../../../build/third_party/cutlass/src/extern_cutlass/include") +include_directories( + "../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include") +include_directories("../../../build/third_party/install/glog/include") +include_directories( + "../../../build/third_party/gflags/src/extern_gflags-build/include") +include_directories("../../../") +link_directories("../../../build/kaleido/core") +link_directories("../../../build/kaleido/core/operators") + +cuda_add_executable(hgemm_b2b fused_two_hgemms.cu) +target_link_libraries( + hgemm_b2b + fill_op + print_op + expect_eq_op + fractaltensor_core + ${CUDA_LIBRARIES} + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_curand_LIBRARY} + ${CUDNN_LIBRARIES}) diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/Makefile b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/Makefile new file mode 100644 index 000000000..58dc33cc1 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/Makefile @@ -0,0 +1,12 @@ +BENCH_NAME ?= back2back_hgemm +BUILD_DIR := build + +.PHONY: build clean + +build: + @mkdir -p build && cd build && cmake .. && make -j12 + +$(BUILD_DIR)/$(BENCH_NAME): build + +clean: + @rm -rf build diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png new file mode 100644 index 000000000..229790b72 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png differ diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png new file mode 100644 index 000000000..e6d15b792 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png differ diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png new file mode 100644 index 000000000..49bf34254 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png differ diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png new file mode 100644 index 000000000..e9404cc5a Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png differ diff --git a/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/fused_two_hgemms.cu b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/fused_two_hgemms.cu new file mode 100644 index 000000000..eb4b047b2 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/fused_two_hgemms/fractaltensor/fused_two_hgemms.cu @@ -0,0 +1,174 @@ +#include "kaleido/core/cuda_allocator.h" +#include "kaleido/core/device/cuda_timer.h" +#include "kaleido/core/device/cuda_utils.h" +#include "kaleido/core/device/gpu_context.h" +#include "kaleido/core/device/kernels/gemm.h" +#include "kaleido/core/operators/expect_eq_op.h" +#include "kaleido/core/operators/fill_op.h" +#include "kaleido/core/operators/tests/b2b_gemm_test_utils.h" +#include "kaleido/core/place.h" +#include "kaleido/core/tensor.h" +#include "kaleido/core/tile_shape.h" + +#include + +#include +#include +#include +#include + +using namespace kaleido::core; + +template +void run_test(std::ofstream& fout) { + cudaStream_t stream; + CudaCheck(cudaStreamCreate(&stream)); + auto allocator = std::make_shared(); + allocator->add_track_stream(stream); + CUDAPlace place = CUDAPlace(0); + GPUContext context(place); + + cudaDeviceProp m_dev_prop; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + cudaGetDeviceProperties(&m_dev_prop, device_idx); + + ops::FillOp fill; + + const int kM = dim_size<0, WholeShape>; + const int kN = dim_size<1, WholeShape>; + const int kK = dim_size<2, WholeShape>; + const int kP = dim_size<3, WholeShape>; + + const int kTM = dim_size<0, CtaTileShape>; + const int kTN = dim_size<1, CtaTileShape>; + const int kTK = dim_size<2, CtaTileShape>; + const int kTP = dim_size<3, CtaTileShape>; + + int shm_input = + (kTM * kTK /*A tile*/ + kTK * kTN /*B tile*/ + kTN * kTP /*B tile*/); + int shm_output = kTM * kTP /*output tile*/; + + // output tile reuse the shared memory buffer for the input tiles + int shm_size = shm_input < shm_output ? shm_output * sizeof(Element) + : shm_input * sizeof(Element); + LOG(INFO) << "shared memory size:" << shm_size / 1024 << "KB"; + + int num_blocks = CeilDiv * CeilDiv; + int num_threads = dim_size<0, WarpShape> * 32; + + fout << "[" << kM << ", " << kK << "][" << kK << ", " << kN << "][" << kN + << ", " << kP << "]\t"; + fout << "[" << kTM << ", " << kTK << "][" << kTK << ", " << kTN << "][" + << kTN << ", " << kTP << "]\t" << shm_size / 1024 << "\t" << num_blocks + << "\t" << num_threads << "\t"; + + kaleido::core::Tensor A({kM, kK}, allocator); + kaleido::core::Tensor B({kK, kN}, allocator); + kaleido::core::Tensor C({kN, kP}, allocator); + kaleido::core::Tensor D({kM, kP}, allocator); + + kaleido::core::Tensor ref_P({kM, kN}, allocator); + kaleido::core::Tensor ref_D({kM, kP}, allocator); + + fill(A, 0., 1e-3); + fill(B, 0., 1e-3); + fill(C, 0., 1e-3); + + fill(D, 0.); + fill(ref_P, 0.); + fill(ref_D, 0.); + + cublasHandle_t handle; + CublasCheck(cublasCreate(&handle)); + + using Gemm = + cuda_kernel::B2BGemm; + Gemm gemm; + + ops::ExpectEqOp check; + + bool passed_unittest = false; + + const int warm_up = 50; + for (int i = 0; i < warm_up; ++i) { // warm up + gemm(A.data(), B.data(), C.data(), + D.mutable_data()); + + cublas_two_hgemms(handle, A, B, C, // inputs + ref_P, // ref_P = A @ B + ref_D /*ref_D = ref_P @ C*/); + + if (!passed_unittest) { // check correctness + check(D, ref_D, 3e-3); + passed_unittest = true; + } + } + + const int iter = 200; + + CudaTimer timer; + timer.Start(); + for (int i = 0; i < iter; ++i) { + cublas_two_hgemms(handle, A, B, C, // inputs + ref_P, // ref_P = A @ B + ref_D /*ref_D = ref_P @ C*/); + } + float time1 = timer.Stop() / iter; + CublasCheck(cublasDestroy(handle)); + + timer.Start(); + for (int i = 0; i < iter; ++i) { + gemm(A.data(), B.data(), C.data(), + D.mutable_data()); + } + float time2 = timer.Stop() / iter; + + fout << time1 << "\t" << time2 << "\t" << time2 / time1 << "\n"; +} + +int main(int argc, char** argv) { + assert(argc == 2); + const char* filename = argv[1]; + + google::InitGoogleLogging("back-to-back gemms"); + + std::ofstream fout; + fout.setf(std::ios::fixed); + fout.precision(4); + + std::stringstream file_name; + file_name << filename; + fout.open(file_name.str(), std::ios::out); + fout << "GEMM Shape\tCTA Tile Shape\tShared " + "Memory(KB)\tblocks\tthreads\tcuBLAS(ms)\tFused two " + "GEMMs(ms)\tRatio " + "to cuBLAS\n"; + + run_test, + TileShape<128 /*kTM*/, 64 /*kTN*/, 64 /*kTK*/, 64 /*kTP*/>, + TileShape<4, 1> /*b2b gemm requires 1 warp per column*/ + >(fout); + + run_test, + TileShape<128 /*kTM*/, 64 /*kTN*/, 64 /*kTK*/, 64 /*kTP*/>, + TileShape<4, 1> /*b2b gemm requires 1 warp per column*/ + >(fout); + + run_test, + TileShape<128 /*kTM*/, 64 /*kTN*/, 64 /*kTK*/, 64 /*kTP*/>, + TileShape<8, 1> /*b2b gemm requires 1 warp per column*/ + >(fout); + + run_test, + TileShape<128 /*kTM*/, 64 /*kTN*/, 64 /*kTK*/, 64 /*kTP*/>, + TileShape<8, 1> /*b2b gemm requires 1 warp per column*/ + >(fout); + + return 0; +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/README.md b/artifacts/FractalTensor/benchmarks/multi-head_attention/README.md new file mode 100644 index 000000000..16c185af2 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/README.md @@ -0,0 +1,21 @@ +# Program Flash Attention with Parallel Operator Nesting + +

+
+Fig. Program flash attention with parallel operator nesting. +

+ +

+
+Fig. Parse into nested ETDG representation. +

+ +

+
+Fig. Fused ETDG representation. +

+ +

+
+Fig. Mapping to CUDA memory and compute hierarchy (the left part represents the imperative equivalent of Fused ETDG). +

diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/README.md b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/README.md new file mode 100644 index 000000000..e006f8582 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/README.md @@ -0,0 +1,28 @@ +[Flash attention](https://github.com/HazyResearch/flash-attention) and Triton (```pip3 install -U --pre triton==2.0.0.dev20221202```) must be install to run this test. + +# Benchmark multi-head attention + +## OPT model size + +

+ +
Fig. OPT model size. +

+ +Implementations to compare: + +|Methods|Codes| +|:--|:--| +|M1|[MHA using PyTorch operators](https://github.com/lcy-seso/FractalTensor/blob/master/benchmarks/multi-head_attention/baseline/MultiHeadAttention/pt_model/pt_attn.py#L87)| +|M2|[Author's original Flash Attention implementation](https://github.com/HazyResearch/flash-attention)| +|M3|[Flash Attention Triton implementation](https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py)| +|M4|[cutlass fused MHA](https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu)| + +## Test results + +1. [2080Ti](./figures/pt_data_2080Ti.tsv) + + ***triton and cublass fused multi-head attention can only be compiled on CUDA campability larger than 8.0.*** +1. [A6000](./figures/pt_data_A6000.tsv) + +1. [A100](./figures/pt_data_A100.tsv) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/CMakeLists.txt b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/CMakeLists.txt new file mode 100644 index 000000000..f2c3eacb1 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.0) +project(benchmarks CXX C) + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/../../../../../cmake/Modules/") + +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(CUDA QUIET REQUIRED) +find_package(CuDNN QUIET REQUIRED) + +cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") +message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") +set(CUDA_PROPAGATE_HOST_FLAGS OFF) +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) + +set(CMAKE_BUILD_TYPE Release) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") +set(CMAKE_CXX_FLAGS_DEBUG + "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") +set(CMAKE_CXX_FLAGS_RELEASE + "$ENV{CXXFLAGS} -fPIC -O3 -Werror -Wno-sign-compare") +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CUDNN_INCLUDE_DIRS}) + +# cuda_add_executable(cudnn_mha multiHeadAttention.cpp) +cuda_add_executable(cudnn_mha main.cu) +target_link_libraries(cudnn_mha ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY} + ${CUDNN_LIBRARIES}) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/README.md b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/README.md new file mode 100644 index 000000000..18a523d66 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/README.md @@ -0,0 +1,26 @@ +If there is any question, refer to the [original implementation](https://github.com/johnpzh/cudnn_samples_v8/tree/master/multiHeadAttention) + +```text +CUDA Version 11.6 +CUDNN Version 8.4.1 + +GeForce RTX 2080 Ti, Compute Capability 7.5 +``` + + +|Hyper Parameter|value| +|:--|:--| +|batch_size|32| +|num_heads|16| +|q/k/v_size|512| + +|q/k/v seq length|Elapsed Time(ms)| +|:--|:--| +|128|16.178| +|256|30.283| +|384|47.769| +|512|70.292| +|640|95.845| +|768|124.774| +|896|162.018| +|1024|199.508| diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/main.cu b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/main.cu new file mode 100644 index 000000000..36865991f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/main.cu @@ -0,0 +1,15 @@ +#include "multi_head_attention.h" + +int main(int argc, char** argv) { + TestOpts opts; + ParseAttnParameters(argc, argv, &opts); + + MultiheadAttentionTest attn_test; + attn_test.SetUp(opts); + attn_test.Run(); + attn_test.TearDown(); + + fflush(stdout); + + return 0; +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/multi_head_attention.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/multi_head_attention.h new file mode 100644 index 000000000..1bb5046b0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/multi_head_attention.h @@ -0,0 +1,877 @@ +#include "utils.h" + +#include +#include +#include +#include +#include + +struct TestOpts { + TestOpts() { memset(this, 0, sizeof(*this)); } + int attn_train; + int attn_data_type; + int attn_query_map; + int attn_num_heads; + int attn_batch_size; + int attn_beam_size; + double attn_sm_scaler; + float attn_dropout_rate; + int attn_q_size; + int attn_k_size; + int attn_v_size; + int attn_proj_q_size; + int attn_proj_k_size; + int attn_proj_v_size; + int attn_proj_o_size; + int attn_seq_len_q; + int attn_seq_len_k; + int attn_data_layout; + int attn_res_link; + int attn_sweep; + int attn_rand_geom; + int attn_rand_seed; +}; + +static void ParseAttnParameters(int argc, char** argv, TestOpts* opts) { + struct cmdParams { + const char* name; + const char* fmt; + size_t offs; + const char* desc; + } param[] = { + {"attn_train", "%d", offsetof(TestOpts, attn_train), + "selects API mode (0-inference, 1-training)"}, + {"attn_data_type", "%d", offsetof(TestOpts, attn_data_type), + "selects data format (0-FP32, 1-FP64)"}, + {"attn_num_heads", "%d", offsetof(TestOpts, attn_num_heads), + "number of attenton heads"}, + {"attn_batch_size", "%d", offsetof(TestOpts, attn_batch_size), + "batch size for Q, R, K, V and O arguments"}, + {"attn_beam_size", "%d", offsetof(TestOpts, attn_beam_size), + "number of sentence candidates in Q, R inputs"}, + {"attn_sm_scaler", "%lg", offsetof(TestOpts, attn_sm_scaler), + "softmax smoothing or sharpening coefficient"}, + {"attn_dropout_rate", "%g", offsetof(TestOpts, attn_dropout_rate), + "dropout rate settings applied during training"}, + {"attn_q_size", "%d", offsetof(TestOpts, attn_q_size), + "original vector length for 'queries'"}, + {"attn_k_size", "%d", offsetof(TestOpts, attn_k_size), + "original vector length for 'keys'"}, + {"attn_v_size", "%d", offsetof(TestOpts, attn_v_size), + "original vector length for 'values'"}, + {"attn_proj_q_size", "%d", offsetof(TestOpts, attn_proj_q_size), + "length of 'queries' vector after projection"}, + {"attn_proj_k_size", "%d", offsetof(TestOpts, attn_proj_k_size), + "length of 'keys' vector after projection"}, + {"attn_proj_v_size", "%d", offsetof(TestOpts, attn_proj_v_size), + "length of 'values' vector after projection"}, + {"attn_proj_o_size", "%d", offsetof(TestOpts, attn_proj_o_size), + "length of 'output' vector after projection"}, + {"attn_seq_len_q", "%d", offsetof(TestOpts, attn_seq_len_q), + "largest sequence length for Q, R, O arguments"}, + {"attn_seq_len_k", "%d", offsetof(TestOpts, attn_seq_len_k), + "largest sequence length for K, V arguments"}, + {"attn_data_layout", "%d", offsetof(TestOpts, attn_data_layout), + "data layout for Q, K, V, O inputs"}, + {"attn_res_link", "%d", offsetof(TestOpts, attn_res_link), + "enable/disable residual connections"}, + {"attn_sweep", "%d", offsetof(TestOpts, attn_sweep), + "sweep all time-steps in one inference API call"}, + {"attn_rand_geom", "%d", offsetof(TestOpts, attn_rand_geom), + "randomize attention task dimensions"}, + {"attn_rand_seed", "%d", offsetof(TestOpts, attn_rand_seed), + "seed for the random number generator"}, + }; + + if (argc == 1) { + printf("This is the cuDNN multi-head attention API test.\n\n"); + printf("Usage: ./%s [OPTIONS]\n\nProgram options:\n\n", + BaseFile(*argv)); + + for (int i = 0; i < COUNTOF(param); i++) { + char buf[64]; + sprintf(buf, "-%s<%s>", param[i].name, param[i].fmt); + printf("%-20s - %s\n", buf, param[i].desc); + } + printf("\n"); + + exit(-1); + } + + while (argc > 1) { + argc--; + argv++; + + int i; + + for (i = 0; i < COUNTOF(param); i++) { + const char* pname = param[i].name; + size_t plen = strlen(pname); + if (strncmp(*argv + 1, pname, plen) == 0) { + int count = sscanf(*argv + plen + 1, param[i].fmt, + (char*)opts + param[i].offs); + if (count != 1) { + fprintf( + stderr, + "ERROR: missing numerical argument in option '%s'\n\n", + *argv); + exit(-1); + } + break; + } + } + + if (i >= COUNTOF(param)) { + fprintf(stderr, "ERROR: unknown switch '%s'\n\n", *argv); + exit(-1); + } + } +} + +struct AttnConfig { + AttnConfig() { + memset(this, 0, sizeof(*this)); + } // sets query_map=ALL_TO_ONE + + cudnnAttnQueryMap_t query_map; // query_map mode + + int num_heads; // number of attention heads + int beam_size; // number of candidates of the same sentence + double sm_scaler; // softmax smoothing or sharpening coefficient + float dropout_rate; // dropout probability + int q_size; // original vector length of "queries" + int k_size; // original vector length of "keys" + int v_size; // original vector length of "values" + int q_proj_size; // "queries" after projection (0=no projection) + int k_proj_size; // "keys" after projection (0=no projection) + int v_proj_size; // "values" after projection (0=no projection) + int o_proj_size; // "output" after projection (0=no projection) + int seq_len_q; // max seq length for Q, R, O buffers + int seq_len_k; // max seq length for K, V buffers + int batch_size; // batch size for Q, R, K, V, O buffers + bool res_link; // enable/disable residual connections + int sweep; // sweep all time-steps in inference mode + int rand_geom; // randomize problem dimensions + int rand_seed; // random number generator seed + + // Attention window boundaries for every time-step. + int* lo_win_idx; + int* hi_win_idx; + + // Query and key sequence lengths (for each batch/beam sentence). + int* q_seq_len; + int* k_seq_len; + + int data_layout; // data layout, map to one of 6 possible dataAxes + cudnnSeqDataAxis_t + data_axes[CUDNN_SEQDATA_DIM_COUNT]; // data order for T, N, and B dim + + cudnnDataType_t data_type; // data type for Q,K,V inputs, weights, output + cudnnDataType_t comp_prec; // compute precision + + int q_length() { + return this->q_proj_size > 0 ? this->q_proj_size : this->q_size; + } + + int k_length() { + return this->k_proj_size > 0 ? this->k_proj_size : this->k_size; + } + + int v_length() { + return this->v_proj_size > 0 ? this->v_proj_size : this->v_size; + } + + int o_length() { + return this->o_proj_size > 0 ? this->o_proj_size + : this->v_length() * this->num_heads; + } + + size_t qo_tokens() { + return size_t(this->seq_len_q) * this->batch_size * this->beam_size; + } + + size_t kv_tokens() { + size_t t = size_t(this->seq_len_k) * this->batch_size; + if (this->query_map == CUDNN_ATTN_QUERYMAP_ONE_TO_ONE) { + t *= this->beam_size; + } + return t; + } + + size_t q_all_data() { return this->qo_tokens() * this->q_size; } + + size_t k_all_data() { return this->kv_tokens() * this->k_size; } + + size_t v_all_data() { return this->kv_tokens() * this->v_size; } + + size_t o_all_data() { return this->qo_tokens() * this->o_length(); } + + size_t q_all_weights() { + size_t q_weights = + (this->q_proj_size > 0 ? size_t(this->q_size) * this->q_proj_size + : 0); + return q_weights * this->num_heads; + } + + size_t k_all_weights() { + size_t k_weights = + (this->k_proj_size > 0 ? size_t(this->k_size) * this->k_proj_size + : 0); + return k_weights * this->num_heads; + } + + size_t v_all_weights() { + size_t v_weights = + (this->v_proj_size > 0 ? size_t(this->v_size) * this->v_proj_size + : 0); + return v_weights * this->num_heads; + } + + size_t o_all_weights() { + size_t o_weights = (this->o_proj_size > 0 + ? size_t(this->v_length()) * this->o_proj_size + : 0); + return o_weights * this->num_heads; + return o_weights * this->num_heads; + } + + size_t q_seq_len_count() { return this->batch_size * this->beam_size; } + + size_t k_seq_len_count() { + return this->batch_size * + (this->query_map == CUDNN_ATTN_QUERYMAP_ONE_TO_ONE + ? this->beam_size + : 1); + } +}; + +template +class MultiheadAttentionTest { + public: + cudnnHandle_t handle; + + AttnConfig main_cfg; + + cudnnAttnDescriptor_t attn_desc; + cudnnDropoutDescriptor_t drop_desc; + cudnnSeqDataDescriptor_t q_desc; + cudnnSeqDataDescriptor_t k_desc; + cudnnSeqDataDescriptor_t v_desc; + cudnnSeqDataDescriptor_t o_desc; + + // Attention in/out buffers on the GPU side. + T_ELEM* dev_q; + T_ELEM* dev_k; + T_ELEM* dev_v; + T_ELEM* dev_o; + T_ELEM* dev_w; + + // Buffers with in/out data and weights on the CPU side. + T_ELEM* host_q; + T_ELEM* host_k; + T_ELEM* host_v; + T_ELEM* host_o; + T_ELEM* host_w; + + // Work-space and reserve-space GPU buffers required by API. + T_MATH* dev_wk_space; + T_MATH* dev_reserve; + + // Capacity of weight/wkspace/reserve buffers (in bytes). + size_t max_weights; + size_t max_wk_space; + size_t max_reserve; + + // Capacity of each "seq" data container (in elements). + size_t max_elem_q; + size_t max_elem_k; + size_t max_elem_v; + size_t max_elem_o; + size_t max_elem_a; + + size_t max_elem_q_bar; + size_t max_elem_k_bar; + size_t max_elem_v_bar; + size_t max_elem_h_bar; + + // Dropout descriptor settings. + size_t dropout_buf_size; + void* dropout_buf; + + // Sequence length arrays for Q,R,O and K,V. + int* q_seq_array; + int* k_seq_array; + + int* dev_q_seq_array; + int* dev_k_seq_array; + + // Attention window. + int* lo_win_idx; + int* hi_win_idx; + + void SetUp(TestOpts& opts); + + void Run(); + + void TearDown(void); + + void TestGen(AttnConfig* test_desc, bool debug_info = false); +}; + +template +void MultiheadAttentionTest::SetUp(TestOpts& opts) { + attn_desc = NULL; + drop_desc = NULL; + q_desc = NULL; + k_desc = NULL; + v_desc = NULL; + o_desc = NULL; + + dropout_buf = NULL; + dropout_buf_size = 0; + + dev_q = NULL; + dev_k = NULL; + dev_v = NULL; + dev_o = NULL; + dev_w = NULL; + + host_q = NULL; + host_k = NULL; + host_v = NULL; + host_o = NULL; + host_w = NULL; + + dev_wk_space = NULL; + dev_reserve = NULL; + + max_weights = 0; + max_wk_space = 0; + max_reserve = 0; + + max_elem_q = 0; + max_elem_k = 0; + max_elem_v = 0; + max_elem_o = 0; + + q_seq_array = NULL; + k_seq_array = NULL; + + lo_win_idx = NULL; + hi_win_idx = NULL; + + main_cfg.num_heads = opts.attn_num_heads; + main_cfg.batch_size = opts.attn_batch_size; + main_cfg.beam_size = opts.attn_beam_size; + main_cfg.sm_scaler = opts.attn_sm_scaler; + main_cfg.dropout_rate = opts.attn_dropout_rate; + main_cfg.q_size = opts.attn_q_size; + main_cfg.k_size = opts.attn_k_size; + main_cfg.v_size = opts.attn_v_size; + main_cfg.q_proj_size = opts.attn_proj_q_size; + main_cfg.k_proj_size = opts.attn_proj_k_size; + main_cfg.v_proj_size = opts.attn_proj_v_size; + main_cfg.o_proj_size = opts.attn_proj_o_size; + main_cfg.seq_len_q = opts.attn_seq_len_q; + main_cfg.seq_len_k = opts.attn_seq_len_k; + main_cfg.res_link = opts.attn_res_link == 0 ? false : true; + main_cfg.sweep = opts.attn_sweep; + main_cfg.rand_geom = opts.attn_rand_geom != 0 ? 1 : 0; + main_cfg.rand_seed = opts.attn_rand_seed; + main_cfg.data_type = cudnnDataType_t(opts.attn_data_type); + main_cfg.comp_prec = main_cfg.data_type; + + if (main_cfg.num_heads <= 0 || main_cfg.batch_size <= 0 || + main_cfg.beam_size <= 0) { + fprintf( + stderr, + "ERROR: wrong attention NumHeads/BatchSize/BeamSize arguments\n\n"); + exit(-1); + } + + int q_proj_len = main_cfg.q_length(); + int k_proj_len = main_cfg.k_length(); + int out_len = main_cfg.o_length(); + + main_cfg.data_layout = opts.attn_data_layout; + + switch (main_cfg.data_layout) { + case 0: // data_axes = [T, N, B] + main_cfg.data_axes[0] = CUDNN_SEQDATA_TIME_DIM; + main_cfg.data_axes[1] = CUDNN_SEQDATA_BATCH_DIM; + main_cfg.data_axes[2] = CUDNN_SEQDATA_BEAM_DIM; + break; + + case 1: // data_axes = [T, B, N] + main_cfg.data_axes[0] = CUDNN_SEQDATA_TIME_DIM; + main_cfg.data_axes[1] = CUDNN_SEQDATA_BEAM_DIM; + main_cfg.data_axes[2] = CUDNN_SEQDATA_BATCH_DIM; + break; + + case 2: // data_axes = [N, T, B] + main_cfg.data_axes[0] = CUDNN_SEQDATA_BATCH_DIM; + main_cfg.data_axes[1] = CUDNN_SEQDATA_TIME_DIM; + main_cfg.data_axes[2] = CUDNN_SEQDATA_BEAM_DIM; + break; + + case 3: // data_axes = [N, B, T] + main_cfg.data_axes[0] = CUDNN_SEQDATA_BATCH_DIM; + main_cfg.data_axes[1] = CUDNN_SEQDATA_BEAM_DIM; + main_cfg.data_axes[2] = CUDNN_SEQDATA_TIME_DIM; + break; + + case 4: // data_axes = [B, T, N] + main_cfg.data_axes[0] = CUDNN_SEQDATA_BEAM_DIM; + main_cfg.data_axes[1] = CUDNN_SEQDATA_TIME_DIM; + main_cfg.data_axes[2] = CUDNN_SEQDATA_BATCH_DIM; + break; + + case 5: // data_axes = [B, N, T] + main_cfg.data_axes[0] = CUDNN_SEQDATA_BEAM_DIM; + main_cfg.data_axes[1] = CUDNN_SEQDATA_BATCH_DIM; + main_cfg.data_axes[2] = CUDNN_SEQDATA_TIME_DIM; + break; + + default: + fprintf(stderr, "ERROR: wrong -attn_data_layout%d option\n\n", + opts.attn_data_layout); + exit(-1); + } + main_cfg.data_axes[3] = CUDNN_SEQDATA_VECT_DIM; + + CHECK_CUDNN_ERR(cudnnCreate(&handle)); + CHECK_CUDNN_ERR(cudnnCreateAttnDescriptor(&attn_desc)); + CHECK_CUDNN_ERR(cudnnCreateDropoutDescriptor(&drop_desc)); + CHECK_CUDNN_ERR(cudnnCreateSeqDataDescriptor(&q_desc)); + CHECK_CUDNN_ERR(cudnnCreateSeqDataDescriptor(&k_desc)); + CHECK_CUDNN_ERR(cudnnCreateSeqDataDescriptor(&v_desc)); + CHECK_CUDNN_ERR(cudnnCreateSeqDataDescriptor(&o_desc)); + + size_t max_q_tokens = + size_t(main_cfg.seq_len_q) * main_cfg.batch_size * main_cfg.beam_size; + size_t max_k_tokens = size_t(main_cfg.seq_len_k) * main_cfg.batch_size; + + // Buffer Q/K/V/O capacity in elements. + max_elem_q = max_q_tokens * main_cfg.q_size; + max_elem_k = max_k_tokens * main_cfg.k_size; + max_elem_v = max_k_tokens * main_cfg.v_size; + max_elem_o = max_q_tokens * out_len; + max_elem_a = max_q_tokens * main_cfg.num_heads * main_cfg.seq_len_k; + + max_elem_q_bar = max_q_tokens * main_cfg.num_heads * main_cfg.q_proj_size; + max_elem_k_bar = max_k_tokens * main_cfg.num_heads * main_cfg.k_proj_size; + max_elem_v_bar = max_k_tokens * main_cfg.num_heads * main_cfg.v_proj_size; + max_elem_h_bar = max_q_tokens * main_cfg.num_heads * main_cfg.v_proj_size; + + // Allocate input and output buffers (forward/inference pass). + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_q, max_elem_q * sizeof(T_ELEM))); + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_k, max_elem_k * sizeof(T_ELEM))); + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_v, max_elem_v * sizeof(T_ELEM))); + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_o, max_elem_o * sizeof(T_ELEM))); + + // Allocate input and output buffers (backward/training pass). + CHECK_CUDNN_ERR(cudnnDropoutGetStatesSize(handle, &dropout_buf_size)); + CHECK_CUDA_ERR(cudaMalloc((void**)&dropout_buf, dropout_buf_size)); + + CHECK_CUDNN_ERR( + cudnnSetDropoutDescriptor(drop_desc, handle, main_cfg.dropout_rate, + dropout_buf, dropout_buf_size, 0)); + + CHECK_CUDNN_ERR(cudnnSetAttnDescriptor( + attn_desc, main_cfg.query_map, main_cfg.num_heads, main_cfg.sm_scaler, + main_cfg.data_type, main_cfg.comp_prec, CUDNN_DEFAULT_MATH, drop_desc, + NULL, main_cfg.q_size, main_cfg.k_size, main_cfg.v_size, + main_cfg.q_proj_size, main_cfg.k_proj_size, main_cfg.v_proj_size, + main_cfg.o_proj_size, main_cfg.seq_len_q, main_cfg.seq_len_k, + main_cfg.batch_size, main_cfg.beam_size)); + + CHECK_CUDNN_ERR(cudnnGetMultiHeadAttnBuffers( + handle, attn_desc, &max_weights, &max_wk_space, NULL)); + + if (max_weights > 0) { + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_w, max_weights)); + } + if (max_wk_space > 0) { + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_wk_space, max_wk_space)); + } + if (max_reserve > 0) { + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_reserve, max_reserve)); + + // Fill with -NaN to deterct incorrect segment write for debugging. + CHECK_CUDA_ERR(cudaMemset(dev_reserve, 0xff, max_reserve)); + } + + q_seq_array = + (int*)calloc(main_cfg.batch_size * main_cfg.beam_size, sizeof(int)); + k_seq_array = (int*)calloc(main_cfg.batch_size, sizeof(int)); + + if (lo_win_idx == NULL && hi_win_idx == NULL) { + lo_win_idx = (int*)calloc(main_cfg.seq_len_q, sizeof(int)); + hi_win_idx = (int*)calloc(main_cfg.seq_len_q, sizeof(int)); + } + + // Allocate weight and data buffers on the CPU side. + if (max_weights > 0) { + host_w = (T_ELEM*)malloc(max_weights); + } + + host_q = (T_ELEM*)malloc(max_elem_q * sizeof(T_ELEM)); + host_k = (T_ELEM*)malloc(max_elem_k * sizeof(T_ELEM)); + host_v = (T_ELEM*)malloc(max_elem_v * sizeof(T_ELEM)); + host_o = (T_ELEM*)malloc(max_elem_o * sizeof(T_ELEM)); +} + +template +void MultiheadAttentionTest::Run() { + AttnConfig test_cfg; + + TestGen(&test_cfg); + + CHECK_CUDNN_ERR( + cudnnSetDropoutDescriptor(drop_desc, handle, test_cfg.dropout_rate, + dropout_buf, dropout_buf_size, 0)); + + // Set attention descriptor according to generated test_cfg. + CHECK_CUDNN_ERR(cudnnSetAttnDescriptor( + attn_desc, test_cfg.query_map, test_cfg.num_heads, test_cfg.sm_scaler, + test_cfg.data_type, test_cfg.comp_prec, CUDNN_DEFAULT_MATH, drop_desc, + NULL, test_cfg.q_size, test_cfg.k_size, test_cfg.v_size, + test_cfg.q_proj_size, test_cfg.k_proj_size, test_cfg.v_proj_size, + test_cfg.o_proj_size, test_cfg.seq_len_q, test_cfg.seq_len_k, + test_cfg.batch_size, test_cfg.beam_size)); + + size_t size_weights = 0, size_wk_space = 0, size_reserve = 0; + + CHECK_CUDNN_ERR(cudnnGetMultiHeadAttnBuffers( + handle, attn_desc, &size_weights, &size_wk_space, NULL)); + + // Sanity check so we do not over-run the allocated buffers. + if (size_weights > max_weights || size_wk_space > max_wk_space || + size_reserve > max_reserve) { + fprintf(stderr, + "ERROR: cudnnGetMultiHeadAttnBuffers() reported inconsistent " + "buffer sizes\n\n"); + exit(-1); + } + + int q_seq_array_size = test_cfg.beam_size * test_cfg.batch_size; + int k_seq_array_size = test_cfg.batch_size; + + // host-to-device copies + size_t size = sizeof(q_seq_array[0]) * q_seq_array_size; + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_q_seq_array, size)); + CHECK_CUDA_ERR( + cudaMemcpy(dev_q_seq_array, q_seq_array, size, cudaMemcpyHostToDevice)); + + size = sizeof(k_seq_array[0]) * k_seq_array_size; + CHECK_CUDA_ERR(cudaMalloc((void**)&dev_k_seq_array, size)); + CHECK_CUDA_ERR( + cudaMemcpy(dev_k_seq_array, k_seq_array, size, cudaMemcpyHostToDevice)); + + // Length of output vectors. + int o_len = test_cfg.o_length(); + + int dim_a[CUDNN_SEQDATA_DIM_COUNT]; + + dim_a[CUDNN_SEQDATA_BEAM_DIM] = test_cfg.beam_size; + dim_a[CUDNN_SEQDATA_BATCH_DIM] = test_cfg.batch_size; + dim_a[CUDNN_SEQDATA_TIME_DIM] = test_cfg.seq_len_q; + dim_a[CUDNN_SEQDATA_VECT_DIM] = test_cfg.q_size; + + CHECK_CUDNN_ERR(cudnnSetSeqDataDescriptor( + q_desc, test_cfg.data_type, CUDNN_SEQDATA_DIM_COUNT, dim_a, + test_cfg.data_axes, q_seq_array_size, q_seq_array, NULL)); + + dim_a[CUDNN_SEQDATA_BEAM_DIM] = test_cfg.beam_size; + dim_a[CUDNN_SEQDATA_BATCH_DIM] = test_cfg.batch_size; + dim_a[CUDNN_SEQDATA_TIME_DIM] = test_cfg.seq_len_q; + dim_a[CUDNN_SEQDATA_VECT_DIM] = o_len; + + CHECK_CUDNN_ERR(cudnnSetSeqDataDescriptor( + o_desc, test_cfg.data_type, CUDNN_SEQDATA_DIM_COUNT, dim_a, + test_cfg.data_axes, q_seq_array_size, q_seq_array, NULL)); + + // seq-k + dim_a[CUDNN_SEQDATA_BEAM_DIM] = + test_cfg.query_map == CUDNN_ATTN_QUERYMAP_ONE_TO_ONE + ? test_cfg.beam_size + : 1; + dim_a[CUDNN_SEQDATA_BATCH_DIM] = test_cfg.batch_size; + dim_a[CUDNN_SEQDATA_TIME_DIM] = test_cfg.seq_len_k; + dim_a[CUDNN_SEQDATA_VECT_DIM] = test_cfg.k_size; + + CHECK_CUDNN_ERR(cudnnSetSeqDataDescriptor( + k_desc, test_cfg.data_type, CUDNN_SEQDATA_DIM_COUNT, dim_a, + test_cfg.data_axes, k_seq_array_size, k_seq_array, NULL)); + + // seq-v + dim_a[CUDNN_SEQDATA_BEAM_DIM] = + test_cfg.query_map == CUDNN_ATTN_QUERYMAP_ONE_TO_ONE + ? test_cfg.beam_size + : 1; + dim_a[CUDNN_SEQDATA_BATCH_DIM] = test_cfg.batch_size; + dim_a[CUDNN_SEQDATA_TIME_DIM] = test_cfg.seq_len_k; + dim_a[CUDNN_SEQDATA_VECT_DIM] = test_cfg.v_size; + + CHECK_CUDNN_ERR(cudnnSetSeqDataDescriptor( + v_desc, test_cfg.data_type, CUDNN_SEQDATA_DIM_COUNT, dim_a, + test_cfg.data_axes, k_seq_array_size, k_seq_array, NULL)); + + size_t q_num_elem = test_cfg.q_all_data(); + size_t k_num_elem = test_cfg.k_all_data(); + size_t v_num_elem = test_cfg.v_all_data(); + size_t o_nmb_elem = test_cfg.o_all_data(); + + size_t q_nmb_weights = test_cfg.q_all_weights(); + size_t k_nmb_weights = test_cfg.k_all_weights(); + size_t v_nmb_weights = test_cfg.v_all_weights(); + size_t o_nmb_weights = test_cfg.o_all_weights(); + + // Sanity check so we do not over-run the allocated buffers. + if (q_num_elem > max_elem_q || k_num_elem > max_elem_k || + v_num_elem > max_elem_v || o_nmb_elem > max_elem_o) { + fprintf(stderr, "ERROR: inconsistent data buffer sizes\n\n"); + exit(-1); + } + + if (q_num_elem == 0 || k_num_elem == 0 || o_nmb_elem == 0) { + fprintf(stderr, "ERROR: Q/K/O data buffers cannot be zero size\n\n"); + exit(-1); + } + + if (size_weights > 0) { + InitBuffer(host_w, size_weights / sizeof(T_ELEM), INIT_MEAN, + INIT_VAR); + } + + InitBuffer(host_q, q_num_elem, INIT_MEAN, INIT_VAR); + InitBuffer(host_k, k_num_elem, INIT_MEAN, INIT_VAR); + InitBuffer(host_v, v_num_elem, INIT_MEAN, INIT_VAR); + + // Fill output surface with NaN-s. + CHECK_CUDA_ERR(cudaMemset(dev_o, 0xFF, o_nmb_elem * sizeof(dev_o[0]))); + + // Copy the data from GPU (device) to CPU (host) + CHECK_CUDA_ERR( + cudaMemcpy(dev_w, host_w, size_weights, cudaMemcpyHostToDevice)); + CHECK_CUDA_ERR(cudaMemcpy(dev_q, host_q, sizeof(dev_q[0]) * q_num_elem, + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERR(cudaMemcpy(dev_k, host_k, sizeof(dev_k[0]) * k_num_elem, + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERR(cudaMemcpy(dev_v, host_v, sizeof(dev_v[0]) * v_num_elem, + cudaMemcpyHostToDevice)); + + if (size_reserve != 0) { + fprintf(stderr, + "ERROR: non-zero reserve buffer size in inference mode\n\n"); + exit(-1); + } + + for (int i = 0; i < 5; ++i) + CHECK_CUDNN_ERR(cudnnMultiHeadAttnForward( + handle, attn_desc, -1 /*All q time steps are availiable*/, + lo_win_idx, hi_win_idx, dev_q_seq_array, dev_k_seq_array, q_desc, + dev_q, main_cfg.res_link ? dev_q : NULL, k_desc, dev_k, v_desc, + dev_v, o_desc, dev_o, size_weights, size_weights > 0 ? dev_w : NULL, + size_wk_space, dev_wk_space, 0 /*reserveSpaceSizeInBytes*/, + NULL /*reserveSpace*/)); + + CHECK_CUDA_ERR(cudaDeviceSynchronize()); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + const int ITERS = 10; + cudaEventRecord(start, 0); + float elapsed = 0.; + for (int i = 0; i < ITERS; ++i) { + CHECK_CUDNN_ERR(cudnnMultiHeadAttnForward( + handle, attn_desc, -1, lo_win_idx, hi_win_idx, dev_q_seq_array, + dev_k_seq_array, q_desc, dev_q, main_cfg.res_link ? dev_q : NULL, + k_desc, dev_k, v_desc, dev_v, o_desc, dev_o, size_weights, + size_weights > 0 ? dev_w : NULL, size_wk_space, dev_wk_space, 0, + NULL)); + } + + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsed, start, stop); + + printf("%.*f|\n", 3, elapsed / ITERS); + + // Copy forward output to host. + CHECK_CUDA_ERR(cudaMemcpy(host_o, dev_o, o_nmb_elem * sizeof(dev_o[0]), + cudaMemcpyDeviceToHost)); +} + +// Teardown destroys various descriptors and free memories. +template +void MultiheadAttentionTest::TearDown() { + cudnnDestroyAttnDescriptor(attn_desc); + attn_desc = NULL; + + cudnnDestroyDropoutDescriptor(drop_desc); + drop_desc = NULL; + + cudnnDestroySeqDataDescriptor(q_desc); + q_desc = NULL; + + cudnnDestroySeqDataDescriptor(k_desc); + k_desc = NULL; + + cudnnDestroySeqDataDescriptor(v_desc); + v_desc = NULL; + + cudnnDestroySeqDataDescriptor(o_desc); + o_desc = NULL; + + cudaFree(dropout_buf); + dropout_buf = NULL; + + cudaFree(dev_q); + dev_q = NULL; + + cudaFree(dev_k); + dev_k = NULL; + + cudaFree(dev_v); + dev_v = NULL; + + cudaFree(dev_o); + dev_o = NULL; + + cudaFree(dev_w); + dev_w = NULL; + + cudaFree(dev_wk_space); + dev_wk_space = NULL; + + cudaFree(dev_reserve); + dev_reserve = NULL; + + free(q_seq_array); + q_seq_array = NULL; + + free(k_seq_array); + k_seq_array = NULL; + + free(lo_win_idx); + lo_win_idx = NULL; + + free(hi_win_idx); + hi_win_idx = NULL; + + free(host_w); + host_w = NULL; + + free(host_q); + host_q = NULL; + + free(host_k); + host_k = NULL; + + free(host_v); + host_v = NULL; + + free(host_o); + host_o = NULL; +} + +template +void MultiheadAttentionTest::TestGen(AttnConfig* test_cfg, + bool debug_info) { + *test_cfg = this->main_cfg; + + // Initialize q_seq_array and k_seq_array values and attention window + size_t q_batches = test_cfg->q_seq_len_count(); + size_t k_batches = test_cfg->k_seq_len_count(); + + // Set random number generator seed. + srand48(test_cfg->rand_seed); + + // No problem size randomization when the RNG seed is zero. + if (test_cfg->rand_geom != 0) { + for (size_t i = 0; i < q_batches; ++i) { + q_seq_array[i] = RandRangeInt(1, test_cfg->seq_len_q); + } + + for (size_t i = 0; i < k_batches; ++i) { + k_seq_array[i] = RandRangeInt(1, test_cfg->seq_len_k); + } + + // Set the random size of attention window in all time-steps. + for (int i = 0; i < test_cfg->seq_len_q; ++i) { + lo_win_idx[i] = RandRangeInt(0, test_cfg->seq_len_k - 1); + hi_win_idx[i] = RandRangeInt(lo_win_idx[i], test_cfg->seq_len_k); + } + } else { + // Fixed lengths for all sequences in a batch. + for (size_t i = 0; i < q_batches; ++i) { + q_seq_array[i] = test_cfg->seq_len_q; + } + + for (size_t i = 0; i < k_batches; ++i) { + k_seq_array[i] = test_cfg->seq_len_k; + } + + // Set the maximum attention window in all time-steps. + for (int i = 0; i < test_cfg->seq_len_q; ++i) { + lo_win_idx[i] = 0; + hi_win_idx[i] = test_cfg->seq_len_k; + } + } + + const char standard_axes[CUDNN_SEQDATA_DIM_COUNT] = {'T', 'N', 'B', 'V'}; + char data_axes[CUDNN_SEQDATA_DIM_COUNT]; + for (int ii = 0; ii < CUDNN_SEQDATA_DIM_COUNT; ++ii) { + data_axes[ii] = standard_axes[test_cfg->data_axes[ii]]; + } + + if (debug_info) { + printf("Test parameters:\n\n"); + printf("#### attnDataType = %d (FP%d)\n", test_cfg->data_type, + int(8 * sizeof(T_ELEM))); + printf("#### attnNumHeads = %d\n", test_cfg->num_heads); + printf("#### attnBatchSize = %d\n", test_cfg->batch_size); + printf("#### attnBeamSize = %d\n", test_cfg->beam_size); + printf("#### attnSmScaler = %.4e\n", test_cfg->sm_scaler); + printf("#### attnDropoutRate = %.4f\n", test_cfg->dropout_rate); + printf("#### attnQsize = %d\n", test_cfg->q_size); + printf("#### attnKsize = %d\n", test_cfg->k_size); + printf("#### attnVsize = %d\n", test_cfg->v_size); + printf("#### attnProjQsize = %d%s\n", test_cfg->q_proj_size, + test_cfg->q_proj_size ? "" : " (no Q weights)"); + printf("#### attnProjKsize = %d%s\n", test_cfg->k_proj_size, + test_cfg->k_proj_size ? "" : " (no K weights)"); + printf("#### attnProjVsize = %d%s\n", test_cfg->v_proj_size, + test_cfg->v_proj_size ? "" : " (no V weights)"); + printf("#### attnProjOsize = %d%s\n", test_cfg->o_proj_size, + test_cfg->o_proj_size ? "" : " (no O weights)"); + printf("#### attnSeqLenQ = %d\n", test_cfg->seq_len_q); + printf("#### attnSeqLenK = %d\n", test_cfg->seq_len_k); + printf("#### attn_data_layout = %d (%c,%c,%c,%c)\n", + test_cfg->data_layout, data_axes[0], data_axes[1], data_axes[2], + data_axes[3]); + printf("#### attnResLink = %d\n", test_cfg->res_link); + printf("#### attnSweep = %d\n", test_cfg->sweep); + printf("#### attnRandGeom = %d\n", test_cfg->rand_geom); + printf("#### attnRandSeed = %d\n", test_cfg->rand_seed); + + for (size_t i = 0; i < q_batches; ++i) { + printf("sequence_length_q[idx=%lu]=%d\n", i, q_seq_array[i]); + } + printf("\n"); + + for (size_t i = 0; i < k_batches; ++i) { + printf("sequence_length_k[idx=%lu]=%d\n", i, k_seq_array[i]); + } + printf("\n"); + + for (int i = 0; i < test_cfg->seq_len_q; ++i) { + printf("attention_window[time=%d]=%d:%d\n", i, lo_win_idx[i], + hi_win_idx[i]); + } + printf("\n"); + } +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/run.sh b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/run.sh new file mode 100755 index 000000000..d56b7d050 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/run.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +SEED=1234 + + +for seq_len in {128..1024..128} +do + echo $seq_len + + ./build/cudnn_mha \ + -attn_train0 \ + -attn_data_type0 \ + -attn_res_link1 \ + -attn_data_layout3 \ + -attn_num_heads16 \ + -attn_beam_size1 \ + -attn_batch_size32 \ + -attn_q_size512 \ + -attn_k_size512 \ + -attn_v_size512 \ + -attn_proj_q_size512 \ + -attn_proj_k_size512 \ + -attn_proj_v_size512 \ + -attn_proj_o_size512 \ + -attn_res_link0 \ + -attn_seq_len_q$seq_len \ + -attn_seq_len_k$seq_len \ + -attn_sm_scaler1.0 \ + -attn_sweep1 \ + -attn_rand_seed$SEED + +done diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/utils.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/utils.h new file mode 100644 index 000000000..35ea58b4d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cuDNN/utils.h @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +#define COUNTOF(arr) int(sizeof(arr) / sizeof(arr[0])) +#define INIT_MEAN 0.0 +#define INIT_VAR 0.5 +#define WGROUP_COUNT 4 + +inline void CheckCudaError(cudaError_t code, const char* expr, const char* file, + int line) { + if (code) { + fprintf(stderr, "ERROR: CUDA error at %s:%d, code=%d (%s) in '%s'\n\n", + file, line, (int)code, cudaGetErrorString(code), expr); + exit(1); + } +} + +inline void CheckCudnnError(cudnnStatus_t code, const char* expr, + const char* file, int line) { + if (code) { + fprintf(stderr, "CUDNN error at %s:%d, code=%d (%s) in '%s'\n\n", file, + line, (int)code, cudnnGetErrorString(code), expr); + exit(1); + } +} + +#define CHECK_CUDA_ERR(...) \ + do { \ + CheckCudaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \ + } while (0) + +#define CHECK_CUDNN_ERR(...) \ + do { \ + CheckCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \ + } while (0) + +// Returns uniformly distributed integer values between [lower,..,upper], +// ie, with both lower and upper limits included. +inline int RandRangeInt(int lower, int upper) { + int lo = (lower < upper ? lower : upper); + int hi = (lower > upper ? lower : upper); + return lo + int(drand48() * (hi - lo + 1)); +} + +// Returns uniformly distributed floating point values between [bias, +// bias+range) assuming range>0, ie, including the lower limit but excluding the +// upper bound. +inline double RandRangeDbl(double bias, double range) { + return range * drand48() + bias; +} + +// Initializes buffer with uniformly distributed values with the given mean and +// variance. +template +void InitBuffer(T_ELEM* image, size_t imageSize, double mean, double var) { + double range = sqrt(12.0 * var); + double bias = mean - 0.5 * range; + for (size_t index = 0; index < imageSize; index++) { + image[index] = (T_ELEM)RandRangeDbl(bias, range); + } +} + +static char* BaseFile(char* fname) { + char* base; + for (base = fname; *fname != '\0'; fname++) { + if (*fname == '/' || *fname == '\\') { + base = fname + 1; + } + } + return base; +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/cutlass_a100.txt b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/cutlass_a100.txt new file mode 100644 index 000000000..2c19947d9 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/cutlass_a100.txt @@ -0,0 +1,310 @@ +length = 1536, nheads = 16, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 64, 64, 16, 16}. + + Runtime: 1.82262 ms + GFLOPs: 107036 + +Passed +length = 1536, nheads = 32, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 64, 64, 32, 16}. + + Runtime: 3.61759 ms + GFLOPs: 107854 + +Passed +length = 1536, nheads = 32, head_size = 80, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 80, 80, 32, 16}. + + Runtime: 5.64506 ms + GFLOPs: 86236 + +Passed +length = 1536, nheads = 32, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 32, 16}. + + Runtime: 6.10396 ms + GFLOPs: 127248 + +Passed +length = 1536, nheads = 40, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 40, 16}. + + Runtime: 7.75388 ms + GFLOPs: 125214 + +Passed +length = 1536, nheads = 56, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 56, 16}. + + Runtime: 10.7907 ms + GFLOPs: 125965 + +Passed +length = 1536, nheads = 72, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 72, 16}. + + Runtime: 13.9274 ms + GFLOPs: 125480 + +Passed +length = 1536, nheads = 96, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 96, 16}. + + Runtime: 18.5577 ms + GFLOPs: 125563 + +Passed +length = 2048, nheads = 16, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 64, 64, 16, 16}. + + Runtime: 3.20092 ms + GFLOPs: 108349 + +Passed +length = 2048, nheads = 32, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 64, 64, 32, 16}. + + Runtime: 6.38659 ms + GFLOPs: 108608 + +Passed +length = 2048, nheads = 32, head_size = 80, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 80, 80, 32, 16}. + + Runtime: 10.0532 ms + GFLOPs: 86085.8 + +Passed +length = 2048, nheads = 32, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 32, 16}. + + Runtime: 10.7842 ms + GFLOPs: 128043 + +Passed +length = 2048, nheads = 40, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 40, 16}. + + Runtime: 13.6061 ms + GFLOPs: 126857 + +Passed +length = 2048, nheads = 56, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 56, 16}. + + Runtime: 19.0663 ms + GFLOPs: 126740 + +Passed +length = 2048, nheads = 72, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 72, 16}. + + Runtime: 24.4556 ms + GFLOPs: 127041 + +Passed +length = 2048, nheads = 96, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 96, 16}. + + Runtime: 32.6771 ms + GFLOPs: 126771 + +Passed +length = 3072, nheads = 16, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 64, 64, 16, 16}. + + Runtime: 7.13124 ms + GFLOPs: 109426 + +Passed +length = 3072, nheads = 32, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 64, 64, 32, 16}. + + Runtime: 14.2363 ms + GFLOPs: 109627 + +Passed +length = 3072, nheads = 32, head_size = 80, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 80, 80, 32, 16}. + + Runtime: 22.4432 ms + GFLOPs: 86762.5 + +Passed +length = 3072, nheads = 32, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 32, 16}. + + Runtime: 24.1227 ms + GFLOPs: 128795 + +Passed +length = 3072, nheads = 40, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 40, 16}. + + Runtime: 30.4105 ms + GFLOPs: 127706 + +Passed +length = 3072, nheads = 56, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 56, 16}. + + Runtime: 42.4895 ms + GFLOPs: 127961 + +Passed +length = 3072, nheads = 72, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 72, 16}. + + Runtime: 54.9754 ms + GFLOPs: 127156 + +Passed +length = 3072, nheads = 96, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 96, 16}. + + Runtime: 73.1794 ms + GFLOPs: 127367 + +Passed +length = 4096, nheads = 16, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 64, 64, 16, 16}. + + Runtime: 12.6059 ms + GFLOPs: 110050 + +Passed +length = 4096, nheads = 32, head_size = 64, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 64, 64, 32, 16}. + + Runtime: 25.1185 ms + GFLOPs: 110458 + +Passed +length = 4096, nheads = 32, head_size = 80, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 80, 80, 32, 16}. + + Runtime: 39.6852 ms + GFLOPs: 87230.1 + +Passed +length = 4096, nheads = 32, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 128, 128, 32, 16}. + + Runtime: 42.7858 ms + GFLOPs: 129092 + +Passed +length = 4096, nheads = 40, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 128, 128, 40, 16}. + + Runtime: 54.0344 ms + GFLOPs: 127773 + +Passed +length = 4096, nheads = 56, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 128, 128, 56, 16}. + + Runtime: 75.2296 ms + GFLOPs: 128484 + +Passed +length = 4096, nheads = 72, head_size = 128, batch = 16 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 128, 128, 72, 16}. + + Runtime: 97.0811 ms + GFLOPs: 128011 + +Passed diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/cutlass_a6000.txt b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/cutlass_a6000.txt new file mode 100644 index 000000000..8d0da22b6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/cutlass_a6000.txt @@ -0,0 +1,288 @@ +length = 1536, nheads = 16, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 64, 64, 16, 12}. + + Runtime: 1.68085 ms + GFLOPs: 87047.7 + +Passed +length = 1536, nheads = 32, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 64, 64, 32, 12}. + + Runtime: 3.55564 ms + GFLOPs: 82299.7 + +Passed +length = 1536, nheads = 32, head_size = 80, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 80, 80, 32, 12}. + + Runtime: 7.12228 ms + GFLOPs: 51262.4 + +Passed +length = 1536, nheads = 32, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 32, 12}. + + Runtime: 8.07004 ms + GFLOPs: 72185.2 + +Passed +length = 1536, nheads = 40, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 40, 12}. + + Runtime: 10.5764 ms + GFLOPs: 68848.6 + +Passed +length = 1536, nheads = 56, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 56, 12}. + + Runtime: 14.7115 ms + GFLOPs: 69295.8 + +Passed +length = 1536, nheads = 72, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 72, 12}. + + Runtime: 18.7096 ms + GFLOPs: 70055.7 + +Passed +length = 1536, nheads = 96, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {1536, 1536, 128, 128, 96, 12}. + + Runtime: 25.687 ms + GFLOPs: 68034.8 + +Passed +length = 2048, nheads = 16, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 64, 64, 16, 12}. + + Runtime: 3.17102 ms + GFLOPs: 82028.3 + +Passed +length = 2048, nheads = 32, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 64, 64, 32, 12}. + + Runtime: 6.56732 ms + GFLOPs: 79214.5 + +Passed +length = 2048, nheads = 32, head_size = 80, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 80, 80, 32, 12}. + + Runtime: 12.5356 ms + GFLOPs: 51778.6 + +Passed +length = 2048, nheads = 32, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 32, 12}. + + Runtime: 14.2153 ms + GFLOPs: 72852.6 + +Passed +length = 2048, nheads = 40, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 40, 12}. + + Runtime: 18.4883 ms + GFLOPs: 70018.7 + +Passed +length = 2048, nheads = 56, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 56, 12}. + + Runtime: 26.1055 ms + GFLOPs: 69423.7 + +Passed +length = 2048, nheads = 72, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 72, 12}. + + Runtime: 33.1248 ms + GFLOPs: 70344.6 + +Passed +length = 2048, nheads = 96, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {2048, 2048, 128, 128, 96, 12}. + + Runtime: 45.0376 ms + GFLOPs: 68983.9 + +Passed +length = 3072, nheads = 16, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 64, 64, 16, 12}. + + Runtime: 7.47832 ms + GFLOPs: 78260.3 + +Passed +length = 3072, nheads = 32, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 64, 64, 32, 12}. + + Runtime: 15.1228 ms + GFLOPs: 77400.5 + +Passed +length = 3072, nheads = 32, head_size = 80, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 80, 80, 32, 12}. + + Runtime: 28.2343 ms + GFLOPs: 51725 + +Passed +length = 3072, nheads = 32, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 32, 12}. + + Runtime: 31.9871 ms + GFLOPs: 72846.6 + +Passed +length = 3072, nheads = 40, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 40, 12}. + + Runtime: 41.8281 ms + GFLOPs: 69634.8 + +Passed +length = 3072, nheads = 56, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 56, 12}. + + Runtime: 57.3156 ms + GFLOPs: 71145.9 + +Passed +length = 3072, nheads = 72, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 72, 12}. + + Runtime: 73.7557 ms + GFLOPs: 71083.9 + +Passed +length = 3072, nheads = 96, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {3072, 3072, 128, 128, 96, 12}. + + Runtime: 98.232 ms + GFLOPs: 71162.8 + +Passed +length = 4096, nheads = 16, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 64, 64, 16, 12}. + + Runtime: 13.3732 ms + GFLOPs: 77801.6 + +Passed +length = 4096, nheads = 32, head_size = 64, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 64, 64, 32, 12}. + + Runtime: 26.8941 ms + GFLOPs: 77374.1 + +Passed +length = 4096, nheads = 32, head_size = 80, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 80, 80, 32, 12}. + + Runtime: 49.9699 ms + GFLOPs: 51957.4 + +Passed +length = 4096, nheads = 32, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 128, 128, 32, 12}. + + Runtime: 57.1778 ms + GFLOPs: 72449.3 + +Passed +length = 4096, nheads = 40, head_size = 128, batch = 12 + +CUTLASS Attention: +==================================================== + {seq length Q, seq length KV, head size, head size V, head number, batch size} = {4096, 4096, 128, 128, 40, 12}. + + Runtime: 73.2764 ms + GFLOPs: 70665.5 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/model_config.csv b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/model_config.csv new file mode 100644 index 000000000..96438bea0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/model_config.csv @@ -0,0 +1,32 @@ +1536, 24, 16, 1024 +1536, 24, 32, 2048 +1536, 32, 32, 2560 +1536, 32, 32, 4096 +1536, 40, 40, 5120 +1536, 48, 56, 7168 +1536, 64, 72, 9216 +1536, 96, 96, 12288 +2048, 24, 16, 1024 +2048, 24, 32, 2048 +2048, 32, 32, 2560 +2048, 32, 32, 4096 +2048, 40, 40, 5120 +2048, 48, 56, 7168 +2048, 64, 72, 9216 +2048, 96, 96, 12288 +3072, 24, 16, 1024 +3072, 24, 32, 2048 +3072, 32, 32, 2560 +3072, 32, 32, 4096 +3072, 40, 40, 5120 +3072, 48, 56, 7168 +3072, 64, 72, 9216 +3072, 96, 96, 12288 +4096, 24, 16, 1024 +4096, 24, 32, 2048 +4096, 32, 32, 2560 +4096, 32, 32, 4096 +4096, 40, 40, 5120 +4096, 48, 56, 7168 +4096, 64, 72, 9216 +4096, 96, 96, 12288 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/post_process.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/post_process.py new file mode 100644 index 000000000..f9e324d6f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/post_process.py @@ -0,0 +1,27 @@ +def post_process(): + elapsed_times = [] + # with open('cutlass_a6000.txt', 'r') as flog: + with open('cutlass_a100.txt', 'r') as flog: + for line in flog: + if not 'Runtime: ' in line: + continue + + line_splits = line.split() + elapsed_times.append(float(line_splits[1])) + + # with open('../figures/MHA_A6000.tsv', 'r') as fin: + with open('../figures/MHA_A100.tsv', 'r') as fin: + data = fin.readlines() + + assert len(elapsed_times) == len(data) - 1 + + header = data[0] + with open('../figures/MHA_A100-2.tsv', 'w') as fout: + fout.write('%s\tcutlass(ms)\n' % (header.strip())) + + for i in range(1, len(data)): + fout.write('%s\t%.4f\n' % (data[i].strip(), elapsed_times[i - 1])) + + +if __name__ == '__main__': + post_process() diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/test_cutlass_fused_mha.sh b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/test_cutlass_fused_mha.sh new file mode 100755 index 000000000..509dbdc7d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/cutlass_test/test_cutlass_fused_mha.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +file="model_config.csv" +exec 3<&0 +exec 0<$file + +batch_size=16 +while read line +do + IFS=',' read -r length layer_num nheads model_dim <<< "$line" + head_size=$(expr $model_dim / $nheads) + echo "length = ${length}, nheads = ${nheads}, head_size = ${head_size}, batch = ${batch_size}" + + ./41_fused_multi_head_attention_fixed_seqlen \ + --nheads="$nheads" \ + --batch_size=$batch_size \ + --head_size=$head_size \ + --head_size_v=$head_size \ + --seq_length="$length" \ + --seq_length_kv=$length \ + --causal=false +done + +exec 0<&3 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/MHA_A100.tsv b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/MHA_A100.tsv new file mode 100644 index 000000000..4ef5d82f6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/MHA_A100.tsv @@ -0,0 +1,32 @@ +model name batch_size seqlens nheads head_dim Flash Attention1 (ms) Flash Attention2 (ms) MHA(ms) Triton Flash Attention(ms) cutlass(ms) +opt-350M 16 1536 16 64 2.3588 0.8675 5.8320 1.1906 1.8226 +opt-1.3B 16 1536 32 64 4.7151 1.8516 11.6845 2.3648 3.6176 +opt-2.7B 16 1536 32 80 9.8522 2.7404 12.4421 4.4142 5.6451 +opt-6.7B 16 1536 32 128 11.2126 3.3833 14.0614 4.9635 6.1040 +opt-13B 16 1536 40 128 12.8936 4.2373 17.6012 6.3572 7.7539 +opt-30B 16 1536 56 128 18.8833 5.9039 24.7626 8.6800 10.7907 +opt-66B 16 1536 72 128 23.9488 7.5769 32.1560 11.1666 13.9274 +opt-175B 16 1536 96 128 32.3544 10.0978 43.3470 14.9200 18.5577 +opt-350M 16 2048 16 64 4.2250 1.6635 9.0049 2.1031 3.2009 +opt-1.3B 16 2048 32 64 8.3304 3.3019 18.0538 4.1695 6.3866 +opt-2.7B 16 2048 32 80 17.4390 4.8724 19.2794 7.6119 10.0532 +opt-6.7B 16 2048 32 128 19.7743 5.9602 21.5278 8.6008 10.7842 +opt-13B 16 2048 40 128 23.1090 7.4734 26.9242 10.9717 13.6061 +opt-30B 16 2048 56 128 33.3865 10.5031 38.0049 15.0835 19.0663 +opt-66B 16 2048 72 128 42.7090 13.4357 49.3808 19.3681 24.4556 +opt-175B 16 2048 96 128 57.3354 17.9748 66.6501 25.7631 32.6771 +opt-350M 16 3072 16 64 9.4224 3.7569 22.0180 4.6964 7.1312 +opt-1.3B 16 3072 32 64 18.7048 7.4281 44.1669 9.4207 14.2363 +opt-2.7B 16 3072 32 80 39.2399 10.8292 46.4823 16.7917 22.4432 +opt-6.7B 16 3072 32 128 43.9232 13.3484 50.6383 18.9730 24.1227 +opt-13B 16 3072 40 128 52.5141 16.8165 63.3623 24.0978 30.4105 +opt-30B 16 3072 56 128 75.4349 23.6305 89.7973 33.3812 42.4895 +opt-66B 16 3072 72 128 96.9988 30.3833 116.8217 42.9747 54.9754 +opt-175B 16 3072 96 128 129.4830 40.7591 -1.0000 57.4144 73.1794 +opt-350M 16 4096 16 64 16.0001 6.6702 34.6097 8.3682 12.6059 +opt-1.3B 16 4096 32 64 30.9640 13.2943 69.3133 16.7299 25.1185 +opt-2.7B 16 4096 32 80 68.1062 19.5225 73.0911 29.2998 39.6852 +opt-6.7B 16 4096 32 128 75.1283 24.1571 79.5454 33.2059 42.7858 +opt-13B 16 4096 40 128 94.7431 30.2228 101.3300 42.0873 54.0344 +opt-30B 16 4096 56 128 134.5259 42.4741 -1.0000 58.6414 75.2296 +opt-66B 16 4096 72 128 173.2026 54.7330 -1.0000 75.5838 97.0811 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/MHA_A6000.tsv b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/MHA_A6000.tsv new file mode 100644 index 000000000..515c3989a --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/MHA_A6000.tsv @@ -0,0 +1,30 @@ +model name batch_size seqlens nheads head_dim Flash Attention1 (ms) Flash Attention2 (ms) MHA(ms) Triton Flash Attention(ms) cutlass(ms) +opt-350M 12 1536 16 64 4.1985 1.0447 8.7216 1.2471 1.6808 +opt-1.3B 12 1536 32 64 8.4356 2.1463 17.6430 2.6725 3.5556 +opt-2.7B 12 1536 32 80 15.2308 3.2556 17.5715 4.7712 7.1223 +opt-6.7B 12 1536 32 128 20.5499 4.2969 18.7476 5.4093 8.0700 +opt-13B 12 1536 40 128 23.5704 5.3637 23.6955 6.8730 10.5764 +opt-30B 12 1536 56 128 32.8473 7.5063 33.2637 9.5484 14.7115 +opt-66B 12 1536 72 128 45.3323 9.6428 43.2613 12.2588 18.7096 +opt-175B 12 1536 96 128 61.3206 12.8705 57.7979 16.3388 25.6870 +opt-350M 12 2048 16 64 7.2307 1.9526 16.7500 2.3967 3.1710 +opt-1.3B 12 2048 32 64 13.6955 3.9047 33.5726 4.6024 6.5673 +opt-2.7B 12 2048 32 80 26.0028 5.7447 33.6131 8.4920 12.5356 +opt-6.7B 12 2048 32 128 35.4402 7.5753 35.4033 9.4914 14.2153 +opt-13B 12 2048 40 128 41.9852 9.4511 44.2082 12.0556 18.4883 +opt-30B 12 2048 56 128 58.5144 13.2435 62.0802 16.7006 26.1055 +opt-66B 12 2048 72 128 79.8859 17.1971 80.2886 21.4919 33.1248 +opt-175B 12 2048 96 128 109.5652 22.8927 108.0913 28.6930 45.0376 +opt-350M 12 3072 16 64 15.7385 4.3581 30.4792 5.1502 7.4783 +opt-1.3B 12 3072 32 64 32.0268 8.6193 60.9558 10.1846 15.1228 +opt-2.7B 12 3072 32 80 59.4587 12.8951 59.6855 19.2198 28.2343 +opt-6.7B 12 3072 32 128 80.3155 17.0643 62.9577 21.2009 31.9871 +opt-13B 12 3072 40 128 94.8047 21.4570 78.7499 27.0376 41.8281 +opt-30B 12 3072 56 128 132.0720 29.9611 110.9677 37.3730 57.3156 +opt-66B 12 3072 72 128 177.7049 38.7192 -1.0000 47.9376 73.7557 +opt-175B 12 3072 96 128 247.3243 51.6388 -1.0000 63.9850 98.2320 +opt-350M 12 4096 16 64 27.3220 7.7363 58.5485 9.1497 13.3732 +opt-1.3B 12 4096 32 64 52.3220 15.4407 117.5960 18.1860 26.8941 +opt-2.7B 12 4096 32 80 101.6562 23.0857 112.1556 34.3485 49.9699 +opt-6.7B 12 4096 32 128 139.4661 30.5207 116.1683 37.7007 57.1778 +opt-13B 12 4096 40 128 168.6970 38.1357 -1.0000 47.6345 73.2764 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/opt-model-size.png b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/opt-model-size.png new file mode 100644 index 000000000..f11fa8b0b Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/opt-model-size.png differ diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/process.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/process.py new file mode 100644 index 000000000..878965deb --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/process.py @@ -0,0 +1,19 @@ +# data1 = open("pt_data_a6000_flash_attn1.tsv", "r").readlines() +# data2 = open("pt_data_a6000_flash_attn2.tsv", "r").readlines() + +data1 = open("pt_data_a100_flash_attn1.tsv", "r").readlines() +data2 = open("pt_data_a100_flash_attn2.tsv", "r").readlines() + +header = data1[0].strip().split("\t") +new_header = header[:5] + ["Flash Attention1 (ms)" + ] + ["Flash Attention2 (ms)"] + header[6:] + +# with open("MHA_A6000.tsv", "w") as fdata: +with open("MHA_A100.tsv", "w") as fdata: + fdata.write("%s\n" % ("\t".join(new_header))) + for i in range(len(data1) - 1): + items1 = data1[i + 1].strip().split("\t") + items2 = data2[i + 1].strip().split("\t") + ft2 = items2[5] + new_items = items1[0:6] + [ft2] + items2[6:] + fdata.write("%s\n" % "\t".join(new_items)) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/pt_data_2080Ti.tsv b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/pt_data_2080Ti.tsv new file mode 100644 index 000000000..e82256200 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/pt_data_2080Ti.tsv @@ -0,0 +1,38 @@ +model name batch_size seqlens nheads head_dim Flash Attention(ms) MHA(ms) +opt-350M 1 1024 16 64 0.284054 0.354809 +opt-1.3B 1 1024 32 64 0.606547 0.685507 +opt-2.7B 1 1024 32 80 0.961802 0.652739 +opt-6.7B 1 1024 32 128 0.996740 0.699388 +opt-13B 1 1024 40 128 1.084028 0.875701 +opt-30B 1 1024 56 128 1.630003 1.219136 +opt-66B 1 1024 72 128 2.460680 1.562858 +opt-175B 1 1024 96 128 2.991258 2.073724 +opt-350M 4 1024 16 64 1.472973 1.454408 +opt-1.3B 4 1024 32 64 1.946002 2.755240 +opt-2.7B 4 1024 32 80 3.394489 2.916603 +opt-6.7B 4 1024 32 128 4.254305 3.406902 +opt-13B 4 1024 40 128 4.617503 4.236389 +opt-30B 4 1024 56 128 6.110026 5.930894 +opt-66B 4 1024 72 128 7.807382 7.580436 +opt-175B 4 1024 96 128 10.105235 10.140352 +opt-350M 4 1536 16 64 2.525576 3.184356 +opt-1.3B 4 1536 32 64 3.805533 6.179598 +opt-2.7B 4 1536 32 80 6.978655 6.618598 +opt-6.7B 4 1536 32 128 8.597419 7.539310 +opt-13B 4 1536 40 128 9.801096 9.517523 +opt-30B 4 1536 56 128 13.134855 13.196135 +opt-66B 4 1536 72 128 16.926805 16.888568 +opt-175B 4 1536 96 128 22.006179 22.647004 +opt-350M 8 1536 16 64 4.015063 6.179225 +opt-1.3B 8 1536 32 64 7.137822 12.371377 +opt-2.7B 8 1536 32 80 13.360916 13.355338 +opt-6.7B 8 1536 32 128 15.441924 15.068967 +opt-13B 8 1536 40 128 19.112467 18.891531 +opt-30B 8 1536 56 128 26.643906 26.486567 +opt-66B 8 1536 72 128 32.575415 34.199014 +opt-175B 8 1536 96 128 42.976816 -1.000000 +opt-350M 16 1536 16 64 7.317994 12.342639 +opt-1.3B 16 1536 32 64 14.137993 24.738528 +opt-2.7B 16 1536 32 80 28.221682 26.810784 +opt-6.7B 16 1536 32 128 29.417253 30.459492 +opt-13B 16 1536 40 128 35.579116 -1.000000 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/tvm_data.tsv b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/tvm_data.tsv new file mode 100644 index 000000000..e960cebd5 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/figures/tvm_data.tsv @@ -0,0 +1,22 @@ +model name query len total time(ms) +pt_model_time 128 4.796 +onnx_model_time 128 14.274 +onnx_model_runtime 128 247.236 +pt_model_time 256 14.875 +onnx_model_time 256 35.040 +onnx_model_runtime 256 514.620 +pt_model_time 384 34.060 +onnx_model_time 384 63.518 +onnx_model_runtime 384 776.247 +pt_model_time 512 51.911 +onnx_model_time 512 93.142 +onnx_model_runtime 512 1104.498 +pt_model_time 640 98.558 +onnx_model_time 640 148.598 +onnx_model_runtime 640 1401.410 +pt_model_time 768 152.341 +onnx_model_time 768 212.699 +onnx_model_runtime 768 1692.065 +pt_model_time 896 212.152 +onnx_model_time 896 282.281 +onnx_model_runtime 896 1996.061 diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/__init__.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/__init__.py new file mode 100644 index 000000000..5489a39f5 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/__init__.py @@ -0,0 +1,2 @@ +from . import pt_attn +from . import triton_attn diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/pt_attn.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/pt_attn.py new file mode 100644 index 000000000..00c1cec07 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/pt_attn.py @@ -0,0 +1,96 @@ +import torch +from torch import Tensor +import torch.nn.functional as F + +import math + +from einops import rearrange + +from flash_attn.flash_attn_interface import _flash_attn_forward + +__all__ = [ + 'MultiHeadAttention', + 'MultilHeadFlashAttention', + 'GenerateQKV', + 'AttentionRef', +] + + +def GenerateQKV(x: Tensor, Wqkv, nheads: int, qkvpacked: bool): + """ + Arguments: + x: (batch_size, seqlen, nheads * d) + Wqkv: nn.Linear(nheads * d, 3 * nheads * d) + """ + batch_size, seqlen, _ = x.shape + q, k, v = Wqkv(x).chunk(3, dim=-1) + + q_unpad = rearrange(q, 'b s (h d) -> (b s) h d', h=nheads) + cu_seqlens = torch.arange( + 0, (batch_size + 1) * seqlen, + step=seqlen, + dtype=torch.int32, + device=q_unpad.device) + max_seqlen = seqlen + + k_unpad = rearrange(k, 'b s (h d) -> (b s) h d', h=nheads) + v_unpad = rearrange(v, 'b s (h d) -> (b s) h d', h=nheads) + + if qkvpacked: + qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1) + return (qkv_unpad, cu_seqlens, max_seqlen) + + else: + q, k, v = [ + rearrange(z, 'b s (h d) -> b s h d', h=nheads).detach() + for z in [q, k, v] + ] + + return (q_unpad, k_unpad, v_unpad, cu_seqlens, max_seqlen) + + +def MultiHeadAttention(query: Tensor, + key: Tensor, + value: Tensor, + num_heads: int, + dropout_p: float = 0.) -> Tensor: + """ + Arguments: + query: (batch_size, seqlen, model_dim) + wq: (model_dim, model_dim) + """ + + # Transpose. After transposed, the layout is: + # [batch, num_heads, length, head_dim] + query = query.view(query.shape[0], query.shape[1], num_heads, -1).permute( + 0, 2, 1, 3) + # [batch, num_heads, head_dim, length] + key = key.view(key.shape[0], key.shape[1], num_heads, -1).permute( + 0, 2, 3, 1) + # [batch, num_heads, length, head_dim] + value = value.view(value.shape[0], value.shape[1], num_heads, -1).permute( + 0, 2, 1, 3) + + d = query.shape[-1] + # MHA + scores = torch.matmul(query, key / math.sqrt(d)) + attn = torch.softmax(scores, dim=-1) + attn = F.dropout(attn, dropout_p) + out = torch.matmul(attn, value) + + # transpose + out = out.permute(0, 2, 1, 3) + out = out.reshape(out.shape[0], out.shape[1], -1) + return out + + +def AttentionRef(qkv: Tensor): + q = qkv[:, :, 0] + k = qkv[:, :, 1] + v = qkv[:, :, 2] + d = q.shape[-1] + scores = torch.einsum('bthd,bshd->bhts', q / math.sqrt(d), k) + attention = torch.softmax(scores, dim=-1) + output = torch.einsum('bhts,bshd->bthd', attention, v) + + return output diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/triton_attn.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/triton_attn.py new file mode 100644 index 000000000..f459d6631 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/pt_model/triton_attn.py @@ -0,0 +1,704 @@ +""" +Fused Attention +=============== + +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao (https://tridao.me/publications/flash2/flash2.pdf) +Credits: OpenAI kernel team + +Extra Credits: +- Original flash attention paper (https://arxiv.org/abs/2205.14135) +- Rabe and Staats (https://arxiv.org/pdf/2112.05682v2.pdf) + +""" + +import pytest +import torch + +import triton +import triton.language as tl + + +@triton.jit +def _attn_fwd_inner( + acc, + l_i, + m_i, + q, # + K_block_ptr, + V_block_ptr, # + start_m, + qk_scale, # + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, # + STAGE: tl.constexpr, + offs_m: tl.constexpr, + offs_n: tl.constexpr, # + N_CTX: tl.constexpr): + # range of values handled by this stage + if STAGE == 1: + lo, hi = 0, start_m * BLOCK_M + elif STAGE == 2: + lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M + lo = tl.multiple_of(lo, BLOCK_M) + # causal = False + else: + lo, hi = 0, N_CTX + K_block_ptr = tl.advance(K_block_ptr, (0, lo)) + V_block_ptr = tl.advance(V_block_ptr, (lo, 0)) + # loop over k, v and update accumulator + for start_n in range(lo, hi, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(K_block_ptr) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + if STAGE == 2: + mask = offs_m[:, None] >= (start_n + offs_n[None, :]) + qk = qk * qk_scale + tl.where(mask, 0, -1.0e6) + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk -= m_ij[:, None] + else: + m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale) + qk = qk * qk_scale - m_ij[:, None] + p = tl.math.exp2(qk) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + alpha = tl.math.exp2(m_i - m_ij) + l_i = l_i * alpha + l_ij + # -- update output accumulator -- + acc = acc * alpha[:, None] + # update acc + v = tl.load(V_block_ptr) + acc += tl.dot(p.to(tl.float16), v) + # update m_i and l_i + m_i = m_ij + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + return acc, l_i, m_i + + +# We don't run auto-tuning every time to keep the tutorial fast. Uncommenting +# the code below and commenting out the equivalent parameters is convenient for +# re-tuning. +# @triton.autotune( +# configs=[ +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64}, num_stages=4, num_warps=8), +# triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64}, num_stages=3, num_warps=8), +# triton.Config({'BLOCK_M': 256, 'BLOCK_N': 32}, num_stages=3, num_warps=8), +# triton.Config({'BLOCK_M': 256, 'BLOCK_N': 32}, num_stages=3, num_warps=4), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32}, num_stages=3, num_warps=4), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32}, num_stages=4, num_warps=4), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64}, num_stages=3, num_warps=4), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64}, num_stages=4, num_warps=4), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64}, num_stages=3, num_warps=8), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64}, num_stages=7, num_warps=8), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32}, num_stages=7, num_warps=8), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32}, num_stages=6, num_warps=8), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32}, num_stages=5, num_warps=8), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32}, num_stages=4, num_warps=8), +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64}, num_stages=6, num_warps=4), +# ], +# key=['N_CTX'], +# ) + + +@triton.jit +def _attn_fwd( + Q, + K, + V, + sm_scale, + M, + Out, # + stride_qz, + stride_qh, + stride_qm, + stride_qk, # + stride_kz, + stride_kh, + stride_kn, + stride_kk, # + stride_vz, + stride_vh, + stride_vk, + stride_vn, # + stride_oz, + stride_oh, + stride_om, + stride_on, # + Z, + H, # + N_CTX: tl.constexpr, # + BLOCK_M: tl.constexpr, # + BLOCK_DMODEL: tl.constexpr, # + BLOCK_N: tl.constexpr, # + STAGE: tl.constexpr # +): + start_m = tl.program_id(0) + off_hz = tl.program_id(1) + off_z = off_hz // H + off_h = off_hz % H + qvk_offset = off_z.to(tl.int64) * stride_qz + off_h.to( + tl.int64) * stride_qh + + # block pointers + Q_block_ptr = tl.make_block_ptr( + base=Q + qvk_offset, + shape=(N_CTX, BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + V_block_ptr = tl.make_block_ptr( + base=V + qvk_offset, + shape=(N_CTX, BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0), + ) + K_block_ptr = tl.make_block_ptr( + base=K + qvk_offset, + shape=(BLOCK_DMODEL, N_CTX), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1), + ) + O_block_ptr = tl.make_block_ptr( + base=Out + qvk_offset, + shape=(N_CTX, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + # initialize offsets + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # load scales + qk_scale = sm_scale + qk_scale *= 1.44269504 # 1/log(2) + # load q: it will stay in SRAM throughout + q = tl.load(Q_block_ptr) + # stage 1: off-band + # For causal = True, STAGE = 3 and _attn_fwd_inner gets 1 as its STAGE + # For causal = False, STAGE = 1, and _attn_fwd_inner gets 3 as its STAGE + if STAGE & 1: + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, # + start_m, + qk_scale, # + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, # + 4 - STAGE, + offs_m, + offs_n, + N_CTX # + ) + # stage 2: on-band + if STAGE & 2: + # barrier makes it easier for compielr to schedule the + # two loops independently + tl.debug_barrier() + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, # + start_m, + qk_scale, # + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, # + 2, + offs_m, + offs_n, + N_CTX # + ) + # epilogue + m_i += tl.math.log2(l_i) + acc = acc / l_i[:, None] + m_ptrs = M + off_hz * N_CTX + offs_m + tl.store(m_ptrs, m_i) + tl.store(O_block_ptr, acc.to(Out.type.element_ty)) + + +@triton.jit +def _attn_bwd_preprocess( + O, + DO, # + Delta, # + Z, + H, + N_CTX, # + BLOCK_M: tl.constexpr, + D_HEAD: tl.constexpr # +): + off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) + off_hz = tl.program_id(1) + off_n = tl.arange(0, D_HEAD) + # load + o = tl.load(O + off_hz * D_HEAD * N_CTX + off_m[:, None] * D_HEAD + + off_n[None, :]) + do = tl.load(DO + off_hz * D_HEAD * N_CTX + off_m[:, None] * D_HEAD + + off_n[None, :]).to(tl.float32) + delta = tl.sum(o * do, axis=1) + # write-back + tl.store(Delta + off_hz * N_CTX + off_m, delta) + + +# The main inner-loop logic for computing dK and dV. +@triton.jit +def _attn_bwd_dkdv( + dk, + dv, # + Q, + k, + v, + sm_scale, # + DO, # + M, + D, # + # shared by Q/K/V/DO. + stride_tok, + stride_d, # + H, + N_CTX, + BLOCK_M1: tl.constexpr, # + BLOCK_N1: tl.constexpr, # + BLOCK_DMODEL: tl.constexpr, # + # Filled in by the wrapper. + start_n, + start_m, + num_steps, # + MASK: tl.constexpr): + offs_m = start_m + tl.arange(0, BLOCK_M1) + offs_n = start_n + tl.arange(0, BLOCK_N1) + offs_k = tl.arange(0, BLOCK_DMODEL) + qT_ptrs = Q + offs_m[None, :] * stride_tok + offs_k[:, None] * stride_d + do_ptrs = DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d + # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work. + tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0) + curr_m = start_m + step_m = BLOCK_M1 + for blk_idx in range(num_steps): + qT = tl.load(qT_ptrs) + # Load m before computing qk to reduce pipeline stall. + offs_m = curr_m + tl.arange(0, BLOCK_M1) + m = tl.load(M + offs_m) + qkT = tl.dot(k, qT) + pT = tl.math.exp2(qkT - m[None, :]) + # Autoregressive masking. + if MASK: + mask = (offs_m[None, :] >= offs_n[:, None]) + pT = tl.where(mask, pT, 0.0) + do = tl.load(do_ptrs) + # Compute dV. + ppT = pT + ppT = ppT.to(tl.float16) + dv += tl.dot(ppT, do) + # D (= delta) is pre-divided by ds_scale. + Di = tl.load(D + offs_m) + # Compute dP and dS. + dpT = tl.dot(v, tl.trans(do)).to(tl.float32) + dsT = pT * (dpT - Di[None, :]) + dsT = dsT.to(tl.float16) + dk += tl.dot(dsT, tl.trans(qT)) + # Increment pointers. + curr_m += step_m + qT_ptrs += step_m * stride_tok + do_ptrs += step_m * stride_tok + return dk, dv + + +# the main inner-loop logic for computing dQ +@triton.jit +def _attn_bwd_dq( + dq, + q, + K, + V, # + do, + m, + D, + # shared by Q/K/V/DO. + stride_tok, + stride_d, # + H, + N_CTX, # + BLOCK_M2: tl.constexpr, # + BLOCK_N2: tl.constexpr, # + BLOCK_DMODEL: tl.constexpr, + # Filled in by the wrapper. + start_m, + start_n, + num_steps, # + MASK: tl.constexpr): + offs_m = start_m + tl.arange(0, BLOCK_M2) + offs_n = start_n + tl.arange(0, BLOCK_N2) + offs_k = tl.arange(0, BLOCK_DMODEL) + kT_ptrs = K + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d + vT_ptrs = V + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d + # D (= delta) is pre-divided by ds_scale. + Di = tl.load(D + offs_m) + # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work. + tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0) + curr_n = start_n + step_n = BLOCK_N2 + for blk_idx in range(num_steps): + kT = tl.load(kT_ptrs) + vT = tl.load(vT_ptrs) + qk = tl.dot(q, kT) + p = tl.math.exp2(qk - m) + # Autoregressive masking. + if MASK: + offs_n = curr_n + tl.arange(0, BLOCK_N2) + mask = (offs_m[:, None] >= offs_n[None, :]) + p = tl.where(mask, p, 0.0) + # Compute dP and dS. + dp = tl.dot(do, vT).to(tl.float32) + ds = p * (dp - Di[:, None]) + ds = ds.to(tl.float16) + # Compute dQ. + # NOTE: We need to de-scale dq in the end, because kT was pre-scaled. + dq += tl.dot(ds, tl.trans(kT)) + # Increment pointers. + curr_n += step_n + kT_ptrs += step_n * stride_tok + vT_ptrs += step_n * stride_tok + return dq + + +@triton.jit +def _attn_bwd( + Q, + K, + V, + sm_scale, # + DO, # + DQ, + DK, + DV, # + M, + D, + # shared by Q/K/V/DO. + stride_z, + stride_h, + stride_tok, + stride_d, # + H, + N_CTX, # + BLOCK_M1: tl.constexpr, # + BLOCK_N1: tl.constexpr, # + BLOCK_M2: tl.constexpr, # + BLOCK_N2: tl.constexpr, # + BLK_SLICE_FACTOR: tl.constexpr, # + BLOCK_DMODEL: tl.constexpr): + LN2: tl.constexpr = 0.6931471824645996 # = ln(2) + + bhid = tl.program_id(2) + off_chz = (bhid * N_CTX).to(tl.int64) + adj = (stride_h * (bhid % H) + stride_z * (bhid // H)).to(tl.int64) + pid = tl.program_id(0) + + # offset pointers for batch/head + Q += adj + K += adj + V += adj + DO += adj + DQ += adj + DK += adj + DV += adj + M += off_chz + D += off_chz + + # load scales + offs_k = tl.arange(0, BLOCK_DMODEL) + + start_n = pid * BLOCK_N1 + start_m = start_n + + MASK_BLOCK_M1: tl.constexpr = BLOCK_M1 // BLK_SLICE_FACTOR + offs_n = start_n + tl.arange(0, BLOCK_N1) + + dv = tl.zeros([BLOCK_N1, BLOCK_DMODEL], dtype=tl.float32) + dk = tl.zeros([BLOCK_N1, BLOCK_DMODEL], dtype=tl.float32) + + # load K and V: they stay in SRAM throughout the inner loop. + k = tl.load(K + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d) + v = tl.load(V + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d) + + num_steps = BLOCK_N1 // MASK_BLOCK_M1 + + dk, dv = _attn_bwd_dkdv( + dk, + dv, # + Q, + k, + v, + sm_scale, # + DO, # + M, + D, # + stride_tok, + stride_d, # + H, + N_CTX, # + MASK_BLOCK_M1, + BLOCK_N1, + BLOCK_DMODEL, # + start_n, + start_m, + num_steps, # + MASK=True # + ) + + start_m += num_steps * MASK_BLOCK_M1 + num_steps = (N_CTX - start_m) // BLOCK_M1 + + # Compute dK and dV for non-masked blocks. + dk, dv = _attn_bwd_dkdv( # + dk, + dv, # + Q, + k, + v, + sm_scale, # + DO, # + M, + D, # + stride_tok, + stride_d, # + H, + N_CTX, # + BLOCK_M1, + BLOCK_N1, + BLOCK_DMODEL, # + start_n, + start_m, + num_steps, # + MASK=False # + ) + + dv_ptrs = DV + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d + tl.store(dv_ptrs, dv) + + # Write back dK. + dk *= sm_scale + dk_ptrs = DK + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d + tl.store(dk_ptrs, dk) + + # THIS BLOCK DOES DQ: + start_m = pid * BLOCK_M2 + end_n = start_m + BLOCK_M2 + + MASK_BLOCK_N2: tl.constexpr = BLOCK_N2 // BLK_SLICE_FACTOR + offs_m = start_m + tl.arange(0, BLOCK_M2) + + q = tl.load(Q + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d) + dq = tl.zeros([BLOCK_M2, BLOCK_DMODEL], dtype=tl.float32) + do = tl.load(DO + offs_m[:, None] * stride_tok + + offs_k[None, :] * stride_d) + + m = tl.load(M + offs_m) + m = m[:, None] + + # Compute dQ for masked (diagonal) blocks. + # NOTE: This code scans each row of QK^T backward (from right to left, + # but inside each call to _attn_bwd_dq, from left to right), but that's + # not due to anything important. I just wanted to reuse the loop + # structure for dK & dV above as much as possible. + num_steps = BLOCK_M2 // MASK_BLOCK_N2 + dq = _attn_bwd_dq( + dq, + q, + K, + V, # + do, + m, + D, # + stride_tok, + stride_d, # + H, + N_CTX, # + BLOCK_M2, + MASK_BLOCK_N2, + BLOCK_DMODEL, # + start_m, + end_n - num_steps * MASK_BLOCK_N2, + num_steps, # + MASK=True # + ) + end_n -= num_steps * MASK_BLOCK_N2 + # stage 2 + num_steps = end_n // BLOCK_N2 + dq = _attn_bwd_dq( + dq, + q, + K, + V, # + do, + m, + D, # + stride_tok, + stride_d, # + H, + N_CTX, # + BLOCK_M2, + BLOCK_N2, + BLOCK_DMODEL, # + start_m, + end_n - num_steps * BLOCK_N2, + num_steps, # + MASK=False # + ) + # Write back dQ. + dq_ptrs = DQ + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d + dq *= LN2 + tl.store(dq_ptrs, dq) + + +empty = torch.empty(128, device="cuda") + + +class _attention(torch.autograd.Function): + @staticmethod + def forward(ctx, q, k, v, causal, sm_scale): + # shape constraints + Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] + assert Lq == Lk and Lk == Lv + assert Lk in {16, 32, 64, 128} + o = torch.empty_like(q) + BLOCK_M = 128 + BLOCK_N = 64 if Lk <= 64 else 32 + num_stages = 4 if Lk <= 64 else 3 + num_warps = 4 + stage = 3 if causal else 1 + # Tuning for H100 + if torch.cuda.get_device_capability()[0] == 9: + num_warps = 8 + num_stages = 7 if Lk >= 64 else 3 + grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1) + M = torch.empty( + (q.shape[0], q.shape[1], q.shape[2]), + device=q.device, + dtype=torch.float32) + _attn_fwd[grid]( + q, + k, + v, + sm_scale, + M, + o, # + q.stride(0), + q.stride(1), + q.stride(2), + q.stride(3), # + k.stride(0), + k.stride(1), + k.stride(2), + k.stride(3), # + v.stride(0), + v.stride(1), + v.stride(2), + v.stride(3), # + o.stride(0), + o.stride(1), + o.stride(2), + o.stride(3), # + q.shape[0], + q.shape[1], # + N_CTX=q.shape[2], # + BLOCK_M=BLOCK_M, # + BLOCK_N=BLOCK_N, # + BLOCK_DMODEL=Lk, # + STAGE=stage, # + num_warps=num_warps, # + num_stages=num_stages # + ) + + ctx.save_for_backward(q, k, v, o, M) + ctx.grid = grid + ctx.sm_scale = sm_scale + ctx.BLOCK_DMODEL = Lk + ctx.causal = causal + return o + + @staticmethod + def backward(ctx, do): + q, k, v, o, M = ctx.saved_tensors + assert do.is_contiguous() + assert q.stride() == k.stride() == v.stride() == o.stride( + ) == do.stride() + dq = torch.empty_like(q) + dk = torch.empty_like(k) + dv = torch.empty_like(v) + BATCH, N_HEAD, N_CTX = q.shape[:3] + PRE_BLOCK = 128 + NUM_WARPS, NUM_STAGES = 4, 5 + BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 32, 128, 128, 32 + BLK_SLICE_FACTOR = 2 + RCP_LN2 = 1.4426950408889634 # = 1.0 / ln(2) + arg_k = k + arg_k = arg_k * (ctx.sm_scale * RCP_LN2) + PRE_BLOCK = 128 + assert N_CTX % PRE_BLOCK == 0 + pre_grid = (N_CTX // PRE_BLOCK, BATCH * N_HEAD) + delta = torch.empty_like(M) + _attn_bwd_preprocess[pre_grid]( + o, + do, # + delta, # + BATCH, + N_HEAD, + N_CTX, # + BLOCK_M=PRE_BLOCK, + D_HEAD=ctx.BLOCK_DMODEL # + ) + grid = (N_CTX // BLOCK_N1, 1, BATCH * N_HEAD) + _attn_bwd[grid]( + q, + arg_k, + v, + ctx.sm_scale, + do, + dq, + dk, + dv, # + M, + delta, # + q.stride(0), + q.stride(1), + q.stride(2), + q.stride(3), # + N_HEAD, + N_CTX, # + BLOCK_M1=BLOCK_M1, + BLOCK_N1=BLOCK_N1, # + BLOCK_M2=BLOCK_M2, + BLOCK_N2=BLOCK_N2, # + BLK_SLICE_FACTOR=BLK_SLICE_FACTOR, # + BLOCK_DMODEL=ctx.BLOCK_DMODEL, # + num_warps=NUM_WARPS, # + num_stages=NUM_STAGES # + ) + + return dq, dk, dv, None, None + + +attention = _attention.apply diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/run.sh b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/run.sh new file mode 100755 index 000000000..3324df5ca --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python3 test_pt_model.py 2>&1 | tee figures/baseline.tsv diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_pt_model.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_pt_model.py new file mode 100644 index 000000000..7deeba62d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_pt_model.py @@ -0,0 +1,134 @@ +import os +import warnings + +import torch +from torch import Tensor +import flash_attn + +is_sm80 = torch.cuda.get_device_capability('cuda') >= (8, 0) + +try: + from flash_attn.flash_attn_triton import flash_attn_func +except (ImportError, AttributeError): + flash_attn_func = None + +try: + # flash_attn2 + from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as ft_fwd_fun +except (ImportError, AttributeError): + + from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func as ft_fwd_fun + +from einops import rearrange +from pt_model import pt_attn + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +warnings.filterwarnings('ignore') + + +@torch.no_grad() +def test_pt_mha(qkv: Tensor, batch_size: int, warmup=10, iters=50): + nnz, n, nheads, d = qkv.shape + qkv = qkv.view(batch_size, nnz // batch_size, n, nheads, d) + + qkv = qkv.detach().requires_grad_(False) + for _ in range(warmup): # warmup + output_ref = pt_attn.AttentionRef(qkv) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + for _ in range(iters): + output_ref = pt_attn.AttentionRef(qkv) + end_event.record() + torch.cuda.synchronize() + + elapsed = start_event.elapsed_time(end_event) / iters + return elapsed, output_ref + + +@torch.no_grad() +def test_flash_attention(qkv: Tensor, + cu_seqlens: Tensor, + max_seqlen: Tensor, + batch_size: int, + dropout_p=0., + warmup=10, + iters=50): + qkv = qkv.detach().requires_grad_(False) + for _ in range(warmup): # warmup + output, _, _ = ft_fwd_fun( + qkv, + cu_seqlens, + max_seqlen, + dropout_p, + return_attn_probs=True, + causal=False) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + for _ in range(iters): + output, _, _ = ft_fwd_fun( + qkv, + cu_seqlens, + max_seqlen, + dropout_p, + return_attn_probs=True, + causal=False) + end_event.record() + torch.cuda.synchronize() + + elapsed = start_event.elapsed_time(end_event) / iters + + output = rearrange(output, '(b s) h d -> b s h d', b=batch_size) + return elapsed, output + + +def make_test( + batch_size: int, + nheads: int, + hidden: int, # this is head_dim + seqlen: int, + device='cuda', + dtype=torch.float16): + + x = torch.randn( + batch_size, + seqlen, + nheads * hidden, + device=device, + dtype=dtype, + requires_grad=False) + Wqkv = torch.nn.Linear( + nheads * hidden, 3 * nheads * hidden, device=device, dtype=dtype) + + qkv, cu_seqlens, max_seqlen = pt_attn.GenerateQKV( + x, Wqkv, nheads, qkvpacked=True) + + time, _ = test_flash_attention(qkv, cu_seqlens, max_seqlen, batch_size) + print(f"{seqlen}\t{hidden}\t{time}") + + +if __name__ == '__main__': + torch.random.manual_seed(0) + print('hidden\tseqlen\tflash_attn') + batch = 32 + nheads = 8 + for hidden in [ + 128, + 256, + ]: + for seqlen in [ + 128, + 256, + 512, + 768, + 1024, + 1536, + 2048, + 4096, + ]: + make_test(batch, nheads, hidden, seqlen) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_tf_model.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_tf_model.py new file mode 100644 index 000000000..5df8bf283 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_tf_model.py @@ -0,0 +1,39 @@ +import os +import tensorflow as tf +from tf_model import tf_attn +import tensorflow as tf + +from model_config import model_settings + +from time import time + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +if __name__ == '__main__': + device = "cuda" + + batch_size = 1 + length = 512 + for setting in model_settings: + _, num_heads, model_dim = setting + + query_len = length + kv_len = length + query_size = model_dim + value_size = model_dim + + with tf.device('/GPU:0'): + model = tf_attn.MutilHeadAttention( + num_heads=num_heads, d_model=query_size) + query = tf.random.uniform((batch_size, query_len, query_size)) + key = tf.random.uniform((batch_size, kv_len, query_size)) + value = tf.random.uniform((batch_size, kv_len, value_size)) + + for _ in range(10): # warmup + out = model(query, key, value) + + start = time() + for _ in range(100): + model(query, key, value) + elapsed_time = time() - start + print('%.6f' % (elapsed_time / 100.)) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_triton_model.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_triton_model.py new file mode 100644 index 000000000..ba1efec6b --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_triton_model.py @@ -0,0 +1,132 @@ +import os +import warnings + +import torch +from torch import Tensor +import flash_attn +import math + +is_sm80 = torch.cuda.get_device_capability('cuda') >= (8, 0) + +try: + from flash_attn.flash_attn_triton import flash_attn_func +except (ImportError, AttributeError): + flash_attn_func = None + +try: + # flash_attn2 + from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as ft_fwd_fun +except (ImportError, AttributeError): + + from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func as ft_fwd_fun + +from einops import rearrange +from pt_model import pt_attn +from pt_model import triton_attn + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +warnings.filterwarnings('ignore') + + +@torch.no_grad() +def test_pt_mha(qkv: Tensor, batch_size: int, warmup=10, iters=50): + nnz, n, nheads, d = qkv.shape + qkv = qkv.view(batch_size, nnz // batch_size, n, nheads, d) + + qkv = qkv.detach().requires_grad_(False) + for _ in range(warmup): # warmup + output_ref = pt_attn.AttentionRef(qkv) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + for _ in range(iters): + output_ref = pt_attn.AttentionRef(qkv) + end_event.record() + torch.cuda.synchronize() + + elapsed = start_event.elapsed_time(end_event) / iters + return elapsed, output_ref + + +@torch.no_grad() +def test_triton_flash_attention(qkv: Tensor, + batch_size: int, + warmup: int = 10, + iters: int = 50): + + qkv = rearrange(qkv, '(b s) n h d -> b s n h d', b=batch_size) + query = qkv[:, :, 0].detach().requires_grad_(False) + key = qkv[:, :, 1].detach().requires_grad_(False) + value = qkv[:, :, 2].detach().requires_grad_(False) + + d = qkv.size(4) + sm_scale = 1.0 / math.sqrt(d) + + query = query.permute(0, 2, 1, 3).contiguous() + key = key.permute(0, 2, 1, 3).contiguous() + value = value.permute(0, 2, 1, 3).contiguous() + + for _ in range(warmup): # warmup + output = triton_attn.attention(query, key, value, False, sm_scale) + torch.cuda.synchronize() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + for _ in range(iters): + output = triton_attn.attention(query, key, value, False, sm_scale) + end_event.record() + torch.cuda.synchronize() + + elapsed = start_event.elapsed_time(end_event) / iters + + return elapsed, output + + +def make_test( + batch_size: int, + nheads: int, + hidden: int, # this is head_dim + seqlen: int, + device='cuda', + dtype=torch.float16): + + x = torch.randn( + batch_size, + seqlen, + nheads * hidden, + device=device, + dtype=dtype, + requires_grad=False) + Wqkv = torch.nn.Linear( + nheads * hidden, 3 * nheads * hidden, device=device, dtype=dtype) + + qkv, cu_seqlens, max_seqlen = pt_attn.GenerateQKV( + x, Wqkv, nheads, qkvpacked=True) + + time, _ = test_triton_flash_attention(qkv, batch_size) + + print(f"{seqlen}\t{hidden}\t{time}") + + +if __name__ == '__main__': + torch.random.manual_seed(0) + print('hidden\tseqlen\ttriton_flash_attn') + batch = 32 + nheads = 8 + for hidden in [ + 128, + # 256, # the triton implementation does not support hidden 256 + ]: + for seqlen in [ + 128, + 256, + 512, + 768, + 1024, + 1536, + 2048, + 4096, + ]: + make_test(batch, nheads, hidden, seqlen) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_tvm.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_tvm.py new file mode 100644 index 000000000..02ccda21a --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/test_tvm.py @@ -0,0 +1,240 @@ +import tvm +from tvm import relay +from tvm.contrib import graph_executor +import numpy as np +from time import perf_counter +from pt_model import pt_attn +import torch +import onnx +import onnxruntime as ort +import numpy as np +import time +import logging +import csv +logging.basicConfig(level=logging.DEBUG) + + +def load_pt_trace_model_to_tvm(traced_module, input_infos, device, query_shape, + key_shape, value_shape, output_shape): + mod, params = relay.frontend.from_pytorch(traced_module, input_infos) + + target = tvm.target.Target(target=device, host='llvm') + + # compile on CUDA + print("############################") + print("Deploy on CUDA, build the relay.") + + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params=params) + + dev = tvm.device(str(target), 0) + + module = graph_executor.GraphModule(lib["default"](dev)) + + ###################################### + # TVM runtime + print("#############################") + print("TVM runtime") + tvm_dtype = "float32" + # print(mod) + query = tvm.nd.array(np.random.uniform(size=query_shape).astype(tvm_dtype)) + key = tvm.nd.array(np.random.uniform(size=key_shape).astype(tvm_dtype)) + value = tvm.nd.array(np.random.uniform(size=value_shape).astype(tvm_dtype)) + + module.set_input("query", query) + module.set_input("key", key) + module.set_input("value", value) + # warmup execution + for i in range(10): + module.run() + tvm_output = module.get_output( + 0, tvm.nd.empty(output_shape, dtype=tvm_dtype)).numpy() + + num = 10 # number of times we run module for a single measurement + rep = 3 # number of measurements (we derive std dev from this) + timer = module.module.time_evaluator("run", dev, number=num, repeat=rep) + + tcost = timer() + start = perf_counter() + + module.run() + output_shape = output_shape + + tvm_output = module.get_output(0, + tvm.nd.empty(output_shape, + dtype=tvm_dtype)).numpy() + + mean = tcost.mean * 1000 + print("TVM Average per sample inference time: %.2fms" % (mean)) + return mean + + +def load_onnx_model_to_tvm(onnx_model, shape_dict, device, query_shape, + key_shape, value_shape, output_shape): + + mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) + + target = tvm.target.Target(target=device, host='llvm') + + # compile on CUDA + print("############################") + print("Deploy on CUDA, build the relay.") + + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params=params) + + dev = tvm.device(str(target), 0) + + module = graph_executor.GraphModule(lib["default"](dev)) + + ###################################### + # TVM runtime + print("#############################") + print("TVM runtime") + tvm_dtype = "float32" + # print(mod) + query = tvm.nd.array(np.random.uniform(size=query_shape).astype(tvm_dtype)) + key = tvm.nd.array(np.random.uniform(size=key_shape).astype(tvm_dtype)) + value = tvm.nd.array(np.random.uniform(size=value_shape).astype(tvm_dtype)) + + module.set_input("query", query) + module.set_input("key", key) + module.set_input("value", value) + # warmup execution + for i in range(10): + module.run() + tvm_output = module.get_output( + 0, tvm.nd.empty(output_shape, dtype=tvm_dtype)).numpy() + + num = 10 # number of times we run module for a single measurement + rep = 3 # number of measurements (we derive std dev from this) + timer = module.module.time_evaluator("run", dev, number=num, repeat=rep) + + tcost = timer() + start = perf_counter() + + module.run() + output_shape = output_shape + + tvm_output = module.get_output(0, + tvm.nd.empty(output_shape, + dtype=tvm_dtype)).numpy() + + mean = tcost.mean * 1000 + print("TVM Average per sample inference time: %.2fms" % (mean)) + return mean + + +def onnx_runtime(path, batch_size, query_len, kv_len, query_size, value_size): + ort_session = ort.InferenceSession(path) + + # warmup + for i in range(5): + outputs = ort_session.run( + None, { + "query": + np.random.randn(batch_size, query_len, query_size).astype( + np.float32), + "key": + np.random.randn(batch_size, kv_len, query_size).astype( + np.float32), + "value": + np.random.randn(batch_size, kv_len, value_size).astype( + np.float32) + }) + torch.cuda.synchronize() + t0 = time.time() + for i in range(10): + outputs = ort_session.run( + None, { + "query": + np.random.randn(batch_size, query_len, query_size).astype( + np.float32), + "key": + np.random.randn(batch_size, kv_len, query_size).astype( + np.float32), + "value": + np.random.randn(batch_size, kv_len, value_size).astype( + np.float32) + }) + torch.cuda.synchronize() + t1 = time.time() + return round(((t1 - t0) / 10.0) * 1000, 3) + + +def test_run(len): + batch_size = 32 + num_heads = 16 + query_len = len + kv_len = len + query_size = 512 + value_size = 512 + pt_dtype = torch.float32 + device = 'cuda' + onnx_path = "pt.onnx" + + query = torch.randn( + (batch_size, query_len, query_size), + device=device, + dtype=pt_dtype, + ) + key = torch.randn( + (batch_size, kv_len, query_size), + device=device, + dtype=pt_dtype, + ) + value = torch.randn( + (batch_size, kv_len, value_size), + device=device, + dtype=pt_dtype, + ) + + model = pt_attn.MutilHeadAttention(num_heads=num_heads, d_model=query_size) + model.eval() + + torch.onnx.export( + model, + (query, key, value), + onnx_path, + opset_version=13, + do_constant_folding=True, + input_names=['query', 'key', 'value'], + output_names=['output'], + ) + + traced_module = torch.jit.trace(model.to(device), + [query, key, value]).eval() + + query_shape = (batch_size, query_len, query_size) + key_shape = (batch_size, kv_len, query_size) + value_shape = (batch_size, kv_len, value_size) + output_shape = (batch_size, query_len, value_size) + + input_infos = [("query", (query_shape, "float32")), + ("key", (key_shape, "float32")), ("value", (value_shape, + "float32"))] + + shape_dict = {"query": query_shape, "key": key_shape, "value": value_shape} + onnx_model = onnx.load(onnx_path) + pt_model_time = load_pt_trace_model_to_tvm(traced_module, input_infos, + device, query_shape, key_shape, + value_shape, output_shape) + onnx_model_time = load_onnx_model_to_tvm(onnx_model, shape_dict, device, + query_shape, key_shape, + value_shape, output_shape) + onnx_model_runtime = onnx_runtime(onnx_path, batch_size, query_len, kv_len, + query_size, value_size) + return format(pt_model_time, ".3f"), format(onnx_model_time, + ".3f"), format( + onnx_model_runtime, ".3f") + + +if __name__ == "__main__": + with open('figures/tvm_data.tsv', 'w') as f: + tsv_w = csv.writer(f, delimiter='\t') + tsv_w.writerow(["model name", 'query len', "total time(ms)"]) + for len in [128, 256, 384, 512, 640, 768, 896]: + pt_model_time, onnx_model_time, onnx_model_runtime = test_run(len) + tsv_w.writerow(["pt_model_time", len, pt_model_time]) + tsv_w.writerow(["onnx_model_time", len, onnx_model_time]) + tsv_w.writerow(["onnx_model_runtime", len, onnx_model_runtime]) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/tf_model/__init__.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/tf_model/__init__.py new file mode 100644 index 000000000..f8a5c448e --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/tf_model/__init__.py @@ -0,0 +1 @@ +from . import tf_attn diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/tf_model/tf_attn.py b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/tf_model/tf_attn.py new file mode 100644 index 000000000..19a2d6fb6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/baseline/tf_model/tf_attn.py @@ -0,0 +1,37 @@ +import tensorflow as tf +import time + +__all__ = [ + 'MultiHeadAttention', +] + + +class MutilHeadAttention(tf.keras.layers.Layer): + def __init__(self, num_heads: int, d_model: int): + super(MutilHeadAttention, self).__init__() + self.num_heads = num_heads + self.wq = tf.keras.layers.Dense(d_model) + self.wk = tf.keras.layers.Dense(d_model) + self.wv = tf.keras.layers.Dense(d_model) + self.xo = tf.keras.layers.Dense(d_model) + + def call(self, value: tf.Tensor, key: tf.Tensor, query: tf.Tensor): + query = self.wq(query) + key = self.wk(key) + value = self.wv(value) + query = tf.reshape( + query, [query.shape[0], query.shape[1], self.num_heads, -1]) + key = tf.reshape(key, [key.shape[0], key.shape[1], self.num_heads, -1]) + value = tf.reshape( + value, [value.shape[0], value.shape[1], self.num_heads, -1]) + + query = tf.transpose(query, perm=[0, 2, 1, 3]) + key = tf.transpose(key, perm=[0, 2, 3, 1]) + value = tf.transpose(value, perm=[0, 2, 1, 3]) + scores = tf.matmul(query, key) + attn = tf.nn.softmax(scores, axis=-1) + out = tf.matmul(attn, value) + out = tf.transpose(out, perm=[0, 2, 1, 3]) + out = tf.reshape(out, [out.shape[0], out.shape[1], -1]) + out = self.xo(out) + return out diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/CMakeLists.txt b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/CMakeLists.txt new file mode 100644 index 000000000..bb134fb24 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/CMakeLists.txt @@ -0,0 +1,59 @@ +cmake_minimum_required(VERSION 3.18) +project(flash_attention CXX C CUDA) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake") +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/../../../cmake/Modules/") + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(CUDA QUIET REQUIRED) +find_package(CuDNN QUIET REQUIRED) + +cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") +message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +if(CUTLASS_NATIVE_CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) +else() + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) +endif() + +set(CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS} -w + ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS_DEBUG} + -w ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS} + ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") +set(CMAKE_CXX_FLAGS_DEBUG + "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") +set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall + -Wno-sign-compare") +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") + +include_directories( + "../../../build/third_party/cutlass/src/extern_cutlass/include") +include_directories( + "../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include") +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CUDNN_INCLUDE_DIRS}) + +include_directories("../../../") + +cuda_add_executable(main main.cu) diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/Makefile b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/Makefile new file mode 100644 index 000000000..895f0625b --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/Makefile @@ -0,0 +1,12 @@ +BENCH_NAME ?= mha_attention +BUILD_DIR := build + +.PHONY: build clean + +build: + @mkdir -p build && cd build && cmake .. && make -j12 + +$(BUILD_DIR)/$(BENCH_NAME): build + +clean: + @rm -rf build diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/ETDG.png b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/ETDG.png new file mode 100644 index 000000000..70dd2a087 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/ETDG.png differ diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/flash_attention_with_parallel_operators.png b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/flash_attention_with_parallel_operators.png new file mode 100644 index 000000000..1cc69d787 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/flash_attention_with_parallel_operators.png differ diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/fused_etdg.png b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/fused_etdg.png new file mode 100644 index 000000000..a3ef27b78 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/fused_etdg.png differ diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/map_to_cuda.png b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/map_to_cuda.png new file mode 100644 index 000000000..1bc34d17c Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/figures/map_to_cuda.png differ diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/flash_attn.cu b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/flash_attn.cu new file mode 100644 index 000000000..36132622f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/flash_attn.cu @@ -0,0 +1,352 @@ +#include "util/cuda_timer.hpp" +#include "utils/util.hpp" + +template +__global__ void KeFlashAttention(const InType* dA, const InType* dB, + const InType* dC, InType* dD, int kM, int kN, + int kK, int kP, int kTM, int kTN, int kTK, + int kTP) { + // Advance to the global data tile to the current CTA. + const InType* A = dA + blockIdx.z * (kM * kK) + blockIdx.x * (kTM * kK); + const InType* B = dB + blockIdx.z * (kK * kN); + const InType* gC_ptr = + dC + blockIdx.z * (kN * kP) + blockIdx.y * (kTP * kN); + + InType* gD_ptr = dD + blockIdx.z * (kM * kP) + blockIdx.x * (kTM * kP) + + (blockIdx.y * kTP); + + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + auto* shm = reinterpret_cast(shared_buf); + + InType* sA_ptr = shm; + InType* sB_ptr = shm + SharedA::kNumel; + InType* sC_ptr = shm + SharedA::kNumel + SharedB::kNumel; + + GIteratorA gAs(A); + SharedA sA(sA_ptr); + RegA rA; + + SharedALoader load_sa; + RegALoader load_ra; + + GIteratorB gBs(B); + SharedB sB(sB_ptr); + RegB rB; + + SharedBLoader load_sb; + RegBLoader load_rb; + + GIteratorC gCs(gC_ptr); + SharedC sC(sC_ptr); + + SharedCLoader load_sc; + RegCLoader load_rc; + RegC rC; + + // RegD rD_f32; + RegD unnormized_attn_block_f32; + + RegDCast rD; + RegDCast unnormized_attn_block; + + RegAcc attn_block_f32; + RegAccCast attn_block; + + RegVec prev_norm_vec; + RegVec cur_norm_vec; + + RegVec prev_max_vec; + RegVec cur_max_vec; + RegVec new_max_vec; + + RegVec prev_sum_vec; + RegVec cur_sum_vec; + RegVec new_sum_vec; + + RowMax row_max; + CopyVec copy_vec; + + BroadcastSub broadcast_sub; + BroadcastMul broadcast_mul; + + BlockExp block_exp; + BlockAdd block_add; + + VecMax vec_max; + VecAdd vec_add; + VecSub vec_sub; + VecMul vec_mul; + VecExp vec_exp; + + ConvertAcc cast_acc; // Convert acc to half precision + ConvertO cast_o; // Convert half precision to float. + + for (int n = 0; n < GIteratorC::sc0; ++n) { + load_sc(gCs(n), sC); + + for (int k = 0; k < GIteratorA::sc1; ++k) { + load_sa(gAs(k), sA); + load_sb(gBs(k, n), sB); + __copy_async(); + __syncthreads(); + + load_ra(sA, rA); + load_rb(sB, rB); + __syncthreads(); + + compute::gemm_(rA, rB, attn_block_f32); + } + load_rc(sC, rC); + __syncthreads(); + + cast_acc(attn_block_f32, attn_block); + + // Copy `cur_max_vec`, `cur_norm_vec` into `prev_max_vec`, + // `prev_norm_vec`. + copy_vec(cur_max_vec, prev_max_vec); + copy_vec(cur_norm_vec, prev_norm_vec); + + // Compute row max. + row_max(attn_block, cur_max_vec); + + // Broadcast subtract from `attn_block`. + broadcast_sub(cur_max_vec, attn_block); + + // Compute exp in `attn_block`. + block_exp(attn_block, attn_block); + + // Compute new max vector. + vec_max(cur_max_vec, prev_max_vec, new_max_vec); + + // Renormalization for the previous block. + vec_sub(prev_max_vec, new_max_vec, prev_norm_vec); + vec_exp(prev_norm_vec, prev_norm_vec); + + // Renormalization for the current block. + vec_sub(cur_max_vec, new_max_vec, cur_norm_vec); + vec_exp(cur_norm_vec, cur_norm_vec); + + // Update normalization factor l(x) + vec_mul(prev_norm_vec, prev_sum_vec, prev_sum_vec); + vec_mul(cur_norm_vec, cur_sum_vec, cur_sum_vec); + vec_add(prev_sum_vec, cur_sum_vec, new_sum_vec); + + // Compute unnormized attention block. + compute::gemm_(attn_block, rC, unnormized_attn_block_f32); + + __syncthreads(); + + cast_o(unnormized_attn_block_f32, unnormized_attn_block); + + vec_mul(prev_sum_vec, prev_norm_vec, prev_norm_vec); + broadcast_mul(prev_norm_vec, rD); + + broadcast_mul(cur_norm_vec, unnormized_attn_block); + + block_add(rD, unnormized_attn_block, rD); + + // Cear the accumulator. + attn_block_f32.clear(); + + // Update max vector and sum vector. + copy_vec(new_max_vec, prev_max_vec); + copy_vec(new_sum_vec, prev_sum_vec); + } + __syncthreads(); + + GlobalD gD(gD_ptr); + DStorer storer_d; // Store D tile from register to global. + storer_d(rD, gD); +} + +template +void run(bool check = true) { + using InType = __half; + using AccType = float; + + static constexpr int kM = dim_size<0, WholeShape>; + static constexpr int kN = dim_size<1, WholeShape>; + static constexpr int kK = dim_size<2, WholeShape>; + static constexpr int kP = dim_size<3, WholeShape>; + + static constexpr int kTM = dim_size<0, CtaTileShape>; + static constexpr int kTN = dim_size<1, CtaTileShape>; + static constexpr int kTK = dim_size<2, CtaTileShape>; + static constexpr int kTP = dim_size<3, CtaTileShape>; + + static_assert(kK == kTK, + "The current implementation requires kTK == K for now."); + static_assert(kP == kTP, + "The current implementation requires kTP == P for now."); + + // initialize data + thrust::host_vector h_a(kM * kK * kBatch); + + for (int i = 0; i < h_a.size(); ++i) + h_a[i] = static_cast(rand_float()); + + thrust::host_vector h_b(kK * kN * kBatch); + for (int i = 0; i < h_b.size(); ++i) + h_b[i] = static_cast(rand_float()); + + thrust::host_vector h_c(kN * kP * kBatch); + for (int i = 0; i < h_c.size(); ++i) + h_c[i] = static_cast(rand_float()); + + thrust::host_vector h_d(kM * kP * kBatch); + thrust::fill(h_d.begin(), h_d.end(), 0.); + + thrust::device_vector d_a = h_a; + thrust::device_vector d_b = h_b; + thrust::device_vector d_c = h_c; + thrust::device_vector d_d = h_d; + + const InType* A = thrust::raw_pointer_cast(d_a.data()); + const InType* B = thrust::raw_pointer_cast(d_b.data()); + const InType* C = thrust::raw_pointer_cast(d_c.data()); + InType* D = thrust::raw_pointer_cast(d_d.data()); + + using Config = + FlashAttentionTraits; + + using RegA = typename Config::RegA; + using RegB = typename Config::RegB; + using RegC = typename Config::RegC; + using RegD = typename Config::RegD; + using RegDCast = typename Config::RegDCast; + using RegAcc = typename Config::RegAcc; + using RegAccCast = typename Config::RegAccCast; + + using GIteratorA = typename Config::GIteratorA; + using SharedA = typename Config::SharedA; + using SharedALoader = typename Config::SharedALoader; + using RegALoader = typename Config::RegALoader; + + using GIteratorB = typename Config::GIteratorB; + using SharedB = typename Config::SharedB; + using SharedBLoader = typename Config::SharedBLoader; + using RegBLoader = typename Config::RegBLoader; + + using GIteratorC = typename Config::GIteratorC; + using SharedC = typename Config::SharedC; + using SharedCLoader = typename Config::SharedCLoader; + using RegCLoader = typename Config::RegCLoader; + + using DStorer = typename Config::DStorer; + + using ConvertAcc = typename Config::ConvertHalf; + using ConvertO = typename Config::ConvertO; + + using RegVec = typename Config::RegVec; + + using CopyVec = typename Config::CopyVec; + using RowMax = typename Config::RowMax; + + using BroadcastSub = typename Config::BroadcastSub; + using BroadcastMul = typename Config::BroadcastMul; + using BroadcastDiv = typename Config::BroadcastDiv; + + using BlockExp = typename Config::BlockExp; + using BlockAdd = typename Config::BlockAdd; + + using VecMax = typename Config::VecMax; + using VecAdd = typename Config::VecAdd; + using VecSub = typename Config::VecSub; + using VecMul = typename Config::VecMul; + using VecExp = typename Config::VecExp; + + int block_x = CeilDiv; + int block_y = CeilDiv; + int block_z = kBatch; + + dim3 grid(block_x, block_y, block_z); + dim3 block(Config::kThreads, 1, 1); + + int shm_input = (kTM * kTK + kTK * kTN + kTN * kTP); + int shm_output = kTM * kTP; + int shm_size = shm_input < shm_output ? shm_output * sizeof(InType) + : shm_input * sizeof(InType); + + auto kernel = + &KeFlashAttention; + + if (shm_size > 48 * 1024) { + cudaFuncSetAttribute( + kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + } + + kernel<<>>(A, B, C, D, kM, kN, kK, kP, kTM, kTN, + kTK, kTP); +} + +int main() { + // hidden 128 + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + run, + FlashAttentionShape<64, 128, 128, 128>, 64>(); + + // hidden 256 + run, + FlashAttentionShape<64, 128, 256, 256>, 64>(); + + run, + FlashAttentionShape<64, 128, 256, 256>, 64>(); + + run, + FlashAttentionShape<64, 128, 256, 256>, 64>(); + + run, + FlashAttentionShape<64, 128, 256, 256>, 64>(); + + run, + FlashAttentionShape<64, 128, 256, 256>, 64>(); + + run, + FlashAttentionShape<64, 64, 256, 256>, 64>(); + + run, + FlashAttentionShape<64, 64, 256, 256>, 64>(); + + return 0; +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/kernel.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/kernel.h new file mode 100644 index 000000000..871dd53c3 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/kernel.h @@ -0,0 +1,316 @@ +#pragma once +#include "kaleido/core/device/kernels/tiled_copy.h" +#include "utils/elementwise.h" +#include "utils/gmem_copy.h" +#include "utils/matmul.h" +#include "utils/misc.h" +#include "utils/reduce.h" + +using namespace kaleido::core::cuda_kernel; +using namespace cute; + +template +__global__ void __launch_bounds__(Nthreads) + flashattn(InType* dQ, InType* dK, InType* dV, InType* dO, int length_k, + int length_q) { + constexpr float softmax_scale = 1.250000e-01f; + + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + InType* shared = reinterpret_cast(shared_buf); + InType* shared_k = shared + num_stages_qk * kTileSizeRow * BlockKSmem; + InType* shared_v = shared_k + kTileSizeCol * BlockKSmem * num_stages_qk; + + int Tr = length_q / kTileSizeRow; + int len = length_k; + int iters = (len + kTileSizeCol - 1) / kTileSizeCol; + + int q_offset = (int)blockIdx.x * Kd * kTileSizeRow; + int k_offset = ((int)blockIdx.x / Tr) * Kd * length_k; + int v_offset = ((int)blockIdx.x / Tr) * D * length_k; + int o_offset = (int)blockIdx.x * D * kTileSizeRow; + + InType* q_ptr = dQ + q_offset; + InType* k_ptr = dK + k_offset; + + typename Traits::TiledMma tiled_mma; + typename Traits::CopyG2S g2s_copy; + auto g2s_thrd_copy = g2s_copy.get_thread_slice(threadIdx.x); + + Tensor gQ = make_tensor(make_gmem_ptr(q_ptr), + Shape, Int>{}, + make_stride(Int{}, _1{})); + Tensor gQs = g2s_thrd_copy.partition_S(gQ); + + Tensor sQ = + make_tensor(make_smem_ptr(shared), typename Traits::SmemLayoutQ{}); + Tensor sQs = g2s_thrd_copy.partition_D(sQ); + + Tensor gK = make_tensor(make_gmem_ptr(k_ptr), + Shape, Int>{}, + make_stride(Int{}, _1{})); + Tensor gKs = g2s_thrd_copy.partition_S(gK); + + Tensor sK = make_tensor(shared_k, typename Traits::SmemLayoutK{}); + Tensor sKs = g2s_thrd_copy.partition_D(sK); + + CopyTilesG2S g2s_qk(g2s_copy, gQs, sQs, gKs, sKs, BlockKSmem, size(sQ), + BlockKSmem, size(sK), num_stages_qk); + + CopyTileG2S g2s_k(g2s_copy, gKs, sKs, BlockKSmem, size(sK), num_stages_qk); + + Tensor gV = make_tensor(make_gmem_ptr(dV + v_offset), + Shape, Int>{}, + make_stride(Int{}, _1{})); + Tensor gVs = g2s_thrd_copy.partition_S(gV); + + Tensor sV = make_tensor(shared_v, typename Traits::SmemLayoutV{}); + Tensor sVs = g2s_thrd_copy.partition_D(sV); + + CopyTileG2S g2s_v(g2s_copy, gVs, sVs, BlockKSmem2 * D, size(sV), + num_stages_v); + + auto thr_mma = tiled_mma.get_thread_slice(threadIdx.x); + Tensor rQ_org = thr_mma.partition_fragment_A(sQ); + Tensor rK_org = thr_mma.partition_fragment_B(sK); + + Tensor acc_o = + partition_fragment_C(tiled_mma, Shape, Int>{}); + Tensor acc_s = partition_fragment_C( + tiled_mma, Shape, Int>{}); + + auto s2r_copyQ = + make_tiled_copy_A(typename Traits::SmemCopyAtom{}, tiled_mma); + auto s2r_thr_copyQ = s2r_copyQ.get_thread_slice(threadIdx.x); + + Tensor sQs_copy = s2r_thr_copyQ.partition_S(sQ); + auto smem_tiled_copy_K = + make_tiled_copy_B(typename Traits::SmemCopyAtom{}, tiled_mma); + auto smem_thr_copy_K = smem_tiled_copy_K.get_thread_slice(threadIdx.x); + + Tensor sKs_copy = smem_thr_copy_K.partition_S(sK); + Tensor sVt = + make_tensor(sV.data(), typename Traits::SmemLayoutVtransposed{}); + Tensor sVtNoSwizzle = make_tensor( + sVt.data(), typename Traits::SmemLayoutVtransposedNoSwizzle{}); + + Tensor rQ = s2r_thr_copyQ.retile_D(rQ_org); + Tensor rK = smem_thr_copy_K.retile_D(rK_org); + + auto s2r_copyV = + make_tiled_copy_B(typename Traits::SmemCopyAtomTransposed{}, tiled_mma); + auto thr_copy_rV = s2r_copyV.get_thread_slice(threadIdx.x); + + Tensor sVst_copy = thr_copy_rV.partition_S(sVt); + Tensor rVt = thr_mma.partition_fragment_B(sVtNoSwizzle); + Tensor rVt_copy_view = thr_copy_rV.retile_D(rVt); + + CopyTilesS2R qk_s2r(s2r_copyQ, sQs_copy, rQ, smem_tiled_copy_K, sKs_copy, + rK, tiled_mma, rQ_org, rK_org, acc_s, size(sQ), + size(sK), num_stages_qk); + + CopyTileS2R v_s2r(s2r_copyV, sVst_copy, rVt_copy_view, tiled_mma, rVt, + acc_o, size(sV), num_stages_v); + + g2s_qk.prologue(); + + Tensor m_new = make_tensor(Shape(acc_s)>>{}); + Tensor lse_new = make_fragment_like(m_new); + fill(lse_new, 0.0f); + fill(m_new, -INFINITY); + clear(acc_o); + + for (int i = 0; i < iters - (unrollLastIter ? 1 : 0); ++i) { + clear(acc_s); + for (int ax0 = 0; ax0 < Kd / BlockKSmem - 1; ++ax0) { + cp_async_wait_flash<0>(); + __syncthreads(); + g2s_qk.body(); + qk_s2r.body(); + } + cp_async_wait_flash<0>(); + __syncthreads(); + g2s_v.prologue(); + qk_s2r.epilogue(); + + auto scores = + make_tensor(acc_s.data(), convert_layout_scores(acc_s.layout())); + + Tensor m_old = make_fragment_like(m_new); + copy(m_new, m_old); + + Tensor scores_max = make_fragment_like(m_new); + reduce_max<4, true>(scores, scores_max); + + for (int ax0 = 0; ax0 < size<0>(m_new); ++ax0) + m_new(ax0) = max(m_new(ax0), scores_max(ax0)); + + auto acc_o_rowcol = + make_tensor(acc_o.data(), convert_layout_scores(acc_o.layout())); + + for (int ax0 = 0; ax0 < size<0>(acc_o_rowcol); ++ax0) { + float scale = exp((m_old(ax0) - m_new(ax0)) * softmax_scale); + lse_new(ax0) = lse_new(ax0) * scale; + for (int ax1 = 0; ax1 < size<1>(acc_o_rowcol); ax1++) { + acc_o_rowcol(ax0, ax1) *= scale; + } + } + + for (int ax0 = 0; ax0 < size<0>(scores); ++ax0) { + float m_scaled = m_new(ax0) * softmax_scale; + for (int ax1 = 0; ax1 < size<1>(scores); ax1++) { + scores(ax0, ax1) = + exp(scores(ax0, ax1) * softmax_scale - m_scaled); + } + } + + Tensor scores_sum = make_fragment_like(lse_new); + reduce_sum<4>(scores, scores_sum); + + for (int ax0 = 0; ax0 < size<0>(lse_new); ++ax0) { + lse_new(ax0) = lse_new(ax0) + scores_sum(ax0); + } + + auto frag = convert_type(scores); + + Tensor rP = make_tensor(make_rmem_ptr(&frag), scores.layout()); + Tensor rP_Aregs = + make_tensor(rP.data(), convert_layout_rowcol_Aregs(rP.layout())); + + for (int ax0 = 0; ax0 < kTileSizeCol / BlockKSmem2 - 1; ++ax0) { + cp_async_wait_flash<0>(); + __syncthreads(); + g2s_v.body(); + v_s2r.body(rP_Aregs); + } + + cp_async_wait_flash<0>(); + __syncthreads(); + if (i < iters - 1) { + gKs.data() = gKs.data() + (-Kd) + kTileSizeCol * Kd; + if (load_q_once) { + g2s_k.prologue(); + } else { + gQs.data() = gQs.data() + (-Kd); + g2s_qk.prologue(); + } + } + v_s2r.epilogue(rP_Aregs); + } + + if (unrollLastIter) { + clear(acc_s); + for (int ax0 = 0; ax0 < Kd / BlockKSmem - 1; ++ax0) { + cp_async_wait_flash<0>(); + __syncthreads(); + g2s_qk.body(); + qk_s2r.body(); + } + + cp_async_wait_flash<0>(); + __syncthreads(); + g2s_v.prologue(); + qk_s2r.epilogue(); + + Tensor scores = + make_tensor(acc_s.data(), convert_layout_scores(acc_s.layout())); + + Tensor m_old = make_fragment_like(m_new); + copy(m_new, m_old); + + Tensor scores_max = make_fragment_like(m_new); + reduce_max<4, true>(scores, scores_max); + + for (int ax0 = 0; ax0 < size<0>(m_new); ++ax0) { + m_new(ax0) = max(m_new(ax0), scores_max(ax0)); + } + + Tensor acc_o_rowcol = + make_tensor(acc_o.data(), convert_layout_scores(acc_o.layout())); + + for (int ax0 = 0; ax0 < size<0>(acc_o_rowcol); ++ax0) { + float scale = exp((m_old(ax0) - m_new(ax0)) * softmax_scale); + lse_new(ax0) = lse_new(ax0) * scale; + for (int ax1 = 0; ax1 < size<1>(acc_o_rowcol); ++ax1) { + acc_o_rowcol(ax0, ax1) *= scale; + } + } + + for (int ax0 = 0; ax0 < size<0>(scores); ++ax0) { + float m_scaled = m_new(ax0) * softmax_scale; + for (int ax1 = 0; ax1 < size<1>(scores); ++ax1) { + scores(ax0, ax1) = + exp(scores(ax0, ax1) * softmax_scale - m_scaled); + } + } + + Tensor scores_sum = make_fragment_like(lse_new); + reduce_sum<4>(scores, scores_sum); + for (int ax0 = 0; ax0 < size<0>(lse_new); ++ax0) { + lse_new(ax0) = lse_new(ax0) + scores_sum(ax0); + } + + auto frag = convert_type(scores); + + Tensor rP = make_tensor(make_rmem_ptr(&frag), scores.layout()); + Tensor rP_Aregs = + make_tensor(rP.data(), convert_layout_rowcol_Aregs(rP.layout())); + + for (int ax0 = 0; ax0 < kTileSizeCol / BlockKSmem2 - 1; ++ax0) { + cp_async_wait_flash<0>(); + __syncthreads(); + g2s_v.body(); + v_s2r.body(rP_Aregs); + } + + cp_async_wait_flash<0>(); + __syncthreads(); + v_s2r.epilogue(rP_Aregs); + } + + Tensor acc_o_rowcol = + make_tensor(acc_o.data(), convert_layout_scores(acc_o.layout())); + for (int ax0 = 0; ax0 < size<0>(acc_o_rowcol); ++ax0) { + float scale = 1 / lse_new(ax0); + lse_new(ax0) = m_new(ax0) * softmax_scale + log(lse_new(ax0)); + for (int ax1 = 0; ax1 < size<1>(acc_o_rowcol); ++ax1) { + acc_o_rowcol(ax0, ax1) *= scale; + } + } + + auto frag2 = convert_type(acc_o); + Tensor acc_o_f16 = + make_tensor(make_rmem_ptr(&frag2), acc_o.layout()); + + Tensor sO = make_tensor(make_smem_ptr((InType*)(shared)), + typename Traits::SmemLayoutO{}); + auto smem_tiled_copy_O = + make_tiled_copy_C(typename Traits::SmemCopyAtomO{}, tiled_mma); + + auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(threadIdx.x); + Tensor sOs = smem_thr_copy_O.partition_D(sO); + Tensor rO_copy_view = smem_thr_copy_O.retile_S(acc_o_f16); + + Tensor gO = make_tensor(make_gmem_ptr(dO + o_offset), + Shape, Int>{}, + make_stride(Int{}, _1{})); + + typename Traits::GmemTiledCopyO gmem_tiled_copy_O; + auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(threadIdx.x); + + Tensor gO_partition = gmem_thr_copy_O.partition_D(gO); + Tensor sO_partition = gmem_thr_copy_O.partition_S(sO); + __syncthreads(); + copy(smem_tiled_copy_O, rO_copy_view, sOs); + __syncthreads(); + + for (int m = 0; m < size<1>(gO_partition); ++m) { + for (int k = 0; k < size<2>(gO_partition); ++k) { + copy(gmem_tiled_copy_O, sO_partition(_, m, k), + gO_partition(_, m, k)); + } + } +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/main.cu b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/main.cu new file mode 100644 index 000000000..2612d0361 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/main.cu @@ -0,0 +1,123 @@ +#include "kaleido/core/device/cuda_timer.h" +#include "kernel.h" +#include "utils/kernel_traits.h" + +#include +#include + +namespace { +float rand_float(float a = 1e-1, float b = 5e-2) { + float random = ((float)rand()) / (float)RAND_MAX; + float diff = b - a; + float r = random * diff; + return a + r; +} +} // namespace + +template +void run_test() { + using InType = cutlass::half_t; + using AccType = float; + + thrust::host_vector h_q(batch * head * length * hidden_qk); + for (int i = 0; i < h_q.size(); ++i) + h_q[i] = static_cast(rand_float()); + + thrust::host_vector h_k(batch * head * length * hidden_qk); + for (int i = 0; i < h_k.size(); ++i) + h_k[i] = static_cast(rand_float()); + + thrust::host_vector h_v(batch * head * length * hidden_v); + for (int i = 0; i < h_v.size(); ++i) + h_v[i] = static_cast(rand_float()); + + thrust::host_vector h_o(batch * head * length * hidden_v); + for (int i = 0; i < h_o.size(); ++i) + h_o[i] = static_cast(rand_float()); + + thrust::device_vector d_q = h_q; + thrust::device_vector d_k = h_k; + thrust::device_vector d_v = h_v; + thrust::device_vector d_o = h_o; + + const int kDimK = hidden_qk; + const int kDimV = hidden_v; + + const bool load_q_once = (kBlockKSmem == kDimK); + constexpr const int SmemKAtom = 64; + constexpr const int kSwizzle = SmemKAtom == 32 ? 2 : 3; + + constexpr int shared_in = + stages_qk * kTileSizeRow * kBlockKSmem * sizeof(InType) + + stages_qk * kTileSizeCol * kBlockKSmem * sizeof(InType) + + stages_v * kBlockKSmem * kDimV * sizeof(InType); + + constexpr int shared_out = kTileSizeRow * kDimV * sizeof(InType); + constexpr int shared_mem = shared_in > shared_out ? shared_in : shared_out; + + using Traits = KeTraits; + + auto kernel = + &flashattn; + + if (shared_mem > 48 * 1024) + cudaFuncSetAttribute( + kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem); + + int warm_up = 10; + for (int i = 0; i < warm_up; ++i) { + kernel<<>>( + thrust::raw_pointer_cast(d_q.data()), + thrust::raw_pointer_cast(d_k.data()), + thrust::raw_pointer_cast(d_v.data()), + thrust::raw_pointer_cast(d_o.data()), length, length); + } + + int iter = 20; + kaleido::core::CudaTimer timer; + timer.Start(); + for (int i = 0; i < iter; ++i) { + kernel<<>>( + thrust::raw_pointer_cast(d_q.data()), + thrust::raw_pointer_cast(d_k.data()), + thrust::raw_pointer_cast(d_v.data()), + thrust::raw_pointer_cast(d_o.data()), length, length); + } + cudaDeviceSynchronize(); + float time = timer.Stop() / iter; + + std::cout << length << "\t" << hidden_qk << "\t" << time << std::endl; +} + +int main() { + // length, hidden_q, hidden_v, batch, head, + run_test<128, 128, 128, 32, 8>(); + run_test<256, 128, 128, 32, 8>(); + run_test<512, 128, 128, 32, 8>(); + run_test<768, 128, 128, 32, 8>(); + run_test<1024, 128, 128, 32, 8>(); + run_test<1536, 128, 128, 32, 8>(); + run_test<2048, 128, 128, 32, 8>(); + run_test<4096, 128, 128, 32, 8>(); + + run_test<128, 256, 256, 32, 8>(); + run_test<256, 256, 256, 32, 8>(); + run_test<512, 256, 256, 32, 8>(); + run_test<768, 256, 256, 32, 8>(); + run_test<1024, 256, 256, 32, 8>(); + run_test<1536, 256, 256, 32, 8>(); + run_test<2048, 256, 256, 32, 8>(); + run_test<4096, 256, 256, 32, 8>(); + + return 0; +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/elementwise.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/elementwise.h new file mode 100644 index 000000000..15829066e --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/elementwise.h @@ -0,0 +1,51 @@ +#pragma once +#include "reduce.h" + +#include +#include + +using namespace cute; + +template +__device__ inline void multiply_mask(SmemTiledCopy1 smem_tiled_copy_mask, + STensor1& sMask_copypartition, + RTensor1& rMask_copy_view, + Tensor0& acc_s_fragment, Tensor1& rMask) { + using namespace cute; + cute::copy(smem_tiled_copy_mask, sMask_copypartition(_, _, _0{}), + rMask_copy_view(_, _, _0{})); +#pragma unroll + for (int ax0 = 0; ax0 < size<2>(acc_s_fragment); ax0++) { + if (ax0 < size<2>(acc_s_fragment) - 1) { + cute::copy(smem_tiled_copy_mask, sMask_copypartition(_, _, ax0 + 1), + rMask_copy_view(_, _, ax0 + 1)); + } +#pragma unroll + for (int ax1 = 0; ax1 < size<1>(acc_s_fragment); ax1++) { +#pragma unroll + for (int ax2 = 0; ax2 < size<0>(acc_s_fragment); ax2++) { + acc_s_fragment(ax2, ax1, ax0) = + acc_s_fragment(ax2, ax1, ax0) * + __half2float(rMask(ax2, ax1, ax0)); + } + } + } +} + +template +__device__ inline void update_r(Tensor0& r_new_fragment, + Tensor1& r_wo_clamp_fragment, Tensor2& scores) { + using namespace cute; + Tensor r_wo_clamp_fragment_tmp = make_fragment_like(r_wo_clamp_fragment); + reduce_sumabs(scores, r_wo_clamp_fragment_tmp); +#pragma unroll + for (int ax0 = 0; ax0 < size<0>(r_wo_clamp_fragment); ax0++) { + r_wo_clamp_fragment(ax0) += r_wo_clamp_fragment_tmp(ax0); + } + +#pragma unroll + for (int ax0 = 0; ax0 < size<0>(r_wo_clamp_fragment); ax0++) { + r_new_fragment(ax0) = max(r_wo_clamp_fragment(ax0), 1.0f); + } +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/gmem_copy.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/gmem_copy.h new file mode 100644 index 000000000..aab6c0c9e --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/gmem_copy.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include + +using namespace cute; + +template +CUTE_HOST_DEVICE void cp_async_wait_flash() { +#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); +#endif +} + +template +__device__ void copy_Global2Reg(GmemTiledCopy gmem_tiled_copy_QKV, + GTensor1& gQ_partition, + RTensor1& rQ_partition) { + CUTE_STATIC_ASSERT_V(size<0>(gQ_partition) == size<0>(rQ_partition)); + CUTE_STATIC_ASSERT_V(size<1>(gQ_partition) == size<2>(rQ_partition)); +#pragma unroll + for (int i = 0; i < size<1>(gQ_partition); i++) { + cute::copy(gmem_tiled_copy_QKV, gQ_partition(_, i, _0{}), + rQ_partition(_, _0{}, i)); + } +} + +template +__device__ void copy_Reg2Global(GmemTiledCopy gmem_tiled_copy_QKV, + RTensor1& rQ_partition, + GTensor1& gQ_partition) { + CUTE_STATIC_ASSERT_V(size<0>(gQ_partition) == size<0>(rQ_partition)); + CUTE_STATIC_ASSERT_V(size<1>(gQ_partition) == size<2>(rQ_partition)); +#pragma unroll + for (int i = 0; i < size<1>(gQ_partition); i++) { + cute::copy(gmem_tiled_copy_QKV, rQ_partition(_, _0{}, i), + gQ_partition(_, i, _0{})); + } +} diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/kernel_traits.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/kernel_traits.h new file mode 100644 index 000000000..357cc1769 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/kernel_traits.h @@ -0,0 +1,84 @@ +#pragma once + +#include "kaleido/core/device/kernels/tiled_copy.h" + +template +struct KeTraits { + static_assert(kDimK % (BlockKSmem) == 0, "kDimK%(BlockKSmem)!=0"); + static_assert(kTileSizeCol % (BlockKSmem2) == 0, + "kTileSizeCol%(BlockKSmem2)!=0"); + + static_assert(BlockKSmem % SmemKAtom == 0, "BlockKSmem%SmemKAtom!=0"); + static_assert(BlockKSmem2 % (kThreads / (SmemKAtom / 8)) == 0, + "gmem load V fail"); + static_assert(kTileSizeRow % (kThreads / (SmemKAtom / 8)) == 0, + "gmem load Q fail"); + static_assert(kTileSizeCol % (kThreads / (SmemKAtom / 8)) == 0, + "gmem load K fail"); + + constexpr static int kWarps = kThreads / 32; + + using TiledMma = + TiledMMA, + Layout, _1, _1>>, Layout>>; + + using GmemCopyLayoutAtom = + Layout, Int>, + Stride, _1>>; + using CopyG2S = decltype(make_tiled_copy( + Copy_Atom, InType>{}, + GmemCopyLayoutAtom{}, Layout>{})); + + using SmemLayoutAtom = decltype(composition( + Swizzle{}, + Layout>, Stride, _1>>{})); + + using GmemLayoutQ = Layout, Int>, + Stride, _1>>; + + using GmemLayoutK = Layout, Int>, + Stride, _1>>; + + using SmemLayoutQ = decltype(tile_to_shape( + SmemLayoutAtom{}, Shape, Int>{})); + + using SmemLayoutK = decltype(tile_to_shape( + SmemLayoutAtom{}, Shape, Int>{})); + + using SmemCopyAtom = Copy_Atom; + + using SmemLayoutAtomVtransposedNoSwizzle = + Layout, Int>, + Stride<_1, Int>>; + + using SmemLayoutAtomVtransposed = decltype(composition( + Swizzle{}, SmemLayoutAtomVtransposedNoSwizzle{})); + + using SmemLayoutV = decltype(tile_to_shape( + SmemLayoutAtom{}, Shape, Int>{})); + + using SmemLayoutVtransposed = decltype(tile_to_shape( + SmemLayoutAtomVtransposed{}, Shape, Int>{})); + + using SmemLayoutVtransposedNoSwizzle = + decltype(tile_to_shape(SmemLayoutAtomVtransposedNoSwizzle{}, + Shape, Int>{})); + + using SmemCopyAtomTransposed = Copy_Atom; + + using SmemLayoutAtomO = decltype(composition( + Swizzle{}, + Layout, Int>, Stride, _1>>{})); + using SmemLayoutO = decltype(tile_to_shape( + SmemLayoutAtomO{}, Shape, Int>{})); + using SmemCopyAtomO = Copy_Atom; + + using GmemTiledCopyO = decltype(make_tiled_copy( + Copy_Atom{}, GmemCopyLayoutAtom{}, + Layout>{})); +}; diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/matmul.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/matmul.h new file mode 100644 index 000000000..d9c76a272 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/matmul.h @@ -0,0 +1,325 @@ +#pragma once +#include +#include +#include +#include +#include + +using namespace cute; + +template +class MatmulQK_s2r { + public: + __device__ MatmulQK_s2r(SmemTiledCopy1 smem_tiled_copy_Q, + STensor1& sQ_copypartition, RTensor1& rQ_copy_view, + SmemTiledCopy2 smem_tiled_copy_K, + STensor2& sK_copypartition, RTensor2& rK_copy_view, + TiledMMAType tiled_mma1, RTensor3& rQ, RTensor4& rK, + RTensor5& acc_s_fragment, int sQ_stride, + int sK_stride, int num_stage = 2) + : smem_tiled_copy_Q(smem_tiled_copy_Q), + sQ_copypartition(sQ_copypartition), + rQ_copy_view(rQ_copy_view), + smem_tiled_copy_K(smem_tiled_copy_K), + sK_copypartition(sK_copypartition), + rK_copy_view(rK_copy_view), + tiled_mma1(tiled_mma1), + rQ(rQ), + rK(rK), + acc_s_fragment(acc_s_fragment), + sQ_stride(sQ_stride), + sK_stride(sK_stride), + cur_iter(0), + num_stage(num_stage) {} + inline __device__ void prologue() { + cur_iter = 0; + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, _0{}), + rQ_copy_view(_, _, _0{})); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, _0{}), + rK_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rK); ++i) { + if (i < size<2>(rK) - 1) { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, i + 1), + rQ_copy_view(_, _, i + 1)); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, i + 1), + rK_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma1, rQ(_, _, i), rK(_, _, i), acc_s_fragment); + } + sQ_copypartition.data() = sQ_copypartition.data() + sQ_stride; + sK_copypartition.data() = sK_copypartition.data() + sK_stride; + cur_iter++; + } + inline __device__ void body() { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, _0{}), + rQ_copy_view(_, _, _0{})); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, _0{}), + rK_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rK); ++i) { + if (i < size<2>(rK) - 1) { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, i + 1), + rQ_copy_view(_, _, i + 1)); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, i + 1), + rK_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma1, rQ(_, _, i), rK(_, _, i), acc_s_fragment); + } + sQ_copypartition.data() = sQ_copypartition.data() + sQ_stride; + sK_copypartition.data() = sK_copypartition.data() + sK_stride; + if ((cur_iter + 1) % num_stage == 0) { + sQ_copypartition.data() = + sQ_copypartition.data() + (-sQ_stride * num_stage); + sK_copypartition.data() = + sK_copypartition.data() + (-sK_stride * num_stage); + } + cur_iter++; + } + inline __device__ void epilogue() { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, _0{}), + rQ_copy_view(_, _, _0{})); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, _0{}), + rK_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rK); ++i) { + if (i < size<2>(rK) - 1) { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, i + 1), + rQ_copy_view(_, _, i + 1)); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, i + 1), + rK_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma1, rQ(_, _, i), rK(_, _, i), acc_s_fragment); + } + sQ_copypartition.data() = sQ_copypartition.data() + sQ_stride; + sK_copypartition.data() = sK_copypartition.data() + sK_stride; + if ((cur_iter + 1) % num_stage == 0) { + sQ_copypartition.data() = + sQ_copypartition.data() + (-sQ_stride * num_stage); + sK_copypartition.data() = + sK_copypartition.data() + (-sK_stride * num_stage); + } + cur_iter++; + } + + private: + int cur_iter; + SmemTiledCopy1 smem_tiled_copy_Q; + STensor1& sQ_copypartition; + RTensor1& rQ_copy_view; + SmemTiledCopy2 smem_tiled_copy_K; + STensor2& sK_copypartition; + RTensor2& rK_copy_view; + TiledMMAType tiled_mma1; + RTensor3& rQ; + RTensor4& rK; + RTensor5& acc_s_fragment; + int sQ_stride, sK_stride; + int num_stage; +}; + +template +class CopyTilesS2R { + public: + __device__ CopyTilesS2R(SmemTiledCopy1 smem_tiled_copy_Q, + STensor1& sQ_copypartition, RTensor1& rQ_copy_view, + SmemTiledCopy2 smem_tiled_copy_K, + STensor2& sK_copypartition, RTensor2& rK_copy_view, + TiledMMAType tiled_mma1, RTensor3& rQ, RTensor4& rK, + RTensor5& acc_s_fragment, int sQ_stride, + int sK_stride, int num_stage = 2) + : smem_tiled_copy_Q(smem_tiled_copy_Q), + sQ_copypartition(sQ_copypartition), + rQ_copy_view(rQ_copy_view), + smem_tiled_copy_K(smem_tiled_copy_K), + sK_copypartition(sK_copypartition), + rK_copy_view(rK_copy_view), + tiled_mma1(tiled_mma1), + rQ(rQ), + rK(rK), + acc_s_fragment(acc_s_fragment), + sQ_stride(sQ_stride), + sK_stride(sK_stride), + cur_iter(0), + num_stage(num_stage), + cur_iter_sq(0) {} + inline __device__ void prologue() { + cur_iter = 0; + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, _0{}), + rQ_copy_view(_, _, _0{})); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, _0{}), + rK_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rK); ++i) { + if (i < size<2>(rK) - 1) { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, i + 1), + rQ_copy_view(_, _, i + 1)); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, i + 1), + rK_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma1, rQ(_, _, i), rK(_, _, i), acc_s_fragment); + } + sQ_copypartition.data() = sQ_copypartition.data() + sQ_stride; + sK_copypartition.data() = sK_copypartition.data() + sK_stride; + cur_iter++; + } + inline __device__ void body() { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, _0{}), + rQ_copy_view(_, _, _0{})); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, _0{}), + rK_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rK); ++i) { + if (i < size<2>(rK) - 1) { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, i + 1), + rQ_copy_view(_, _, i + 1)); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, i + 1), + rK_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma1, rQ(_, _, i), rK(_, _, i), acc_s_fragment); + } + sQ_copypartition.data() = sQ_copypartition.data() + sQ_stride; + sK_copypartition.data() = sK_copypartition.data() + sK_stride; + if ((cur_iter + 1) % num_stage == 0) { + sK_copypartition.data() = + sK_copypartition.data() + (-sK_stride * num_stage); + } + cur_iter++; + cur_iter_sq++; + } + inline __device__ void epilogue() { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, _0{}), + rQ_copy_view(_, _, _0{})); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, _0{}), + rK_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rK); ++i) { + if (i < size<2>(rK) - 1) { + cute::copy(smem_tiled_copy_Q, sQ_copypartition(_, _, i + 1), + rQ_copy_view(_, _, i + 1)); + cute::copy(smem_tiled_copy_K, sK_copypartition(_, _, i + 1), + rK_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma1, rQ(_, _, i), rK(_, _, i), acc_s_fragment); + } + sQ_copypartition.data() = + sQ_copypartition.data() + (-sQ_stride * cur_iter_sq); + sK_copypartition.data() = sK_copypartition.data() + sK_stride; + if ((cur_iter + 1) % num_stage == 0) { + sK_copypartition.data() = + sK_copypartition.data() + (-sK_stride * num_stage); + } + cur_iter++; + cur_iter_sq = 0; + } + + private: + int cur_iter, cur_iter_sq; + SmemTiledCopy1 smem_tiled_copy_Q; + STensor1& sQ_copypartition; + RTensor1& rQ_copy_view; + SmemTiledCopy2 smem_tiled_copy_K; + STensor2& sK_copypartition; + RTensor2& rK_copy_view; + TiledMMAType tiled_mma1; + RTensor3& rQ; + RTensor4& rK; + RTensor5& acc_s_fragment; + int sQ_stride, sK_stride; + int num_stage; +}; + +template +class CopyTileS2R { + public: + __device__ CopyTileS2R(SmemTiledCopy2 smem_tiled_copy_V, + STensor2& sV_copypartition, RTensor2& rV_copy_view, + TiledMMAType tiled_mma, RTensor4& rV, + RTensor5& acc_o_fragment, int sV_stride, + int num_stage = 2) + : smem_tiled_copy_V(smem_tiled_copy_V), + sV_copypartition(sV_copypartition), + rV_copy_view(rV_copy_view), + tiled_mma(tiled_mma), + rV(rV), + acc_o_fragment(acc_o_fragment), + sV_stride(sV_stride), + cur_iter(0), + cur_iter_sv(0), + num_stage(num_stage) {} + template + inline __device__ void prologue(RTensor3& rP) { + cur_iter = 0; + cute::copy(smem_tiled_copy_V, sV_copypartition(_, _, _0{}), + rV_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rV); ++i) { + if (i < size<2>(rV) - 1) { + cute::copy(smem_tiled_copy_V, sV_copypartition(_, _, i + 1), + rV_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma, rP(_, _, cur_iter * size<2>(rV) + i), + rV(_, _, i), acc_o_fragment); + } + sV_copypartition.data() = sV_copypartition.data() + sV_stride; + cur_iter++; + } + template + inline __device__ void body(RTensor3& rP) { + cute::copy(smem_tiled_copy_V, sV_copypartition(_, _, _0{}), + rV_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rV); ++i) { + if (i < size<2>(rV) - 1) { + cute::copy(smem_tiled_copy_V, sV_copypartition(_, _, i + 1), + rV_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma, rP(_, _, cur_iter_sv * size<2>(rV) + i), + rV(_, _, i), acc_o_fragment); + } + sV_copypartition.data() = sV_copypartition.data() + sV_stride; + if ((cur_iter + 1) % num_stage == 0) { + sV_copypartition.data() = + sV_copypartition.data() + (-sV_stride * num_stage); + } + cur_iter++; + cur_iter_sv++; + } + template + inline __device__ void epilogue(RTensor3& rP) { + cute::copy(smem_tiled_copy_V, sV_copypartition(_, _, _0{}), + rV_copy_view(_, _, _0{})); +#pragma unroll + for (int i = 0; i < size<2>(rV); ++i) { + if (i < size<2>(rV) - 1) { + cute::copy(smem_tiled_copy_V, sV_copypartition(_, _, i + 1), + rV_copy_view(_, _, i + 1)); + } + cute::gemm(tiled_mma, rP(_, _, cur_iter_sv * size<2>(rV) + i), + rV(_, _, i), acc_o_fragment); + } + sV_copypartition.data() = sV_copypartition.data() + sV_stride; + if ((cur_iter + 1) % num_stage == 0) { + sV_copypartition.data() = + sV_copypartition.data() + (-sV_stride * num_stage); + } + cur_iter++; + cur_iter_sv = 0; + } + + private: + int cur_iter, cur_iter_sv; + SmemTiledCopy2 smem_tiled_copy_V; + STensor2& sV_copypartition; + RTensor2& rV_copy_view; + TiledMMAType tiled_mma; + RTensor4& rV; + RTensor5& acc_o_fragment; + int sV_stride; + int num_stage; +}; diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/misc.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/misc.h new file mode 100644 index 000000000..648911dc6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/misc.h @@ -0,0 +1,60 @@ +#pragma once +#include "cute/layout.hpp" +#include "cute/tensor.hpp" +#include "cutlass/numeric_conversion.h" + +namespace cute { + +template +CUTE_DEVICE auto convert_type(cute::Tensor const& tensor) { + using From_type = typename Engine::value_type; + constexpr int numel = decltype(size(tensor))::value; + cutlass::NumericArrayConverter convert_op; + auto frag = + convert_op(*reinterpret_cast*>( + tensor.data())); + return make_tensor(make_rmem_ptr(&frag), tensor.layout()); +} + +template +inline __device__ auto convert_layout_rowcol_Aregs(Layout rowcol_layout) { + using namespace cute; + static_assert(decltype(size<0, 0>(rowcol_layout))::value == 2); + static_assert(decltype(size<1, 0>(rowcol_layout))::value == 2); + auto l = logical_divide(rowcol_layout, + Shape>>{}); + + return make_layout(make_layout(get<0>(get<1>(l)), get<0>(get<0>(l)), + get<0>(get<1>(get<1>(l)))), + get<1>(get<0>(l)), get<1>(get<1>(get<1>(l)))); +} + +inline __device__ auto convert_layout_C_Aregs() { + using namespace cute; + auto layout_s = Layout, _2, _16>>{}; + auto l = logical_divide(layout_s, Shape{}); + + return make_layout( + make_layout(get<0>(get<0>(l)), get<1>(get<0>(l)), get<0>(get<2>(l))), + get<1>(l), get<1>(get<2>(l))); +} + +template +inline __device__ auto convert_layout_scores(LayoutType layout_s) { + using namespace cute; + static_assert(decltype(size<0>(layout_s))::value == 4); + static_assert(decltype(rank(layout_s))::value == 3); + + auto l = logical_divide(layout_s, Shape<_2>{}); + return make_layout(make_layout(get<1>(get<0>(l)), get<1>(l)), + make_layout(get<0>(get<0>(l)), get<2>(l))); +} + +template +inline __device__ auto convert_layout_scores_copyview(LayoutType layout_s) { + using namespace cute; + + auto l = logical_divide(layout_s, Shape>{}); + return make_layout(get<0>(get<1>(l)), get<0>(l), get<1>(get<1>(l))); +} +} // namespace cute diff --git a/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/reduce.h b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/reduce.h new file mode 100644 index 000000000..3ac4c38cc --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/multi-head_attention/fractaltensor/utils/reduce.h @@ -0,0 +1,137 @@ +#pragma once + +#include + +using namespace cute; + +struct MaxOp_float { + __device__ inline float operator()(float const& x, float const& y) { + return max(x, y); + } +}; + +template +struct SumOp { + __device__ inline T operator()(T const& x, T const& y) { return x + y; } +}; +template +struct SumAbsOp { + __device__ inline T operator()(T const& x, T const& y) { + return x + abs(y); + } +}; + +template +struct Allreduce { + static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || + THREADS == 4); + template + static __device__ inline T run(T x, Operator& op) { + constexpr int OFFSET = THREADS / 2; + x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET)); + return Allreduce::run(x, op); + } +}; + +template <> +struct Allreduce<2> { + template + static __device__ inline T run(T x, Operator& op) { + x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1)); + return x; + } +}; + +template +__device__ inline void thread_reduce_( + cute::Tensor const& tensor, + cute::Tensor& summary, Operator& op) { + using namespace cute; + static_assert(Layout0::rank == 2, "Only support 2D Tensor"); + static_assert(Layout1::rank == 1, "Only support 1D Tensor"); + CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor)); +#pragma unroll + for (int mi = 0; mi < size<0>(tensor); mi++) { + summary(mi) = + zero_init ? op(0, tensor(mi, 0)) : op(summary(mi), tensor(mi, 0)); +#pragma unroll + for (int ni = 1; ni < size<1>(tensor); ni++) { + summary(mi) = op(summary(mi), tensor(mi, ni)); + } + } +} + +template +__device__ inline void quad_allreduce_(cute::Tensor& dst, + cute::Tensor& src, + Operator& op) { + using namespace cute; + CUTE_STATIC_ASSERT_V(size(dst) == size(src)); +#pragma unroll + for (int i = 0; i < size(dst); i++) { + dst(i) = Allreduce<4>::run(src(i), op); + } +} + +template +__device__ inline void eight_allreduce_(cute::Tensor& dst, + cute::Tensor& src, + Operator& op) { + using namespace cute; + CUTE_STATIC_ASSERT_V(size(dst) == size(src)); +#pragma unroll + for (int i = 0; i < size(dst); i++) { + dst(i) = Allreduce<8>::run(src(i), op); + } +} + +template +__device__ inline void allreduce_(cute::Tensor& dst, + cute::Tensor& src, + Operator& op) { + using namespace cute; + CUTE_STATIC_ASSERT_V(size(dst) == size(src)); +#pragma unroll + for (int i = 0; i < size(dst); i++) { + dst(i) = Allreduce::run(src(i), op); + } +} + +template +__device__ inline void reduce_(cute::Tensor const& tensor, + cute::Tensor& summary, + Operator& op) { + thread_reduce_(tensor, summary, op); + allreduce_(summary, summary, op); +} + +template +__device__ inline void reduce_max(cute::Tensor const& tensor, + cute::Tensor& max) { + MaxOp_float max_op; + reduce_(tensor, max, max_op); +} + +template +__device__ inline void reduce_sum(cute::Tensor const& tensor, + cute::Tensor& sum) { + SumOp sum_op; + reduce_(tensor, sum, sum_op); +} + +template +__device__ inline void reduce_sumabs( + cute::Tensor const& tensor, + cute::Tensor& sum) { + SumAbsOp sumabs_op; + reduce_(tensor, sum, sumabs_op); +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/README.md b/artifacts/FractalTensor/benchmarks/rnn/baselines/README.md new file mode 100644 index 000000000..c13aff888 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/README.md @@ -0,0 +1,45 @@ +# Test Environment + +``` {.text} +OS: Ubuntu 16.04.7 LTS +TensorFlow version: 2.2.3, compiled by gcc 5.0 +PyTorch v1.9.0 +CUDA Version 10.2 +CUDNN Version 7.6.5 +``` +## CPU information + +```bash +lscpu +``` + +``` {.text} +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Byte Order: Little Endian +CPU(s): 12 # virtual CPU +On-line CPU(s) list: 0-11 +Thread(s) per core: 2 +Core(s) per socket: 6 +Socket(s): 1 +NUMA node(s): 1 +Vendor ID: GenuineIntel +CPU family: 6 +Model: 63 +Model name: Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz +Stepping: 2 +CPU MHz: 1200.117 +CPU max MHz: 3700.0000 +CPU min MHz: 1200.0000 +BogoMIPS: 7000.36 +Virtualization: VT-x +L1d cache: 32K +L1i cache: 32K +L2 cache: 256K +L3 cache: 15360K +NUMA node0 CPU(s): 0-11 +``` + +### GPU information + +GeForce RTX 2080 Ti, Compute Capability 7.5 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png b/artifacts/FractalTensor/benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png new file mode 100644 index 000000000..31f714034 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png differ diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/figures/figures.pptx b/artifacts/FractalTensor/benchmarks/rnn/baselines/figures/figures.pptx new file mode 100644 index 000000000..b10e071a3 Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/rnn/baselines/figures/figures.pptx differ diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/README.md b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/README.md new file mode 100644 index 000000000..f3f9a5a03 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/README.md @@ -0,0 +1,18 @@ +# gridLSTM + +## Hyper-parameters + +1. `batch_size`=20 +2. `seq_len`=64 +3. `hidden_size`=`input_size`=128 +4. `rnn_cell`=`LSTM` +5. `iters` = 20, `warmup` = 10 + +## Result + +|Name|PyTorch Average Time| TF_graph Average Time| +|:--|:--|:--| +|gridlstm_gpu:0_forward| 2.6266 |2.5567| +|gridlstm_cpu_forward| 8.6012 |3.7226| + +> tf_graph: using tf.compat.v1.session diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py new file mode 100644 index 000000000..1996f60df --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py @@ -0,0 +1,209 @@ +from time import time +import unittest +import torch +import logging +import argparse +from pt_model import StackedGridModel + +from torch.profiler import profile +from torch.profiler import record_function +from torch.profiler import ProfilerActivity + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=10) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=32) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=4) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +class PytorchGrid(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN_SIZE = cmd_args.hidden_size + DEPTH = cmd_args.depth + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tPyTorch(ms)\n") + + LOG_DEBUG_INFO = 1 + PROFILER_ENABLE = 0 + + def setUp(self): + self.shape = (PytorchGrid.SEQ_LEN, PytorchGrid.BATCH_SIZE, + PytorchGrid.HIDDEN_SIZE) + + def _report(self, test_name, test_case, elapsed): + seq_len, batch_size, hidden, num_layers = test_case + + print(f"\nbench-grid\tdepth\t{num_layers}\tseq_length\t{seq_len}\t" + f"batch_size\t{batch_size}\t" + f"hidden_size\t{hidden}\tPyTroch(ms)\t{elapsed}") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {seq_len}, {batch_size}, {hidden}]\t" + f"{elapsed}\n") + + def _apply_forward(self, test_name, test_case, source, target, model): + + for i in range(PytorchGrid.WARM_UP): + output = model(source, target) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() + + if PytorchGrid.PROFILER_ENABLE: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=True) as prof: + with record_function("model_inference"): + for i in range(PytorchGrid.ITERS): + output = model(source, target) + + prof.export_chrome_trace("trace_" + test_name + ".json") + else: + for i in range(PytorchGrid.ITERS): + output = model(source, target) + + end_event.record() + torch.cuda.synchronize() + elapsed = start_event.elapsed_time(end_event) / PytorchGrid.ITERS + + self._report(test_name, test_case, elapsed) + + def test_grid_forward(self): + if not self.DEFAULT_TEST: + for enable_jit in [ + # False, + True, + ]: + for device in [ + "cuda:0", + # "cpu", + ]: + target = torch.randn(*self.shape, device=device) + source = torch.randn(*self.shape, device=device) + model = StackedGridModel( + PytorchGrid.DEPTH, + PytorchGrid.SEQ_LEN, + PytorchGrid.SEQ_LEN, + PytorchGrid.BATCH_SIZE, + PytorchGrid.HIDDEN_SIZE, + device, + enable_jit=enable_jit).to(device) + test_name = f"gridlstm_{device}_forward" + ("_JIT" + if enable_jit + else "") + test_case = [ + PytorchGrid.SEQ_LEN, PytorchGrid.BATCH_SIZE, + PytorchGrid.HIDDEN_SIZE, PytorchGrid.DEPTH + ] + self._apply_forward(test_name, test_case, source, target, + model) + + def test_default_data(self): + if self.DEFAULT_TEST: + for device in [ + "cuda:0", + # "cpu", + ]: + test_name = f"gridlstm_{device}_forward_JIT" + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + target = torch.randn( + (seq_len, batch_size, hidden), + device=device, + ) + source = torch.randn( + (seq_len, batch_size, hidden), + device=device, + ) + model = StackedGridModel( + num_layers, + seq_len, + seq_len, + batch_size, + hidden, + device, + enable_jit=True).to(device) + return target, source, model + + test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [10, 32, 256, 32], + [10, 32, 512, 32], + [10, 32, 1024, 32], + + # scale with depth + [10, 32, 256, 1], + [10, 32, 256, 2], + [10, 32, 256, 4], + [10, 32, 256, 8], + [10, 32, 256, 16], + [10, 32, 256, 32], + [10, 32, 1024, 1], + [10, 32, 1024, 2], + [10, 32, 1024, 4], + [10, 32, 1024, 8], + [10, 32, 1024, 16], + [10, 32, 1024, 32], + + # scale with length + [5, 32, 256, 32], + [7, 32, 256, 32], + [10, 32, 256, 32], + [5, 32, 1024, 32], + [7, 32, 1024, 32], + [10, 32, 1024, 32], + ] + + for test_case in test_cases: + target, source, model = build_data(test_case) + self._apply_forward(test_name, test_case, source, target, + model) + del target + del source + del model + torch.cuda.empty_cache() + + +if __name__ == "__main__": + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py new file mode 100644 index 000000000..0454d2b51 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py @@ -0,0 +1,149 @@ +import time +import sys +import math +import unittest +import os +import logging +import datetime + +import test_utils as tu +import tensorflow as tf + +from tf_model import WhileOpGridLSTMNet +from tf_model import BaseWhileOpGridLSTMNet +from tf_model import FineGrainedOpGridLSTMNet + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Only print error information. +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=10) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=32) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=4) + return parser.parse_args() + +class TFGraphGridLSTM(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN_SIZE = cmd_args.hidden_size + DEPTH = cmd_args.depth + + LOG_DEBUG_INFO = 1 + PROFILER_ENABLE = 0 + + def setUp(self): + tf.compat.v2.random.set_seed(1234) + self._init_logger() + + self.stddev = 1.0 / math.sqrt(TFGraphGridLSTM.HIDDEN) + self.shape = (TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + TFGraphGridLSTM.HIDDEN) + + def _init_logger(self): + self.logger = logging.getLogger() + logging.basicConfig( + level=(logging.DEBUG + if TFGraphGridLSTM.LOG_DEBUG_INFO else logging.INFO), + filename="grid_lstm_results_tensorflow_graph.txt", + filemode="w", + format="%(message)s") + + def _report(self, test_name, start): + """ + Args: + test_name (String): Name of the test. + start (String): Timestamp of the start time. + """ + elapsed_time = time.time() - start + average_time = elapsed_time / TFGraphGridLSTM.ITERS + seq_per_sec = ( + TFGraphGridLSTM.ITERS * TFGraphGridLSTM.BATCH_SIZE) / elapsed_time + self.logger.info(("|%s|%.4f\t|%.4f\t|%.4f|") % + (test_name, average_time, elapsed_time, seq_per_sec)) + print(( + "|test_name = %s|average_time = %.4f s|elapsed_time = %.4f s|seq_per_sec = %.4f|" + ) % (test_name, average_time, elapsed_time, seq_per_sec)) + + def _apply_forward(self, dev, test_name, model): + """Only Test the forward computation. + Args: + dev, String: Device that on which the test is running. cpu or gpu. + test_name, String: Name of the test. + model, Callable: The tested model. It should be a callable object. + """ + + with tf.device(tu.device(dev)): + source = tf.random.uniform( + self.shape, minval=-self.stddev, maxval=self.stddev) + target = tf.random.uniform( + self.shape, minval=-self.stddev, maxval=self.stddev) + + output = model(source, target) + + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + + for _ in range(TFGraphGridLSTM.WARM_UP): + sess.run(output) + + if TFGraphGridLSTM.PROFILER_ENABLE: + log_dir = "logs/" + datetime.datetime.now().strftime( + "%Y%m%d-%H%M%S") + "_" + test_name + tf.profiler.experimental.start(log_dir) + + start = time.time() + for _ in range(TFGraphGridLSTM.ITERS): + sess.run(output) + + if TFGraphGridLSTM.PROFILER_ENABLE: + tf.profiler.experimental.stop() + + self._report(test_name, start) + + def test_fine_grained_op_lstm_forward(self): + for device in [ + "cpu", + "gpu", + ]: + model = FineGrainedOpGridLSTMNet( + TFGraphGridLSTM.NUM_LAYERS, TFGraphGridLSTM.SEQ_LEN, + TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + TFGraphGridLSTM.HIDDEN) + self._apply_forward( + device, f"graph_finegrained_op_lstm_{device}_forward", model) + + def test_while_op_lstm_forward(self): + for device in [ + "cpu", + "gpu", + ]: + model = WhileOpGridLSTMNet( + TFGraphGridLSTM.NUM_LAYERS, TFGraphGridLSTM.SEQ_LEN, + TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + TFGraphGridLSTM.HIDDEN) + self._apply_forward(device, + f"graph_while_op_lstm_{device}_forward", model) + + def test_base_while_op_lstm_forward(self): + for device in [ + "cpu", + "gpu", + ]: + model = BaseWhileOpGridLSTMNet(TFGraphGridLSTM.HIDDEN) + self._apply_forward( + device, f"graph_base_while_op_lstm_{device}_forward", model) + + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_triton.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_triton.py new file mode 100644 index 000000000..3e9c67309 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_triton.py @@ -0,0 +1,208 @@ +from time import time +import unittest +import torch +import logging +import argparse + +from triton_model import StackedGridModel + +from torch.profiler import profile +from torch.profiler import record_function +from torch.profiler import ProfilerActivity + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=10) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=32) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=4) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +class TritonGrid(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN_SIZE = cmd_args.hidden_size + DEPTH = cmd_args.depth + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tTriton(ms)\n") + + LOG_DEBUG_INFO = 1 + PROFILER_ENABLE = 0 + + def setUp(self): + self.shape = (TritonGrid.SEQ_LEN, TritonGrid.BATCH_SIZE, + TritonGrid.HIDDEN_SIZE) + + # self._init_logger() + + # def _init_logger(self): + # self.logger = logging.getLogger() + # logging.basicConfig( + # level=(logging.DEBUG + # if TritonGrid.LOG_DEBUG_INFO else logging.INFO), + # filename="grid_lstm_results_triton.txt", + # filemode="w", + # format="%(message)s") + + def _report(self, test_name, test_case, elapsed): + seq_len, batch_size, hidden, num_layers = test_case + # elapsed_time = time() - start + # average_time = elapsed_time / TritonGrid.ITERS + # seq_per_sec = (TritonGrid.ITERS * TritonGrid.BATCH_SIZE) / elapsed_time + + print( + f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + f"hidden_size: {hidden}, Triton(ms): {elapsed}ms") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {seq_len}, {batch_size}, {hidden}]\t" + f"{elapsed}\n") + + def _apply_forward(self, test_name, test_case, source, target, model): + + for i in range(TritonGrid.WARM_UP): + output = model(source, target) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() + + if TritonGrid.PROFILER_ENABLE: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=True) as prof: + with record_function("model_inference"): + for i in range(TritonGrid.ITERS): + output = model(source, target) + + prof.export_chrome_trace("trace_" + test_name + ".json") + else: + for i in range(TritonGrid.ITERS): + output = model(source, target) + + end_event.record() + torch.cuda.synchronize() + elapsed = start_event.elapsed_time(end_event) / TritonGrid.ITERS + self._report(test_name, test_case, elapsed) + + def test_grid_forward(self): + if not self.DEFAULT_TEST: + for device in [ + "cuda:0", + # "cpu", + ]: + target = torch.randn(*self.shape, device=device) + source = torch.randn(*self.shape, device=device) + model = StackedGridModel( + TritonGrid.DEPTH, TritonGrid.SEQ_LEN, TritonGrid.SEQ_LEN, + TritonGrid.BATCH_SIZE, TritonGrid.HIDDEN_SIZE, + device).to(device) + test_name = f"gridlstm_{device}_forward" + test_case = [ + TritonGrid.SEQ_LEN, TritonGrid.BATCH_SIZE, + TritonGrid.HIDDEN_SIZE, TritonGrid.DEPTH + ] + self._apply_forward(test_name, test_case, source, target, + model) + + def test_default_data(self): + if self.DEFAULT_TEST: + for device in [ + "cuda:0", + # "cpu", + ]: + test_name = f"gridlstm_{device}_forward" + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + target = torch.randn( + (seq_len, batch_size, hidden), + device=device, + # dtype=torch.float16 + ) + source = torch.randn( + (seq_len, batch_size, hidden), + device=device, + # dtype=torch.float16 + ) + model = StackedGridModel(num_layers, seq_len, seq_len, + batch_size, hidden, + device).to(device) + return target, source, model + + test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [10, 32, 256, 32], + [10, 32, 512, 32], + [10, 32, 1024, 32], + # scale with depth + [10, 32, 256, 1], + [10, 32, 256, 2], + [10, 32, 256, 4], + [10, 32, 256, 8], + [10, 32, 256, 16], + [10, 32, 256, 32], + [10, 32, 1024, 1], + [10, 32, 1024, 2], + [10, 32, 1024, 4], + [10, 32, 1024, 8], + [10, 32, 1024, 16], + [10, 32, 1024, 32], + # scale with length + [5, 32, 256, 32], + [7, 32, 256, 32], + [10, 32, 256, 32], + [5, 32, 1024, 32], + [7, 32, 1024, 32], + [10, 32, 1024, 32], + ] + + for test_case in test_cases: + target, source, model = build_data(test_case) + self._apply_forward(test_name, test_case, source, target, + model) + del target + del source + del model + torch.cuda.empty_cache() + + +if __name__ == "__main__": + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/pt_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/pt_model/__init__.py new file mode 100644 index 000000000..56bc7dec0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/pt_model/__init__.py @@ -0,0 +1,10 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .model import StackedGridModel + +__all__ = [ + "StackedGridModel", +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/pt_model/model.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/pt_model/model.py new file mode 100644 index 000000000..d7e84a019 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/pt_model/model.py @@ -0,0 +1,190 @@ +from typing import List, Tuple, Optional + +import torch +from torch import Tensor +from torch.nn import Module, Parameter +from torch.nn.init import zeros_ +from torch.nn.init import xavier_normal_ as init + +__all__ = ['StackedGridModel'] + + +class VanillaRNNCell(Module): + def __init__(self, hidden_size, grid_dim=2): + """ + Args: + hidden_size(int): hidden dimension + grid_dim(int): grid dimension + """ + super(VanillaRNNCell, self).__init__() + + # learnable paramters + self.W = Parameter(Tensor(hidden_size, hidden_size)) + self.U = Parameter(Tensor(hidden_size * grid_dim, hidden_size)) + self.b = Parameter(Tensor(1, hidden_size)) + + self.init_weights() + + def init_weights(self): + for p in self.parameters(): + if p.data.ndimension() >= 2: + init(p.data) + else: + zeros_(p.data) + + def forward(self, x_t: Tensor, y_t: Tensor, + state: Tensor) -> Tuple[Tensor, Tensor]: + """ + Args: + x_t(Tensor): + the shape is (batch_size, hidden_size) + y_t(Tensor): + the shape is (batch_size, hidden_size) + state(Tensor): + the shape is (batch_size, grid_dim * hidden_size) + Returns: + (h_x, h_y): Tuple[Tensor, Tensor] + h_x: + the shape is (batch_size, hidden_size) + h_y: + the shape is (batch_size, hidden_size) + """ + temp = torch.mm(state, self.U) + self.b + + h_x = torch.tanh(torch.mm(x_t, self.W) + temp) + h_y = torch.tanh(torch.mm(y_t, self.W) + temp) + return h_x, h_y + + +class GridRNNNaive(Module): + def __init__(self, depth: int, src_len: int, trg_len: int, batch_size: int, + hidden_size: int, device: str): + """ + Args: + depth(int): the number of stacked RNN layer + src_len(int): source sequence length + trg_len(int): target sequence length + batch_size(int): the number of samples + hidden_size(int): hidden dimension + """ + super(GridRNNNaive, self).__init__() + + self.depth = depth + self.src_len = src_len + self.trg_len = trg_len + self.batch_size = batch_size + self.hidden_size = hidden_size + self.device = device + + # We stack 2d GridLSTMs to get 3d GridLSTM. + self.cells = torch.nn.ModuleList( + [VanillaRNNCell(hidden_size).to(device) for _ in range(depth)]) + + self.h_output = torch.zeros( + self.depth, + src_len, + trg_len, + 2, + batch_size, + self.hidden_size, + device=self.device) + + def forward(self, src_array_batch: Tensor, trg_array_batch: Tensor): + """ + Args: + src_array_batch(Tensor): + the shape is (src_len, batch_size, hidden_size) + trg_array_batch(Tensor): + the shape is (trg_len, batch_size, hidden_size) + Returns: + h_output(Tensor): + the shape is (depth, src_len, trg_len, grid_dim, batch_size, hidden_size) + """ + # dim 1: stack Grid LSTM Cell to form depth. + d = 0 + for m in self.cells: + # dim 2: iterate over source sequence length. + for i in range(0, self.src_len, 1): + # dim 3: iterate over target sequence length. + for j in range(0, self.trg_len, 1): + + # print("depth:", m, " src:", i, " trg:", j) + if d == 0: + x_t = src_array_batch[i] + y_t = trg_array_batch[j] + else: + x_t = self.h_output[d - 1][i][j][0] + y_t = self.h_output[d - 1][i][j][1] + + if i == 0: + state_x = torch.zeros( + self.batch_size, + self.hidden_size, + device=self.device) + else: + state_x = self.h_output[d][i - 1][j][0] + + if j == 0: + state_y = torch.zeros( + self.batch_size, + self.hidden_size, + device=self.device) + else: + state_y = self.h_output[d][i][j - 1][0] + + state = torch.cat([state_x, state_y], dim=1) + + h_x, h_y = m(x_t, y_t, state) + + self.h_output[d][i][j][0] = h_x + self.h_output[d][i][j][1] = h_y + + d += 1 + + return self.h_output + + +class StackedGridModel(Module): + def __init__(self, depth: int, src_len: int, trg_len: int, batch_size: int, + hidden_size: int, device: str, enable_jit: bool): + """ + Args: + depth(int): the number of stacked RNN layer + src_len(int): source sequence length + trg_len(int): target sequence length + batch_size(int): the number of samples + hidden_size(int): hidden dimension + enable_jit(bool): whether to apply PyTorch JIT + """ + super().__init__() + + self.depth = depth + self.batch_size = batch_size + self.hidden_size = hidden_size + self.device = device + + if enable_jit: + self.m = torch.jit.script( + GridRNNNaive(self.depth, src_len, trg_len, batch_size, + self.hidden_size, self.device)).to(self.device) + else: + self.m = GridRNNNaive(self.depth, src_len, trg_len, batch_size, + self.hidden_size, self.device).to( + self.device) + + def forward(self, source_input, target_input): + """ + Args: + src_array_batch(Tensor): + the shape is (src_len, batch_size, hidden_size) + trg_array_batch(Tensor): + the shape is (trg_len, batch_size, hidden_size) + Returns: + h_output(Tensor): + the shape is (depth, src_len, trg_len, grid_dim, batch_size, hidden_size) + """ + + output = self.m( + src_array_batch=source_input, trg_array_batch=target_input) + + return output diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh new file mode 100755 index 000000000..c110c9541 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +seq_len=10 +batch_size=32 + +# overall +hiddens='256 512 1024' +for hidden in $hiddens; do + python3 gridlstm_pt.py --seq_len=$seq_len \ + --batch_size=$batch_size \ + --hidden_size=$hidden \ + --depth=32 +done + +# scale with depth +depths='1 2 4 8 16 32' +hiddens='256 1024' +for hidden in $hiddens; do + for depth in $depths; do + python3 gridlstm_pt.py --seq_len=$seq_len \ + --batch_size=$batch_size \ + --hidden_size=$hidden \ + --depth=$depth + done +done + +# scale with length +lengths='5 7 10' +hiddens='256 1024' +for length in $lengths; do + for hidden in $hiddens; do + python3 gridlstm_pt.py --seq_len=$seq_len \ + --batch_size=32 \ + --hidden_size=$hidden \ + --depth=32 + done +done diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/test_utils.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/test_utils.py new file mode 100644 index 000000000..d7e65e39d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/test_utils.py @@ -0,0 +1,33 @@ +import tensorflow as tf + + +def get_config(): + config = tf.compat.v1.ConfigProto( + gpu_options=tf.compat.v1.GPUOptions( + allow_growth=True, per_process_gpu_memory_fraction=0.2)) + + config.log_device_placement = False + config.allow_soft_placement = True + + config.intra_op_parallelism_threads = 0 + config.inter_op_parallelism_threads = 56 + + return config + + +def device(dtype="cpu"): + """Return the TF device string. + + Args: + dtype: String, "cpu" or "gpu". + + Raises: + ValueError: if dtype is an unknown device. + """ + if dtype == "cpu": + return "/device:CPU:0" + elif dtype == "gpu": + assert tf.test.is_gpu_available(cuda_only=True) + return "/device:GPU:0" + else: + raise ValueError("Unknown device type. Should be cpu or gpu.") diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/tf_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/tf_model/__init__.py new file mode 100644 index 000000000..c8cdd33ad --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/tf_model/__init__.py @@ -0,0 +1,14 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .model import WhileOpGridLSTMNet +from .model import BaseWhileOpGridLSTMNet +from .model import FineGrainedOpGridLSTMNet + +__all__ = [ + "WhileOpGridLSTMNet", + "BaseWhileOpGridLSTMNet", + "FineGrainedOpGridLSTMNet", +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/tf_model/model.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/tf_model/model.py new file mode 100644 index 000000000..eb55811da --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/tf_model/model.py @@ -0,0 +1,413 @@ +from typing import NamedTuple, Tuple +import math + +import tensorflow as tf +tf.compat.v1.disable_eager_execution() + +from tensorflow import Tensor, TensorArray +from tensorflow.keras.layers import Dense, Layer, LSTMCell + + +class DimArg(NamedTuple): + step: Tensor + h: Tensor + m: Tensor + + +class DimArrayArg(NamedTuple): + step_array: TensorArray + h_array: TensorArray + m_array: TensorArray + + +class StepArg(NamedTuple): + target_step: DimArg + source_step: DimArg + + +class InnerLoopArg(NamedTuple): + source: DimArg + target_array: DimArrayArg + + +class OuterLoopArg(NamedTuple): + source_array: DimArrayArg + target_array: DimArrayArg + + +class VanillaRNNCell(Layer): + def __init__(self, hidden_size, grid_dim=2): + """ + Args: + hidden_size(int): hidden dimension + grid_dim(int): grid dimension + """ + self.hidden_size = hidden_size + self.grid_dim = grid_dim + super(VanillaRNNCell, self).__init__() + + def build(self, _): + stddev = 1.0 / math.sqrt(self.hidden_size) + with tf.name_scope('weight'): + self.W = tf.random.uniform( + [self.hidden_size, self.hidden_size], + minval=-stddev, + maxval=stddev) + self.U = tf.random.uniform( + [self.hidden_size * self.grid_dim, self.hidden_size], + minval=-stddev, + maxval=stddev) + self.b = tf.random.uniform( + [1, self.hidden_size], minval=-stddev, maxval=stddev) + + def call(self, x_t: Tensor, y_t: Tensor, + state: Tensor) -> Tuple[Tensor, Tensor]: + """ + Args: + x_t(Tensor): + the shape is (batch_size, hidden_size) + y_t(Tensor): + the shape is (batch_size, hidden_size) + state(Tensor): + the shape is (batch_size, grid_dim * hidden_size) + Returns: + (h_x, h_y): Tuple[Tensor, Tensor] + h_x: + the shape is (batch_size, hidden_size) + h_y: + the shape is (batch_size, hidden_size) + """ + temp = tf.matmul(state, self.U) + self.b + + h_x = tf.tanh(tf.matmul(x_t, self.W) + temp) + h_y = tf.tanh(tf.matmul(y_t, self.W) + temp) + return h_x, h_y + + +class FineGrainedOpGridLSTMNet(tf.keras.Model): + def __init__(self, depth: int, src_len: int, trg_len: int, batch_size: int, + hidden_size: int): + super(FineGrainedOpGridLSTMNet, self).__init__() + + self.depth = depth + self.src_len = src_len + self.trg_len = trg_len + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.cells = [VanillaRNNCell(hidden_size) for i in range(depth)] + + stddev = 1.0 / math.sqrt(self.hidden_size) + with tf.name_scope('output'): + self.h_output = tf.random.uniform( + [self.depth, src_len, trg_len, 2, batch_size, hidden_size], + minval=-stddev, + maxval=stddev) + + def call(self, src_input_seq, trg_input_seq): + + # dim 1: stack Grid LSTM Cell to form depth. + for d in range(0, self.depth, 1): + # dim 2: iterate over source sequence length. + for i in range(0, self.src_len, 1): + # dim 3: iterate over target sequence length. + for j in range(0, self.trg_len, 1): + + # print("depth:", d, " src:", i, " trg:", j) + if d == 0: + x_t = src_input_seq[i] + y_t = trg_input_seq[j] + else: + x_t = self.h_output[d - 1][i][j][0] + y_t = self.h_output[d - 1][i][j][1] + + if i == 0: + state_x = tf.zeros([self.batch_size, self.hidden_size]) + else: + state_x = self.h_output[d][i - 1][j][0] + + if j == 0: + state_y = tf.zeros([self.batch_size, self.hidden_size]) + else: + state_y = self.h_output[d][i][j - 1][0] + + state = tf.concat([state_x, state_y], 1) + + h_x, h_y = self.cells[d](x_t, y_t, state) + temp = tf.stack([h_x, h_y], 0) + tf.tensor_scatter_nd_update( + self.h_output, [[d, i, j, 0], [d, i, j, 1]], temp) + + return self.h_output + + +class WhileOpGridLSTMNet(Layer): + def __init__(self, depth: int, src_len: int, trg_len: int, batch_size: int, + hidden_size: int): + super(WhileOpGridLSTMNet, self).__init__() + self.depth = depth + self.src_len = src_len + self.trg_len = trg_len + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.cells = [VanillaRNNCell(hidden_size) for i in range(depth)] + + stddev = 1.0 / math.sqrt(self.hidden_size) + with tf.name_scope('output'): + self.h_output = tf.random.uniform( + [self.depth, src_len, trg_len, 2, batch_size, hidden_size], + minval=-stddev, + maxval=stddev) + + self.cur_d = 0 + self.cur_i = 0 + self.cur_j = 0 + + # dim 3: iterate over target sequence length. + def inner_loop2(self): + init_j = tf.constant(0) + + def cond(j: int): + return tf.less(j, self.trg_len) + + def body(j: int) -> int: + if self.cur_d == 0: + x_t = self.source[self.cur_i] + y_t = self.target[self.cur_j] + else: + x_t = self.h_output[self.cur_d - 1][self.cur_i][self.cur_j][0] + y_t = self.h_output[self.cur_d - 1][self.cur_i][self.cur_j][1] + + if self.cur_i == 0: + state_x = tf.zeros([self.batch_size, self.hidden_size]) + else: + state_x = self.h_output[self.cur_d][self.cur_i - + 1][self.cur_j][0] + + if self.cur_j == 0: + state_y = tf.zeros([self.batch_size, self.hidden_size]) + else: + state_y = self.h_output[self.cur_d][self.cur_i][self.cur_j - + 1][0] + + state = tf.concat([state_x, state_y], 1) + h_x, h_y = self.cells[self.cur_d](x_t, y_t, state) + temp = tf.stack([h_x, h_y], 0) + tf.tensor_scatter_nd_update( + self.h_output, [[self.cur_d, self.cur_i, self.cur_j, 0], + [self.cur_d, self.cur_i, self.cur_j, 1]], temp) + + self.cur_j += 1 + return j + 1 + + return tf.while_loop(cond, body, [init_j]) + + # dim 2: iterate over source sequence length. + def inner_loop1(self): + init_i = tf.constant(0) + + def cond(i: int): + return tf.less(i, self.src_len) + + def body(i: int) -> int: + print("test") + self.inner_loop2() + self.cur_i += 1 + return i + 1 + + return tf.while_loop(cond, body, [init_i]) + + # dim 1: stack Grid LSTM Cell to form depth. + def call(self, source: Tensor, target: Tensor) -> Tensor: + init_d = tf.constant(0) + + self.source = source + self.target = target + + def cond(d: int): + return tf.less(d, self.depth) + + def body(d: int) -> int: + self.inner_loop1() + self.cur_d += 1 + return tf.add(d, 1) + + tf.while_loop(cond, body, [init_d]) + + return self.h_output + + +class GridLSTMBlock(Layer): + def __init__(self, hidden_size: int): + super(GridLSTMBlock, self).__init__() + self.hidden_size = hidden_size + + self.lstm_cell = LSTMCell(self.hidden_size) + self.H2h = Dense(self.hidden_size) + + def call(self, arg: StepArg) -> StepArg: + source_step = arg.source_step + target_step = arg.target_step + + # shape: (batch_size, hidden_size) + s_h = source_step.h + # shape: (batch_size, hidden_size) + t_h = target_step.h + + # shape: (batch_size, hidden_size) + s_m = source_step.m + # shape: (batch_size, hidden_size) + t_m = target_step.m + + H = tf.concat([s_h, t_h], 1) + h = self.H2h(H) + + # shape: (batch_size, hidden_size), (batch_size, hidden_size) + _, [next_s_h, next_s_m] = self.lstm_cell(source_step.step, (h, s_m)) + # shape: (batch_size, hidden_size), (batch_size, hidden_size) + _, [next_t_h, next_t_m] = self.lstm_cell(target_step.step, (h, t_m)) + + return StepArg( + DimArg(next_t_h, next_t_h, next_t_m), + DimArg(next_s_h, next_s_h, next_s_m)) + + +class GridLSTM(Layer): + def __init__(self, hidden_size: int): + super(GridLSTM, self).__init__() + self.hidden_size = hidden_size + + self.block = GridLSTMBlock(hidden_size) + + def inner_loop(self, input: InnerLoopArg) -> InnerLoopArg: + init_i = tf.constant(0) + + target_seq_len = input.target_array.step_array.size() + + def cond(i: int, x): + return tf.less(i, target_seq_len) + + def body(i: int, acc: InnerLoopArg) -> Tuple[int, InnerLoopArg]: + step_array: TensorArray + h_array: TensorArray + m_array: TensorArray + step_array, h_array, m_array = acc.target_array + t_step: Tensor = step_array.read(i) + t_h: Tensor = h_array.read(i) + t_m: Tensor = m_array.read(i) + + t_step_arg: DimArg = DimArg(t_step, t_h, t_m) + s_step_arg: DimArg = acc.source + + step_arg: StepArg = self.block(StepArg(t_step_arg, s_step_arg)) + source_step = step_arg.source_step + target_step = step_arg.target_step + return i + 1, InnerLoopArg( + source_step, + DimArrayArg( + step_array.write(i, target_step.step), + h_array.write(i, target_step.h), + m_array.write(i, target_step.m), + )) + + return tf.while_loop(cond, body, (init_i, input))[1] + + def call(self, source: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: + init_i = tf.constant(0) + + batch_size = tf.shape(source)[1] + + source_seq_len = tf.shape(source)[0] + target_seq_len = tf.shape(target)[0] + + def cond(i: int, x): + return tf.less(i, source_seq_len) + + def body(i: int, acc: OuterLoopArg) -> Tuple[int, OuterLoopArg]: + source_array = acc.source_array + target_array = acc.target_array + + step_array: TensorArray + h_array: TensorArray + m_array: TensorArray + step_array, h_array, m_array = source_array + + s_step: Tensor = step_array.read(i) + s_h: Tensor = h_array.read(i) + s_m: Tensor = m_array.read(i) + + source_arg = DimArg(s_step, s_h, s_m) + inner_loop_arg: InnerLoopArg = InnerLoopArg( + source_arg, target_array) + inner_loop_arg = self.inner_loop(inner_loop_arg) + + source = inner_loop_arg.source + s_step, s_h, s_m = source + + step_array = step_array.write(i, s_step) + h_array = h_array.write(i, s_h) + m_array = m_array.write(i, s_m) + + source_array: DimArrayArg = DimArrayArg(step_array, h_array, + m_array) + target_array: DimArrayArg = inner_loop_arg.target_array + + outer_loop_arg: OuterLoopArg = OuterLoopArg( + source_array, target_array) + return i + 1, outer_loop_arg + + s_step_array = tf.TensorArray(tf.float32, size=source_seq_len) + s_h_array = tf.TensorArray(tf.float32, size=source_seq_len) + s_m_array = tf.TensorArray(tf.float32, size=source_seq_len) + + t_step_array = tf.TensorArray(tf.float32, size=source_seq_len) + t_h_array = tf.TensorArray(tf.float32, size=source_seq_len) + t_m_array = tf.TensorArray(tf.float32, size=source_seq_len) + + init_outer_loop_arg: OuterLoopArg = OuterLoopArg( + DimArrayArg( + step_array=s_step_array.unstack(source), + h_array=s_h_array.unstack( + tf.zeros([source_seq_len, batch_size, self.hidden_size])), + m_array=s_m_array.unstack( + tf.zeros([source_seq_len, batch_size, self.hidden_size]))), + DimArrayArg( + step_array=t_step_array.unstack(target), + h_array=t_h_array.unstack( + tf.zeros([target_seq_len, batch_size, self.hidden_size])), + m_array=t_m_array.unstack( + tf.zeros([target_seq_len, batch_size, self.hidden_size])))) + + outer_loop_arg: OuterLoopArg = tf.while_loop( + cond, body, (init_i, init_outer_loop_arg))[1] + + source: Tensor = outer_loop_arg.source_array.step_array.stack() + target: Tensor = outer_loop_arg.target_array.step_array.stack() + + return source, target + + +class BaseWhileOpGridLSTMNet(Layer): + def __init__(self, hidden_size: int): + super(BaseWhileOpGridLSTMNet, self).__init__() + + # As stated in the Section 4.4, hierarchy grows along the third + # dimension. + # We stack two 2d GridLSTM to get 3d GridLSTM. + self.gridLSTM_1 = GridLSTM(hidden_size) + self.gridLSTM_2 = GridLSTM(hidden_size) + self.gridLSTM_3 = GridLSTM(hidden_size) + + def call(self, source_input: Tensor, target_input: Tensor): + + source_output, target_output = self.gridLSTM_1(source_input, + target_input) + + source_output, target_output = self.gridLSTM_2(source_output, + target_output) + + source_output, target_output = self.gridLSTM_3(source_output, + target_output) + + return target_output diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/__init__.py new file mode 100644 index 000000000..56bc7dec0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/__init__.py @@ -0,0 +1,10 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .model import StackedGridModel + +__all__ = [ + "StackedGridModel", +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/model.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/model.py new file mode 100644 index 000000000..dffc68c41 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/model.py @@ -0,0 +1,128 @@ +from typing import Tuple +from typing import List + +import torch.jit as jit +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +from time import time + +from .op import * + + +class VanillaRNNCell(nn.Module): + def __init__(self, + hidden_size: int, + batch_size: int, + device: str, + grid_dim=2, + dtype=torch.float32): + super(VanillaRNNCell, self).__init__() + self.device = device + self.dtype = dtype + self.size = (hidden_size, batch_size, grid_dim) + self.W = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + + self.U = init( + nn.Parameter( + torch.empty( + [hidden_size * grid_dim, hidden_size], + device=device, + dtype=dtype))) + + self.b = nn.Parameter( + torch.zeros([hidden_size], device=device, dtype=dtype)) + + def forward( + self, + x_t: Tensor, + y_t: Tensor, + state: Tensor, + state_resident: Tensor, + ) -> Tuple[Tensor, Tensor]: + + h_x, h_y = Vanilla_scan((self.W, self.U), (x_t, y_t), self.b, state, + state_resident, self.size, self.device, + self.dtype) + + return h_x, h_y + + +class StackedGridModel(nn.Module): + def __init__(self, + depth: int, + src_len: int, + trg_len: int, + batch_size: int, + hidden_size: int, + device: str, + dtype=torch.float32): + super(StackedGridModel, self).__init__() + self.depth = depth + self.src_len = src_len + self.trg_len = trg_len + self.batch_size = batch_size + self.hidden_size = hidden_size + self.device = device + self.dtype = dtype + + self.h_output = torch.zeros( + depth, src_len, trg_len, 2, batch_size, hidden_size, device=device) + + self.cells = torch.nn.ModuleList([ + VanillaRNNCell(hidden_size, batch_size, device, 2, + dtype).to(device) for _ in range(depth) + ]) + + def forward(self, src_array_batch: Tensor, trg_array_batch: Tensor): + h_x_resident = torch.empty( + [self.batch_size, self.hidden_size], + device=self.device, + dtype=self.dtype) + h_y_resident = torch.empty( + [self.batch_size, self.hidden_size], + device=self.device, + dtype=self.dtype) + d = 0 + for m in self.cells: + for i in range(0, self.src_len, 1): + for j in range(0, self.trg_len, 1): + if d == 0: + x_t = src_array_batch[i] + y_t = trg_array_batch[j] + else: + x_t = self.h_output[d - 1][i][j][0] + y_t = self.h_output[d - 1][i][j][1] + + if i == 0: + state_x = torch.zeros( + self.batch_size, + self.hidden_size, + device=self.device) + else: + state_x = self.h_output[d][i - 1][j][0] + + if j == 0: + state_y = torch.zeros( + self.batch_size, + self.hidden_size, + device=self.device) + else: + state_y = self.h_output[d][i][j - 1][0] + + state = torch.cat([state_x, state_y], dim=1) + + h_x, h_y = m(x_t, y_t, state, (h_x_resident, h_y_resident)) + + self.h_output[d][i][j][0] = h_x + self.h_output[d][i][j][1] = h_y + + d += 1 + + return self.h_output diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/op.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/op.py new file mode 100644 index 000000000..6b4cdfa66 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/triton_model/op.py @@ -0,0 +1,373 @@ +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['Vanilla_scan'] + + +@triton.autotune( + configs=[ + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 64 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 64 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 64 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 64 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 64 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 64 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=2), + ], + key=['hidden_size', 'batch_size'], +) +@triton.jit +def Vanilla_scan_kernel( + W_ptr, + U_ptr, + b_ptr, + x_ptr, + y_ptr, + state_ptr, + h_x_ptr, + h_y_ptr, + hidden_size, + batch_size, + grid_dim, + stride_wk, + stride_wn, + stride_uk, + stride_un, + stride_xm, + stride_xk, + stride_sm, + stride_sk, + BLOCK_SIZE_B: tl.constexpr, + BLOCK_SIZE_H: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_h = tl.program_id(1) + + W_block_ptr = tl.make_block_ptr( + base=W_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + U_block_ptr = tl.make_block_ptr( + base=U_ptr, + shape=(hidden_size * grid_dim, hidden_size), + strides=(stride_uk, stride_un), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + x_block_ptr = tl.make_block_ptr( + base=x_ptr, + shape=(batch_size, hidden_size), + strides=(stride_xm, stride_xk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + y_block_ptr = tl.make_block_ptr( + base=y_ptr, + shape=(batch_size, hidden_size), + strides=(stride_xm, stride_xk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + state_block_ptr = tl.make_block_ptr( + base=state_ptr, + shape=(batch_size, hidden_size * grid_dim), + strides=(stride_sm, stride_sk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + offset_batch = ( + pid_m * BLOCK_SIZE_B + tl.arange(0, BLOCK_SIZE_B)) % batch_size + offset_hidden = ( + pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)) % hidden_size + b_ptrs = b_ptr + offset_hidden[None, :] + b = tl.load(b_ptrs) + b_ = tl.broadcast_to(b, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + temp = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + h_x = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + h_y = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + for k in range(hidden_size * grid_dim // BLOCK_SIZE_K): + state = tl.load(state_block_ptr) + U = tl.load(U_block_ptr) + temp += tl.dot(state, U) + U_block_ptr = tl.advance(U_block_ptr, (BLOCK_SIZE_K, 0)) + state_block_ptr = tl.advance(state_block_ptr, (0, BLOCK_SIZE_K)) + temp = temp + b_ + for k in range(hidden_size // BLOCK_SIZE_K): + x = tl.load(x_block_ptr) + y = tl.load(y_block_ptr) + W = tl.load(W_block_ptr) + h_x += tl.dot(x, W) + h_y += tl.dot(y, W) + W_block_ptr = tl.advance(W_block_ptr, (BLOCK_SIZE_K, 0)) + x_block_ptr = tl.advance(x_block_ptr, (0, BLOCK_SIZE_K)) + y_block_ptr = tl.advance(y_block_ptr, (0, BLOCK_SIZE_K)) + h_x = _tanh(h_x + temp) + h_y = _tanh(h_y + temp) + + h_x_ptrs = h_x_ptr + offset_batch[:, + None] * stride_xm + offset_hidden[None, :] * stride_xk + h_y_ptrs = h_y_ptr + offset_batch[:, + None] * stride_xm + offset_hidden[None, :] * stride_xk + tl.store(h_x_ptrs, h_x) + tl.store(h_y_ptrs, h_y) + + +@triton.jit +def _dot(a, b): + return tl.sum(a[:, :, None] * b[None, :, :], axis=1) + + +@triton.jit +def _sigmoid(x): + #\sigma(x) = \frac{1}{1 + 2^{-x \cdot \log_2(e)}} + log2_e = 1.4426950408889634 # log2(e) + neg_log2_e_x = -x * log2_e + exp_neg_log2_e_x = tl.math.exp2(neg_log2_e_x) + return 1 / (1 + exp_neg_log2_e_x) + + +@triton.jit +def _tanh(x): + return 2 * _sigmoid(2 * x) - 1 + + +def Vanilla_scan(weight_, + input_, + blas_, + state_, + resident_, + size_, + device_='cuda', + dtype_=torch.float32): + W, U = weight_ + x_t, y_t = input_ + h_x, h_y = resident_ + hidden_size, batch_size, grid_dim = size_ + grid = lambda META: ( + triton.cdiv(batch_size, META['BLOCK_SIZE_B']), triton.cdiv(hidden_size, META['BLOCK_SIZE_H']), + ) + Vanilla_scan_kernel[grid]( + W_ptr=W, + U_ptr=U, + b_ptr=blas_, + x_ptr=x_t, + y_ptr=y_t, + state_ptr=state_, + h_x_ptr=h_x, + h_y_ptr=h_y, + hidden_size=hidden_size, + batch_size=batch_size, + grid_dim=grid_dim, + stride_wk=W.stride(0), + stride_wn=W.stride(1), + stride_uk=U.stride(0), + stride_un=U.stride(1), + stride_xm=x_t.stride(0), + stride_xk=x_t.stride(1), + stride_sm=state_.stride(0), + stride_sk=state_.stride(1), + ) + return h_x, h_y diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/README.md b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/README.md new file mode 100644 index 000000000..f9b063acb --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/README.md @@ -0,0 +1,29 @@ +# Stacked Dilated LSTM + +## PyTorch implementation + +

+ +

+ +## Hyper-parameters + +1. `batch_size` = 32 +2. `seq_len` = 100 +3. `layers_num`= 6, the corresponding `dilation`=`[1, 2, 4, 8, 16, 32]` +4. `input_size`= 64, while size means number of dims +5. `hidden_size` = `output_size` = 64 +6. `rnn_cell` = `LSTM` + +## Results + +`counting_iteration_num` = 50, `warmup_iteration_num` = 20 + +|Test Name|Average Time(s)|Elapsed Time(s)|Throughput(seq/s)| +|:--|:--|:--|:--| +|PyTroch Imperative|0.0085 |0.1698 |3768.5783| +|PyTorch_JITed|0.0059 |0.1176 |5443.7788| +|PyTorch Pad per Layer (cannot be JITed)|0.0092 |0.1843 |3472.5731| +|TensorFlow Eager|0.0656 |3.2781 |488.0861| +|TensorFlow Auto-graph|0.0073 |0.3648 |4386.2863| +|TensorFlow Graph-mode|0.0051 |0.2575 |6214.1577| diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/__init__.py new file mode 100644 index 000000000..c138fe09e --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/__init__.py @@ -0,0 +1,12 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .model import StackedDRNNJIT +from .model import StackedDRNN + +__all__ = [ + 'StackedDRNNJIT', + 'StackedDRNN', +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/model.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/model.py new file mode 100644 index 000000000..58af30dcc --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/model.py @@ -0,0 +1,155 @@ +import math +import random +import numpy as np + +from typing import List +from typing import Tuple + +import torch +import torch.jit as jit +from torch import Tensor +import torch.nn as nn + +__all__ = [ + 'StackedDRNNJIT', + 'StackedDRNN', +] + + +class StackedDRNNJIT(nn.Module): + def __init__(self, + batch_size: int, + seq_len: int, + input_size: int, + hidden_size: int, + dilation: List[int], + device: str, + dtype=torch.float16): + super(StackedDRNNJIT, self).__init__() + + self.batch_size = batch_size + self.seq_len = input_size + + rate = dilation[-1] + self.register_buffer( + 'padding_data', + torch.zeros( + (rate - (seq_len % rate)) % rate, + batch_size, + input_size, + device=device, + dtype=dtype)) + + self.dilation_above_first_layer = dilation[1:] + self.cell1 = nn.LSTM( + input_size, + hidden_size, + num_layers=1, + batch_first=False, + dropout=0., + dtype=dtype) + self.cells = torch.nn.ModuleList([ + nn.LSTM( + input_size, + hidden_size, + num_layers=1, + batch_first=False, + dropout=0., + dtype=dtype) for i in range(len(dilation) - 1) + ]) + + def forward(self, input: Tensor) -> Tensor: + # step 0: pad the input + input_x = torch.cat((input, self.padding_data)) + + # no special treatment for the first layer. + xs, _ = self.cell1(input_x) + + for i, cell in enumerate(self.cells): + # for layers above the frist layer. + # step 1: pre-process: form a new batch + xs_splits = xs.split(self.dilation_above_first_layer[i]) + xs_ = torch.jit.annotate(List[Tensor], []) + for x in xs_splits: + xs_.append(x.flatten(start_dim=0, end_dim=1)) + dilated_input = torch.stack(xs_) + + # step 2: call LSTM layer + xs, _ = cell(dilated_input) + + # step 3: post-processing, revert to the original layout + xss = torch.jit.annotate(List[List[Tensor]], []) + for x in xs.unbind(0): + xss.append(torch.split(x, self.batch_size)) + + xs_ = torch.jit.annotate(List[Tensor], []) + for sublist in xss: + for x in sublist: + xs_.append(x) + xs = torch.stack(xs_) + return xs + + +class StackedDRNN(nn.Module): + def __init__(self, + batch_size: int, + seq_len: int, + input_size: int, + hidden_size: int, + dilation: List[int], + device: str, + dtype=torch.float16): + super(StackedDRNN, self).__init__() + + self.batch_size = batch_size + self.seq_len = seq_len + self.hidden_size = hidden_size + self.input_size = input_size + self.device = device + self.dtype = dtype + + self.dilation = dilation + + layers = [] + for i in range(len(dilation)): + c = nn.LSTM( + self.input_size, self.hidden_size, dropout=0., dtype=dtype) + layers.append(c) + self.cells = nn.Sequential(*layers) + + def _forward(self, input_x, cell, rate): + L, N, input_size = input_x.size() + + # padding + pad_num = (rate - (self.seq_len % rate)) % rate + padding_data = torch.zeros( + pad_num, + self.batch_size, + input_size, + device=self.device, + dtype=self.dtype) + + input_x = torch.cat((input_x, padding_data)) + + dilated_input = torch.stack( + tuple( + map(lambda m: m.flatten(start_dim=0, end_dim=1), + input_x.split(rate))), + dim=0) + + output, _ = cell(dilated_input) + + output_split = [ + torch.split(item, self.batch_size) for item in torch.unbind(output) + ] + + output_flatten = torch.stack( + [output for sublist in output_split for output in sublist]) + + y = output_flatten[:self.seq_len] + return y + + def forward(self, x): + for i, (cell, rate) in enumerate(zip(self.cells, self.dilation)): + x = self._forward(x, cell, rate) + return x diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_pytorch.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_pytorch.py new file mode 100644 index 000000000..79a7fa768 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_pytorch.py @@ -0,0 +1,185 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +import unittest +from time import time + +import torch +from pt_model import StackedDRNNJIT +from pt_model import StackedDRNN + +from utils import * + + +class PytorchDRNN(unittest.TestCase): + def setUp(self): + torch.manual_seed(1234) + self.log_dir = '' + + def _apply_forward(self, test_name, test_case, model, *inputs): + for i in range(WARMUP): + output = model(*inputs) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() + + for i in range(ITERS): + output = model(*inputs) + + end_event.record() + torch.cuda.synchronize() + elapsed = start_event.elapsed_time(end_event) / ITERS + report(test_name, test_case, OUTPUT_FILE, elapsed) + + # Uncomment to enable PyTorch profiler. + # # with torch.autograd.profiler.profile( + # with torch.profiler.profile( + # activities=[ + # torch.profiler.ProfilerActivity.CPU, + # torch.profiler.ProfilerActivity.CUDA + # ], + # schedule=torch.profiler.schedule(wait=1, warmup=2, active=5), + # on_trace_ready=torch.profiler.tensorboard_trace_handler( + # self.log_dir), + # with_stack=True, + # profile_memory=False, + # record_shapes=True) as prof: + + # for i in range(15): + # output = model.forward(*inputs) + # prof.step() + + # print(prof.table()) + # print(prof.key_averages().table( + # sort_by="cuda_time_total", row_limit=-1)) + + def test_drnn_forward(self): + if not DEFAULT_TEST: + shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + + for jited in [ + # False, + True, + ]: + for device in [ + # 'cpu', + 'cuda:0', + ]: + x = torch.randn(*shape, device=device, dtype=torch.float16) + net = StackedDRNNJIT( + batch_size=BATCH_SIZE, + seq_len=SEQ_LEN, + dilation=DILATION, + input_size=INPUT_SIZE, + hidden_size=HIDDEN_SIZE, + device=device).to(device) + net.eval() + + script_module = net + test_name = f'PyTroch_Stacked_DLSTM_{device}' + if jited: + script_module = torch.jit.script(net) + test_name = 'PyTorch_JITed_' + test_name + + test_case = [SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE, NUM_LAYERS] + self._apply_forward(test_name, test_case, script_module, x) + + # def test_drnn_pad_per_layer_forward(self): + # shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + + # for device in [ + # # 'cpu', + # 'cuda:0', + # ]: + # x = torch.randn(shape, device=device, dtype=torch.float16) + + # net = StackedDRNN( + # batch_size=BATCH_SIZE, + # seq_len=SEQ_LEN, + # input_size=INPUT_SIZE, + # hidden_size=HIDDEN_SIZE, + # dilation=DILATION, + # device=device).to(device) + # net.eval() + + # rate = DILATION[-1] + # padding_data = torch.zeros( + # (rate - (SEQ_LEN % rate)) % rate, # padding number + # BATCH_SIZE, + # INPUT_SIZE, + # device=device) + + # test_name = f'PyTorch_StackedDRNNJIT_pad_per_layer_{device}' + # self._apply_forward(test_name, net, x) + + def test_default_data(self): + if DEFAULT_TEST: + for device in [ + # 'cpu', + 'cuda:0', + ]: + test_name = f'PyTroch_Stacked_DLSTM_JIT_{device}' + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + x = torch.randn( + (seq_len, batch_size, hidden), + device=device, + dtype=torch.float16) + net = StackedDRNNJIT( + batch_size=batch_size, + seq_len=seq_len, + input_size=hidden, + hidden_size=hidden, + dilation=DILATION[0:num_layers], + device=device, + dtype=torch.float16).to(device) + net.eval() + script_module = torch.jit.script(net) + + return x, script_module + + test_cases = [ + # overall + [50, 256, 256, 6], + [50, 256, 512, 6], + [50, 256, 1024, 6], + # scale with depth + [50, 256, 256, 1], + [50, 256, 256, 2], + [50, 256, 256, 3], + [50, 256, 256, 4], + [50, 256, 256, 5], + [50, 256, 256, 6], + [50, 256, 1024, 1], + [50, 256, 1024, 2], + [50, 256, 1024, 3], + [50, 256, 1024, 4], + [50, 256, 1024, 5], + [50, 256, 1024, 6], + # scale with seq + [32, 256, 256, 6], + [64, 256, 256, 6], + [128, 256, 256, 6], + [32, 256, 1024, 6], + [64, 256, 1024, 6], + [128, 256, 1024, 6], + ] + + for test_case in test_cases: + x, script_module = build_data(test_case) + self._apply_forward(test_name, test_case, script_module, x) + del x + del script_module + torch.cuda.empty_cache() + + +if __name__ == '__main__': + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tPyTorch(ms)\n") + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow.py new file mode 100644 index 000000000..10bfda5f2 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow.py @@ -0,0 +1,58 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +import math +import unittest +from time import time +import tensorflow as tf + +from tf_model import StackedDRNN +from utils import * + + +class TFGraphDRNN(unittest.TestCase): + def setUp(self): + self.shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + self.stddev = 1.0 / math.sqrt(HIDDEN_SIZE) + + self.log_dir = '' + self.logger = init_logger(self.log_dir, 'tensorflow_drnn.txt') + + def _apply_forward(self, test_name, model, *inputs): + for i in range(WARMUP): + output = model(*inputs) + + start = time() + + for i in range(ITERS): + output = model(*inputs) + report(test_name, start, self.logger) + + def test_drnn_forward(self): + shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + stddev = 1.0 / math.sqrt(HIDDEN_SIZE) + + gpus = tf.config.list_physical_devices('GPU') + for device in [ + # 'cpu', + '/device:GPU:0', + ]: + with tf.device(device): + model = StackedDRNN( + batch_size=BATCH_SIZE, + seq_len=SEQ_LEN, + input_size=INPUT_SIZE, + hidden_size=HIDDEN_SIZE, + dilation=DILATION) + + x = tf.random.uniform(shape, minval=-stddev, maxval=stddev) + rate = DILATION[-1] + padding_data = tf.zeros( + ((rate - (SEQ_LEN % rate)) % rate, BATCH_SIZE, INPUT_SIZE), + dtype=tf.dtypes.float32) + test_name = f'TensorFlow_Stacked_DLSTM_{device}' + self._apply_forward(test_name, model, x, padding_data) + + +if __name__ == '__main__': + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py new file mode 100644 index 000000000..1cd08d2d3 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py @@ -0,0 +1,78 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +import math +import unittest +from time import time +import tensorflow as tf + +from tf_model import StackedDRNN +from utils import * + +tf.compat.v1.disable_eager_execution() + + +class TFGraphDRNN(unittest.TestCase): + def setUp(self): + self.shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + self.stddev = 1.0 / math.sqrt(HIDDEN_SIZE) + + self.log_dir = '' + self.logger = init_logger(self.log_dir, 'tensorflow_drnn_graph.txt') + + def test_drnn_forward(self): + shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + rate = DILATION[-1] + pad_shape = ((rate - (SEQ_LEN % rate)) % rate, BATCH_SIZE, INPUT_SIZE) + + stddev = 1.0 / math.sqrt(HIDDEN_SIZE) + + with tf.compat.v1.Session() as sess: + for device in [ + 'cpu', + '/device:GPU:0', + ]: + with tf.device(device): + model = StackedDRNN( + batch_size=BATCH_SIZE, + seq_len=SEQ_LEN, + input_size=INPUT_SIZE, + hidden_size=HIDDEN_SIZE, + dilation=DILATION) + + inputs = tf.compat.v1.placeholder(tf.float32, shape=shape) + pads = tf.compat.v1.placeholder( + tf.float32, shape=pad_shape) + res = model(inputs, pads) + + sess.run(tf.compat.v1.global_variables_initializer()) + + gen_x = tf.random.uniform( + shape, minval=-stddev, maxval=stddev) + gen_padding = tf.zeros(pad_shape, dtype=tf.dtypes.float32) + + x_data = sess.run(gen_x) + padding_data = sess.run(gen_padding) + + for i in range(WARMUP): + output = sess.run( + res, + feed_dict={ + inputs: x_data, + pads: padding_data + }) + + start = time() + for i in range(ITERS): + sess.run( + res, + feed_dict={ + inputs: x_data, + pads: padding_data + }) + test_name = f'TensorFlow_Stacked_DLSTM_graph_{device}' + report(test_name, start, self.logger) + + +if __name__ == '__main__': + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_triton.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_triton.py new file mode 100644 index 000000000..6c159626c --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_triton.py @@ -0,0 +1,156 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +import unittest +from time import time + +import torch +from triton_model import StackedDRNN + +from utils import * + + +class TritonDRNN(unittest.TestCase): + def setUp(self): + torch.manual_seed(1234) + self.log_dir = '' + + def _apply_forward(self, test_name, test_case, model, *inputs): + for i in range(WARMUP): + output = model(*inputs) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() + + for i in range(ITERS): + output = model(*inputs) + + end_event.record() + torch.cuda.synchronize() + elapsed = start_event.elapsed_time(end_event) / ITERS + report(test_name, test_case, OUTPUT_FILE, elapsed) + + def test_drnn_forward(self): + if not DEFAULT_TEST: + shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + for device in [ + # 'cpu', + 'cuda:0', + ]: + x = torch.randn(*shape, device=device, dtype=torch.float16) + net = StackedDRNN( + batch_size=BATCH_SIZE, + seq_len=SEQ_LEN, + input_size=INPUT_SIZE, + hidden_size=HIDDEN_SIZE, + dilation=DILATION, + device=device, + dtype=torch.float16).to(device) + net.eval() + + script_module = net + test_name = f'Triton_Stacked_DLSTM_{device}' + test_case = [SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE, NUM_LAYERS] + self._apply_forward(test_name, test_case, script_module, x) + + # def test_drnn_pad_per_layer_forward(self): + # shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) + + # for device in [ + # # 'cpu', + # 'cuda:0', + # ]: + # x = torch.randn(shape, device=device, dtype=torch.float16) + + # net = StackedDRNN( + # batch_size=BATCH_SIZE, + # seq_len=SEQ_LEN, + # input_size=INPUT_SIZE, + # hidden_size=HIDDEN_SIZE, + # dilation=DILATION, + # device=device).to(device) + # net.eval() + + # rate = DILATION[-1] + # padding_data = torch.zeros( + # (rate - (SEQ_LEN % rate)) % rate, # padding number + # BATCH_SIZE, + # INPUT_SIZE, + # device=device, + # dtype=torch.float16) + + # test_name = f'Triton_Stacked_DLSTM_pad_per_layer_{device}' + # test_case = [SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE, NUM_LAYERS] + # self._apply_forward(test_name, test_case, net, x) + + def test_default_data(self): + if DEFAULT_TEST: + for device in [ + # 'cpu', + 'cuda:0', + ]: + test_name = f'triton_finegrained_op_{device}' + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + x = torch.randn( + (seq_len, batch_size, hidden), + device=device, + dtype=torch.float16) + net = StackedDRNN( + batch_size=batch_size, + seq_len=seq_len, + input_size=hidden, + hidden_size=hidden, + dilation=DILATION[0:num_layers], + device=device, + dtype=torch.float16).to(device) + net.eval() + + script_module = net + return x, script_module + + test_cases = [ + # overall + [50, 256, 256, 6], + [50, 256, 512, 6], + [50, 256, 1024, 6], + # scale with depth + [50, 256, 256, 1], + [50, 256, 256, 2], + [50, 256, 256, 3], + [50, 256, 256, 4], + [50, 256, 256, 5], + [50, 256, 256, 6], + [50, 256, 1024, 1], + [50, 256, 1024, 2], + [50, 256, 1024, 3], + [50, 256, 1024, 4], + [50, 256, 1024, 5], + [50, 256, 1024, 6], + # scale with seq + [32, 256, 256, 6], + [64, 256, 256, 6], + [128, 256, 256, 6], + [32, 256, 1024, 6], + [64, 256, 1024, 6], + [128, 256, 1024, 6], + ] + + for test_case in test_cases: + x, script_module = build_data(test_case) + self._apply_forward(test_name, test_case, script_module, x) + del x + del script_module + torch.cuda.empty_cache() + + +if __name__ == '__main__': + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tTriton(ms)\n") + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/__init__.py new file mode 100644 index 000000000..d7cd1fd44 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/__init__.py @@ -0,0 +1,10 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .model import StackedDRNN + +__all__ = [ + 'StackedDRNN', +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/model.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/model.py new file mode 100644 index 000000000..210d2d3ac --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/model.py @@ -0,0 +1,60 @@ +from typing import List +import tensorflow as tf + +__all__ = [ + 'StackedDRNN', +] + + +class StackedDRNN(tf.keras.Model): + def __init__(self, batch_size: int, seq_len: int, input_size: int, + hidden_size: int, dilation: List[int]): + super(StackedDRNN, self).__init__() + + self.batch_size = batch_size + self.seq_len = seq_len + self.hidden_size = hidden_size + self.input_size = input_size + self.dilation = dilation + self.num_layers = len(dilation) + + rate = dilation[-1] + self.padded_length = (rate - (seq_len % rate)) % rate + self.seq_len + + self.cells = [] + for i in range(self.num_layers): + self.cells.append( + tf.compat.v1.keras.layers.CuDNNLSTM( + hidden_size, return_sequences=False)) + + # uncomment the following line to enable auto-graph. + # @tf.function + def call(self, input, padding_data): + # step 0: pad the input + input_x = tf.concat((input, padding_data), axis=0) + + # no special treatment for the first layer. + xs = self.cells[0](input_x) + + for i, cell in enumerate(self.cells[1:]): + # for layers above the frist layer. + # step 1: pre-process: form a new batch + num_split = self.padded_length // self.dilation[i + 1] + + xs_ = [ + tf.reshape(x, (-1, self.hidden_size)) + for x in tf.split(xs, num_or_size_splits=num_split, axis=0) + ] + dilated_input = tf.stack(xs_) + + # step 2: call LSTM layer + xs = cell(dilated_input) + + # step 3: post-processing, revert to the original layout + xss = [ + tf.split(x, self.dilation[i + 1], axis=0) + for x in tf.unstack(xs, axis=0) + ] + + xs = tf.stack([x for sublist in xss for x in sublist]) + return xs diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/__init__.py new file mode 100644 index 000000000..d268cbf4d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/__init__.py @@ -0,0 +1,9 @@ +from .rnn import StackedDRNN +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +__all__ = [ + "StackedDRNN", +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/op.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/op.py new file mode 100644 index 000000000..521efa67d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/op.py @@ -0,0 +1,467 @@ +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['LSTMscan'] + + +@triton.autotune( + configs=[ + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 128 + }, + num_stages=3, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 128 + }, + num_stages=2, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 128 + }, + num_stages=2, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 128 + }, + num_stages=2, + num_warps=2), + ], + key=['hidden_size', 'batch_size'], +) +@triton.jit +def LSTMscan_kernel( + Wi_ptr, + Ui_ptr, + bi_ptr, + Wf_ptr, + Uf_ptr, + bf_ptr, + Wo_ptr, + Uo_ptr, + bo_ptr, + Wg_ptr, + Ug_ptr, + bg_ptr, + h_prev_ptr, + c_prev_ptr, + input_ptr, + h_ptr, + c_ptr, + input_size, + hidden_size, + batch_size, + stride_hm, + stride_hk, + stride_wk, + stride_wn, + BLOCK_SIZE_B: tl.constexpr, + BLOCK_SIZE_H: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_h = tl.program_id(1) + + Wi_block_ptr = tl.make_block_ptr( + base=Wi_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Wf_block_ptr = tl.make_block_ptr( + base=Wf_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Wo_block_ptr = tl.make_block_ptr( + base=Wo_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Wg_block_ptr = tl.make_block_ptr( + base=Wg_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Ui_block_ptr = tl.make_block_ptr( + base=Ui_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Uf_block_ptr = tl.make_block_ptr( + base=Uf_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Uo_block_ptr = tl.make_block_ptr( + base=Uo_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Ug_block_ptr = tl.make_block_ptr( + base=Ug_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + h_prev_block_ptr = tl.make_block_ptr( + base=h_prev_ptr, + shape=(batch_size, hidden_size), + strides=(stride_hm, stride_hk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + input_block_ptr = tl.make_block_ptr( + base=input_ptr, + shape=(batch_size, hidden_size), + strides=(stride_hm, stride_hk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + c_prev_block_ptr = tl.make_block_ptr( + base=c_prev_ptr, + shape=(batch_size, hidden_size), + strides=(stride_hm, stride_hk), + offsets=(pid_m * BLOCK_SIZE_B, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_H), + order=(1, 0), + ) + offset_batch = ( + pid_m * BLOCK_SIZE_B + tl.arange(0, BLOCK_SIZE_B)) % batch_size + offset_hidden = ( + pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)) % hidden_size + bi_ptrs = bi_ptr + offset_hidden[None, :] + bf_ptrs = bf_ptr + offset_hidden[None, :] + bo_ptrs = bo_ptr + offset_hidden[None, :] + bg_ptrs = bg_ptr + offset_hidden[None, :] + bi, bf, bo, bg = tl.load(bi_ptrs), tl.load(bf_ptrs), tl.load( + bo_ptrs), tl.load(bg_ptrs) + bi_ = tl.broadcast_to(bi, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + bf_ = tl.broadcast_to(bf, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + bo_ = tl.broadcast_to(bo, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + bg_ = tl.broadcast_to(bg, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + + ig_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + fg_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + og_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + c_candidate_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + for k in range(hidden_size // BLOCK_SIZE_K): + input = tl.load(input_block_ptr) + h_prew = tl.load(h_prev_block_ptr) + Wi, Wf, Wo, Wg = tl.load(Wi_block_ptr), tl.load(Wf_block_ptr), tl.load( + Wo_block_ptr), tl.load(Wg_block_ptr) + Ui, Uf, Uo, Ug = tl.load(Ui_block_ptr), tl.load(Ui_block_ptr), tl.load( + Uo_block_ptr), tl.load(Ug_block_ptr) + + ig_ += tl.dot(input, Wi) + tl.dot(h_prew, Ui) + fg_ += tl.dot(input, Wf) + tl.dot(h_prew, Uf) + og_ += tl.dot(input, Wo) + tl.dot(h_prew, Uo) + c_candidate_ += tl.dot(input, Wg) + tl.dot(h_prew, Ug) + + Wi_block_ptr = tl.advance(Wi_block_ptr, (BLOCK_SIZE_K, 0)) + Wf_block_ptr = tl.advance(Wf_block_ptr, (BLOCK_SIZE_K, 0)) + Wo_block_ptr = tl.advance(Wo_block_ptr, (BLOCK_SIZE_K, 0)) + Wg_block_ptr = tl.advance(Wg_block_ptr, (BLOCK_SIZE_K, 0)) + + Ui_block_ptr = tl.advance(Ui_block_ptr, (BLOCK_SIZE_K, 0)) + Uf_block_ptr = tl.advance(Uf_block_ptr, (BLOCK_SIZE_K, 0)) + Uo_block_ptr = tl.advance(Uo_block_ptr, (BLOCK_SIZE_K, 0)) + Ug_block_ptr = tl.advance(Ug_block_ptr, (BLOCK_SIZE_K, 0)) + + input_block_ptr = tl.advance(input_block_ptr, (0, BLOCK_SIZE_K)) + h_prev_block_ptr = tl.advance(h_prev_block_ptr, (0, BLOCK_SIZE_K)) + + ig = ig_.to(tl.float16) + bi_ + fg = fg_.to(tl.float16) + bf_ + og = og_.to(tl.float16) + bo_ + c_candidate = c_candidate_.to(tl.float16) + bg_ + + ig = _sigmoid(ig) + fg = _sigmoid(fg) + og = _sigmoid(og) + c_candidate = _tanh(c_candidate) + + c_prev = tl.load(c_prev_block_ptr) + c = fg * c_prev + ig * c_candidate + + c_ptrs = c_ptr + offset_batch[:, None] * \ + stride_hm + offset_hidden[None, :] * stride_hk + tl.store(c_ptrs, c) + + c = _tanh(c) + h = og * c + h_ptrs = h_ptr + offset_batch[:, None] * \ + stride_hm + offset_hidden[None, :] * stride_hk + tl.store(h_ptrs, h) + + +@triton.jit +def _dot(a, b): + return tl.sum(a[:, :, None] * b[None, :, :], axis=1) + + +@triton.jit +def _sigmoid(x): + # \sigma(x) = \frac{1}{1 + 2^{-x \cdot \log_2(e)}} + log2_e = 1.4426950408889634 # log2(e) + neg_log2_e_x = -x * log2_e + exp_neg_log2_e_x = tl.math.exp2(neg_log2_e_x) + return 1 / (1 + exp_neg_log2_e_x) + + +@triton.jit +def _tanh(x): + return 2 * _sigmoid(2 * x) - 1 + + +def LSTMscan(input_, + weight_, + blas_, + state_, + resident_, + size_, + device_='cuda', + dtype_=torch.float16): + Wi, Wf, Wo, Wg, Ui, Uf, Uo, Ug = weight_ + bi, bf, bo, bg = blas_ + h_prew, c_prew = state_ + input_size, hidden_size, batch_size = size_ + h_resident, c_resident = resident_ + + def grid(META): + return ( + triton.cdiv(batch_size, META['BLOCK_SIZE_B']), + triton.cdiv(hidden_size, META['BLOCK_SIZE_H']), + ) + + LSTMscan_kernel[grid]( + Wi_ptr=Wi, + Ui_ptr=Ui, + bi_ptr=bi, + Wf_ptr=Wf, + Uf_ptr=Uf, + bf_ptr=bf, + Wo_ptr=Wo, + Uo_ptr=Uo, + bo_ptr=bo, + Wg_ptr=Wg, + Ug_ptr=Ug, + bg_ptr=bg, + h_prev_ptr=h_prew, + c_prev_ptr=c_prew, + input_ptr=input_, + h_ptr=h_resident, + c_ptr=c_resident, + input_size=input_size, + hidden_size=hidden_size, + batch_size=batch_size, + stride_hm=h_resident.stride(0), + stride_hk=h_resident.stride(1), + stride_wk=Wi.stride(0), + stride_wn=Wi.stride(1)) + return h_resident, c_resident diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/rnn.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/rnn.py new file mode 100644 index 000000000..0f6313e04 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/rnn.py @@ -0,0 +1,160 @@ +from typing import Tuple +from typing import List + +import torch.jit as jit +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +from time import time + +from .op import * + + +class LSTMCell(nn.Module): + def __init__(self, + input_size: int, + hidden_size: int, + batch_size: int, + device: str, + dtype=torch.float16): + super(LSTMCell, self).__init__() + self.device = device + self.dtype = dtype + self.size = (input_size, hidden_size, batch_size) + self.Wi = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + self.Wf = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + self.Wo = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + self.Wg = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + + self.Ui = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + self.Uf = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + self.Uo = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + self.Ug = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + + self.bi = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + self.bf = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + self.bo = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + self.bg = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + + def forward(self, input: Tensor, state_prev: Tuple[Tensor, Tensor], + state_now: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]: + + h, c = LSTMscan(input, (self.Wi, self.Wf, self.Wo, self.Wg, self.Ui, + self.Uf, self.Uo, self.Ug), + (self.bi, self.bf, self.bo, self.bg), state_prev, + state_now, self.size, self.device, self.dtype) + + return h, c + + +class StackedDRNN(nn.Module): + def __init__(self, + batch_size: int, + seq_len: int, + input_size: int, + hidden_size: int, + dilation: List[int], + device: str, + dtype=torch.float16): + super(StackedDRNN, self).__init__() + self.seq_len = seq_len + self.device = device + self.dtype = dtype + self.batch_size = batch_size + self.input_size = input_size + self.hidden_size = hidden_size + self.dilation = dilation + self.h_resident = [ + torch.empty( + [batch_size * dilation[i], hidden_size], + device=device, + dtype=dtype) for i in range(len(dilation)) + ] + self.c_resident = [ + torch.empty( + [batch_size * dilation[i], hidden_size], + device=device, + dtype=dtype) for i in range(len(dilation)) + ] + self.cells = torch.nn.ModuleList([ + LSTMCell(input_size, hidden_size, batch_size * dilation[i], device, + dtype) for i in range(len(dilation)) + ]) + + def _forward(self, iter, input_x, cell, rate): + + pad_num = (rate - (self.seq_len % rate)) % rate + padding_data = torch.zeros( + pad_num, + self.batch_size, + self.input_size, + device=self.device, + dtype=torch.float16) + input_x = torch.cat((input_x, padding_data)) + + dilated_input = torch.stack( + tuple( + map(lambda m: m.flatten(start_dim=0, end_dim=1), + input_x.split(rate))), + dim=0) + + h = torch.zeros( + (dilated_input.size(1), self.hidden_size), + device=self.device, + dtype=self.dtype) + c = torch.zeros( + (dilated_input.size(1), self.hidden_size), + device=self.device, + dtype=self.dtype) + hs = [] + + for i in range(dilated_input.size(0)): + h, c = cell(dilated_input[i], (h, c), + (self.h_resident[iter], self.c_resident[iter])) + hs.append(h) + + output = torch.stack(hs) + + output_split = [ + torch.split(item, self.batch_size) for item in torch.unbind(output) + ] + + output_flatten = torch.stack( + [output for sublist in output_split for output in sublist]) + xs = output_flatten[:self.seq_len] + return xs + + def forward(self, x): + for i, (cell, rate) in enumerate(zip(self.cells, self.dilation)): + x = self._forward(i, x, cell, rate) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py new file mode 100644 index 000000000..3e89519d9 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py @@ -0,0 +1,81 @@ +import os +import logging +import argparse +from time import time + + +def max_dilation(depth): + depth = int(depth) + if depth > 6: + raise argparse.ArgumentTypeError( + 'To avoid memory errors, %s should not be too large.' % depth) + return depth + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=50) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=256) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument( + '--input_size', type=int, help='Input size', default=256) + parser.add_argument( + '--depth', type=max_dilation, help='Depth size', default=1) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +cmd_args = parse_test_args() +SEQ_LEN = cmd_args.seq_len +BATCH_SIZE = cmd_args.batch_size +HIDDEN_SIZE = cmd_args.hidden_size +INPUT_SIZE = cmd_args.input_size +NUM_LAYERS = cmd_args.depth +OUTPUT_FILE = cmd_args.output_file +DEFAULT_TEST = cmd_args.default_test + +DILATION = [1, 2, 4, 8, 16, 32] +if not DEFAULT_TEST: + DILATION = DILATION[0:NUM_LAYERS] + +ITERS = 10 +WARMUP = 5 +LOG_DEBUG_INFO = 0 + +print_header = True + + +def report(test_name, test_case, OUTPUT_FILE, elapsed): + seq_len, batch_size, hidden, num_layers = test_case + # elapsed_time = time() - start + # average_time = elapsed_time / ITERS + # seq_per_sec = (ITERS * BATCH_SIZE) / elapsed_time + + print( + f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + f"hidden_size: {hidden}, Baseline(ms): {elapsed}ms") + + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'a') as fout: + fout.write(f"{num_layers}\t[{seq_len}, {batch_size}, {hidden}]\t" + f"{elapsed}\n") diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/README.md b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/README.md new file mode 100644 index 000000000..ea5a25bce --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/README.md @@ -0,0 +1,19 @@ +# Hyper-parameters + +1. `num_layers` = 8, 8 LSTM layers are stacked +1. LSTM's `hidden_dim` = `output_dim` = 512 +1. All training samples have a fixed length: `seq_len_` = 100 +1. `batch_size` = 64 +1. `warm_up` = 10, `iteration` = 30 + +Explanation for some implementations: + +|Name|Explanation| +|:--|:--| +|Fine-grained Lstm Cell V1|Compute LSTM's Four gates separatedly.| +|Fine-grained Lstm Cell V2|Manually batch GEMMs in LSTM's four gates into a large GEMM.| +|Static LSTM cell in TensorFlow|LSTM cell as a single operator.| + +

+ +

diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/draw_figure.m b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/draw_figure.m new file mode 100644 index 000000000..875d30c68 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/draw_figure.m @@ -0,0 +1,179 @@ +clc +clear +close all; + +%% +map = addcolorplus(319); +num = 7; + +idx = linspace(5,60,num); +idx = round(idx); +C = map(idx,:); +%% +% t = tiledlayout(2, 1, 'TileSpacing','compact','Padding','compact'); +% nexttile +% +% subtitle('(a). The elapsed time varied according to depth.',... +% 'FontName','Helvetica', 'FontSize', 16) +% hold on + +figure; + +filename1 = 'perf_with_increased_depth_subplot1.tsv'; +data = tdfread(filename1, '\t'); + +x = reshape(data.AvgTime, 6,6); + +%% +figureUnits = 'centimeters'; +figureWidth = 16; +figureHeight = 6; + +set(gcf, 'Units', figureUnits, 'Position', [0 0 figureWidth figureHeight]); + +%% +GO = bar(x,1,'EdgeColor','k', 'LineWidth', 0.5); + + +GO(1).FaceColor = C(1,:); +GO(2).FaceColor = C(2,:); +GO(3).FaceColor = C(3,:); +GO(4).FaceColor = C(5,:); +GO(5).FaceColor = C(6,:); +GO(6).FaceColor = C(7,:); + +%% +set(gca, 'Box', 'on', ... + 'XGrid', 'off', 'YGrid', 'on', ... + 'TickDir', 'in', 'TickLength', [.01 .01], ... + 'XMinorTick', 'off', 'YMinorTick', 'off', ... + 'XColor', [.1 .1 .1], 'YColor', [.1 .1 .1],... + 'YTick', 0:20:80,... + 'Ylim' , [0 85], ... + 'Xticklabel',{'1' '4' '8' '12' '16', '20'},... + 'Yticklabel',{0:20:80}) + +hYLabel = ylabel('Elapsed Time(ms)'); +hLegend = legend([GO(1),GO(2),GO(3),GO(4),GO(5),GO(6)], ... + 'PT-JIT',... + 'TF-WhileOpLSTM',... + 'TF-GraphMode',... + 'TF-AutoGraph',... + 'TVM-Ansor', ... + 'CuDNN',... + 'Location', 'northeast', ... + 'Box', 'off'); + +P = hLegend.Position; +hLegend.Position = P + [0.015 0.03 0 0]; + +set(gca, 'FontName', 'Helvetica', 'FontSize', 12) +set([hYLabel,hLegend], 'FontName', 'Helvetica') +set([hYLabel,hLegend], 'FontSize', 14) + +set(gca,'Color',[1 1 1]) + +%% +figure; +% nexttile +% +% subtitle('(b). The increasing ratio of time varied according to depth.',... +% 'FontName','Helvetica', 'FontSize', 16) + +filename2 = 'perf_with_increased_depth_subplot2.tsv'; +data = tdfread(filename2, '\t'); + +Y = reshape(data.Ratio, 20, 6); +y1 = Y(:,1); +y2 = Y(:,2); +y3 = Y(:,3); +y4 = Y(:,4); +y5 = Y(:,5); +y6 = Y(:,6); +yRef = 1:1:20; + +x = 1:1:20; + +%% +figureUnits = 'centimeters'; +figureWidth = 16; +figureHeight = 8; + +%% +set(gcf, 'Units', figureUnits, 'Position', [0 0 figureWidth figureHeight]); +hold on + +Line1 = line(x,y1); +Line2 = line(x,y2); +Line3 = line(x,y3); +Line4 = line(x,y4); +Line5 = line(x,y5); +Line6 = line(x,y6); +LineRef = line(x, yRef); + +set(Line1, 'LineStyle','--',... + 'Marker', 'o',... + 'MarkerSize', 8,... + 'MarkerEdgeColor',[0,0,0],'MarkerFaceColor',C(1,:),... + 'LineWidth', 1.5,... + 'Color', C(1,:)) +set(Line2, 'LineStyle', '--',... + 'Marker', 'o',... + 'MarkerSize', 8,... + 'MarkerEdgeColor',[0,0,0],'MarkerFaceColor',C(2,:),... + 'LineWidth', 1.5,... + 'Color', C(2,:)) +set(Line3, 'LineStyle', '--',... + 'Marker', 'o',... + 'MarkerSize', 8,... + 'MarkerEdgeColor',[0,0,0],'MarkerFaceColor',C(3,:),... + 'LineWidth', 1.5,... + 'Color', C(3,:)) +set(Line4, 'LineStyle', '--',... + 'Marker', 'o',... + 'MarkerSize', 8,... + 'MarkerEdgeColor',[0,0,0],'MarkerFaceColor',C(5,:),... + 'LineWidth', 1.5,... + 'Color', C(5,:)) +set(Line5, 'LineStyle', '--',... + 'Marker', 'o',... + 'MarkerSize', 8,... + 'MarkerEdgeColor',[0,0,0],'MarkerFaceColor',C(6,:),... + 'LineWidth', 1.5,... + 'Color', C(6,:)) +set(Line6, 'LineStyle','--',... + 'Marker', 'o',... + 'MarkerSize', 8,... + 'MarkerEdgeColor',[0,0,0],'MarkerFaceColor',C(7,:),... + 'LineWidth', 1.5,... + 'Color', C(7,:)) + +set(LineRef,'LineStyle','-',... + 'LineWidth', 4, 'Color', 'k') + +set(gca, 'Box', 'on', ... + 'XGrid', 'on', 'YGrid', 'on', ... + 'TickDir', 'in', 'TickLength', [.01 .01], ... + 'XMinorTick', 'off', 'YMinorTick', 'off', ... + 'XColor', [.1 .1 .1], 'YColor', [.1 .1 .1],... + 'XTick', 1:3:21, 'YTick', 0:5:25,... + 'Xlim',[0.5 20.5],'Ylim' ,[0 26], ... + 'Xticklabel',{1:3:20},... + 'Yticklabel',{0:5:25}) + +hYLabel = ylabel('Ratio'); +hLegend = legend([Line1,Line2,Line3,Line4,Line5,Line6,LineRef], ... + 'PT-JIT',... + 'TF-WhileOpLSTM',... + 'TF-GraphMode',... + 'TF-AutoGraph',... + 'TVM-Ansor', ... + 'CuDNN', ... + 'baseline',... + 'Location', 'northwest', ... + 'Box', 'off'); + +set(gca, 'FontName', 'Helvetica', 'FontSize', 12) +set([hYLabel,hLegend], 'FontName', 'Helvetica') +set([hYLabel,hLegend], 'FontSize', 14) +set(gca,'Color',[1 1 1]) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/for_plot.tsv b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/for_plot.tsv new file mode 100644 index 000000000..0000719cc --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/for_plot.tsv @@ -0,0 +1,7 @@ +Test Name Average Time Elapsed Time Throughput +CuDNN 0.0251 0.7545 2544.7697 +PT 0.2123 6.3702 301.4025 +PT_JITed 0.0505 1.5161 1266.4127 +TF_GraphMode 0.0743 2.2293 861.2472 +TF_WhileOpLSTM 0.1068 3.2042 599.2203 +TF_AutoGraph 0.0501 1.5038 1276.7482 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth.ipynb b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth.ipynb new file mode 100644 index 000000000..796b6439a --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "04db80e0-dbec-4ea5-a826-53ea14e50c4f", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import seaborn as sns\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "sns.set_theme(style='white', palette=None)\n", + "\n", + "import matplotlib as mpl\n", + "mpl.rcParams['font.family'] = 'Times New Roman'\n", + "# mpl.rcParams['font.weight'] = 'bold'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4515fc9e-05ed-4ca5-a44b-8366cf2a5c80", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAArkAAAKxCAYAAABXDbTZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gU1dfA8e9ueg8EpIZu6FKkg/SOdJDei0qTojSRIgmCSC+CIAYhlNClI0UFRCKCdKSXhBZIQnrdef/Iu/PLksJm08P5PA8PycydmTMzm92zd27RKIqiIIQQQgghRC6izeoAhBBCCCGESG+S5AohhBBCiFxHklwhhBBCCJHrSJIrhBBCCCFyHUlyhRBCCCFEriNJrhBCCCGEyHUkyRVCCCGEELmOJLlCCCGEECLXkSRXZLo7d+4QGRlpdPmgoCAeP36cgREJIYQQIreRJFdkmpcvXzJ+/HgOHjyIpaWl0dtZW1vzww8/MHv2bMLDw9MUg6enJ2XLlqV8+fK0adOGvn37Ur9+fcqWLUvZsmWpXLky/fr146OPPqJ69eqULVuWfv36AXDgwAFq1Kihlt25c2eaYkkPQUFBNG/eXI2pbNmy+Pr6ZnVYaRYUFESnTp0Mzis11zs6Oppz584ZLAsJCaFv375Uq1aNH374Ib1DzhV27dpFrVq16NixI0+ePEnz/tJ6H3Or27dv06ZNG4PrcvbsWXX948eP6dixI7Vq1WL37t1ZF+hrzpw5kynH+fHHH6ldu7Z6bZo2bZopx03o9XPduHEjlSpVSvJ+iexLklyRKe7fv0/37t2pWrUqo0aNQqs1/qVnbW3NzJkziY6OZsCAAQQFBaUpFhcXF/bs2cPBgwfZuHEjH3zwgbouf/78bNiwAW9vb3777TeDdW3btqVevXppOnZ6c3Z25uDBg9ja2mZ1KOnK2dmZ3bt3G1z/1Fi0aFGiD6kzZ87w999/Ex4ezurVq9MjzFxn3bp1vHr1ihs3bnD48OE07y+t9zG3KlOmDAcOHKBMmTJJrj98+DA3btzg1atXrFu3LpOjS9rTp0/57LPPMuVYQ4YMydIvor/99hsrV640WNa3b1/Kli2bRREJU0mSKzJcUFAQQ4cOxdnZWa0VNcXEiRN5+PAhY8aMITY21uT9TJkyBTc3tzeWc3R0ZO7cuQa1ztkxmbSwsCBPnjxZHUaGyJcvX6q38fb2TjIxqFOnDtWrV8fW1pZhw4alR3i5zsCBA3F0dKRs2bK0bNky3fZryn3M7TQaDXnz5k1yXYsWLXBzc8PR0ZFBgwZlcmSJBQUFMWzYMF69epVpx3Rxccm0YyV09epVJkyYkOS67Pj+L1JmntUBiNzPw8ODR48e8fXXX6PRaEzej4ODA+3atcPLy4uffvrJpESlZMmS1KhRw+jy+fLlo3v37qk+jsh8iqLw/fffs3Tp0iTXOzo6snnz5kyOKmfp2rUrXbt2zeow3npFixZl7969WR0GAPfu3WPEiBHcvXs3q0PJcL/99hvjx48nLCwsq0MR6USSXJGhbty4wS+//AKQ5CPLqKgoNm/ezLlz5wgKCuLKlSsUK1aMwYMH06lTp0TlmzdvjpeXF99//z09evTA0dExVfE0atQo1efQunXrZNdt376dNWvW8PTpU2rUqIGHhwcFCxY0KBMSEsLKlSv5448/0Gq1WFhYMH78eBo0aGDU8S9fvsyyZct4/vw5r1694v3332fy5MlG144dPnyYAwcOEBERwT///IOTkxPt27dn5MiRai31iBEjOHbsmLrNmDFjCAgI4OrVq9y8eZMaNWowadIkSpcurZaJjIxk4cKF/PHHH9jZ2fHff/8RExODu7u7wReD58+fs3TpUi5cuEBcXBz58+dn4sSJVK5c2SDO69evs2jRIi5evEi1atUoUKBAqmqO1q1bx/bt21EUBYhvX+rj44OtrS0dO3bkq6++IjQ0FIBatWqxYcMGABYuXGjQfGHr1q1s27aN48ePExYWRvny5fnqq68oUqQIK1as4PDhw7x69YqKFSvi7u5ucE1Sc74J7dq1i8mTJxssK1euHHv27CEyMpKOHTty//59LCws+Pnnn6levTo6nY6dO3fy+++/ExYWxoULF3jnnXfo2bMnAwcOVL9Qnjt3ji+//JL79+8D0LlzZ+bMmcPSpUvZvHkz77zzDps3b2bIkCH8+++/6vF//vlnateunerzSut91Dtz5gzbt29XX7cWFha0aNGCCRMmYG9vb1A2NDSUlStXcvbsWUJDQ4mJiaFVq1aMGjUKOzs7tdyLFy9YunQp169f5+XLl5ibm9OpUyeGDx+Oufn/Pg7PnTvHypUr8fX1xc7ODmtra/r06cOHH35oUMaY66qP9a+//mL58uXcuXOH6tWrU6JECaKiohKdt6enJwsXLlTXde7cmblz5xIZGcnMmTPZtWuXWvbXX39l0aJFnDx5Eo1GQ9euXZk4caJBc7DY2FjWrVvHrl270Ol0BAcHExAQAMR/6c+fPz8zZsxIsumEn58fc+fONUhw9U/j+vTpo743GnO9UhIXF8fatWvZvn07FhYWlCpVilq1aqW4zf3791myZAm3b98mPDycd999lylTplC8eHEAOnXqxPXr19Xy06dP57///uPixYs8efKEBg0aMGnSJAoUKADA6dOnWbx4sZrgXr9+XT1Xd3d3db96Op2OH374gU2bNvHq1SsaNmyIu7s7Dg4ORp2zyCSKEBno66+/Vtzc3JSqVasmWhcVFaV0795dqVixohIcHKwoiqKsXbtWcXNzU9zc3JQdO3Yk2ubly5fq+o0bN6ZLjJMmTVL32aRJE6PLtm3bVpk6daoya9YsdVmfPn0MygcGBipt27ZV3NzclAsXLig6nU7p27evUr58eeW33357Y2xHjx5VKlasqHz44YdKdHS0cu/ePaV8+fJKy5YtlZCQELVckyZN1BgePXqkLp8zZ47i5uam/P7774qiKMqlS5fUcpMmTTI41qBBg9R1LVu2VAIDAxVFUZQRI0Yobm5uSq1atRRfX1+1vP68f/nlF0VRFOXu3btK06ZNFW9vb7XMw4cPlQYNGigVK1ZUfH19laioKKVly5bKe++9p1y7dk0td/HiRaVKlSqKm5ubsnPnTkVRFOXvv/9WKlasmOLr4XVLly5Vyy9dutRg3b59+9R1ffv2NVhXu3ZtdV3Xrl2Vc+fOKSdPnlTKli2ruLm5KfXr11cGDx6snD9/Xvnzzz/V5Z07dzbYj7HnmxRPT081Bjc3NyUoKEhdFxMTo1SqVEm5dOmSumzkyJGKm5ubcvv2bUVRFOXgwYPJnvumTZsM7vvkyZPVc3Bzc1MuXryoKIqitGrVSl32119/pfq80us+6q/Fpk2bFEVRlKdPnyqVKlVK8t4FBwcr7du3VypWrKg8fPhQefHihXpuI0aMUMs9fvxYadSokVKrVi0lODhYuXjxohqTh4eHWu7IkSNK+fLllerVqysvX75UFEVRBg8erLi5uSmzZ8826br++uuvSvny5RU3Nzfl7NmziqIoyv79+w3ud8LrvXr16iT/TkNDQw22GTJkiHLhwgVl4sSJSb4v6nQ65dNPP1Vfq1FRUcqzZ8/Ue9SzZ8833gtFUQyO+brUXK/kjB49WnFzc1NatWqlREVFKdHR0WrcSb0vX7x4UalevbpSp04dJSQkRAkMDFRq1Kih1KlTR3ny5IlarmXLluo+evXqpURGRirR0dFK165dFTc3N6Vp06YG76N//fVXsu8RiqIoffv2Vde3b99e+eabb5Tx48eryyZOnGjU+YrMI21yRYY6deoUgPpt+fV1Fy9eJCYmRq1Vq1Chgrr+yJEjibbJmzevWvt48uTJjAjZaA0aNMDDw4Pp06fj7OwMwD///GNQOzNnzhxu375N2bJlqVq1KhqNhsaNGxMXF8fcuXNT3P+rV6+YNGkSMTExdO3aFQsLC0qUKEGpUqW4f/8+mzZtSnH70NBQ1q9fD6C2UU3p+ubPn1/9ecCAAeo56Wtlg4KCWL58uVpm//79AGzZsoWAgABKlizJokWLDGqRpkyZwvPnz/nggw8oUqQIlpaW1K9fn8jISBYsWKCWmzlzJhEREZibm9OuXTsAatSokWntQhO2tXN3d+f999+nQYMG6jb+/v588cUXVKtWjbp166rLr169avBo09jzTUqfPn0MaosSvr6vXbtGs2bN1FrT//77j19//RUw7t5aWVmpP//++++89957/PbbbzRp0oSWLVtSvnx5wPA1kFBm30d9px/9uRUoUEBtv+rj42PQ+XTBggX8999/VK9eHVdXV1xcXNQOQj4+Pmq56dOn8+TJE5o2bYqDgwMVK1ZUX+P6ciEhIUydOpW4uDiqVq2qHrNFixYAbNiwwaBDozHXNTo6mpkzZxIXF0ehQoXUGsq2bdtStWrVJM8/uddqwlppgPnz51O1alX1WoPhqAC//vqr+oSmefPmWFpa8s4771ClShUAzp8/z6NHj5I8ljFSe72Scvz4cbWTY5s2bbC0tMTCwoLPP/88yfKxsbGMHz+e0NBQ2rVrh729Pc7OzlSvXp2AgAC+//57tWzC1/PHH3+MlZUVFhYWdO7cGQBfX188PT1NOvdu3boxefJkZs2apS47ffq0SfsSGUeSXJFhoqOj1Ud5STUrKFeuHIUKFcLMzIyiRYsCGCSIyQ0Xpn8cdOPGjXSOOHUS9rTVJ0k6nU59NBseHs6BAwcA1POD/12Lu3fvpjhM06FDhwgJCUl2+ze9odra2qofqPrkKaXrm7C9tLW1tfpzyZIl1Z+PHj2qNgfQf+CeO3eODh06cPjwYd577z21Tee9e/f4+++/k43/7NmzxMTEcPfuXa5evQpAwYIFDTr6pWaoufSS8FF4wuMnt1x/j4w93+SYm5sbtDP/8ccf1Z+9vb0N1hUpUkTtPGnMvU3I0tKSXr16UbBgQVatWsWyZcuwsLBItnxW3MfGjRsbnFtMTAw6nS7R+UVGRqqP7xMmNAsWLGDo0KEsWrQIiE9m/vjjD4NyZmZmfP/99wwZMoSvv/4aiB8mMDg4ONH+9MkwxDdRSkpy1/Xs2bP4+/sDUKxYsUTbpIW+w2nCZFvfJAcME96EiXPCn1++fGny8dNyvfT27dun/pzwS15y1+avv/5SE/M3vS8a856m/7KYWvr3/4RfPNI68o9If9ImV2SYhH/wSb1hFS5cmBMnThAdHY2vry+TJk3iwYMH6np9MvU6/b707cqyg4RvpvpE5u7du+rP//zzj9q+KywsjCJFigAQGBhIoUKFktznzZs31Z8XLVqk1soGBASo26dEq9Xy888/ExERQWRkJIsWLeLy5cvq+uSu7+sStjELDg4mMDCQvHnzMmbMGCZNmgTE13SOGTOGzp078/XXX2NpaWkQ/6FDh9QvJcHBwWr8ISEhBu39cmLvZf1IH8aeb3I96gE6duzIsmXLePbsGdeuXeP333+ncuXKvHjxgooVK6rl7O3t2bt3L5GRkQQFBTF79mz+++8/dX1K9zbhfoyRFfdx3rx5zJw5E0VR+PHHH/ntt9+IiIhQ1+sT3tu3b6sTy+iTLYgfouuLL75Qf9cn36+Xq169OtWrV1d/T/jF2cbGRv3ZzMxM/fnWrVtJxpzcdb1z5476c2a8vuPi4tSfEyZ2Cd+jEj5tSe79xxhpuV56Ca9Pwn0kJ+HrcePGjWpNdWBgIEWKFDE4dnISfmF9+PDhG8unJKn3fpF9SJIrMkzCjhzJfegGBwfz7bffsmfPHtzd3enSpQv9+/dPcb/6D7i0jNSQkZI6V2dnZ7VJhinat2/PJ598kurt4uLi8PLyYuXKlXTo0IE1a9YYPNY2xuvXWf8B2alTJ8zMzJg5c6Zae7Rr1y7s7Oz46quvDLZ59913kx3vM+GHcm6R0vmmxNLSksGDB/PNN98AsHr1aho0aEDfvn0TlY2KimL58uWsX7+ekSNHMmjQIJo1a/bGY6S2s2ZCmXkfDx48yHfffUeVKlVYs2YNbdu2VWvN9RImFZcuXUKn0yU5BnfCcgk7170u4d9uwmQp4bklN3xhctc1YQ10ZmvWrJl6v168eKEu19cs16pVK8mmZMZKy/XSS8v1qVu3LrNnz071dglfI9n1c0SkD2muIDKMo6OjmuhGR0cnWh8cHEyPHj3Yvn07PXv2THI0haTo95VSjVh2kPBR2sOHD7l3757B+ujo6BS/+Sfc/uTJk4mS54SPJZPz+eefM3/+fFxdXZkxY4ZRtRyvS9jm1MnJCScnJyC+3WT79u3Zt2+fwUgRe/bsSRT/5cuXCQwMNNhveHg4iqIYlEtYy2aKrPzAMvZ83+Sjjz4yaON96tSpRCNxREdHM2jQINasWUODBg1M+gJkrKy4j/Pnz2fKlCloNBoWL15sUCOZUPHixdV7HhQUpLYTf12JEiXUn69fv8758+eTLPfuu++qPyd83Sechvz10TTeJD2vS2rVqFGDUaNGAfFNC0JDQ7l9+zbnz5+nYMGCzJkzJ037T4/rlfCplDHXJ+H1/OuvvxJ9thjzvpgw1oRNSCThzX0kyRUZxtzcXG23lNQQQps2bVITP33CakybJn1tTsLHg3PmzKFatWp07dqVp0+fpirOhDUJ6Vnr4uzsTJ06ddT9TpgwQZ1yNzAwkGnTpqU4HmPz5s3VLwnnzp1jwYIFalLs4+PDqlWrUjz++fPn1TbBefLkQaPRJEpQkpMwGUvYMaVZs2bqB8Fff/3FlStXKFSoEGvXrqVjx47A/x6RlitXTk0ugoOD+fzzz9UmJn5+fsycOZOYmBjKlSuntsXTD5Oml/BLgDEJorFtHF/fV8LfkzvOm8oYe75vYmtrazBpSq9evRKVOXz4MP/88w9g3N+OKbWs+nPM7Pv47Nkz1q5dC8Q/VrawsCAiIiLJ4bby5s1rMDShu7u7el2ioqJYtGgR/v7+VKxY0WCIrM8//1x9TB4SEsI333xDVFQUbdq0UZsUJPxbSdhuNeGXcWOua7169dQmP3fu3DF4jzHmuhjbrCg5o0ePxt3dHT8/P3r27MkXX3xBv3792LNnD66urkbtI7m/q9Rer6S0atVK/Tlh04aENcAJr0G9evXUWvOHDx8ybdo0tY32f//9x7x585I8TnLvac2bN1d/Ts8+APoOclWrVmXQoEFGJd8i/UmSKzJUkyZNAHjy5EmiBDLht/YVK1bQvXt3g04KDx48YMOGDQYfJM+fP1ff/PRj3r569Yr169cTHh7OlStXOHTokNHxxcXFGbQJCwwMTLGtb8J2gQlrKxJ29Em4/PPPP1eTvqtXr9K8eXMaN25M06ZNadWqlUEHjde5uroaNN1Ys2YNNWvWpFGjRsycOVPtiBQdHW3wAaN/LJnw0e6ZM2fo1KkTU6ZMMeiksm7duiR7V3t6eqr70Q9K7+zszOjRo9UyiqIwduxY7t27h0ajUb909OzZE4h/fDlp0iT10eCpU6f44IMPaNq0KZ07d6Zv375YWlpiZmbG559/jkajQafT4eXlBcDFixcNOoUY00Em4ZitL168QFEU9TX1/PlzdV3Cn2NjYw0SxGfPngHx9zrhcv0j3sjISIPrrX+9GHu+xujbty+2trYUKVKENm3aJFqf8G9nx44ddOvWjRUrVqjLXr58iaenp/oaSHi+ySXDiqIYlNNfh8y+jwlft/fv3+fDDz/kk08+Mahl27VrF5cuXQLgq6++UjtgBQUF0bt3bxo2bEjNmjXJmzcv+fPnR6PR4OHhobb59PPz48MPP6Rx48Y0aNCA6tWrY2VlRd68eZkxYwYajYYLFy7w6tUrFEVRO6116dLFoEmIMdfV3t5erU0NCgpS/56OHj1q0HQi4ftOUvcBEnco1H9JTrg84XsUxD9x+eGHHzh79iz79u1j165dfPHFFym+97zuvffeU39+8eIF165d49q1a6m+Xklp3769+t6xb98+goKCiIuLY+HChWqZkJAQtcbW3t6esWPHquv27NlDrVq1aNKkCUOHDmX48OFJHuf7778nLCwMRVHUGn9XV1cGDhyolildurTakUz/Ot23b5/6np7U+//r90Rf5ubNm+zfv5+IiAj+/PPPN44yITKGJLkiQ3Xr1g1LS0uio6MTdUAYOHAg1apVw9ramsqVKzNy5EhWrVpF69atsbKyws7OjjJlyhg8Ytd3dHB2dlaHzXFycmLAgAHY2tpSqVKlFCdvSGjBggW0aNGCK1euqMsiIyNp06aNwRuf3oEDB/jzzz/V3zdv3szNmzdZvHixQdKzePFi9Q2ycuXKeHl50bBhQxwcHLC1tcXV1ZVVq1YZ1X5y4sSJTJ8+HTc3NywtLbG3t6dRo0Zs3LgRJycngoKCaNWqlcEb7aeffqo+4u7UqRM2NjaUKFGCzp07s3z5csaMGYO9vT0uLi5otdoka3MaN27Mt99+S58+fTh8+DBNmzZl8+bNFC5c2KDco0ePaN++PV27dmXjxo2MHTuWTz/9VF3ftGlTfvzxR2rWrImtrS22tra8++67rF+/3uCDs2XLlixdupRy5cqxevVqRo8ezalTp2jTpg1FihShRo0a+Pr6GnQ6SUrdunWZMmUK+fPn58CBA3z77bc0aNCAAwcOMH36dLXcgwcPaN26NWFhYXTu3Nng+o0ePZpr167Rrl07g5r2Tz/9lHPnztG+fXuDD7sRI0aoX5SMPd83cXZ2pkePHgwcONCgbbte586dadSoEdbW1pQtW5a+ffvy/fff079/f2xsbNTkzsHBgW3bthnU+p84cYJu3bol2mfXrl3V0VAgfjgw/fBKmXkfy5Qpw7Bhw7Czs6Nw4cK0adOGVatWMXHiRPLmzYujoyPR0dHqF5pixYrh7e1N27ZtcXZ2xsbGhiJFirBkyRIGDBig7rdq1aps2rSJRo0aYW9vj62tLaVLl+ann34yqE3s1KkTa9euxc3Nje7du9O5c2eCg4OZMWMGHh4eajljryvEv9fNmjWLEiVKMGvWLCZMmICfnx/16tWjWLFi1K5dm4sXL/Lo0SM8PT0Nhuo7c+YMvXr1IjIy0mC4KoDZs2fj6+trEMeNGzfU9v8PHz5k6dKlPHz4kCZNmtC6dWvatGlD+/bt6devH/PmzTNIqJMzb948ateujZWVFRMmTOD27dtq235jr1dyzM3NWbduHV27diUuLo6PPvqI6dOn07p1a5ycnChbtiw1atRg+/btaqLbp08fFi5cSOXKlbG2tsbe3p5q1aqxadOmZGunGzRowLRp0+jatSvXrl2jffv2bNq0yaATmoODAytWrKB06dJqZ+hChQphbW2Nl5eXQUe7NWvW4Ovrq47Moefh4UFkZCRubm60a9cOa2tr6tWrR926dd94LUT60yhpfRYixBssWrSIVatWMWLECD777LM07evrr7/Gy8uLr7/+mh49eqRThAJg8uTJ6nBM33zzDV26dMniiIQQafH48WNatGiRYucvV1dXDhw4kCXD9WW0fv36qWMgvz6Dn3g7SE2uyHCjRo2ievXqbN++PcXxO98kMDCQvXv30rp1a0lwhRDiDQoXLszs2bNT7HD66NGjLB9zXIiMIkmuyHAWFhZ8//33FClShG+//dakfcTFxTFjxgxq1Khh8j5EyuShjhC5y6FDh5g+fTqDBg3i/Pnz3Lhxg+vXr6vtpOvUqYNWq03TMGLZmbynCUlyRaZwdnbm559/xtnZmS+//DLJIcWSExISwhdffEH16tVZsWKFQccpkX70HauAVI9QIYTIfu7cuUNMTAwxMTHY2Nig0WjQarVYW1tjY2NDaGgogwcPzpVJrqIo8p4mpE2uyHwhISFYW1unOJVoQuHh4eobs8gYn3zyCSdOnFB/t7CwoHbt2gZTywohcpbY2Fi8vb05cuQIL1++xNHRESsrK8zMzHB0dKRt27ZGdYDNidq3b2/QwdHW1pY2bdqkeWxgkbNIkiuEEEIIIXIdaa4ghBBCCCFyncQDML7FatSoQXR0NPnz58/qUIQQQgghRBL8/f2xtLTk3LlzKZaTJDeBqKgok6a/FEIIIYQQmSM2Ntao0TMkyU3gnXfeAeDYsWNZHIkQQgghhEiKsR0mpU2uEEIIIYTIdSTJFUIIIYQQuY4kuUIIIYQQIteRJFcIIYQQQuQ6kuQKIYQQQohcR5JcIYQQQgiR60iSK4QQQgghch1JcoUQQgghRK4jSa4QQgghhMh1suWMZydPnmTVqlVotVqio6MpU6YM48ePx8XFRS3z7Nkz3N3defHiBXFxcfTp04eOHTtmSbwxMTEyHbAQGcTMzAwLC4usDkMIIUQOk+2S3L/++ovx48ezYcMGypUrh06nY9q0aQwbNoxt27ZhZmZGQEAAffr04aOPPmL48OG8fPmSTp06ERsbS9euXTMt1uDgYF68eEFUVFSmHVOIt5GVlRX58uXD0dExq0MRQgiRQ2S7JHfbtm3UrVuXcuXKAaDVaunXrx+dOnXi1q1blCtXjiVLlhAWFsbgwYMBcHFxoVevXnh4eNCkSRPy5s2b4XEGBwfj5+eHvb09+fLlw8LCAo1Gk+HHFeJtoigKMTExvHr1Cj8/PwBJdIUQuYai6NBoUtdy1JRt3lbZLsmNiYnh1q1bxMbGYm5uri6zsrLinXfeISIigl27dtG4cWN1PUDNmjVZsmQJe/bsYdCgQRke54sXL7C3t6do0aKS3AqRgWxsbHBwcMDX15cXL15IkiuEyDU0Gi1hFw+jCws0qrzWLg92VVplcFS5R7ZLcrt27crw4cOZM2cOX331FRqNhs2bNzNlyhTy5s3L77//TlRUFCVLljTYrlSpUgD4+PikKclVFIXw8PAUy8TGxhIREUHevHnR6XQmH0sIYTwHBwf8/PwIDg42+IIrhBA5kUajwcbGBl1YIHHB/qnaNiIiAkVRMiiy7M/Yc892nxSNGjViypQpzJ07l4CAAOrWrUuHDh2oW7cugPrIMl++fAbbOTg4GKw3VXR0NNevX39jOXNzc3Q6HZGRkWk6nhDCODqdTn3SI4QQOZ2NjQ0VKlQwadt79+4RERGRzhHlHNHR0QAGT/2Tku2SXICBAwfy7Nkz/Pz8mD17Np9//rma5L569QoAa2trg23MzMwA0twJzNLSkvLly6dYJioqisePH2NlZZUoDiFExrGwsKB48eJYWVlldShCCJEmaWnqWLJkybe6JtfS0hLgjU/1smWS6+7uTtu2balevTqLFy/mm2++4enTp0yePFn9cHs9mdX/7uTklKZjazQabG1tUyyj1WrRarWYmZmpybUQImOZmZmh1WqxsbGRL5dCiLeajY1NVoeQpYz9gpDtklwvLy8uXrzItGnTABg7diyxsbGsWbOGxo0bU6xYMQCCgoIMttP/XqhQocwMVwghhBBCZEPZbgyKHTt24OrqarBs3LhxvPPOOxw/fpwaNWpgbm7OvXv3DMo8ePAAgPr162darMnJzo8QsnNsQgghhBDpJdvV5Nra2nL//n2DZWZmZri4uGBjY4OzszNt2rTh9OnTKIqiVln7+Pjg5OREq1ZZP7SGRqPh7J1gQiKy1yxoDjZm1C4twy8JIYQQIvfLdknukCFD+OSTT9ixY4c6e9nRo0d5+vQpPXr0AGDixIl07NiR7du30717d3x9ffH29mbq1KlpbpObXkIi4ggKj83qMNLdvHnz+OWXX3jx4oW6zMLCAmdnZypVqsSAAQOoW7cukydPZt++fZQtWxZbW1uCg4O5ceMGhQsXpmjRosTGxnL37l2CgoL4+++/kx37dMKECRw/flwd1q1AgQL07duXwMBAtm/fTnBwMAB58uThww8/ZODAgTRr1oxChQpRpEgRtFot169fJyQkhJo1a6LRaAgKCuLWrVv069ePL7/8MuMvmhBCCCEyXbZLcps0acKKFStYuXIl69evx8XFBWdnZ7Zt20bhwoUBeOedd9i4cSOzZ89m9+7d6HQ6Zs2aRfPmzbM4+txv0qRJjBw5knr16mFjY8OKFSuwtLTEx8eHFStW8NtvvzFnzhzMzc3ZsmULlSpVAuDs2bP079+fDh06MG7cOAAiIyMZPnx4isdbsGAB9+7do3Xr1uTPn5+jR4+qvSo//fRTGjZsSGxsLAcPHiRPnjz4+vrSqlUrFi1apHYK7NevHz4+Pnh6eqo9Mf/44w9OnTqVUZdJCCGEEFks2yW5AM2bN39jwlq6dGk8PT0zJyBhwN7eXp0Io0aNGgC89957vPPOO3zxxRfMnTuXmTNnqglucqytrRk0aBBabcpNw4sXLw5A0aJF1QQX4qd3dXZ2JjY2ljx58gDxTVsGDhz4xlEvGjZsqA5HJ4QQQojcJ1smuSL702q1iWZ7038xefXqFbVq1TJqP02aNDHqWJD0kCH64dz0ChUqZPQIG+3btzeqnBBCCCFynmw3uoLIuXx9fYH/1bAKIYQQQmQVSXKFyRIOR/b48WODsY3fNAuJEEIIIURGkkxEmCw0NJQvvvgCf39/AgMDKV68OBMnTlTb6QohhBBCZBVJcoXJHBwcmD9/flaHIYQQQgiRiDRXEFlq9+7dVKhQweDf8uXLkyz7plEYhBBCCCH0pCZXZKmmTZuye/dug2UuLi5JlrWzs8uEiIQQQgiRG0iSK0yi0+mIizN+2mL9cGMJO6tB/EgMSc12FhMTw8mTJ6lXrx7W1tYAuLm5Jbnf1/eZ3LFfH/JMCCGEELmXJLkZxMEm5ckIskJ6xRQSEkJAQABxcXEEBASQN2/eN27z8OFDAB48eGDUMU6fPs2nn35Kv379GD16NLa2tuq0zgnjePnyJQBBQUFJDlsWExPD48eP1RjKlClj1PGFEEIIkbNJkpsBFEWhdunEtZPZgaIoSU6qYKx58+axZ88eoqKiAGjbti1NmzZlzpw5SZYPCQlh6NChXL16FYBDhw7Rvn17vvjiCxo2bJjscapVq0bNmjXZsWMHERERrFmzBldX10RxREdHA9CyZUs6dOigDmMGsH79ejw9PdUkt1evXtStW5elS5eafP5CCCGEyBk0ypue9b5FmjVrBsCxY8dSLBcZGcm9e/coWbKk+ihdCJGx5O9OCJEbhfy5hbhgf6PKmjnmx6FezwyOKPszNl+T7upCCCGEECLXkSRXCCGEEELkOpLkCiGEEEKIXEeSXCGEEEIIketIkiuEEEIIIXIdSXKFEEIIIUSuI0muEEIIIYTIdSTJFUIIIYQQuY4kuUIIIYQQIteRJFcIIYQQQuQ6kuQKIYQQQohcR5JcIYQQQgiR60iSmwEURZfVISQrO8cmhBBCCJFezLM6gNxIo9ESdvEwurDArA7FgNYuD3ZVWmV1GFnmr7/+wtPTkzx58vDNN9+kWLZr1664urqyePFiAHx9fdm2bRs7duxgy5YtFC1aNBMiFkIIIYSpJMnNILqwQOKC/bM6jHS3c+dOpkyZgpubG05OTmg0Gnx8fLC3t6dChQooisLjx4/x8/Nj7ty5HD9+nCNHjuDk5ES1atXU/QQHB3PlyhVKlSrFnj17kj3epEmT2Lt3L3FxcQCULl2avXv3smLFCn744QdiYmIAKFq0KDNmzKBhw4YALF++nHXr1hEWFoazszPr16/n5MmTnDhxgs6dO7/xPJ2cnLC3t1d/v3v3Lj4+Pvj7m35P//77bzZu3Ii/vz9mZmaEhoZSvHhxBg8ezHvvvZfq/Z0+fZq5c+dy8+ZNdVnp0qUZPXo0bdq0SXa7p0+fsmDBAh4/fkxUVBS3bt0iMjISgN9//52CBQum6j6vWLGCgwcPcvToUSIjI7GwsODXX3+lUKFCycbg5+dHy5YtiY2NJW/evHTq1IlJkyal+hoIIYQQyZEkV6Ta1KlTGTBggPp72bJlcXNzY8OGDeqyBQsW4ODgwLx58zhy5AilS5dm9erVBvu5ceMG7u7uKR5r3rx51KxZky+//JJy5cqpCfGYMWOoV68effv2RVEUdu/ejYODg7rdqFGjqF69OrNmzWLnzp3Y2dlRsmRJ1q5da9Q5rlu3zuD3hg0b8s8//3D+/Hmjtk9Ip9Ph4eHBkSNHWLRoETVq1FDXHTp0iIEDBzJ48GBGjRqVqv3Wr1+fvXv38tlnn3Ho0CHGjRvHJ598kuI2/v7+dO3alY8//pj58+cDEBoayoIFC9i0aZNBWWPvs/7/S5cu0b17d2JiYli3bh1ffvllsnH89NNPxMbGYmFhwcGDB3F2dk7VuQshhBBvIm1yRarY29vTu3fvN5YbOnQoFhYW2NraJlumXLlyRu2rU6dO5M+fn3v37hEaGqour1GjBh988AEA9+7dS7TdvXv36Nu3L3Z2dgBYWVm98VgpMTc37TvhvHnz2LRpE8uXLzdIcAFat27N119/zbJly/D09DRp/yVLlgSgYsWKbyy7ZcsWoqOj6d+/v7rM3t6eGTNmULduXYNlqbnPAJUqVcLR0RErKyu2b99OYGDSzXUCAwPZt28fjo6OODs7S4IrhBAiQ0iSK1KlZcuWalKTEicnJxo1avTGcm3btn1jGXNzc9q3b09UVBS//vqrwbqOHTsCcODAgUTbHT58mHbt2r1x/xnp6tWrrF+/nvr161OlSpUky7Rr145ixYqxcOFCnj17lupjmJmZGfyfkpcvXxIaGsqlS5cSrdNfSzDtPmu1WhwcHOjSpQvh4eEGNb4JeXl50aZNGxwcHNBq5S1ICCFExpBPGJElrly5wrlz54wur29Hu3v3boPl+ra6e/fuJTY2Vl3+6NEjbG1tyZs3b5L7O3v2LNOnT6du3boMGTJErSHW6XT8/vvvjBo1ikGDBhkV2y+//MKQIUPo0qULTZs2Zdu2beq6HTt2oCgKDRo0SHZ7jUZDvXr1iIqKYu/evfj7++Pl5UWnTp2YOnUqx48fZ9y4cdSrV4+hQ4fy+PFjo+JKSv369dHpdAwcOBBvb28URVHXde7cmYIFC5q8b70hQ4ZgZmaGl5cX4eHhBusiIyPZsmULgwcPTvNxhBBCiJRIkiuyxOvJ6pu4ublRsWJFfHx8ePr0qbp879691K9fnxcvXnDy5El1+Z49e+jQoUOS+7py5QpWVlZ8/fXXbNy4kdOnT6tNBYKDg1EUhRMnThgkzcn5+eefOXToEN9//z07d+6kS5cuTJs2jRMnTgBw+fJlAIoVK5bifvRNDi5fvkz+/Pnp0KED169f5/Lly+TLl49FixaxefNm/v33X/r3709UVNQbY0tKixYt6N+/P2FhYXz11Vd06dKF33//3aR9JcfV1ZXWrVsTFBSEt7e3wbodO3ZQu3ZtXF1d0/WYQgghxOskyRWZ4ubNm/Tr149+/frRvn17Nm7cmOp9dOnSBZ1Oxy+//ALEd6KKiopSO2zt2rVLLXvixAmaNWuW5H4qVapE1apVgfjRCPLkycOVK1cAcHZ2pnHjxsnWACcUFhbG4sWLmTBhApaWlgBq7a++k11wcDAANjY2Ke5L3244JCQEQO1EV6FCBXXkheLFi9O7d28ePXrEoUOH3hhfcr788ktWrlxJ0aJFuXbtGsOHD2fAgAHcvXvX5H2+bujQoUB8B7Po6Gggvtb9p59+UtcJIYQQGcnk0RXu3LnDyZMnuXHjBgEBAWg0GvLmzUvFihVp1KiR1NQIA6/3yj927Fiq9/Hhhx8yd+5c9uzZw/Dhw9m3bx8dOnSgevXqlChRghMnTvDq1Svu3btH+fLlje5oZm1tTUREhMEyY9qj/vvvv4SFhTFjxgw0Go26vEiRIupjekdHRwCDDnNJ0Q/hlSdPHoPlCfcL8Z3tVq9ezeXLlw3a0KZWs2bN+OCDD9iwYQOrVq3ir7/+olu3bnh6epo0nNnrKlSoQIMGDTh16hS//PIL3bp149ChQxQvXpzy5cunef9CCCHEm6Q6yb106RLffvst//zzT5Lrd+/ezZw5c2jQoAGTJk2idOnSaQ5S5D6v17JOnTo1URMGT09PatWqpf7u7OxM06ZNOXz4MJcvX+bw4cPqkGBdunRh4cKF7Nu3j9u3b6cqAdRoNOh0qZ8J7uXLl0D88FkFChRIskylSpW4dOkSd+7coUWLFsnuS9/ONrnOaXr6NrP62tG0sLS0ZMiQIXTu3JlJkybxxx9/4OHhwdatW9O8b4Bhw4Zx6tQp1q5dS5cuXfjxxx9lLFwhhBCZJlXNFZYtW0bPnj05d+4ciqIk+0+n0/HHH3/QqVMnNm/enFGxi1zks88+Y/fu3Qb/KlWqlKicvgPawoULKVKkiDphQ6dOndBqtezYsYOLFy8mGqorI+ibFBw+fDjRuv/++0+NC+C3335LcV9nzpzB1taWDz/8MMVy+uYPqZ1xTT/E2o8//pgoQc6bNy/Lli3DxcWFa9eupWq/KalTpw6VK1fm3r17zJo1CzMzM2rXrp1u+xdCCCFSYnRN7vTp0/H29ubdd9+lWbNmlCtXjkKFCmFvb4+lpSWKohAdHU1wcDBPnjzhxo0bnDhxgq+//pqQkBCGDx+ekechsoi+BjRhL/2E9KMfJLder0CBAsnWhib0wQcfkD9/fv7880/WrFljsH29evU4deoUn3zySaLH/BmhWrVqWFpasnDhQhwdHenQoQNarZZr167h5eWFh4cHVapUoVu3bmzfvp0//vhDnZEtoWPHjnH9+nVmzpyZqC3w66MTXLlyBY1GQ9OmTY2O89GjR/z777+ULFmS8PBwDh8+TPv27Q3KWFtbky9fPlxcXJLcx5vus77M6zXiw4YNY8yYMWzZsoVly5a9sbwQQgiRXoxKcjdv3sylS5fYtGkT1atXN2rH7dq1Y8KECZw5c4ZZs2ZRqVIl6tWrl6ZgcxKtXZ43F8pkGRHTgwcPgPhpWmNjYxNNmKCvQXz+/DnR0dFqBy1T6cfM1Y+qkFDnzp05depUsk0V9KMy6JsZAMTExBAWFoa/vz+KoqDRaIiOjiYoKAhra2t1GaCOYfvs2TOKFi2Ks7Mzn3zyCUuXLmXSpEnMnDkTJycnAgICDIYRmzFjBpGRkXzxxRcsWrRI/TtQFIUDBw4wffp0JkyYQK9evRLFfPr0ae7du0fJkiV5/vw569evp0+fPpQpU0Yto0+Ek5p84dmzZ4wbN44ffvhBXTZr1iwsLS1p0aKFOk7tjh07uHPnDqtWrUry2r3pPkN8Mh0QEEBYWJjaka5FixaUKFECjUZD8+bN1bKhoaEEBASg0+kIDQ01mEJZCCGESA9vTHLDwsL47bff2Lp1q0kzRtWtW5dNmzYxderUtybJVRQddlVaZXUYSVIUHRpN+gyqMWnSJI4fPw7EJ7HNmzend+/eaq39hAkT1OGp/Pz8aNy4MT169OCzzz5L03E7d+6MRqNJNPlBixYtqF+/PqVKlUq0zdmzZxk/fjyA2pTmm2++YezYsQQFBREUFESHDh1YtWoVgwcPJiwsjDt37tChQwd27NjBzJkz1dEbRowYwdixY+nVqxcjR47E2dkZT09Pnjx5Qp48eZg7dy7lypVTj21pacmCBQs4fvw469evZ9GiRVhbWxMWFkbp0qXZuHFjsp2xqlevzrfffktQUBCvXr2ia9eujBgxQr3mZ86cYd++fUD805YNGzZgYWFBXFwcISEh3Lt3j+rVqxvUEIeEhDBmzBhcXFxwdXUlPDycfPnysX79+iSbebzpPuvLnDhxgqioKFq1akWnTp34/PPP0Wq16mgK+oT6u+++Y9euXeowaK1bt6Z9+/bSXlcIIUS60ihveI786NEjzM3NKVSoUJoOdOvWLd555x2cnJzStJ+MpO8M9aae/5GRkWrtmrW1dWaEJt5CZcuWpXPnzsydOzerQ8kW5O9OCJEbhfy5hbhgf6PKmjnmx6FezwyOKPszNl97Y02usUOBPX36lCNHjmBjY0PLli0TJbPvvvuuUfsRQry5DbMQQgghUmbSOLmNGzdW/585cybnzp1j2LBh6lifixcvxtPTUxJbIUz0/PlzIH7CCyGEEEKknkmNM58+fUq3bt2YOXMmgYGBjB07loiICBRFwcLCAo1GI49YhTCRt7e3OvrBqVOn6NKlC2FhYVkclRBCCJGzmFSTW6BAAUaOHAmAu7s7L168QKPRYGdnx9atW3F1daVLly7pGqgQb4uPPvqIjz76KKvDEEIIIXI0k5JcFxcXNBoN+/btY//+/eoQS5MnT1ZnOHt9fE8hhBBCCCEyi0lJrrm5OW3atOHRo0dqgvvBBx/QrVs3AJYuXcqTJ0/SL0ohhBBCCCFSwaQ2uRMnTuTp06fExsaiKAqlS5dm3rx5/PPPP3z66aesXLkyveMUQgghhBDCaCbV5NaoUYP9+/dz/Phx7O3tadu2LZaWlsTFxTFo0CAGDRqU3nEKIYQQQghhNJOSXIDChQvTt29fg2W1atVSfw4JCTE9KiGEEEKkmamzXKbn7JhCZBWTk9yUBAQE8NFHH3H06NGM2L0QQgghjKDRaAm7eBhdWKDR22jt8mTbqemFSA2TktwXL17w9ddfc+7cOYKDg4mLi0vvuIQQQgiRDnRhgUZPGytEbmJSkjt16lROnjwJJD/9qH7UBSGEEEIIITKbSQ1u/v77bxRFSTbBfdvpsvF1yc6x5TTh4eFs3LiRli1bcvbs2awOJ1lBQUH88MMPNGzYEF9f36wORwghhMgUJtXkWltbM3PmTNq0aYOlpWWi9X5+fvTq1SvNwUF8TfGxY8c4cOAABQoUwM3Njc6dOwMQGhqKh4cHd+/eJS4ujrZt2zJ48OB0OW5aaDUajj09Q1B0cFaHYsDZ0pFmBeumaR87d+5kypQpuLm54eTkhEajwcfHB3t7eypUqICiKDx+/Bg/Pz/mzp3L8ePHOXLkCE5OTlSrVk3dT3BwMFeuXKFUqVLs2bPHqGPrdDr27dvHrl27CAkJwcbGBoCiRYvSunVrTp48ybRp09J0fqnx22+/cfDgQR48eJBiub59+3LhwgViY2NxdnbmxIkT2NraJlv+n3/+oXfv3gAULFiQgQMHpmnEkqNHj/LLL7/w7Nkzk/chhBBC5DQmJbmNGzemfPnySSa4AEWKFGHx4sVpiQuAwMBAJk6ciE6nY+7cueTPn19dFx0dzcCBA6lYsSJbt24lIiKCjz76iLCwMEaPHp3mY6dVUHQwL6KMb+ifk0ydOpUBAwaov5ctWxY3Nzc2bNigLluwYAEODg7MmzePI0eOULp0aVavXm2wnxs3buDu7m7UMQMCAhg/fjx+fn64u7tTu3Ztdd2TJ09YsGAB+/fvZ8qUKZiZmaXxDI3Ttm1bXr16xblz51Ist3HjRg4ePMjYsWMJCgrC29ubgQMHJlt+zZo1ALzzzjv8+uuvyf6dGatbt27cu3ePW7dupWk/QgghRE5iUnOFMWPGsGjRomSn7n3+/Dljx45NS1y8evWK/v37o9PpWL16tUGCC+Dp6cm1a9cYN24cADY2NgwdOpTvv/9ePswzkL29vVrLmJKhQ4diYWGRYo1luXLljNpXbGwsH3/8MdevX2f9+vUGCS5AoUKFmD9/Pk2bNiUgIODNJ5GOrKysjCpXqVIl9TXs6elJTExMkuXu3LnDtWvXgPgvi2lNcFMbpxBCCJFbmJTkxsbGcvfuXd5//33Kly+f6F+jRo3w909bT84JEybw/Plz5s+fj7l54grnzZs3U758eZydndVlNWvWJC4ujq1bt6bp2CJ5LVu2xMLC4o3lnJycaNSo0RvLtW3b9o1lVq1axaVLlxg+fDiFCxdOsoxGo+HLL79M8rWSHWg0GkqWLEnDhg158uQJv/zyS5Ll1q5dS//+/dVt0vP4QgghxNvEpIxg5syZPHz4MMWOZ2n5UD127BgnT55k9OjR5M2bN9H6O3fu8PjxY6pXr26wvHDhwlhbW+Pj42PysRVFSbaGWi8qKgqdTkdcXFySw6dl1uNyU2XUkG9v2m/C9VevXiUyMpL3338/xW2io6PZuHEjEJ8Qp3SMAgUKqMe5ffs2+/btY//+/cyaNYt9+/Zx5MgRvvzySzp37szff//N8uXL0Wq13L9/nypVqjB9+nTy5s3LvXv32LlzJ4cOHeKTTz4hNDSU06dPc/36dZo3b86kSZOwtrYG4tsJ6+P09PTk5MmTXLp0iU8++cSgSYJOp0NRFIYMGcIff/zB2rVr6dixo8HfybNnz/j999+ZPHky8+fPT/Ka/vvvv/zwww9ERETw8OFDqlevzvjx4ylUqJBBuf379/Pzzz9jYWGBhYUFLi4uahz6fYaGhrJkyRLu37/PnTt3KF68ONOmTaN06dIp3pOsEhcXh06nIyIiQr3uQojkaTQate+CKSIiIqSDeQZKy/2JjIw06d7klvtp7HmYlOSeP38+Qy+Ut7c3EJ+0zJgxg+vXr2Nra0ufPn1o0aKF2kP89SYMAA4ODvj5+Zl87OjoaK5fv/7Gcubm5kRFRSVartVq0/Smkhmio6PTPUnQ6XRERkYavX7nzp00b948xW0ALl68SGBgIE5OTjg6OiYqf/HiRTw8PAgLCwPi3zSaNWtGhw4dePLkCX5+fnh7e9OhQwdevXqFmZkZd+/e5eOPP2bSpEl07NiRa9eu0bdvX5ydnZk8eTKFChWiYsWK/Pjjjxw+fJiZM2fSo0cPfv31VyZNmkRwcDAeHh4AarODvXv3Mn78eLp3787SpUv57rvv+OCDDyhSpAjwvy9GlStXpnLlyly+fJmDBw/StGlT9VzWrVtHhw4d1Nro16+Zj48PU6ZMwdPTE1dXV168eMHHH39M79692bBhA/ny5QNg+/btLF++nJ9++omSJUty7949tXY4KiqKyMhIYmJiGDJkCH369GHChAmEhYUxaNAghg0bxo4dO7LlazgqKkp9iiSEeDMbGxsqVKhg8vb37t0jIiIiHSMSCZlyfzSWtugURa1oSY24uDiuXr2abHO5nCQ6OhqIb1mQ0hNck5JcCwsLpk+fTrt27ZJsM/j06VO6dOliyq5RFIUzZ86QN29eihQpQvfu3YmIiGDSpEmMGjWKOXPmqI/Lk7rJZmZmSSafxrK0tKR8+fIplomKiuLx48dYWVmZ9ELLaunVzjMhrVab4rW4ffs2H3/8MRDf3vrWrVu0bdv2jddP38bWwsIiybK1a9dm27ZtfPTRR9y6dYsJEyYwZMgQIH6a6b1799K8eXOaNm2qJpS///47kZGRvPfee1hbW1O9enUcHR3x8/NTj6F/gtC6dWu1iUT79u3ZtGkThw4dYsKECRQuXFh9LXbu3Fn90lW7dm08PT25e/euWitqZWWlXqNhw4YxZswY1q9frzbXCAkJYf/+/ezevVuNIeE1VRSFOXPm0K5dO959910gflSJCRMmMGrUKFavXs3s2bN58eIFCxcu5OOPP1Zfx+XLl6dp06bs3btXfc0eOnQIjUZDx44dgfi/pR49euDh4cGvv/7KRx99lOJ9ySrm5uYUK1ZM2hgLYYS0NlMqWbJkrqn5y45MuT8aCyuTRnDSj6707rvv5op7qs9j3tRE0aQkt379+lSoUCHZZKlgwYKsW7fOlF0TGBhIVFQUtWvXpl69ekD8t50ZM2Zw/PhxFixYwIwZMwCSTGajoqJwcnIy6dgQ/6JLqbMUxCcfWq0WMzOzbN80ISkZFXNK+3Vzc1ObHUB8kxT9NUyJvkYxMDAw2WPY2tpSvnx5bt26ReXKldUy+hd/wYIFDbb74IMP8PLyolKlSkRERHD06FF0Oh0xMTFqOa1Wqx4v4ba1atXi4sWL3LhxA1dXV7VcwnPRv34iIyMN9qfRaDAzM6Nly5aUKlWKy5cv4+PjQ926dfH29qZFixZqk4vXr+mlS5d4+PAhpUqVMoinRYsW2Nracvz4cebMmcOhQ4eIjIykRo0aBuWKFy9uEOeff/7J48ePDZpUhIWFUaRIEQICArLl69rMzEx9UpITv1wKkdNkxyc6Ip6pIzjllntq7BcEk5LcsWPH4u7uzpIlS5JMCJ89e8aIESM4fvx4qvet/3C1t7c3WO7i4kK1atXw8fFR2x8GBQUZlNHpdISEhLyxJlZkvWbNmhn8PnXqVHbv3m2wzNPTk8qVK6PRaIiLi+PChQvUqFEjyf29npymxMLCghIlSuDu7k5QUBDdu3c3+otRwYIFgf89KkmK/o8vuSYhGo2GoUOHMnXqVNasWcP777+Pl5eXwRBsr9M30UmqvXjRokXVR/h37twBMOiQmZSAgADKli3LTz/9lGI5IYQQIqcyKclt3bo1wBs7DZnCycmJAgUK8OTJk0Tr8uXLh7m5OeXKlSNPnjzcu3fPYL2fnx+xsbHUr18/3eMSGeuzzz5LNHZs0aJFsbW1pUmTJhw/fpxt27Ylm+Smxo0bNxgwYACff/453bt3T9W2r169AlDb2pqqffv2LFmyhNOnT+Ph4UG1atUoVqxYsuX1yXVSE0/Y2dmp2+qfrjx9+hQ3N7dk9+fg4MCpU6d49uxZotrj//77j7Jly6b6nIQQQojsxKQhxPTtOfRT+yb1Ly06d+7M5cuXefr0qcHyR48e8cEHH2BpaUmPHj24dOkSISEh6vqzZ89iYWGhzogmMp6+tjK5e67vyf+m14R+NruE//RPCb766ivy5MnDnj17OHXqVKpjfP3YK1asQKPRGJXgvl5zevXqVQoWLEilSpVSHUPCml1LS0s1qd+yZQvDhg1T1yV1TStVqkThwoU5cuRIomY6vr6+fPjhhwDqiCPHjh1LNg6IbzccFhbGiBEj1NrfuLg4PD09uX37dqrOTQghhMiOTEpyIWOHoRg+fDilS5dm1qxZai/Affv28eDBAyZOnAjAxx9/jKurKz/++CMQ33Rh7dq1jBw5khIlSmRYbMKQvmZRX4v+On1t+/Pnz1N8xJ+SwoUL4+XlhaurK59++ikbN240SPSePHnC/fv3MTMzM2g+8+LFC4BEvfFtbW0JDg5WYzt37hxBQUFEREQQERFhMP3trl271JEbzp07x2+//caUKVPU9r76L2IvX75Ut9HX9iYcK/rRo0c8efLEYEiwHj164OTkRP369alYsaJBWYDHjx+r5S0tLZk8eTLBwcHMmzdP/fvz9vbG0dFRnc66ZcuWVK5cmR07dnD06FEgvmnFlStX1H1HR0fTpUsX3NzcuHLlCm3btqVBgwbUrl2bQ4cOqU9qhBBCiJzMpOYK3bp148svv0y2AbO/vz89evQwOSg7Ozs8PT2ZP38+3bt3x9raGmdnZzZv3kypUqWA+ERlw4YNzJo1i549e6LT6RgyZEiqHz9nFGdLx6wOIZH0jmnSpElqu+vnz5/TvHlzevfuzfDhw4H4CT1+//13ID4Jbty4MT169OCzzz5L9bFKly7N/v372blzJ7/++isbN27E2dmZqKgoFEWhdu3azJkzR319zJw5kx07dgAwZ84c7ty5w1dffQXAyJEjuXXrFv3796dRo0Y0aNCABg0acP78efbs2UOXLl24f/8+EF+D+sknnxAVFUV0dDSLFi2iVatWAMydO1ftTPfll19y9+5drKys+OGHH4D4GuPw8HAuXrzIxYsXCQ8Pp0WLFgwYMIABAwZgZ2dHr169qFOnjsE1PXHiBBDftr1Fixb069ePQYMG0apVK1asWMGKFSto06YNhQoVokSJEmzevFn9WzQ3N+fHH39kzpw5TJo0iQoVKlC2bFny5ctHqVKlOH36NPnz5+fdd99l/fr1fPvttxw7doywsDCaNGnCtGnTsmWnMyGEECK1NIoJVbJRUVFvHMInODgYR8fsl+ilRN8ZKrlHvXqRkZHcu3ePkiVLJtnLW6coaLPpDFPZObbs5OzZs/Tv359vvvnG5OHwRPp609+dECJpIX9uIS7Y+FlIzRzz41CvZwZGJBJKzf2xKOSGXZVW7Hh4OFWjK+SzykPXYq1MDTHbMTZfe2NzhevXr6uPPfWMGaPy9QTX09NTHQYqt8vOSWR2ji07yQ3jCAohhBBvszcmueXLl8fLy4tr166ZfJCDBw9y7do18uTJY/I+hMhMz58/B/7XrlcIIYQQOYtRbXK/+OIL+vfvT/fu3enZs6c6sPybnD17lp9//pmLFy+yZ8+eNAUqRGaZNm2a+npdtmwZd+7cYd68eVkclRBCCCFSw6gkt0KFCsybN49x48bh6elJ0aJFKVOmDAUKFMDR0RFLS0sURSE6OppXr17h5+fH9evXCQwMJE+ePKxbtw4XF5eMPhch0oW7uzvu7u5ZHYYQQggh0sDo0RWaNWvGxo0b+fzzz3n48KE6A1NS9O0Z33//febPn0/hwoXTHqkQQgghhBBGStUQYu+99x779u1jy5YtbNu2jVu3biUqo9VqqVGjBr1796ZNmzbpFqgQQgghhBDGSvU4uZaWlvTv35/+/fvj7+/PzZs3CQoKwszMDBcXF8qXL4+9vX1GxCqEEEKIbExRFDQmjOJj6nZCpMSkySD08ufPT968eQkODlZHTkhq1ishhBBC5H4ajYbIs3+hCwk2ehutgyPWteu8uaAQqWRyknvjxg0WLlzI2bNnyZ8/P0ePHiUiIoKZM2fSr18/KlWqlJ5xCiGEECIH0IUEowt6O8bFF9nbG8fJTcqVK1fo1asXf/zxhzqtKoCNjQ1jxoxhyJAh/Pfff+kaqBBCCCGEEMYyKcmdN28e77zzDn369OHTTz81mN2sSJEi2NnZsXjx4vSKUQghhBBCiFQxqbnCixcv2LdvHxYWFgCcO3dOXRcYGMjz588JCwtLnwiFEEIIIYRIJZNqcs3MzHj58qX6u75H5KtXr/jiiy+IjY0lJiYmfSIUQgghhBAilUyqya1atSotW7akYcOGFClShAcPHjBkyBCuXLlCcHAwGo2GihUrpnesOUZ2HgolO8cm0k90dDQHDhzA09OT/v3706VLl6wOSQghhMhUJiW5n332Gb///jvHjh1Tlz1//hyIT6LMzMwYMWJE+kSYA5kyhEpmSI9hWnbu3MmUKVNwc3PDyckJjUaDj48P9vb2VKhQAUVRePz4MX5+fsydO5fjx49z5MgRnJycqFatmrqf4OBgrly5QqlSpdizZ0+q49i9ezcRERH06tUrTeeTno4fP862bdt49uwZtra2ALzzzju0bNmSCxcuMH78eKysrDIlljNnznD06FGuX7+eKccTQgiRfdmYWb+VYxiblOTmz5+fTZs2MWXKFIP2uADFixdn8uTJ1K1bN10CzKly8xAqU6dOZcCAAervZcuWxc3NjQ0bNqjLFixYgIODA/PmzePIkSOULl2a1atXG+znxo0buLu7mxTDTz/9RExMDD179kyXP77t27dTp04dihYtmuptw8PD+eqrrzhz5gyzZs2iWbNmaLXxLYECAwNZtWoVnp6e9OvXz6T9m6JRo0ZoNBp+/fXXTDmeEEKI7MtKa/FWjmFs8ji5rq6ubNy4kfv373Pz5k0AihYtSrly5dQPeJH72Nvb0759+zeWGzp0KP/++69ao5mUcuXK0bt371THcObMGW7cuAHAyZMnadiwYar3kVB4eDhr1qyhTh3T/pAnTpzIb7/9hre3NxUqVDBYlydPHqZMmYJOp+PFixeZluQCmVZrLIQQImfIzRVwSUlzNlqiRAlatmxJy5YtqVChgprg3rt3L83BieynZcuW6qgaKXFycqJRo0ZvLNe2bdtUx/DTTz/xySefALB+/fpUb59QTEwMkyZN4v79+yZtv2vXLn799Vc++uijRAluQqNHj8bFxcXEKIUQQgiRWmlKciMiInj27BmPHz82+Hfp0iX69++fXjGKXOjKlSuJmroY486dOzx79oyxY8dSqlQpTp8+zZ07dxKVGzduHJUqVaJs2bKcPXsWgNWrV1OtWjXKli3Lzp07AVi+fDlXr14FYPz48fTr14/Hjx8DEBISwuzZsxk4cCCtWrWie/fuBu3QIT7hBvjwww9TjNvR0RFXV1cA/Pz8WLVqFe3atWPnzp0sXLiQatWqsWLFCiC+GceQIUMYOHAgTZo0YciQITx8+BAAX19fVq9eTZs2bVi+fDk7d+5k5MiR1K1bl3HjxhEUFJTk8Xfu3Mno0aOpUaMG8+bNe+N1FkIIIXI6k5LcO3fu0LVrV6pXr07jxo1p1qyZwb8ePXrw4sWL9I5V5CK7d+82abuffvqJfv36odFo6NOnD4qiJFmbu2jRIsaMGWOw7OOPP07UBnjcuHF07twZgIULF7JhwwYKFy5MZGQkvXv3pkiRInh6enLo0CHef/99Ro4cqcb+/PlzdWY/Nze3RDE8evSIjh070rRpU/Xf+PHjiY6OJiIigtu3b3Pw4EGqVatG8+bNcXBwIDQ0lEGDBlGlShU8PT3ZvHkzf//9Nx4eHkB8k6BGjRpx9+5dTp06RfXq1VmxYgXff/89hw8fVmu4Ezpy5Aj169dn2bJlTJgwgXXr1uHj45P6iy+EEELkICYluV999RXXrl1DUZRk/wmR0M2bN+nXrx/9+vWjffv2bNy4MdX7CAgI4NSpU2qb4E6dOmFnZ8cvv/ySZA1mvnz5jFqWlHXr1vHs2TO1g51Go2HcuHHkz58fDw8PwsLCePLkiVre0tIy0T5cXV3ZtWsXJUuWxM/Pj0aNGrFw4UJKliyptv99//33adKkCfPnz6d///48efKEgIAAtelDwYIFKV26tEFzCnt7ewDq1atHiRIlgPhh/dq0acOFCxcS1ZC3bNmSAgUKAFCzZk0AtfZaCCGEyK1MSnL1H5D58uWjcOHCif7Z2Nika5Ai59OPvrBhwwb27t2rPppPDS8vLzp27Kh2qLK3t6dTp05ERETg7e2drvHu27cPV1dXzMzM1GVWVla0bNmS4OBg/v77b4OOXfoh9F6n1WqpXLkygEGbXX3b9Xfeeceg/LvvvsvGjRtp3LgxMTExHD16lMDAwCQnV3l9VIkaNWoA8U1BkqOPOTw8PNkyQgghRG5g0ugKTk5OzJ8/n9q1aye5PjIykg4dOqQpMJG7NWvWzOD3qVOnJmrC4OnpSa1atQCIiopi69atFC5cmPPnz6tl9Mmal5cXgwcPxtzc5AFDDPj5+VGkSJFEy/XLgoODqV+/Pra2toSHh3Pu3LlkR07QJ8oJE+aUlC9fnhUrVnD79m06d+5M4cKFefr06Ru3K1iwIBA/EURy9ImxTqczKhYhhBAipzIpI2jVqhXW1tbJrre2tuabb74xOSjx9vnss88YOHCgwbKESeOePXuoU6cO3333XaJtBw8ezOnTpzl8+DDt2rVTl6dl/NyCBQvy+PFjoqOjDZoi6JsKFC9eHAsLC7p27cqGDRvYvn07nTp1Mvl4ek+fPqV379506NCBZcuWAf/r3PYmr169AkgyORdCCCHeNiY1Vxg5ciTLli3j1q1biUZW8PPz48KFC4wcOTK9YxXZkL5GMLl22HFxcSmu1ytQoABubm4G//Rj7MbFxfHTTz8lO4vep59+CsCPP/5osFz/RSwqKirRNgnjSSoZbt68ORERERw9etRg+cOHDylWrBjvvfceEJ+cFy9enL///tukJhOvXxdPT0/8/PwYPnz4G7d9vcnB1atXsbGxoV69eqmOQwghhMhtTKrJ9fPz48aNG9IkIQVaB8esDiGRjIjpwYMHQPxrIjY2NlFzAf14yc+fP09UK2qsDRs2YG5uTqlSpZJcX6NGDRwcHLh69Srbt2+nW7duwP9GPLhw4QINGzYkJCSEffv2ARh0VHN2dlZjtLGxISQkhOHDh3Pw4EEWLVrE+++/T4ECBfDz8+PAgQN8/fXXamLs4ODAhg0b+PTTT5kxYwbPnj1j0KBBao1vQECAOgKDg4ODesyXL18CcPfuXYNz0Sf2Fy9epG7duty8eZNHjx4RFRVFXFwcjx8/Vo99+PBhhgwZQr58+bhz5w7bt29nzJgx5MmTB0Bt4qA/Fvyvttff39+4iy+EEELkUCYlubNnz+bly5cp1s7l1HmO04OiKNl2Grz0nIN60qRJHD9+HIhPEJs3b07v3r3VWsgJEybw+++/A/FJcOPGjenRowefffaZ0cf47LPPOHToEADt27dn/vz5lCtXTl3/4MEDPv74Y0JCQgD48ssv2bt3L+vXr6d06dKMGzeOtWvXcuPGDapUqULLli359ddfuXXrFufOnaNGjRq0b9+e/fv3M2XKFHr06MGgQYPQaDRs2rSJb7/9lu7du1OyZElsbGz49ttv1Q5eegUKFGD79u3s37+fQ4cO0bVrV/LkyUN0dDQxMTFUq1aNHTt2UKlSJQBWrlzJmjVrgPja5/v376sd8fr168e5c+cYO3YsjRs3pmbNmjRv3py9e/fy448/0rt3bzVBr1atGhMnTiQ8PJywsDC++OILdQa5devWsXTpUgCWLFnCo0ePqF69OvPnzwfipzGOjo5m7ty5Rt8LIYQQIifRKCaM91WlShWio6NxcXFJsmbu1atXhIeHc/369XQJMrPoO0O9PuD/6yIjI7l37x4lS5ZMsW2yEBnB19eXZs2aMWrUKEaPHp3V4WQa+bsTwjQhf24hLtj4pzdmjvlxqNfT5OOFHz2Sqqljtc55sG3e0uTj5XSpuT8Whdywq9KKHQ8P8yLK+Gtcxr4YzQrVyzX3xth8zaSaXDs7O77//vtk2/5FREQYdAASQqQfGYdaCCGEeDOTOp41bdo0xbFwbWxsWLlypclBCSGSpx+TV9rVCiGEEMkzKcnt3r0733zzTbKjK/zzzz8MGTIkvWMV4q23bNkyhg4dCsC2bdsSDbsmhBBCiHgmNVfo0aMHGo1GRlcQIpONHj36rWqHK4QQQpgqTdNDyegKQgghhBAiOzI5yXV2dk62XW5QUBAREREmByWEEEIIIURamJTkfvzxx4wbNy7Z9bGxsXTp0sXkoIQQQgghhEgLkzqepZTgApibm7N161aTAhJCCCGEECKtTEpyjdG1a9eM2rUQQgghhBApMqq5wpo1a/jhhx8YNmwYw4cPp3///smWVRSF58+f8/Dhw3QLUgghhBBCiNQwKsn9/vvvCQ8P54cffmD48OH4+PikOHqCoigyuoIQQgghhMgyRjVXqFChAgBly5ZVlymKkuw/IYQQQgghspJRNbmrVq3izJkz1KlTBwB7e3u+/vpr8uXLl2T5Z8+eMX369PSLMofJzjXZ2Tk2IYQQQoj0YlSSa29vT4sWLdTfZ8+eTZs2bVLcxszMLG2R5WAajYZHN6OJCs9etdpWthpc3SyzOgwhhBBCiAxnVJKr72hmZWVF9+7d35jgArRt2zZtkeVwUeEKkWHZK8lNDzt37mTKlCm4ubnh5OSERqPBx8cHe3t7KlSogKIoPH78GD8/P0aPHo23tzfPnj1Dq9UyZMgQ+vTpQ6FChdT9vXjxgpUrV+Ll5UXhwoWZNGkSFy9exNvbm9DQUAC8vb2pUqVKsjGFhYXRpEkTXr16hYODA40aNWLBggWpPrdPPvmEuXPn4uzsnOpthRBCCJG9GJXk+vj40LBhQ7755htcXFwyOiaRzU2dOpUBAwaov5ctWxY3Nzc2bNigLluwYAHlypXjhx9+oGPHjtjb2zNhwoRETSXy5cvH9OnT2b9/P6tXr8bNzY3WrVszcOBAmjVrRkxMDKtXr2blypXJxuPt7c2rV68A2LRpE25ubqk+p+vXr3PixAm8vb0ZPnx4qrcXQgghRPZiVMczOzs7Fi5cKAmuwN7ent69e7+x3NChQ7GwsKBcuXI0aNCA4OBgzp49m2TZc+fOUadOHYPktECBAuTLl4/8+fNz/Phxbt++neS2MTExrF+/nvz58wNQokSJ1J8UsG7dOgC8vLyIjY01aR9CCJEbaCxt0UkncpELGFWTW6ZMGezt7VO148GDB6uJg8g9WrZsaVQ5JycnGjVqBMCAAQM4deoUGzZsUDsvJrRlyxb69OmTaLlWq2XAgAF89913rFmzhnnz5iUqs2/fPtzc3IiIiMDf3x+tNvXzmzx79oxLly7x4Ycfsm/fPg4fPky7du1SvR8h3laKokOjSf3fnqnbiYylsbBCq9Fw7OkZgqKDjd7O1bYQtfK9l4GRCZE6RiW5VlZWqdppYGAg169fNykgkfs0bNiQMmXKcPz4cR49eoSrq6u6LiAggLt371KrVq0kt+3VqxerV69m//79fPbZZxQuXFhdpygK69atY/r06SxdutTk+DZs2EDPnj2pVq0a+/btY/369YmSXJ1Ox/Hjx9m2bRs+Pj4cOXKEjRs3cubMGZ4/f467uzsNGjRQy69Zs4Zjx44RERHBjRs3qFu3Lp6enur6CxcusHr1asLDw3n06BHVq1fn888/p1ChQiiKwtmzZ9m7dy9Hjhxh//79jBkzhgcPHuDp6WkwlJ8Q2YFGoyXs4mF0YYFGb6O1y4NdlVYZGJVIq6DoYF5EGX9PnS0cMjAaIVLPqCT333//pVmzZkbtUFEUAgMDiYyMTFNgIncZMGAAX331FevXr2fatGnq8m3bttGtW7dkt7O3t6dXr1788MMPrFu3zmDb33//HTs7O2rWrGlyXOHh4ezbt4+9e/fi4OBAxYoVuXjxIv/++y9Vq1ZVy2m1Wpo3b87WrVsJDw/n8OHDjBkzhrFjx9K3b1+mTp3KH3/8AcCpU6c4duwYmzZtQqvVcuzYMby8vNR9nTlzhvHjx7N161aKFSuGv78/AwYMoGfPnmzfvh0XFxdsbGy4fPkywcHBeHt7M2TIEDZv3vxWj1oisjddWCBxwf5ZHYYQQqiMek4UHR2t9ph/07/Hjx8TERGR0XGLHKZjx47kyZOHHTt2EBISAsTXju7bt4+OHTumuG3//v2xtLRk+/btBAQEqMvXrFnD0KFD0xTXjh07aNasGQ4O8TUQffv2BWD9+vVJls+TJ49azszMDI1GQ5UqVXj27BkvX74E4MaNG4SFhRETEwNAs2bNqFGjBhD/JXDGjBm0a9eOYsWKAZA/f34mTJjA06dPWbx4MVqtlipVqqg1tt27d6dFixasW7eOMmXKpOl8hRBCiLeF0Y2hZCYzkRZWVlb07NmT8PBwvL29AThx4gS1atXCzs4uxW3z589P586diYiIUEdwuHjxIoGBgUY/YUiKTqdjw4YNamIL0K5dO/LkycORI0d4+vRpom2SavNrY2MDoD69qF+/Pg8ePKBz584cOnQInU7HiBEjALh8+TIPHjygZMmSBvto1qwZtra2HDt2TF2mr7UtUKCAyecohBBCvK2Maq6QL18+PvjgA6N2qNPpuH37NteuXUtTYCL36dOnD2vXrmXjxo0MHDiQTZs2MXXqVKO2HTJkCNu2bWPTpk0MHTpUrcVNafY2/XTUekWKFOHXX39Vfz969ChBQUGJZueztrYmMDCQjRs38vnnnxt9fvovguXLl2fLli14eHjw2WefUaZMGdzd3alWrRq+vr5AfDOJ1xUtWpS7d+8afTwhhBBCJM+oJLdkyZJ88803Ru80Li7O6F744u2RP39+PvzwQ3bt2sUPP/yATqejdOnSRm1bvHhxWrZsyaFDh5g3bx5Xr15l0aJFKW6ze/dug98tLCwMfl+3bh2LFy+mXr16Bsv9/Pxo1aoV27ZtY+TIkWpNbWpUqFABLy8vTpw4wddff82gQYM4ePAgBQsWBODBgweJtrGzs1ObMAghhBAibYxqrpCwHaQxzMzM0m0IpkuXLlGpUqVEY6zeuXNHnUGrZ8+enDx5Ml2OJ1JHp9MBxjdnGThwIABLliyhV69eb9y3fv8Aw4YNA2Dr1q3079/fIGnVl0tY3s3NzeBfwiYC586dA0iU4EJ8jW/79u0JCgpix44dBuuMOU9PT08CA+N7JDdp0oQff/yRiIgI9bVcuHBhjhw5QlRUlMF2vr6+fPjhh4n2J02FhBBCiNQzKsm9f/8+d+7cSdWOx48fb1JACYWGhjJhwgS1A0/CePr06UOXLl3w8vJizpw5jB07ltOnT6f5mOnFylaDtV32+mdlm/yjfVPpayT9/PyMmkShXLly1K5dmwIFCqTYnjY0NJSAgAD18T5ApUqVqFevHk5OTnz00Ufq8tjYWJ48eQLAo0eP3hhDdHQ033zzTYpPG5o2bQrA999/z/Pnz9Xl+uP4+/+vF7l+tjX9sujoaL788kt1WuLo6GhsbGyoXLkylpaWTJ48meDgYObNm6cmsN7e3jg6OjJ48GB1vy9evACQJgxCCJHNSWVE9mRUc4W4uDgGDBjAlClTaNq0qUmPb00xe/Zs6tevz8OHDw2Wu7u7U6hQIbW2uFSpUrRu3Zpp06Zx5MiRRI+lM5uiKLi6WWZpDMlRFCXFdqypMWnSJI4fPw7A8+fPad68Ob17937jtLgDBw7k+vXryQ6H9d1337Fr1y6ioqLo3r07jRs3VieCGDZsGD4+PmpntfXr17N+/Xr8/PyA+HF1GzZsyHfffZfkvh8+fEifPn14/vw5t2/f5t9//000xu7PP//M8uXLgfhEs02bNowfP569e/dy4cIFALp168asWbPYuXOn2lns008/5auvvgLg2LFjNG7cmHfffRdzc3NWrVqljvHbqlUrVqxYwYoVK2jTpg2FChWiRIkSbN68Wf3b6tSpkzrWdK9evZgyZQqdO3dO8boKIYTIGhqNhrN3ggmJiDN6mwJOFlR2Td1EWyJ1jEpyp0yZAsR/4J87d87oTmhpsWPHDsqUKYOLiwubN29Wlz969IiTJ08a1HgB1KxZk+3bt3PixIksbw+cXklkRkjP2JKagcwYTZs2VWtKk/L5558n2+GrXr16Bk0MBgwYwIABA4w+drFixd7YtKV///70798/0fKkZmVr3Lhxkvt4U6LfrFmzFGuyX29PLIQQInsLiYgjKNz4aeEdrGXc84xmVJKbmiQiPdy9e5cTJ06wbNkydu3aZbBOn6C8PgRTqVKlAPDx8UlTkqsoSpI93xOKiopCp9MRFxdHXJzx39qEEKaLi4tDp9MRERFh0PZaZC2NRpOmp3sRERHyqDeDpPXeZLac+lrIadfZFNnt3hgbi1FJbmaKjo7Gw8ODefPmJVnrqH8snS9fPoPl9vb2BuvTcnxjpiQ2NzdP1HFICJFxoqKiiI2NlTbK2YyNjU2i4fpS4969ezKBUAZJ673JbDn1tZDTrrMpstu9iY6OBuL75JibJ5/KZrskd/78+QwYMCBREqun7+Tz+rcm/UmmNfG0tLSkfPnyKZaJiori8ePHWFlZYW1tnabjCSGMZ25uTrFixbCyssrqUMT/S2sTqJIlS2arGqLcJDs3nUtKTn0t5LTrbIrsdm8sLeP7PaWU4EI2S3JPnDiBhYUFDRs2TLaM/sNNP7uUnj65dXJySlMMGo0GW1vbFMtotVq0Wi1mZmbJdp4SQqQvMzMztFotNjY28uUyF8ntj3mF8eS1kH1lt3tj7BeLbJXk/vTTT5w7dw5PT091mf6bw8CBA9FoNEycOBGAoKAgg231vxcqVCgzQhVCCCGEENlYtkpyPTw8ErX5OHbsGIsXL8bd3Z3KlSurHU7u3btnUE4/Xmv9+vUzJ1ghhBBCCJFtGTUZhCn27duX6m1cXV0TzVJVoEABAIoWLYqbmxvlypWjevXqnDp1ymBbHx8fSpQoQa1atdIlfiGEEEIIkXMZVZOb1JihyVEUhcDAQEqUKJHkFKXpYcaMGfTo0YMzZ85Qt25drly5wvHjx1m2bFmWTwQhhBBCCCGynlFJro+PT6p6DyqKkqG9n8uVK4enpyfffvstK1asQKvVsmrVKmrUqJFhxxRCCCGEEDmH0W1yUzt0xNOnT1MdTFK6dOlCly5dEi2vVq2awUxoQgghhBBC6BmV5Nra2uLu7q6OXfvPP/9w4sQJxowZo45VphcTE8OKFSvo3Llz+kcrhBBCCCGEEYxKcnv06EHbtm3V3728vPjpp5+ws7NLsnz16tWZN28e3bt3T58ohRBCCCGESAWjRleYNGmSwe9+fn7JJrgQP2j7pUuX0hZZDqYouqwOIVnZOTYhhBBCiPRi0ji5L168YNCgQbRv355ixYqpM4QFBwdz69YtvL29061Nbk6k0Wjxu3KK6LDgrA7FgKWdI0UqNcjqMIQQQgghMpxJSW7btm1Zt24df/31V7JlWrZsaXJQuUF0WDCRIQFZHUaGi4uL4/Dhw+zYsYOwsDAsLCzQarVUq1aNdu3aMXfuXH788Uej9nXs2DH279/P/v370Wg0FC9enEKFChEXF4e/vz9FixalYcOG9OzZU20LfuTIEb755hseP34MwNChQ/niiy8S7XvSpEmcOHGCV69e4eLiQvny5fH19eX+/ftA/Gt60aJFibb77rvv2L17N/7+/jg6OtKkSRO+/fZbE6+WEEIIITKLSUnumDFjOH/+PP/++2+S6wsXLsyUKVPSEpfIAV6+fMnYsWN5/vw5c+fOpVq1auq6U6dOMWLECAIDA43eX7NmzahRowb79+9PNHpGREQEGzZs4Ntvv2XTpk2sWLGC0qVL07JlS1q0aMGmTZuYPXs2a9euxc3NjY4dOxrse968edy/f58OHTpw6NAhHB0dgfgkecKECRw4cIB3332XESNGGGz3+eef88knn1CvXj22bt1KqVKlTLlUQgghhMhkJs14Zm1tjZeXF19++SVVq1bFwcEBW1tb3NzcGDVqFLt376ZgwYLpHavIRmJiYvjkk0+4efMmmzZtMkhwARo0aICnpycODg6p2q+Tk1OSy21sbBg+fDiLFy/m/v37jBgxgtDQUAA0Gg19+vShcOHCAHz11VdcvHgx0T6KFStG3rx51QQX4p84VK1aFYClS5dy9OjRRNvZ29uTN29eSpQokapzEUIIIUTWMXlaXzMzM/r168eWLVv4+++/OX/+PL/88gujRo0ySCJE7uTp6cmlS5cYNmwYLi4uSZYpUqQIffv2TdfjNm/enHbt2nH//n02bdqUaH3//v2Jiopi5MiRPHv2zGCdVqtFq036Jd+/f38UReGLL77gxo0bidantK0QQgghsp80fWo/f/6cX375hS1btgAQFhbGP//8ky6BiexLURS2bt0KxCedKenRowe9evWiXLlylC1bFl9fXwC+/vprKlWqRNmyZTl79myqjq+fHGTHjh2J1k2ZMoUmTZrg7+/PiBEjiIyMNGqfAwYMoFevXoSHhzNixAgCAnJ/e2ohhBAiNzM5yV20aBFNmzZl0qRJrFmzBgA7Ozv++OMPFi5cmG4Biuzn+fPnPHr0CAsLizc+wre3t2fz5s189NFHBsunT5/OmDFjTDp++fLlAXj48CExMTEG67RaLQsWLKBcuXJcuXKFqVOnGr3fadOm0aBBA/z8/Bg9ejTR0dEmxSeEEEKIrGdSkrt+/XpWr15NbGxsoul+R40axaZNm9i2bVu6BCiynydPngDg7Oxs9Db58+dPtEw/g15q6dv56nS6JDu22dnZsXr1avLnz8/+/ftZuXKlUfs1NzdnyZIlvPvuu5w7d45Zs2aZFJ8QQgghsp5JSa6XlxcjR45k+/btHDlyxKCTmYWFBdbW1mzYsCHdghTZi5WVFYDRTQHSW0hICBDf4Sy59t8FCxZk1apV2NjYJNuhLCn29vasWrUKFxcXtm/fzvr169MtbiGEEEJkHpOSXEdHR0aPHk2lSpUoVqyYQYecv//+mxcvXvDw4cN0C1JkL6VKlcLS0pKQkJBUDRGWXq5evQrAu+++i7W1dbLlKlWqxPz58wGS7VCWlKJFi7Jy5UqsrKyYN28ep0+fTnvQQgghhMhUJiW5sbGxrF271qBG7cWLF+zYsYNRo0YBpHroKJFzWFlZ0apVK4A31pDGxsby8uVLNBpNuh1/+/btAHTt2vWNZVu0aMGECRPUDmXGqlq1KnPnzkWn0zF27FiCg7PX7HVCCCGESJlJSW737t357rvvqFWrFrVq1eKff/7hgw8+YNq0abx69QqNRkPjxo3TOVSRnYwfPx5HR0eWLVtGUFBQsuV+/vlnrKys1BrXqKioRGUStut+vY336w4ePMihQ4eoWLEivXv3Nlin0+nQ6XSJthk2bBjdunXDz88vyX3qdLokj9u2bVvGjBlDcHCw+oVOCCGEEDmDSTOe9enTh/Pnz7N//36Cg4PRaDQGSUKxYsUYN25cugWZE1naZb+xgtMzpsKFC7NmzRo+/vhjevTowcyZM6lTp45aY/vs2TO2bNlCu3btsLe3p2zZsgBcuHCB0qVL8/z5c7UWOGGSnFzCHBoayvr161mxYgXVqlVjxYoV6tS++vUBAQE8fPgwyREfZs6cia+vL48ePTJYHhMTw5MnT3j48CGurq6JthsxYgT3799nz549qbk8QgghhMhiJiW5AAsWLKBu3bps376dmzdvAvFtGVu3bk3//v2xt7dPtyBzGkXRUaRSg6wOI0mKokOjSZ9JDapWrcqhQ4fw9PRkzpw5REZGUqRIEfLkyUOZMmUYMGCAOgJDgwYN6NOnD3PnzuXUqVPUrFmTpk2bcvnyZc6dO0fRokW5fv06Bw4cAOD8+fM0btxY7dQYGBhIuXLlWLRoES1btjRo/vDdd9+xa9cuoqKi+Oijj2jSpAnz5s0ziNXCwoJly5YZfPlav349np6ePH78mFGjRlG1alV++umnROfp7u6ebC2wEEIIIbInjfKm58NvkWbNmgFw7NixFMtFRkZy7949SpYsmWLHJyFE+pG/u+wt5M8txAX7G13ezDE/DvV6ZmBEuY+iKCb1b0jtvbEo5IZdlVbseHiYF1HGdy4uY1+MZoXqEX70CLog47fTOufBtnlLo8tnV0evBBIUHmt0ede8VtQu45iq+yP3Jp6x+ZrJNbmvi4iI4M8//8TGxobatWtjZmaWXrsWQggh3noajYazd4IJiYgzqnwBJwsqu769T1WFMLlNLsTPPDVt2jRu377NoEGDePHiBQAlS5Zk3bp1BuPnCiGEECJtQiLijK4tdLCWyibxdjOpceY///xDyZIlmTJlChEREYwcORJ/f38URUFRFO7fv88333yT3rEKIYQQQghhFJOS3Dx58jBjxgzMzMxYuHAhDx48AMDS0hJPT0/OnDnDf//9l66BCiGEEEIIYSyTmisULVoUCwsLzpw5g5eXl9oQftSoUdSpUwcgXQf/F0IIIYQQIjVMSnKjoqIYMmQI//77LzqdDo1GQ+XKlRk2bBgA27Zt4/79++kZpxBCiFxEY2mLTlHQmlAhYup2Qoi3i0lJ7pgxYxgzZow6u5SLiwsLFy7k2bNnrF27Fi8vr3QNMruS0deEyDzy95a7aCys0Go0HHt6hqBo46fNdrZ0pFnBuhkYmRAitzApyW3evDlbtmzh4MGD2NnZ0bt3b/LmzcuePXuoWLEic+bMSe84sxULCws0Gg1hYWHY2NhkdThCvBXCwsLQaDRYWFhkdSgiHQVFB6dqvE8hhDCWyePkvvfee7z33nsGyzp27JjmgHICMzMznJyc8Pf3JyoqCkdHR8zNzaUdshDpTFEUYmNjCQ4OJjg4GGdnZxmDWwghhFHSNBlEeHg4Z86c4eHDh2g0GooVK0bdunXfitrNggULYmNjw/PnzwkONv5RmxAi9czMzChUqBBOTk5ZHYoQQogcwuQk18vLi8WLFxMaGmqw3M7OjgkTJtCrV680B5edaTQanJ2dcXJyIi4ujthY46fyE0IYz9zcHDMzM3lSIoQQIlVMSnJ3796Nu7s7kLgzSGhoKF9//TUODg58+OGHaY8wm9NoNJibm2Nunm4zJAshhBBCiDQyKTNbs2YNiqKQJ08eSpYsiYODg9oRy9fXlydPnvDDDz+8FUmuEEIIIYTIfkxKcp89e8aKFSto1qxZkuvPnTvH6NGj0xSYEEIkpCg6NJrUTdJoyjZCCCFyB5OS3IYNGyab4ALUqFGD6tWrJ1p+8eJFqlSpYsohhRBvOY1GS9jFw+jCjBtuSmuXB7sqrTI4KiGEENmVSVUclSpV4rfffkt2/fHjxylXrpzBMkVRmDJliimHE0IIAHRhgcQF+xv1z9hkWAghRO5kUk3u8ePH+e6776hcuTJWVlYG6xRF4cKFC1SuXJm///5bXe7v7y9T/QohhBBCiExhUpJ77tw5NBoNly5dSnK9oij8+++/iZbJEEBCCCGEECIzmDzu1ZvmkZd55oUQQgghRFYxKcm1s7Nj1qxZ5M+f36jaWUVReP78OdOnTzflcEIIIYQQQqSKSUnu9OnTTRoDV6fTmXI4IYQQQgghUsWk0RU6duyY4vpjx44RHByc6u2EEEIIIYRIDya3yQ0KCuLcuXOEhYUlan97+vRp9uzZw9KlS9McoBBCCCGEEKllUpJ79+5d+vTpQ1BQULJlXh9aTAghhBBCiMxiUpL73XffERiY8kDrhQoVMikgIYQQQggh0sqkJPeff/6hQoUKlClThqioKG7cuKFO4xsUFIS/vz9z5sxJ10CFECI1NJa26BQFrQnjc5u6XU4n45kLIXITk5JcJycntm3bhpmZGQD9+/dnzJgxau1tz549cXJySr8ohRAilTQWVmg1Go49PUNQdOKOsMlxtnSkWcG6GRhZ9qXRaDh7J5iQiDijtyngZEFlV/sMjEoIIUxjUpJbrlw5zMzM8PPzw87Ojl69ejFw4ECWLFlCnjx58PPzY+nSpXh4eKR3vEIIkSpB0cG8iEq5eZX4n5CIOILCY40u72BtloHRCCGE6UwaQszFxYUJEybQunVrunTpQsuWLbG2tqZz5840btwYf39/jh49mt6xCiGEEEIIYRSTktwBAwZw/PhxYmJieP78OTqdjhkzZqDVatXhxCwtLdM1UCGEEEIIIYxlUpJbokQJ1q1bR8uWLZk+fToWFhZUr16d5cuXU7x4cfLmzcu0adPSO1YhhBBCCCGMYvJkENWqVaNatWoGy5o0aUKTJk3SHJQQQgghhBBpYVJNrjFmz56dUbsWQgghhBAiRUbV5C5fvjxVO33x4gXbt2/nq6++MikoIYQQQggh0sLoJDezBgjX6XRs2rQJLy8vfH19KVKkCEOGDKF79+4G5Z49e4a7uzsvXrwgLi6OPn360LFjx0yJUQghhBBCZG9Gt8nVj5pgLFOT4tWrV+Pr64uHhwfh4eEsW7aMadOmERwczJAhQwAICAigT58+fPTRRwwfPpyXL1/SqVMnYmNj6dq1q0nHFUIIIYQQuYdRSa5Wq6Vbt2688847Ru302bNn7NixI9XBREdHExwcbDCJRLVq1Wjbti0rVqygf//+WFhYsGTJEsLCwhg8eDAQP25vr1698PDwoEmTJuTNmzfVxxZCCCGEELmHUUlu7969Uz0kmCnj5IaGhqq1tXp2dnY0btyYLVu2EBQUhL29Pbt27aJx48aYm/8v/Jo1a7JkyRL27NnDoEGDUn1sIYQQQgiRexiV5Joy5q0pnc6Sq4G1sbHB3t6evHnzcurUKaKioihZsqRBmVKlSgHg4+OTpiRXURTCw8NN3l4Ikf40Gg02NjaZesyIiIhUN9PKybLiGqeF3J/cJ6feU7k3mc/YWEweJ/d1a9eu5eHDhxQqVAhXV1dcXV2pUqVKuuz7woULfPjhh5iZmeHn5wdAvnz5DMo4ODgAqOtNFR0dzfXr19O0DyFE+rKxsaFChQqZesx79+4RERGRqcfMSllxjdNC7k/uk1PvqdybzBcdHQ1AbGyswVP91xmV5Pbv3x+Ib5tbq1YtRowYkajM0KFD8fX1ZdasWSxZsgStVsu1a9dMid3A5cuXuX37NitXrgTg1atXAFhbWxuUMzMzAyAqKipNx7O0tKR8+fJp2ocQIn1l1uguCZUsWTJb1VxktKy4xmkh9yf3yan3VO5N5tM3iU0pwQUjk1wfHx+KFi3KsmXLUkwAixYtyurVq/nkk084efJkKsJNWlxcHO7u7syePRsXFxcArKysgMTJrP53JyenNB1To9Fga2ubpn0IIXK+3P74MaeT+5P7yD3NvrLbvTH2i4XRM565u7sbVcOp1WqZMmWKsbtN0YIFC6hTpw5t27ZVlxUrVgyAoKAgg7L63wsVKpQuxxZCCCGEEDmXUTW5Tk5O1KlTB4DFixcn2S7Dzs6OMWPGAPHV2kWLFk1TYFu3buXly5fMnTvXYHmNGjUwNzfn3r17BssfPHgAQP369dN0XCGEEEIIkfMZleTmz59f/XnMmDGcP3+e2bNnc/PmTcqUKcP06dOpWbOmwTbGjqmblL179/LHH3+wZMkSgyppf39/8ufPT5s2bTh9+jSKoqjrfXx8cHJyolWrViYfVwghhBBCxNNYWRvkWqlh6nbpyagk18LCQv1Zq9VSo0YNli9fTosWLVi1ahVFihRJtI1Wa3RLCAO//PILP/74I/PmzVNrZ2NjY/nvv/+4fv06kyZNYuLEiXTs2JHt27fTvXt3fH198fb2ZurUqWlukyuEECL7sjEz7UM3O3zgCpHjWFqg0Wh4dDOaqHDjO55Z2WpwdUv9fAnpzeQhxFxdXSlbtmySCa6fnx93795N9T737NnD5MmT0el0dOzYMdF6b29vIL6WeOPGjcyePZvdu3ej0+mYNWsWzZs3T/2JCCGEyDGstPEfupFn/0IXEmzUNloHR6xr18ngyITIvaLCFSLDss/oCsYyKsm9fv16sp3O0nO4rY4dOyaZ3CaldOnSeHp6ptuxhRBC5By6kGB0QYFZHYYQIhtLVU3u62OkaTSaZMdNk8dCQgghhBAiqxjdcDapZDY7DQwshBBCCCGEnlE1uXZ2dvTo0cPoSRJCQkLYtWtXmgITQgghhBDCVEYlucOHD+fjjz9O1Y5lUgYhhBBCCJFVjGquULdu3VTv+P3330/1NkIIIYQQQqQHo5Lc9957L9U7rly5cqq3EUIIIYQQIj2YNmODEEKYSDqsCiGEyAwmTwYhhBCm0Gg0nL0TTEhEnNHbFHCyoLKrfQZGJYQQIreRJFcIkelCIuIICo81uryDtVkGRiOEyEoaK5mqWWQMSXKFEEIIkXUs46dqfnQzmqhw45ozWdlqcHWzzODARE4nSa4QQgghslxUuEJkmLTZF+lHOp4JIYQQQohcR5JcIf6fougydTshhBBCZBxpriDE/9NotIRdPIwuLNDobbR2ebCr0ioDoxJCCCGEKSTJFSIBXVggccH+WR2GEEIIIdJImisIIUQCNmbWJk9YIRNdCCFE9iE1uUIIkYCVNn44o8izf6ELCTZ6O62DI9a162RgZEIIIVJDklwhhEiCLiQYXZDx7bOFEEJkL9JcQQghhBBC5DqS5AohhBBCiFxHklwhhBC5msZKOhMK8TaSNrlCCCFyN8v4zoSPbkYTFW580mplq8HVzTIDAxNCZCRJcoUQQrwVosIVIsOkZlaIt4U0VxBCCCGEELmOJLlCCCGEECLXkSRXCCGEEELkOpLkCiGEEEKIXEeSXCHSQGNpi87EIYZM3U4IIYQQbyajKwiRBhoLK7QaDceeniEoOtjo7ZwtHWlWsG4GRiaEEEK83STJFSIdBEUH8yIqMKvDEEIIIcT/k+YKQgghhBAi15EkV+RKMhWnEEII8XaT5goiV9JoNJy9E0xIRJxR5Qs4WVDZ1T6DoxK5mcbKGkVR0Gg0qd7W1O2EEEIkT5JckWuFRMQRFB5rVFkHa7MMjkbkepYWaDQaHt2MJirc+CcJVrYaXN0sMzAwIYR4O0mSK4QQ6SgqXCEyTJrLCCFEVpM2uUJkARsza5PbDUt7YyGEEOLNpCZXiCxgpY1/tB159i90IcaPr6t1cMS6dp0MjEwIIYTIHSTJFSIL6UKC0QXJ+LpCCCFEepPmCkIIIYQQIteRJFcIIYQQQuQ6kuQKIYQQQohcR5JcIYQQQgiRbswtQFF0Jm1r6nZJxpFuexJCCCGEEG89M3MNGo0WvyuniA4zfgQhSztHilRqkG5xSJIrhBBCiBxFX1Oo0aT+gbSp24nUiw4LJjIkIMuOL0luJpM/SpEWGqv4SSQ0Gk2qtjNlGyGEyK6yS02hyN4kyc1kGo2WsIuH0YUZPzaq1i4PdlVaZWBUIsewjJ9E4tHNaKLCjZv5zMpWg6ubZQYHJoQQmS+rawpF9iZJbhbQhQUSF+yf1WGIHCwqXCEyTKb3FUIIIZIjz7+FEEKIJGSXHuJCCNNITW4OoLG0RacoaE1oU2nqdkII8baTdp9C5GyS5OYAGgsrtBoNx56eISja+DfaAtb5qJ+/eqqPJ52UhBDif6TdpxA5kyS5JsqKRDAoOpgXUcZ3WHO2cECj0RB59i90IcYlx1oHR6xr1zE1RCFEKslQSEIIkTEkyTWRRqPh7J1gQiLijN6mgJMFlV3tMzCqpOlCgtEFGZ8cCyEyjzwSF0KIjJHjk9zo6GgWLVrE33//jUajoW7duowZMwZz84w/tZCIOILCY40u72BtloHRCCFyMnkkLoQQ6SvHP+f67LPPuHv3Llu3bmXLli1cuXKFadOmZXVYOZZ+sgFTmLqdyFjSQ1wIIcTbKEfX5B44cIDjx4+zZ88ezMzia0lHjBhBnz59aNeuHR988EEWR5gDmTDZAMiEA9mZPA4XQgjxNsrRSa6Xlxd58+alXLly6rL33nsPKysrvLy8JMlNA5lsIPeRx+FCCCHeJholhz5jDg0NpVatWrz33nts2bLFYF3btm3x9/fHx8cnVSMgVK5cmbi4OAoWLPjGshqNhqhYHbpUPM0102qwNNegREdAah4Da83RWFgREReFLhXbmWvNsNJaokRFYXSgZmZoLC2JjVFIzStDq42vMTRVer8MU3t/csS9AZPuj1arwcwc4qIjU9X8QKPRYmZpevOV5PebSX87OeDeQPa6P/K+9lqI2ejexO9X3tfUEHP4vQF5X0uKsffn6dOnmJmZcfny5RTL5dia3GfPnhEXF0f+/PkTrXNwcODOnTsEBwfj5ORk9D4tLS2JiooiOjra6G1S06hZiYOoOP1WqdhSB0RF/f9WqdguTiGKqPiftUZupygQFWX8Mf6fLg5ijR9oItMYe7VyxL0Bk+7P/+6NBkhF50cFYk14LRgrw/92csC9gex5f+R9LV52vDcg72uQ8+8NyPta0scz7v5otVrMzMyIjY1NcaCBHJvkBgUFAWBtbZ1onb59bmRkZKqS3H/++YeoqCji4rJhtiaEEEIIIYD4XO9NI2nl2CTXysoKgKgkMn79MmdnZ5P3K4QQQgghcq4cO4RYsWLFAAgMTDzJQVBQEHnz5pWEVQghhBDiLZVjk1xHR0cqVKjAvXv3DJZHR0fz9OlT6tWrl0WRCSGEEEKIrJZjk1yA3r174+/vz82bN9Vl//zzD7GxsfTo0SMLIxNCCCGEEFkpRye5Xbp0oWbNmqxduxaI72i2fPlyunfvTq1atbI4OiGEEEIIkVVy7Di5eqGhoXh4eHDr1i00Gg0tWrRg6NChaFMzRIYQQgghhMhVcnySK4QQQgghxOukulMIIYQQQuQ6kuQKIYQQQohcR5JcIYQQQgiR60iSK4QQQgghch1JcoUQQgghRK4jSa4QQgghhMh1zLM6AJExHj16xE8//YSvry8//PBDVocjknDp0iV69+7Njz/+SO3atbM6nLfeyZMnWbVqFVqtlujoaMqUKcP48eNxcXHJ6tDeSim9h4WEhLBkyRKOHj1KQEAAlSpVYvz48dSoUSOLon27GPv5EhkZybZt2zh//jwlSpSgdu3a1KlTJxMjfbvodDo2bdqEl5cXvr6+FClShCFDhtC9e3eDcs+ePcPd3Z0XL14QFxdHnz596NixYxZFnbGkJjcX+uuvv/Dy8sLLy4uIiIisDkckITQ0lAkTJhATE5PVoQji/2bGjx/PV199xYYNG9i8eTOKojBs2DDi4uKyOry3TkrvYYqiMHnyZN555x2WLVuGu7s7Dx8+ZODAgZw/fz6LIn57GPv5cuPGDTp27MiTJ0+YN28en332mSS4GWz16tVcv34dDw8Pvv/+e5ycnJg2bRo//vijWiYgIIA+ffpQuXJlNm/ezPfff893333Hjh07sjDyjCNJbi5Up04dJk+eTJ48ebI6FJGM2bNnU79+/awOQ/y/bdu2UbduXcqVKweAVqulX79+XL16lVu3bmVxdG+flN7Dzp8/T9u2bRk+fDiVK1emQ4cOrF69mri4OFasWJEF0b5djPl8uXr1Kr1796ZDhw5MnDgRS0vLTIzw7RQdHU1wcDAeHh5Ur16dBg0asG7dOgoWLMiKFSvUCpUlS5YQFhbG4MGDAXBxcaFXr154eHgQEBCQlaeQISTJzcVsbGyyOgSRhB07dlCmTBnee++9rA5F/L+YmBhu3bpFbGyswTIrKyveeeedLIzs7ZbUe5ijoyNt27Y1WFaxYkVKlCiBn59fZoX21kvu8yUkJISPP/6YihUrMmLEiEyO6u0VGhrKkCFDDJbZ2dnRuHFjwsLCCAoKIiIigl27dlGzZk3Mzf/XWrVmzZqEhYWxZ8+ezA47w0mSm4tpNJqsDkG85u7du5w4cYKhQ4dmdSgiga5du3L37l3mzJmDfqbzzZs3M2XKFPLmzZvF0b29knoPe/fdd5NcbmNjQ7FixTIjLEHyny/Lly/H39+fMWPGyGdQJsqbNy/58uVLtNzGxgZ7e3vy5s2Lj48PUVFRlCxZ0qBMqVKlAPDx8cmUWDOTdDwTIpNER0fj4eHBvHnz5M0/m2nUqBFTpkxh7ty5BAQEULduXTp06EDdunWzOjRhhNDQUG7evClfHrNYTEwMu3btIn/+/Pj5+bF9+3Zu3LhBsWLFGD16NG5ublkd4lvnwoULfPjhh5iZmalPOl5Phh0cHABy5ZMQqckVIpPMnz+fAQMGJPltW2S9gQMHMmjQIHQ6HbNnz+a///7L6pCEkbZu3UrFihVp06ZNVofyVrty5QqvXr2iSJEiVK5cmXnz5vH9999z69YtunXrxp07d7I6xLfK5cuXuX37NmPGjAHg1atXAFhbWxuUMzMzAyAqKipzA8wEkuQKkQlOnDiBhYUFDRs2zOpQRDLc3d1p0aIFS5cuZejQoXzzzTfMnTs3q8MSb/Ds2TO8vLyYO3euPCHJYk+fPgWgefPmlC5dGoDChQvz+eefExUVxbJly7IyvLdKXFwc7u7uzJ49Wx0G0crKCkiczOp/d3JyytwgM4E0VxAiE/z000+cO3cOT09PdZm+7efAgQPRaDRcu3Yti6ITXl5eXLx4kWnTpgEwduxYYmNjWbNmDY0bN5ahj7Kp6OhoPv/8c2bMmJGonaHIfPrOTPb29gbLGzdujLm5OTdv3syKsN5KCxYsoE6dOgadNPVt1oOCggzK6n8vVKhQZoWXaSTJFSITeHh4JBpT8tixYyxevBh3d3cqV66cRZEJiB/xokSJEgbLxo0bx549ezh+/LgkudmQoihMnTqVDh060KhRo6wOR4Da5vbJkycGy83NzXF2dpZOnJlk69atvHz5MtGTqBo1amBubs69e/cMlj948AAgVw5rKc0VcjFFUdTaQpG1XF1dcXNzM/hXoEABAIoWLSodMrKYra0t9+/fN1hmZmaGi4uLDMWXhVJ6D5s1axZly5ZNNJuTv79/ZoT21kvq3hQvXpzq1atz9OhRg3VhYWEEBATQtGnTzA7zrbN3717++OMPPDw8DJrv+Pv74+zsTJs2bfjzzz8N7o+Pjw9OTk60atUqK0LOUJLk5lL6gaEDAgIk0RXiDYYMGcLVq1cNZv05evQoT58+pUePHlkY2dsrufcwRVGYNWsWISEhNG3alDt37nDnzh2uXbvGypUrOX36dBZG/XZI6fNl1qxZPHv2jDVr1gDxU83Onz8fNzc3+vbtmxXhvjV++eUX1q5dy+jRo3nw4AF37tzhv//+45dffmHdunUATJw4EYDt27cD4Ovri7e3N1OnTs2VbXI1imRAuc6WLVtYs2YNvr6+AJQuXZrJkydLp6dsZufOnUyZMoWff/6Z2rVrZ3U4b72jR4+ycuVKYmNjcXFxwdnZmfHjx+Pq6prVob11UnoP++qrr/D29k5yOycnJ06ePKl2sBHpz5jPl//++4+5c+cSGhqKoihUrFiR8ePH58okKrvYs2cPkydPRqfTJbne29ubKlWqAHDnzh1mz55NTEwMOp2OIUOG0Lx588wMN9NIkiuEEEIIIXIdaa4ghBBCCCFyHUlyhRBCCCFEriNJrhBCCCGEyHUkyRVCCCGEELmOJLk5xJ07d4iMjEz1djKLlhBCCCHeRpLkZnMvX75k/PjxHDx4kMePH9O4cWPKli2r/nuTf//9l48//lgd7sVU169fV4/ZuHFjevXqRevWrQ1i+eijj+jVqxcNGjQwiO/y5cu0b99eXTZ58uQ0xWKMkydP0qBBA1q0aMH169cz/HjZ3erVq6lWrRr9+vUjNDQ0q8PJUGfOnDH4PSYmhlGjRlGtWjVmz56dRVH9z40bNxg/fjzdunXjgw8+oHHjxqxateqN271+XvB23VdTPH78mI4dO1KrVi12796dLvucO3cu1apVU9/P+vXrly77zenGjRtHpUqVkn2fnzVrFtWqVWPMmDHExsZmUZSGAgMDuXHjRoYf5/bt27Rp08bg8/Ls2bMZftyErly5QnBwsPp7VnwuZwVJcrOx+/fv0717d6pWrcqoUaMoVaoUO3fuTNU+evfuzYcffkj37t25evVqmuLRarUsXLiQ3377jc2bNzN8+HCD9QsXLmTz5s38/vvvfPrpp+ryypUr07NnzzQdO7W8vLzw9/fn4cOHBgP8v61Wr15NeHg4Pj4+SSZLuYWXlxd79uwxWHbz5k1+/fVXwsPD2bhxI69evcqi6ODvv/+mW7du7N+/nzFjxtCpUyeePHnCsmXLUkxSr127xsyZMxMtf1vuq6kOHz7MjRs3ePXqlToYflpNnjwZDw+PdNlXbrJo0SLGjBmT5LqgoCA2bdpEeHg4hw8f5ubNm5kcXdJmzpyZKU87y5Qpw4EDByhTpkyGHyspoaGhjBs3ziDJzYrP5awgSW42FRQUxNChQ3F2djaoKTBl7u/27dtTrlw5Pv74Y549e2ZyTN26daNdu3ZvLGdmZsZnn31GhQoV1GWZPTVqr169yJcvH0WLFqVz586ZeuzsaOjQodja2lKjRg3q1KmT1eFkiN9//505c+YkWu7m5kazZs2wsbGhV69eWTog/c8//0xMTAwAVlZWDB48mAULFuDt7Y29vX2S2/j6+vLJJ5+o2yX0NtzXtGjRogVubm44OjoyaNCgdNuvi4tLuu0rN8mXL1+Sy52dnenRowc2Nja0aNGCd999N5MjS2zRokUcOnQo046n0WhM+vxOq6ioKEaNGsXDhw8TrXsbpiw3z+oARNI8PDx49OgRX3/9f+ydd3gUVduH79mS3nsvlNCRpkRUEFCaIIKKAoLYsYu+nwgWLKCiIvoir2IDpYiCAgJSRWkiSO8QID2k103ZNvP9se6aJW0TAgl47uviIjtz5syZ/ptnnvKmXf3phnLPPffw7LPP8uqrr/L555/Xe3kvL68qNeJrQ5IkHnvssXqvp7Ho06ePKO9ZiSeeeIInnniiqYdxyVi2bBlvvPFGtZ9BtVot//vf/5pgVFUpLCy0++3r68vQoUNrbH/o0CGefPJJcnJyCA8PrzL/aj+uF0tERASrV69u6mEIgDfffJM333yzqYeBwWBgxowZLF26tKmHcsnJycnh6aef5sCBA009lCZDiNxmyMmTJ/n5558BuOmmm2pt+/3333P48GF27dqF0Whk2LBhPPvss1XKWt58881oNBq2bt3Krl27uP766+s1pvDw8GofsrUxaNCgGuft37+f9957j+PHjxMdHc1rr73Gtddea9dGURSWLFnCDz/8gEajobi4mLFjxzJhwoRa1ztp0iR++eUX2+933nmHkSNHkpSUxJQpU9i/fz8A1113HU888QSffPIJR44cwd/fn2effZY77rijSp+//PILCxcuRFEUkpKS6NixI5MmTaJDhw4AvPbaa3z//fe29r/++islJSW88cYbnDp1ihdffJHRo0cDkJ2dzX//+18OHDiA2WwmMDCQF198kU6dOtmtc9euXSxfvpzy8nL27duHVqvl1ltv5YUXXrCz+p0/f5633nqL5ORkwOL/BbBx40YCAgIYMWKEbZ51bBEREezcuZMpU6bYrPtPPvkkAQEBLFq0iNTUVFq1asUbb7xB586d7cZ18uRJ/vvf/3Lo0CECAgJsPm1OTk506dKFTp062eqjX8ipU6eYOnUqR48etR2DhQsXsnDhQr744gvUajWLFi2ynWuO7INVq1bx1Vdf2Syd27dvt339ePbZZ3n66afJz88HLOfxli1b7Ma0bNkyvvvuO0wmE2azmZiYGB577LEq210T+fn5fPLJJ/z+++8EBgZSXFxMv379mDhxIp6enoDFp/3tt9+28w9/++23ue6663j55Zer7ff48ePMnDmTnJwcwPLAqrxdU6dOrfa47tq1iylTpnD+/HnAclz9/f1ZvHgxKSkpBAYGMn78eB544AFWrFjBN998w9mzZwkICODxxx9n1KhRduNoyHVYWlrKTTfdRGlpqW2ap6cnc+fOpWfPnsyaNYuvv/4ak8nE2LFjee211wCL3+DixYspKSnh0KFDmM1mevXqxf/93/8RHBwMWD69vvLKK6xbt87W96lTp9i1axfvvPMO6enpvP/++6SkpPDhhx+i1+sBGDFiBO+++269t6u8vJw5c+awdu1a/P39iYiIoG3btjVue01kZWXxzTffkJycTEpKChkZGbb7SJcuXaq0r+ueAyDLMkuXLmXlypWYzWYyMzPp0aMHL7zwAlFRUbZ2jpyjju7Xfv362fqcPXs2W7ZsITo6mpCQENsxqkxycjLjx48nMzPTrl+AtWvX8tprr9lcdd566y3y8vJYvnw5OTk5dO7cmRkzZhAdHW3X519//cX//vc/EhIS8Pb2tt3zPD09adeuHf369avRcj979mw2b95s+/3555+zYsUKoqKibG4ojuyvuvjzzz/55JNPOHv2LN26dSMmJsZ2LlaHwWDgyy+/5JdffsHJyYmKigoef/xxhg0bZhvnnDlzMBgMAAwZMoSYmBhOnjzJn3/+SWxsLM888ww333wzAEVFRbz11lt2borPP/88zs7ODBw4kPvuu6/KGBx5Ll9xKIJmx5tvvqnExcUpXbp0qXZ+XFyc7d+mTZsURVGUvLw8JT4+XomLi1Puv/9+RZblKssNGTJEiYuLU55++ulGGeePP/5oN5bU1FSH2t56663Kgw8+qHz66adK586dlbi4OKVbt25KUVGRrb0sy8rzzz+vxMXFKfPmzVMURVE+/vhjJS4uTpk1a1adY3vwwQdt6/vxxx9t07ds2WKb3qVLF+XVV19VDh8+rNx+++1KXFyc0qZNG+XkyZN2ff33v/9V4uLilP/973+KoijKI488YhtzRkaGrV2vXr1sff/2229K9+7dbb8feeQRRVEUJSUlRbnxxhuVDh06KGlpaYper1cGDBigdO7cWTl+/LitrwULFihxcXHKkiVLFEVRlMzMTKVjx45KXFycct9999namUwmZdiwYUpcXJySmZmpKIqi/Prrr0qnTp2UpKQkRVEUxWAwKF26dKn2OL377ru26dddd52ycOFC5Y8//rC1v/HGGxWDwWBrv2/fPtsxW79+vaIoivL222/b+ti5c2edx2br1q229vfdd58yZ84cpU2bNrZpv/zyS732gaLYn1+TJ0+2m3fgwAHbvL59+9rNe+edd5S4uDhl9OjRitlsVoqLi5Xrr79e6dChg7J27do6tyU/P18ZMGCAEhcXp3z//feKoijKmjVrlLi4OOWWW25R8vLy7Nrfd999trH8+eefdfafmppa49gdPa49evRQFi1apCQlJSkjRoywTR89erSyaNEiJTU1Vbnzzjtt00+cOGHr52Kuw/T0dKVbt262fufPn283/6WXXlLeeust2+/169cr7du3V95//31FURSlpKREufHGG5W4uDhlwIABil6vt7WtqKiwu/f88ccftnMjLi5Oef311xVFUZR58+ZVe144ul1Go1G55557lLi4OGXcuHGKLMtKSUmJctddd9mdw3WRnJysxMfHK6NGjbJNe+yxx5S4uDilY8eOypkzZ+zaO3rPmTx5shIXF6esWrVKURRFue2225S4uDjlpptuUsrKyhRFqd856uh+LS4uVm699VYlLi5OeemllxRFUZTs7Gylf//+1e7v7Oxsu34r8/TTT9um33TTTcrPP/+sbNiwwXZPGDlypF37devWKe3atVPi4uKUw4cPK4qiKE8++aQSFxentG/fvsq+rA7rfrvw+VDf/VUTmzZtso1x9+7diqIoytq1a+32QeXrX6/X2+4Na9asURRFUV588UUlLi5OWbp0qa3dp59+alu+U6dOypEjRxRFUZT58+crcXFxStu2bZXNmzfbjaVv3741PqPr+1y+EhE+uc2QHTt2AFT7Vnwht9xyC2Dx1e3bty9gsX5t2rSpStuQkBAA/vjjDxRFaazh1hsvLy/mzZvHxIkT6d27N2CxIhw+fNjW5scff2TNmjUANsuSdfu++uqrOn2LAwMDq53u5uZm+zsiIoI33niDTp062SzmiqLYRb0eOnSIuXPnAjB8+HAAevXqZRtz5bfkytbzr7/+mo0bN/L666/TsWNHm4P/lClTyM7O5qabbiI8PBwnJyduuOEGKioqmDVrlm156+d1a7BMcHCwzZ9rz549ts/eZ86csVlFvvjiC0wmE/369ePZZ5+19aXVavH19a1zfwwdOpT77ruP66+/3uYzl52dzdmzZ21tpk2bRkVFBVqt1nbu3Xrrrbb5jkSwV95Pp0+fxmQy8eeff3L77bfTq1cv21cGR/dBXdTkJ7hnzx7mz58PWNxbVCoVnp6exMfHYzQaefXVV+tcx3vvvUdSUhKAzYJi3S8pKSm88847Do2xITh6XEeOHMnYsWOJjo7mhhtusE3v1KkTY8eOJSIiwu7Lzr59+2x/X8x1GBYWZmdN2759u938hIQEO1eLefPmYTKZWLp0KWVlZXh4eBAZGQlYgnATEhJsbS/8UrVs2TJ27drF008/TadOnWx++DUde0e367vvvrN96r399tuRJAkPD48aA6xqYsmSJeTn53Pw4EH27t0LYLMGGwwGtm3bZmvr6D3nl19+YcWKFWi1WlushPU4ZmVl2az89TlHHd2vc+fOtfV/++23A5Z77sMPP1zt9td0Pwb7c3XChAkMGzaMAQMG2Pyejx49arP0GgwGXn/9dcxmM1FRUbavX9btMZlMtuPaUC72mq48xtDQUK677jrAYnmtzmIP8Omnn7Jnzx68vLwYPHgw8M/5OGvWLJsFuPJ+7NevHx07dgQs8TIqlQpZlnnnnXca9Hx35Ll8JSJEbjPDYDDYLjAvL696LWt9IAD89ttvVeZbP7OUlJRcdEqxi6FVq1ZoNBZPmco3uMqCYvny5YBlzD4+PgC2gCGTydQokeQ+Pj42f+fKN/fKUe5Lly613TCsD8wxY8bwwgsv8J///Md2E7yQYcOG4efnx+jRo/nxxx/p168fiYmJ/PXXX4BFYFuxHufdu3fbPrlb+7V+pjMajciybFumrKwMwM5tYeHChYwaNYqTJ0/y0EMPVfnE58j+sFLd/sjPz7dFRfv6+qJWqwF7IZGXl1evdZaWlvLYY4/h4+PD+++/z/z5823jcHQfNJTK7iWVHx7W9et0OjZs2FDr2K3+npIk2faDs7Oz7bxev359k6f2qnyOODk51Tm9cvaJi70Ox40bh7u7O2B5ebe6tuzbt48uXbrYBeL06dMHSZIICwtDq9Uiy7JdsF1tx/vuu+/Gw8ODp556iuXLl9fpauLodlUWTJWvp8r7yxF69uyJq6srXl5etheTyp+uK7t1OHrP+e677wDL+Wq9Fp966imeeuopXn/9ddq2bXvR52hN+7Wx9suF1HUPOnnyJAUFBYB98F/l67e+96DKNMY1vXv3bpuLUWWXEah5/1jPx7CwMFQqiyyzno9FRUUcOXLENiYrLi4utr89PDxs+yA1NbVBadEceS5fiQif3GZG5ROqvjeMypGS2dnZVeZX7i8/P99OFDcVlS/ayg80q3WytLTU5odoMplsvpoXK3Bqo7KQsvqOguXlwN/fHycnpyrp0y7E+oZdmcppc9avX2+7ERUXF9u2q6SkBD8/P2bOnMnrr7+Ooih89dVX/P7775SXl1cZY3h4OCNHjrSlljt27Bh33XUXL730UrU+Vw3Buq7KN1XrjfjCv0NDQ+vVd8uWLWuM8HV0HzSUyg+CymOovD2VrYcXcvbsWds56+LiUu0+MRgMpKSk2GUaae6YzWbb3xd7HXp7e3Pvvffy1VdfAfDZZ5/x0UcfsXjxYp5//nm7ts8++yyPPfYYKpWKFStWsGrVKjtLcW3Hu7KfqiM4ul2Vv2JcTCR63759+euvv5Akid27d/P222/b3aMrW94cvedY21UWXN7e3jz99NO23xd7jla3X4uLi20iDi5PhL71nKzpOr2Ye1BlGuOarnzOVBaLNVFQUGA7F5KTk23no16vt52PlVN/1YSnp6fteklOTqZdu3Z1LlMTNT2Xr0SEyG1mWN+kgHp/cqgcWV75jdhK5YdEY2RsaGyq215Zlvnggw8cct24FOOwOvkDHDhwwPbZqi7qCk5o3bp1nXk7161bxwcffMA111zDF198wZAhQygpKanS7q233sLX15f58+fbrF9vvfUWgYGBDBw40KHx1oZ1f7i5uREfH8+ff/5JQUEBZrMZtVpt97C2Bkk4Sl1fKxzdBw2h8nGufN1Vvk4cTVpvtaRdTB/NmYu5DidMmMDChQsxGAxs2LCBnTt3otFo7L5mWNm1axfTp0/H39+fzz//nKefftqhtIf1/eplpa7tqiz4L5bk5GTeeOMNzpw5w9dff83mzZurzRfr6D3HKj7Ky8s5efJkncFwDTlHq9uvjblPHMV6rbZo0YKYmBiSkpLIzc21zbfegzQaDUOGDGmUdTb0mr6Yl29Zlvnmm2/sxLWj1CT6L5amdG1sDIS7QjPDy8vL9sCtfLOriconYOWHf9euXau0rdxfTb58zYXKD8DK/mpWLtcn4NjYWNvfS5Ysuai+Km/TkSNHbJ/drJSVldmO5/vvv8+UKVOQJImPPvrIzop6IZ9//jkvvvgiixYtsvt0eGFRhMZg+vTpREZGotfrWblyJYqisGzZMgAmTpxIz549G21d9dkHDXlpq5yYvfLn4srW4pYtW9a4fGxsrO1BWF5ebjt2iqLYSnCrVCpiYmLqPTYrTf0y2hjXYVBQkM2XU5ZlnnrqqWoj35csWcLEiRPJysrif//7X7Uv6o2Fo9tVud3FvFwdPXqUu+66iz179vDmm2/WamVz9J5Tud3ixYtrbNPY56ivr6/NBQUubr/UF7VazaxZs/D19SU5OZk///wTo9HIihUrUKlUvPLKKw65adV0XTXG/qp8zjhiga28P/V6fZVKaLIs292TaqLyPexCN4l/M0LkNjM0Go2tHK4jlZmsFx5gCwTw8fGpNvem9Wbk4+NjuxBXr15NfHw8N998c739XC98Y23MN77K6cfef/999uzZA1isF7Nnz66Xz9HFjMsaVAGwc+dOZs+ebbOg7Ny5kxUrVtjm12XhaNu2re3mWFxczH/+8x9baqv09HRef/11jEYjWVlZfPnll4DF10qr1VJeXl5j+pmlS5ei0+no3r07P/30k0281fQZ8WL2R2RkJMuWLaNdu3Z8/PHHjBgxgvz8fD7//HMmTZrkUB+OWDrquw8cde2pvO133XWX7W/rcaj8t6ura61p8Dw9PW2WcrPZbHM1KigosG1j//79a7QyOnIc6uOyVLm/mv52pH1lGus6fPjhh23ioWvXrlVEntFo5L333gMs2+zu7o7JZKpRRDfEmlh5Gx3drspfQipbXStb8hw5jh9++KFNqFgNDDX5Ojp6z6ncbvny5SxZsgRFUVAUhdWrV7N169Z6n6OO7teL3S8Xcw/q2LEjy5YtIzw8nKlTp3L33Xfj6enJkiVLbGka60Kr1VY7/WKvabAECVq/5J09e9buflf5039N5+PLL79sO//Kysp4/fXXq3U/rLy8Xq+3tbkwxd3F+klX5mL0QlMhRG4zxBpVef78+VoFgUqlsuX7y8rKYuvWrTg5OfHOO+9UawXJyMgAoHfv3rY32S+++IKCggLOnz/Pt99+W69xXuivWNkX6UIqv4lWFuaVffoqtxk/frztjbyoqIhx48Zx4403Eh8fj9lspkePHrWOrfJNofLnzsrrqPzmW9M4hgwZwoABA2y/P/vsM+Lj44mPj2fOnDm2T2MGg8HuoVXdC4parWby5Mm2T0k7duzgpptuol+/fowYMYL77rsPJycnO8tIUlISQ4cOZeLEiXbWhxUrVtiiXrOyspg0aRI6nQ4PDw9iYmKQJMkWNW4wGOysxpX3R+Xtrvx35X1T+Xilp6dzzz33MHbsWLZt28bKlSv54osv6NOnT5XtrYnK6y8qKqr2gVfffdChQwfbfrV+xly+fDmKotidC4WFhbYvGr1797YJ3e3bt6MoCkVFRRw8eBBJkpg2bVqtUeFgKfFq9QHcunUr8I9lMCwsjFdeecXWVpZlO19Gax7b2vD397fzy7NG4mdlZdV6XCt/yq08vXKu0prGUjlw52KvQytRUVG2qPHqIvD1er3tBaa0tJShQ4cybtw4u3Nv06ZN/PHHH0DVmIOaBGNN9wFHt2vcuHG2/b9s2TIMBgPl5eV88skntr4qvyDVRGWL3hNPPMGoUaPsciYfOHDAlhvd0XvO/fffb4vWl2WZN954g/j4eK677jo2bdpki5Cvzznq6H594oknbELPKq7z8/Ntvtdgfx5d6HJyMfeg48ePc8899zBjxgy2bNnCypUrmTNnTrVfL2uicmBibm4uBoPBlhmmPvurOqyBemDZf9ZAts2bN3Pw4EFbu8rnzZNPPml7+UlPT2f48OH07t2bG264gbZt21Zrnd60aZOtv3Xr1mE0GlGpVEydOrXGbc3Ly+P8+fO2Qkn1fS5fjF5oKiTlSne4uAo5f/48AwYMwGAw8PPPP9ssu1b69evH9OnTkSSJjz/+GEmSyM/Pp3Xr1kycOLHaoKeysjJ69OiB2Wxm6dKlthvC6tWrmTFjBi4uLrzzzjsOFYn45ZdfmDdvXhUrjpOTEx06dGD69Ol2n4KPHj3KlClTbG/8np6evPvuuyiKwuTJk203szZt2vD222/bxp+bm8vs2bPZunUrxcXFREdHc99993HPPffUOr5nnnnGLipeq9Uyfvx4Ro0axUsvvWRLCaTRaHj11Vdp2bIlkyZNsj30w8LCmD59ui3dktFo5Msvv2TFihVkZGQQHBzM7bffzsSJE20RwLfccgupqam2dfr6+vLSSy9VW1jijz/+4H//+x/Hjh1Do9HQrVs3nnvuOTvr1gcffMCSJUvw9vZm5MiRPPzww2zatIl33nkHk8nEvffey/PPP48kSbbzw8vLi8jISEwmE0899RQDBgygtLSUYcOGkZ6ebuvbx8eHadOm4eXlxdSpU20PnODgYGbNmkVCQgIzZsywWWWuvfZapk+fTkxMDC+++CKrVq3CycmJsLAwJElCpVLh4uJCREQEgwcPtomZ6ti6dSvPPfec3U20VatWLFy4sErJy/rsA7BEfH/44Yfk5+czYsQIxowZQ1FREU888YTdS0d4eDjz588nOjoaWZZtBQHAIrZCQ0N57LHHHC6YkpWVxSeffMKOHTsIDg4mOzubG2+8kaefftomkk+cOMFzzz1ny5wClvNy5MiRdVaBOnHiBK+88gqnT5+md+/e3H333Vx77bU1HteMjAw+/PBDm1VOpVLx5JNPIssy//vf/2wvFSqVigceeABXV1fmzp1rm67RaJg0aZJNjDb0OryQkydPMnXqVFuQ5IUsXLiQOXPmoNVqGTJkCE8++STHjx9n6tSpFBUVMWTIEF577TVKSkoYMWKEnSALDg7mnXfesUuRtmDBAj744AM761m3bt1sWQkc3a6MjAzeffdddu7cSWRkJJ07d6ZDhw589NFHhIaGEhISwqBBg+wsqxfy119/MWXKFPLy8rjpppt45pln8PT05PHHHychIYGuXbvyzjvv2AS1I/ccsAjBTz75hHXr1pGXl0dERAT33HMP48ePt/PLdOQczc3NdWi/Wjl16hQzZ85k//79dOjQgXbt2hEUFMS3335LaGgooaGh3HXXXURHRzN27Fi7l6rAwEA++ugjMjMzmTZtms1i36pVK2bNmsW6dev47LPPbO1vueUW3njjDQICAhg7dix79+7Fzc2NoKAgJElCrVbj6upKdHQ0d955py3lWk0oisKsWbNYtmwZWq2We+65h/vvv98m3B3ZX3WxdOlS5s+fT05ODn379qVz585s3bqV1NRUQkNDadeuHffdd58tADw5OZnZs2eza9cu9Ho9rVq14tFHH7V74fnpp5+YMmUKYEndGBoaSkpKCvv27SM2NpannnqqisGhqKiIqVOnsnPnTlq2bMnIkSMZM2YMx44dq/dzuSF6oakRIreZMnv2bD777DOeeOIJu5ynDWX9+vU8++yzDB061C4fq0BQH1599VWbIKyJ6dOn16sEtEAgEDjKgw8+WGvJdkmS+Oqrr6oV5lc6lUXuhRX8BNUj3BWaKU899RTdunVj+fLljZIua+HChcTExDBt2rRGGJ3g38r//d//VfuloDLVFSIRCASCxsAa/FoTiqLYle0V/LsRIreZotVq+fTTTwkPD7cFZDSUb775htzcXL766qsGp9oRCEwmEw899BCZmZmsXbuW48ePc/LkSY4ePcqePXv46KOPgH8q6wkEAkFjUlRUxP3334+zszO//fYbJ06csN2Ddu3aZbNyXkyu3OaM+PBef4TIbcb4+Pjw7bff4uPjw8svv+xQSrEL+eijjzh37hzLly+vNi+lQOAoBoOBc+fOYTKZ0Gq1qNVqJElCq9Xi5eVFaWkpAQEBPPLII009VIFAcBVSXFzM+fPnMZlMODk5oVKpbPcgb29vSkpKiI2Nrbe/+JVCZb/mykGkgpoRPrlXCCUlJbi4uNSY+qQm8vPzqwT0CAQN5cyZM3z77bccPHgQFxcXXFxcbA+bDh06MHbsWLsyvwKBQNCYHDx4kMWLF3Py5Enc3d1xdnZGq9XagnjHjBljV7L6auHzzz/nv//9r10gZZs2bZg7d26zqF7aXBEiVyAQCAQCgUBw1SHcFQQCgUAgEAgEVx2aupv8e+jRowcGg8HhPHgCgUAgEAgEgstLTk4OTk5O7N27t9Z2QuRWQq/XN6hcpEAgEAgEAoHg8mAymRzKNiFEbiWCgoIA+PXXX5t4JAKBQCAQCASC6ujfv79D7YRPrkAgEAgEAoHgqqNZilxZllm0aBGDBw+mU6dODBo0iGXLllXbdtKkSbRp08b2r2vXrrY62AKBQCAQCASCfyfN0l1h3rx5pKWlMWPGDMrKypgzZw6vvPIKxcXFPPTQQ7Z2ycnJ/Pnnn8TGxtqm9e/f/6rMkScQCAQCgUAgcJxmJ3INBgPFxcXMmDHDNq1r164MGTKEuXPnMn78eFtBhC+//JKPPvqInj17NtVwBQKBQCAQCATNkGbnrqDT6eystQDu7u7cfPPNlJaWUlhYCEB2djZbt26loqKCsrKyJhipQCAQCAQCgaC50uwsuTWVoHV1dcXDw8M2f/78+WRlZfHoo4/i5ubG6NGjefbZZ3F2dr6o9SuKIkSzQCAQCASCZo0kSWg0GmRFQSVJDqfVuhpwdDubncitiQMHDjB06FDUajUAAwcOpHPnzhw+fJiVK1fy1VdfsXfvXhYsWICbm1uD12MwGDhx4kRjDVsgEAgEAoGg0XB2diYsKhoXV1d2ZOVToDfh66whPtiPivJyMlKS0ev1TT3MS4rBYAAs+XI1mpqlrKRcAbL/yJEjTJgwgY0bN+Lv719lfklJCS+++CJbtmzhwQcfZPLkyQ1aT//+/VEUhTVr1lzskAUCgUAgEAgaFUmSUDs5sTY5k/Up2RjlfyScViUxKCqI26JDMBsMV7VVd+jQoQBs2bKl1nbN3pJrNpuZPn06b731VrUCF8DT05OPP/6YESNGsGbNmgaLXLCcQBdjCRYIBAKBQCC4FFSYzaxNzmJ1UlaVeUZZ+Xu6xJDoYJz//vJ9NSJJkkPtmr3InTVrFvHx8QwZMqTWdk5OTowdO5Z33333Mo3sH4xGoygHLBBcItRqtS2jikAgEPyrUWBdSlWBW5n1KVkMjgq+TANq3jRrkfv999+Tl5fnsHANDQ2ldevWl3hU/1BcXExubu5V7/siEDQ1zs7OBAQE4OXl1dRDEQgEgiZjd1Y+Jrl2NwSjrLA7K58+4YGXaVTNl2YrclevXs22bdv4+OOP7czSOTk5BAZWf+AOHDjAE088cVnGV1xcTHp6Oh4eHgQEBKDVah02nwsEAsdQFAWj0UhRURHp6ekAQugKBIJ/JSZZIa/C6FDbPL0Rs6ygVv27dUmzFLk///wzX331FTNnziQ5ORmwRNCdOnWKEydOMG7cOGbPns3IkSO5/vrrAfjtt99wc3Ojf//+l2WMubm5eHh4EBERIcStQHAJcXV1xdPTk7S0NHJzc4XIFQgE/0o0Kgl/F8dct/ydtf96gQvNUOSuWrWKl156CVmWGT58eJX5P/zwAy4uLmRmZvLYY4/Rvn17OnXqxIABA5g4ceJlGaPRaESv1xMQECAErkBwGZAkCW9vb9LT0zEajcJHVyAQ/CvpGezHkoQ0u6wKF6JVSfQMrr7mwL+NZidyhw8fXq24vZCFCxdehtFUjzXITDxoBYLLh/V6M5vN4toTCAT/TiQYFBXM6qTMGpsMigpG2N8sNDuReyUhrLgCweVDXG8CgeDfjotazW3RISgKbEjNqpInd2BkMP0iAtEZTVd1CjFHESJXIBAIBAKB4Aohq6yCDn6eDI4OZk9WPnl6I/7OWnoG+yErCnOPJlJkMDK1exvcNJdW6JoVGbWkuqTruBia78gEAoFAIBAIBHb8mp7DF8eTcFar6BMeyPCYUPqEB+KiUeOm1XBfXCSFeiOfHj2HuY50Yw3BKJswmE0cK0hjb845jhWkYZRNGGVTo6/rYhGWXIFAIBAIBIIrgHKTmd1ZBQyKCkb1twvXhVkUQt1deLJjCz48lMDihFTGxUU2mruXSTazPzeJg/kpmBXZNn1H1mm6+EXRPSAWjar5uEkIS65AIBAIBALBFcCerAIMZpmbQv1rbdfOz5NxbaL4PT2XTWk5jbJuo2xiX24i+/KS7AQuWNwW9uUlsT83CaPcfCrACkuuoF7MnDmTn3/+mdzcXNs0rVaLj48PHTt25P777+f666/npZdeYs2aNbRp0wY3NzeKi4s5efIkYWFhREREYDKZOHfuHIWFhfz111815j594YUX2LJlC2VlZQAEBwdz3333UVBQwPLlyykuLgbA19eXoUOHMmHCBPr3709oaCjh4eGoVCpOnDhBSUkJ1157LZIkUVhYSEJCAuPGjePll1++9DtNIBAIBIJGYFtGLp38vfBzcaqzbe+wALLK9HyfkEaQqxNdAnwuat2KAgfzU2ptcyA/ma4B0Re1nsZEiFxBvZg8eTJPPvkkvXr1wtXVlblz5+Lk5MSePXuYO3cuv//+O2+//TYajYalS5fSsWNHAHbv3s348eO5/fbbmTRpEgAVFRU8+uijta5v1qxZJCYmMmjQIAIDA9m8eTNOTpaL+/HHH6d3796YTCbWrVuHr68vaWlpDBw4kNmzZ6P+O7J03Lhx7NmzhwULFqDRWE75bdu2sWPHjku1mwQCgUAgaFRSSspILCnj6U4tHF7mzpZhZJVXMO9YElO6xRHl6dbg9ScUZ1ax4F6IWZE5XZRJB9+IBq+nMRHuCoJ64+HhgZ+fH87OzvTo0YPOnTvz8MMP88Ybb6AoCu+++y69evWyCdyacHFx4YEHHkClqv00jI62vBVGRETYBC5Yyrv6+Pjg4+ODr68vAGq1mgkTJtgEbk307t2bTp06ObK5AoFAIBA0OVszcvF20tLZ39vhZVSSxCPtYwhxc+Hjw2cp0BsatG6zIqMzVjjUVmfUI9chhi8XQuQ2MoqiICtys/2nKI0TaVmdML3lllsAKCoq4rrrrnOon759++Lh4eHQuqpznFepVHZjCQ0NpVu3bg6te9iwYQ61EwgEAoGgKdGbZf7MzOemUP96l+t1Vqt5prPF+vvfw+fQm+vvM6uWVHhoXRxq66F1RtVM0ooJd4VGRFEUMspS0MuOve00Bc5qV8JcGy/SsjJpaWnAPxZWgUAgEAgEF89f2QWUm2VuCqs94KwmfJ2deLZzS97Zf5rPjyXxZKcWtuwMjmA0m2jpFcyOrNO1uiyoJRVx3iENGuOloHlI7auJf1FVpspW4YyMDF555RUAnnvuOZvvq0AgEAgEgotjW0YuHfw8CXR1bnAfUZ5uPNYhloO5Rfx4NsPh5VJ0uSw5t4sCfSld/KJqbdvVLxpoPjpIKJFGRJIkwlwjUWj85MuNhYTUaFZcnU7H//3f/5GTk0NBQQHR0dG8+OKL9OjRo1H6FwgEAoHg3066rpwzRaU80TH2ovvqEuDNPa0jWJqQRrCbM73DAmpsqzcb2Zl1mpNF54lw98NT60L3gFgkJA7kJ9tZdNWSiq5+0XQLiGlWeXKFyG1kJElCakZvMZcST09P3n///aYehkAgEAgEVy1bM3Lx1GroEuB4wFlt3BoRSFZZBQtPpRDg4kR7v6opPJNKctiaeRKjbOLm0Ha08w6zGci6BsTQNSCa00WZ6Ix6PLTOf7soSM1K4IIQuYImZuXKlUydOtVu2hNPPMFTTz1VpW1dWRgEAoFAILiaMJpldmXm0zssAE0jPQMlSWJM60iyy/X872giL3dvQ6i7JaiswmxkR+YpThdnEuXuz82h7aoEnGn/FrIdfCOQFbnZBJlVhxC5gialX79+rFy50m6av3/1jvXu7u6XYUQCgUAgEDQP9uYUUmoy07uBAWc1oVZJPN6xBW/vO8VHh87wSo+25Ojz2Xr+JLIi0y+0PW28Q+t0b2zOAheEyBU0EFmWMdcjDYksW3x3Lkxh5uXlVW21M6PRyPbt2+nVqxcuLpa3yLi4uGr7rSstmnXd1v8FAoFAILgS2JaRS1sfD4LdHEvfVR/cNGqe69ySt/ae5K19R4jyzqOFZwB9Qtrhrm14gFtzonlLcEGzpKSkhPz8fAoLC8nPz3domZQUSynA5ORkh9rv3LmTxx9/nA8++ICioiLc3Ny45557qowjLy+PwsJCCgsLq+3HaDSSkZFhNwaBQCAQXBpMcvMNvL7SyCyr4FShrtbgsIul0FhIC59i8spl9IZwBoV3vmoELgiRK6gnM2fOZODAgej1ekwmE0OGDKniU1uZkpIS7rnnHt566y0A1q9fz7Bhw9i2bVut6+natSvXXnstP/74I++99x5ffPEFkZGRVcZhMBgwGAwMGDCA6dOn2/XxzTffMGDAAJvIHT16NM8880xDN10gEAgE1VBhNlNhMrM1PYefE8+zNT2HCpOZigYUHRD8w7aMXNw1aroH+jR632UmPevTDrMx/QhtfL2Z0DaSY/nlrEnOavR1NSWS0lglsK4C+vfvD8Cvv/5aa7uKigoSExOJjY21fUoXCASXFnHdCQTND4NZZm1yJutTsjBWsuJqVRKDooK5LToEJ7Wwp9UXoyzzws6j9Arx497WEY3Wr6IonCnOYnvWKQB6h7SllVcwAD8nnmdl4nke6xBDz2C/RlvnpcBRvSZ8cgUCgUAgENSbCrOZdclZrE7KrDLPKCu26UOig3FWN6/UUs2dAzlF6IymRg04KzXq2ZZ5kkRdDq28grkpuA2uGifb/GExIWSV6fnqRDL+Lk608vZotHU3FeL1SiAQCAQCQf1RYF1K7Z+316dkIb4X159tGbm09nYnzN31ovtSFIWThRksPbeLzPIiBoV3ZkB4JzuBC5bUYhPaRRHr6cacw+fIKddf9LqbGiFyBQKBQCAQ1JvdWfl1BpoZZYXdWY4FKAssZJfrOV5Q0igBZzpjBWtTD7Ll/HGiPQIY3SKeFl5BNbbXqlQ81aklrho1Hx8+S5npyvarFiJXIBAIBAJBvTDJCnkVRofa5umNmEXWBYfZnpGLq0ZNjyBfh9pXLq9rRVEUjhems/TcLvL0OoZEXMMt4R1xucB6Wx2eThqe7dySQr2RT4+eu6KPnfDJFQgEAoFAUC80Kgl/F61Dbf2dtahV/45y9xeLSVbYcT6PXiF+ONcSsGeUTSgKJBRnojNW4KF1+bu0LujNZn47f4zU0nzaeofSKzgOF7Vjx8pKqLsLT3ZswYeHElickMq4uMg6C0M0R4TIFQgEAoFAUG96BvuxJCHNLqvChWhVUrOP1G9OHM4roshQe8CZSTazPzeJg/kpdlbcHVmn6eIXRSe/SIyymdsiuxDt0XCXh3Z+noxrE8WCkymEuLkwILJmN4fmihC5AoFAIBAI6o8Eg6KCq82uYGVQVDBXoAGwydiakUuslxuRHm7VzjfKJvbnJrEvL6nKPLMi26bfFtkVZ/XFS7zeYQFkllXwfUIaQa5OdAnwueg+LyfCJ1cgEAgEAkG9cVGruS06hGExIWgvcEfQqiSGxYRwW3SISB/mIHkVBo7mFdOnloAzRYGD+bVX7zyYn0Jjeofc1TKcLgHezDuWREpJmd285l7hTlhyBQKBQCAQNAgntYqBUUHcGhnEX1kFFBiM+DhpiQ/xQy0hCkHUg+0ZuTipVVxXS8BZQnFmtYFmlTErMqeLMung2zhFJFSSxKMdYnh3fwKLTqfybOeWqCWJ3Vn55FUY8XfRWlxSJMuLT3NCiFyBQCAQCAQN5mxRKT+cSWdytzhc1Wr2ZBfw3elUHmof09RDu2KQFYXt5/O4PtgPF031QtGsyOiMFQ71pzPqkRUZldQ4LxnOajXPdW6BJEmsT8liY2q2nS/2koS0ZlnhrvmMRCAQCAQCwRXHmaJSigwm3DVq1CoJkyzzR2Y+RQbHUowJ4EheMQV6Y625cY1mE+4ax0qae2idG03gWnHWqNmcls3a5KwqwYbWCndrkzPRm5tPbl0hcgUCgUAgEDSYM0WltPJ2t6WY6hrogyTB/pzCph3YFcTWjFyiPFyJ8aoacKYzVvDb+eOsSjlAK68g1HWIV7WksqUTa1QUWJ+SXWuT5lbhTohcgUAgEAgEDcIsK5wrLqW1t7ttmodWQ1sfT/ZlFzbdwK4gCvQGDucV0Sfc3opbbjKwM+s0i8/+QWJJDm29Q1BJKrr4RdXaX1e/aKDxU1pciRXuhMgV/Gv4888/mThxIlOmTKmz7Z133slzzz1n+52Wlsbs2bO58cYbSUtLu4SjFAgEgiuHtNJy9GaZlt4edtN7BPlwsrCEEoOpiUZ25bAjIw+NpCL+73zCBrOJv3LOsejsTo4XptPNP4b7Wt7ANf7ROKk1dA+IpYd/bBWLrlpS0cM/lm4BMWhVjRsAdqVWuBOBZ4J68dNPPzFlyhTi4uLw9vZGkiT27NmDh4cH7du3R1EUMjIySE9P591332XLli1s3LgRb29vunbtauunuLiYo0eP0qJFC1atWlXj+iZPnszq1asx/+3j07JlS1avXs3cuXP5/PPPMRotF11ERATTpk2jd+/eAHzyySd8/fXXlJaW4uPjwzfffMP27dv57bffGDFiRJ3b6e3tjYfHPzftc+fOsWfPHnJychq03wD++usvFi1aRE5ODmq1Gp1OR3R0NA8++CCdO3eud387d+7k3Xff5fTp07ZpLVu25Omnn2bw4ME1LpeZmcmsWbPIyMhAr9eTkJBARYUlmGHr1q2EhITU6zjPnTuXdevWsXnzZioqKtBqtWzatInQ0NAax5Cens6AAQMwmUz4+flxxx13MHny5HrvA4FA0LScKSpFLUnEeNp/Zu8a6MPCU6kcyC2s1c/034414Oy6YF+0KjiUl8y+vCSMspmOvhF084/B9YJSvBqVmq4BMXQNiOZ0USY6ox4PrfPfLgoSmkYWuJZ1XpkV7oTIFdSbqVOncv/999t+t2nThri4OBYuXGibNmvWLDw9PZk5cyYbN26kZcuWzJs3z66fkydPMn369FrXNXPmTK699lpefvll2rZtaxPEzzzzDL169eK+++5DURRWrlyJp6enbbmnnnqKbt268cYbb/DTTz/h7u5ObGwsX375pUPb+PXXX9v97t27N/v27WP//v0OLV8ZWZaZMWMGGzduZPbs2fTo0cM2b/369UyYMIEHH3yQp556ql793nDDDaxevZpnn32W9evXM2nSJCZOnFjrMjk5Odx555089thjvP/++wDodDpmzZrFkiVL7No6epyt/x8+fJi7774bo9HI119/zcsvv1zjOObPn4/JZEKr1bJu3Tp8fHzqte0CgaB5cKZIR4ynW5WIem8nLW18PNibLURubRzPLyG3wkCUl8ySs39QajLQ1ieUHgEt8NTWHGRmtdR28I1o1CwKtXElVrgT7gqCeuHh4cGYMWPqbPfwww+j1Wpxc6u+agtA27ZtHerrjjvuIDAwkMTERHQ6nW16jx49uOmmmwBITEysslxiYiL33Xcf7u4WXzFnZ+c611UbGk3D3glnzpzJkiVL+OSTT+wELsCgQYN48803mTNnDgsWLGhQ/7GxsQB06NChzrZLly7FYDAwfvx42zQPDw+mTZvG9ddfbzetPscZoGPHjnh5eeHs7Mzy5cspKCiodpmCggLWrFmDl5cXPj4+QuAKBFcwZ4pKaVnJH7cy3YN8OFFQTKlRuCxUh6IorE1Jw10rc6roLMGuPoxuEU/f0Pa1CtwLuRwCV1EUFBQGRtVe2ndQVDAKCkoziT4TIreRURQFs2xutv8u9sQbMGCATdTUhre3N3369Kmz3ZAhQ+pso9FoGDZsGHq9nk2bNtnNGz58OAC//PJLleU2bNjAbbfdVmf/l5Jjx47xzTffcMMNN3DNNddU2+a2224jKiqKDz/8kKysrHqvQ/138m21A0m48/Ly0Ol0HD58uMo8676Ehh1nlUqFp6cnI0eOpKyszM7iW5nFixczePBgPD09UanELUgguFIp0BvIqzDQqgaR2y3QB7MCB3OLLvPImjeKopCiy2Px2T2cLiinlbeKUbHXMTCiEz7O1e/LpkRWZIyyAa0KBkcFMiw6uPoKd9HBDIoKQKvClmmjqRHuCo2Ioigczj9IsbG4qYdSI15aLzr7dWnyE/Do0aNUVFRUsWzWxIgRI/j6669ZuXKlnU+t1Vd39erV/Oc//7FZW1NTU3Fzc8PPr/rPJrt372bt2rVs2rSJ9u3b8/HHH+Ph4YEsy2zfvp1ly5ZRWlrK/Pnz6xzbzz//zKpVqygoKKCwsJDHH3+cu+++G4Aff/wRRVG48cYba1xekiR69erF0qVLWb16NcOHD2fjxo0sW7aM9u3bc8stt7B69Wp2795N+/btefPNNwkLC3Nov13IDTfcwHfffceECRN46aWXuPvuu23ngiO+yo7w0EMP8cMPP7B48WIefvhhO2t+RUUFS5cu5bvvvmPr1q2Nsj6BQNA0nCkqBaDVBUFnVnydnWjl7c6+nEJuCPW/nENrtmSWFfJnzhkyygrR6X1QSyoebtcJd23zk2OyYqbYUEiRsQBfpwBKTWWklaZyU3hbBkUH8WdmAQUGE75OGuJDfCk1lXG04AAR7pGEujXsGdXYCDOKoElYuXJlvdrHxcXRoUMH9uzZQ2Zmpm366tWrueGGG8jNzWX79u226atWreL222+vtq+jR4/i7OzMm2++yaJFi9i5c6fNVaC4uBhFUfjtt98wmer+xPbtt9+yfv16Pv30U3766SdGjhzJK6+8wm+//QbAkSNHAIiKqj3li9Xl4MiRIwQGBnL77bdz4sQJjhw5QkBAALNnz+a7777j4MGDjB8/Hr1eX+fYquPWW29l/PjxlJaW8uqrrzJy5MhGF5uRkZEMGjSIwsJCfvjhB7t5P/74Iz179iQyMrJR1ykQCC4/Z4pKCXBxwse55q8+PYJ8OZpXTLmp+RQIuBTUVWo3r0LHL6kH+Sl5L3qzicHhnckvd+XaIN9mJ3BNsol8fQ4punPkG3Jx03jgpvFAb9ZTYa7gROFBjhceooVPGdcGybTwKeN44SFOFB6kwlyB3mypttYcaF579gpHkiQ6+3VpNge3OlSSqkmsuKdPn2bcuHEAFBYWkpCQwKBBg+rVx8iRI3nrrbf4+eefefTRR8nJyUGv1zNp0iR27tzJihUr6Nu3LwC//fYbjzzySLX9dOzYkS5dugCWbAS+vr4cPXoUAB8fH26++eYaLcCVKS0t5aOPPmLZsmU4OVmiXx944AHmzJnDvHnz6Nu3L8XFFqu+q6trrX1Z/YZLSkoAbEF07du3t2VeiI6OZsyYMcybN4/169fbuRfUh5dffpn4+Hjefvttjh8/zqOPPkp8fDzTpk2jRYsWDerzQh5++GHWrl3L/PnzGTNmDE5OTpjNZubPn8+cOXMaZR0CgaBpOVOkq9FVwUr3QB+WJqRxKLeI+JDmE5DUGBhlE4oCCcWZ6IwVeGhdbEUYtCqLvCoylLEn5xwJxZl4aV25NawjrbyCOVmoI7tcz4PtoptyE+wwygaKDAWUGC3uJV5aH7ydfFFLGsrMZTip/snyUGYqJUVXWm0/zurGr7bWUJqdyJVlmSVLlrB48WLS0tIIDw/noYcesn3+tZKVlcX06dPJzc3FbDYzduzYBj/0GxNJklBLjZ++40rnwqj8X3/9td59DB06lHfffZdVq1bx6KOPsmbNGm6//Xa6detGTEwMv/32G0VFRSQmJtKuXTuHA81cXFwoLy+3m+aIP+rBgwcpLS1l2rRpdi8O4eHhlJWVAeDl5QVgFzBXHdYUXr6+vnbTL3wh6dGjB/PmzePIkSMXdb7379+fm266iYULF/LZZ5/x559/ctddd7FgwYIGpTO7kPbt23PjjTeyY8cOfv75Z+666y7Wr19PdHQ07dq1u+j+BQJB06I3y6SUlHFjSO1uCP4uTsR6ubEvp/CqErkm2cz+3CQO5qfYWXF3ZJ2mi18U3QJi2Z+bxIG8JFw0WvqEtKWtT5gtt+3W9FxC3Zztimg0FXpzBUWGfHSmElSSGh8nf7ycfFChotBQQFJJIjIKnf2uIbHkHDI1G/JUqAhyCb6Mo6+dZidy582bR1paGjNmzKCsrIw5c+bwyiuvUFxczEMPPQRAfn4+Y8eOZdSoUTz66KPk5eVxxx13YDKZuPPOO5t4CwSO0L9/f7vfU6dOreLCsGDBAq677jrbbx8fH/r168eGDRs4cuQIGzZssKUEGzlyJB9++CFr1qzhzJkz9RKAkiQhy/W3vufl5QGW9FnBwdVf1B07duTw4cOcPXuWW2+9tca+MjIyAGoMTrMSEmKxEhgMhnqP90KcnJx46KGHGDFiBJMnT2bbtm3MmDGD77///qL7BnjkkUfYsWMHX375JSNHjuSrr74SuXAFgquEpOJSzAo1ZlaoTI9AX1YmZlBhMuOiufKNQEbZxP7cJPblJVWZZ1Zk2/QojwCc1Go6+kbaFWcoMZjYn1PInS3Dmiw+RlEUKszlFBryKTeXopE0+DsH4an1RiWpKDYUk6RLpMhQiJfWixaesahQEe4eQWppSo39hrtHXMatqJvmYU/+G4PBQHFxMTNmzKBbt27ceOONfP3114SEhDB37lxb4v+PP/6Y0tJSHnzwQQD8/f0ZPXo0M2bMID+/+ZSTEzjOs88+y8qVK+3+dezYsUo7a3DUhx9+SHh4uK1gwx133IFKpeLHH3/k0KFDDge0XQxWl4INGzZUmXfq1CnbuAB+//33WvvatWsXbm5uDB06tNZ2VveHiIj63UisKda++uqrKgLZz8+POXPm4O/vz/Hjx+vVb23Ex8fTqVMnEhMTeeONN1Cr1fTs2bPR+hcIBE3HmaJSXNQqIjxqd8UCSyoxo6xwJL/5BmXXB0WBg/k1Cz2wzPd38aCrf9XqY7syLQaSXnVYwS8FiqJQatKRUZbC+fJUTIqRQJcQIt1b4O3kS7m5nOMFxziUfwCjbKS9Twc6+3XB28kHtUpNpEcUke5RqC6QjypURLpHEekRhfoSFKNoKM1K5Op0Opu11oq7uzs333wzpaWlFBYWUl5ezooVK7j22mvt8pZee+21lJaW1lo9S9D4WC2gNaUms2Y/qCt1WXBwMHFxcXb/qsuxe9NNNxEYGMgff/xhZ60NDg6mV69eHDt2jJtuuumyvB137doVJycnPvzwQ1auXGnbF8ePH+fbb78FLJbZu+66iwMHDrBt27Zq+/n11185ceIEL774YhVfYKvbg5WjR48iSRL9+vVzeJypqakcPHjQ1l91otzFxYWAgABiYmKq7aOu42xtc6FF3OoXvXTp0io+0tW1FwgEVwZninS09HJH5cC9NsjVmWhPV/ZmV587+0ojoTizzkAzsyKTUJRZZbqiKGzNyKVboA+eTo3zMd2R1KCKolBiLCKtLIms8nQAgl3DiXCLwVPrjV7Wc7roFPtz96Iz6Wjj3ZZu/t3xdwmwe56qJTWR7lHEB/WilWcrIt2jaOXVmvigXkS6RzU7d81m5a5QU7CPq6srHh4e+Pn5sWPHDvR6vS0a3Yo1YGbPnj088MADDR6DoihVhMWF6PV6ZFnGbDbbRNy/laSkJMBSplWv11cpmHDmzBnA4kNdXl5uC9BqKJIkcdttt7F27Vri4+Pt9v/w4cPZsWMHw4YNq/a4WLMyWP24AYxGI6WlpeTk5GAymZAkCYPBQGFhIc7OzrZplZc/f/48oaGheHp68thjjzFnzhwmT57M66+/jre3N/n5+SxdutS2jldeeYXy8nL+85//MGvWLHr16gVYzrV169bx+uuv8/zzzzNq1Kgq4965cydnz54lJiaGnJwcFixYwOjRo4mNjbW1tfr75uXlVVk+KyuLSZMm8emnn2I2m5FlmTfeeAONRsMtt9xiy1P7008/cfbsWf73v/9Vu+/qOs4AycnJ5OfnU1xcbAuk69evHzExMUiSRN++fW19l5aWkp+fjyzLFBUV2ZVQrgnr+MvLy4U4FgiaEEVRSCjS0SfYt87npZXOPu5sSM+jUKfD6QrOj63WaNAZKxxqqzPq0RsMmCtl6jlbXMb5Mj13RQc5vO8uRJIktE5aVCoJnbEEk2JEI2nx0HoiywpGg9EmfBVFpkwppVQpQcaMMy74qYJwkpzBAMVyMZmG8+QYsi0C1iWKAG0gKkVVJVYFAL0e9ebNqJctI7i0FNPKFUhIGPSGy1oAwtF1NSuRWxMHDhxg6NChqNVq0tMtbyABAfZlAq2fjq3zG4rBYODEiRN1ttNoNA1O43S18Nprr9nST2VnZzNgwADuvvtu20vGyy+/bEvrlZGRQb9+/bjzzjt5/PHHL2q9Q4YMQZZljEajzYUF4MYbbyQ+Pp7Q0FBbIJeVvXv3MmXKFAC2b9/OiBEjeP3113nppZcoLCyksLCQ4cOH89FHH/HEE09QWlrKuXPnGD58OIsXL+btt99m9erVADz55JM8+eST3HXXXTzwwAO4u7uzaNEiMjMz8fb2Ztq0acTExNiN4a233mLr1q0sWLCAjz76CGdnZ8rKyoiNjeWLL76gTZs2VcYMFkvwzJkzKSoqoqioiOHDh/Pwww9TUVFBTk4Oe/bsYe3atQC8/vrrLFy4EI1Gg9lsRqfTkZSURJcuXXBzc6OiogKTyURJSQnPPfccfn5+hIeHU15ejr+/P/PmzaNr165VxlHXcba22bZtG3q9nsGDBzN06FCeeeYZAMaNG2d7eQD473//y+rVq23Xz5AhQxg8eDCTJk2q9bjr9XpMJhPnzp2rtZ1AILi0FJihzCShLcjlhC7XoWW8zGCQJTYdPUWLuuN6my0hoaG4uzgW1OyucSIvN5fz58/bpm0uAy8VmNNTOJFR//U7OzvTpm0biowFFBsLUPhH7OXps/HS+uKl9eH0qVPgJuPs44SkBkOJCX2+EbNBB+SiSAomDyNGD8t9WatzQqPTkqfkk0dVt0+Xc+cIXrwYny1b0PydBUiRJE7+ugVjYGD9N+QisT5PTCZTrdVIJaW51F6rgSNHjjBhwgQ2btyIv78/n376KR999BHTp0+3y7hgNptp3749MTEx1X6OdYT+/fujKApr1qyptZ1erycjI4OYmBhcXBwvvScQ1If27dtzxx138Pbbbzf1UJoFFRUVJCUlERYWdtElmgUCQcPZlV3IknOZvNejNa71CCR7+3Ai4W7O3N+qeRQKqA9mReaMLpvU8gJujejEt2d21OqyoJZUPNC6N+ZKVtUyk5mX951hcEQAA8Ib5o/r5OxEsamAQkPN8Uc+Tn64qt05X56Gm+SOm+SJRrIIQVmRyTFkk2k4j1kxE+QURIhTKBrVBW8eigJlZfD3VznVzp24DBhg6SMkBPOdd2IeNQq5e3doguA5a/zKli1bam3XrC25ZrOZ6dOn89Zbb+HvbzkhrA+3C62o1t/e3t4XtU5Jkqr1Ba2MSqVCpVKhVqsdKqUqENQX601RkiRxjv2NWq1GpVLh6uoqXi4FgiYkpTyHcA9X/L0867XcdcF+bEjJQuvigvYKcVkwKzKnCs+zLy+REmMFrbwsmXS6+EVVm13BSle/aCRJssuRvistGxmFvlEhuNVSQKM2ZEWmyFC7b3ORoQBvDz+i3Fug+Ttfr6IoZJdnkVyahF7WE+IaQpRHNM7qC+6lR4/Cd9/B0qUwcCD873+W6f37wwsvwG23oerdG5VaTVMa5B2Nu2nWInfWrFnEx8czZMgQ2zRr5ajCwkK7ttbfoaGhl2t4AsElIzs7G4CcnJwmHolAIBDYc6aolLa+dfvRX0j3QB9WJp7neH4J1wRcnEHqUmNWZE4XnWdfbhLFxnJaegZzW2QX/Jwt2909IBYJiQP5yXYWXbWkoqtfNN0CYtBUyjKgKApb03PpEuCDdwMFLoDOWGxzUdCqnPDQeCKhQkFGZyrBKBtQUCg1luDl5IOiKOTp80jWJVJmKiPAOYCOnp1x01Qy5iUmWkTtkiUWkWtl3TqLRVeSQKWCDz5o8LibimYrcr///nvy8vJ499137ab36NEDjUZjS4lkJTk5GYAbbrjhso1RILgU/PDDD3zw981kx44djBw5koULF9qCuQQCgaCp0BlNnC+rYGhM/RP+h7m7EOrmzN7sgmYrcmVF5nRRJntzEyk2ltPCM4jBEdfg72Iv6jUqNV0DYugaEM3pokx0Rj0eWue/K55JdgIX4FxxGWmlFYxq1fA8soqi2ILMAlyCcVI5kV2eg0HW46RyJtQ1EoOsJ7ciC5NipEBfQLIukRJjCT5OPsT5t8FT62Xf6ahRsGzZP7+1Whg8GEaPhmHDmsQVoTFpliJ39erVbNu2jY8//tjOJJ2Tk0NgYCCDBw9m586dKIpim79nzx68vb0ZOHBgUw1bIGgURo0axahRo5p6GAKBQFCFs0WWUq4tvetvyZUkie5BvmxJy8Eky2iakcuCVdzuy02kyFhOrGcggyI6E+BSs0uGNf9tB98IZEWutZTttoxc/F2caO9XPxcPKwazHrNiRis5EeoWSXppGumlaXbVxxJLzhLuHkG4eyQJRafJ0+fiofGko29nfJ19obAQVi6AsWMtYhYgNtYiZPv2tQjbO++ECypvXsk0O5H7888/89VXXzFz5kybddZkMnHq1ClOnDjB5MmTefHFFxk+fDjLly/n7rvvJi0tjR9++IGpU6detE+uQCAQCASC6jlTpMPbSUOgS8PSQfYI9GFNUiYnC3R09Peqe4FLjKwoJBRbxG2hoYxYj0AG1iFuq6M2gVtuMrM7q4Ah0cEO5RW2YpQN6Iwl6EzFGGUDzioXgl3DSStNrbbqmIxsmx7hHkmQaxD+ZjeklWssfrbr1oHBAMHBFmstwKRJ8NxzcJW6ejYrkbtq1SpeeuklZFmutizrDz/8AEBQUBCLFi3irbfesiXhf+ONN7jlllsu95AFAoFAIPjXcKaolFbeHg0uuBPp4UqQq8VloSlFrqwonCnOYm/uOQoNZcR4BHBrWEcCXRt/TLuz8jHKMjeF1Z1RwSgbKTUWozOVYJD1SEi4azzwcw7ETe2GWZFJK02ttY+MwiQiDx7B6/sfYOVKKC39Z2bHjlApby9/l4q/WmlWInf48OHVitvqaNmyJQsWLLi0AxIIBAKBQACASVZILC7ljhYNTwEmSRLdA33Yfj6PcbKCWnV5fT4VReFMSRZ7cxIpMJQS7RHALWEdCboE4tbK1oxcrgnwxte5euu3STZSaipBZyxBL1cgIeGmccfHyR83jbudlTinPNMuN251uJw+h/r2h/+ZEBtrcUUYPdoicv9FNCuRKxAIBAKBoHmSqivDICu0boA/bmV6BPmwLiWL04U62jXQR7U6zIqMuga3AUVROFuSzd7cc+TrS4ly96dfWHuCXRvHxbFyjFBlkkvKSC4pZ3is/YuBWTZZhK2phAqzpbKYm8aDIKdQ3DQe1bo/yIqM3lwpfaqi4HHoFIErfgWVROK0JwAobd+S8j7X49K5B9KYMdCz5xUfQNZQhMgVCAQCgUBQJ2eKStGoJKI8XetuXAsxnm74uzixN6fgokWuUTahKJBQnInOWIGH1uXvDAegVWlQFIVzJdn8lZtIvl5HpLs/N4e2J6QRxK38d+ownbG4Umldi0XYKlK3ZuTi66ylk58XZsVMqbGEUlMJ5WZLSV9XtRuBLiG4aTxQSzXnRDfKRspNZTipnHA9nUzQyl8JXPkrromWKq8md1eS/+9BZDcXkCQKf1lOqNuVV3SjsREiVyAQCAQCQZ2cKdIR4+l20YUcrC4Lf2blMzYusl7BWJUxyWb25yZxMD/FLlftjqzTdPGLoqt/DJvSj5BcmkeEux99QnoQ6uZzUWO3IisyhYY8igxVS+t6O/ni4+SPwazwZ2Y+fcK8yalIp8xs8Y11UbsR4ByMu8YDtap2GVZiLOF8WTo55TmEL99C9Jc/EXbosG2+2cWZ/IG9yL6jP4rW0pcKFUEu9U/xdjUiRK5AIBAIBII6OVNUSnywX6P01SPIh42p2ZwpKiXOp/7uD0bZxP7cpGqrjpkV2Tb9usCWdAuIbTRxC/8I3OpK6yootumHckxUmGVa+1ZgVlT4OwfhrvG0VSGrrf+cimxyko9S6CLj5OZFlEc0oQUgHTqMotGQ3/dacu7oT97AXsju9lVaw90bnov3akOIXIFAIBAIBLWSV2GgQG+klXfjFKVp4eWOr7OWvdkFDRK5igIH86um0arMwfwUugXEoK1DVDYER0rr7syUaevrSme/VmhUdVc5qzBVkJl5CtOPP+C/YhMdduxHt+AzPMY+ZPH3HT8BgkORR95BiXMpeRfkyVWhItw9gkiPqFpdH/5NNJ9MzALBFUZZWRmLFi1iwIAB7N69u6mHUyOFhYV8/vnn9O7dm7S0tKYejkAguAI5U6QDoGUjiVzV3y4L+3IKkZXaswVUR0Jxpp2LQnWY/y7w0NhULq1bE1llZs4Vl9E/PKRWgasoCgWF6aQu+BDd8IFEtbqOVs+9je/Wv5DMZjwPnvwnoC06Gh59FHVAEJHuUcQH9aKVV2si3aNo5dWa+KBeRLoLgVsZYckV1IuffvqJKVOmEBcXh7e3N5IksWfPHjw8PGjfvj2KopCRkUF6ejrvvvsuW7ZsYePGjXh7e9O1a1dbP8XFxRw9epQWLVqwatUqh9YtyzJr1qxhxYoVlJSU4OpqCX6IiIhg0KBBbN++nVdeeeWSbHd1/P7776xbt85WtKQm7rvvPg4cOIDJZMLHx4fffvsNNze3Gtvv27ePMWPGABASEsKECRN44IEHGjzOzZs38/PPP5OVldXgPgQCwb+bM0WlBLs64+VUt0XSUXoE+bI5LYdzxZbcu45iVmR0xgqH2uqM+jqrkdUHa2ndujiQY8RTq6Kzf/UBbkbZSHZ5FtnpJ+jU/Q58dWX/zGzf3pLu6957oVWrapdX/11tLdQtrFG372pDiFxBvZk6dSr333+/7XebNm2Ii4tj4cKFtmmzZs3C09OTmTNnsnHjRlq2bMm8efPs+jl58iTTp093aJ35+fk8//zzpKenM336dHr27Gmbd/78eWbNmsXatWuZMmUKavXleYsdMmQIRUVF7N27t9Z2ixYtYt26dTz33HMUFhbyww8/MGHChBrbf/HFF4Cl6MmmTZtwcmpYZSErd911F4mJiSQkJFxUPwKB4N/LmSJdo7kqWGnl7Y6Xk4Z92YUOi1xZUSgylOGmcXaovYfWudEEoN5cgUkxoZb+kU5alRMeGk8kVCjI6EwllJn0HMkzcUOoD1p1pXXLMmW/b0R38E8SxvRHQSEgKBzat0fJykGy5rLt1KleKb+EwK0ZIXIF9cLDw4Nhw4bV2e7hhx/m4MGDtVos27Zta7NY1obJZOKxxx4jJSWFFStWEBZmnxYlNDSU999/n/LycvLz8wkMDKx7QxoJZ2fHbrQdO3YkMDCQnJwcFixYwNixY9Fqq1pEzp49y/HjxwEIDw+/aIFb33EKBALBhVSYzKTqyukT1rj3VqvLwt6cQka1Cq+1ipq1Qtm+3ERUksTw6O78kZ1Qq8uCWlLZ0ok1FFmR0RmLKTEWoZcrcFG7EewaRrGhEH+XIJxUTmSX52CQ9TipnAl1jWTH+WwqzGX0jwgFRUHet5eKRV+jWbYCt4wsXLQa9HePJDisLU5qJ1i9FgID/7W5bC8lQv4L6sWAAQOqFWcX4u3tTZ8+fepsN2TIkDrbfPbZZxw+fJhHH320isC1IkkSL7/8MhpN83xvkySJ2NhYevfuzfnz5/n555+rbffll18yfvx42zKNuX6BQCBoCIklZcgKtG5kSy5Aj0Bf8ioMJJWUVTtfVmROFmaw9NwuNmccxcvJlZtD26GWJLr4RdXad1e/aKBh9z69uYLciiySdWfJ1WehktQEu4YT6hqBhESoWyTZ5dnszv6TsyUJpJamcLYkgd3Zu/g9PZf4knwC3p2BMa4lqmuvw+3jz3DKyEL28kQaO5ZIyc8icAGCgoTAvUQ0T0VwBWPx16ndGb4p0UiqZiF4jh49SkVFBT169Ki1ncFgYNGiRQAMHTq01raVBfCZM2dYvXo1q1evZvr06axevZr169fz6quvMnLkSPbs2cOcOXOQJImkpCS6dOnC66+/jp+fH+fOnePHH39k3bp1PPHEE+h0OrZv387Jkye55ZZbmDJlCi4uLnbrNpvNfPvtt2zdupUjR47w+OOPV+tH+8gjj7Bt2za+/PJLRo4caXcssrKy2Lp1Ky+//DLvv/9+tdt44MAB5s2bR1lZGampqXTr1o3//Oc/hIaG2rVbs2YN33zzDVqtFo1GQ0BAQJW+dDodH330EYmJiZw9e5aoqCimTZtGy5Yta93PAoHg38WZQh2uGjWh7i51N64ncT4eeGg17M0uJNbrHxFtCRo7z77cJIqN5cR6BFYpv9s9IBYJiQP5yXYWXbWkoqtfNN0CYtCoHHdfkxWZUlMJxYZC9HIFakmNt5MPnloftJWCx2RZJq00ldTSC7I7KAr5BkjRmbh7/29o3nrb0t7FGXnoEDRjxqEaPBhcGn8/CqpHiNxGRFEUViTvJbO8qKmHUiMhrt6MiO7R5EJ35cqVDBo0qM52x44do6CgAB8fH4KDqya3PnDgAK+99hqlpaW2aQMHDmTkyJFkZ2eTnp7OsmXLGDlyJMXFxbi4uJCWlsYjjzzCa6+9xp133smRI0e46667CAgI4LXXXqNFixZ07dqVL7/8kk2bNvH2228zYcIEm1+tTqdj1qxZduNYs2YNL730EuPHj+eDDz7gvffe45ZbbiEyMtKu3XXXXUeXLl04ePAgmzdv5tZbb7XNW7BgAXfddRceHtX7pu3atYvnn3+e77//nqioKHJycrj//vu59957Wb58uc1NY+nSpXz44Yd89913tGzZkrNnz3L33Xfb9WU0GnnwwQe5//77eeWVV9DpdIwePZqHHnqIdevW2YL6BAKB4ExxKS293BtctKE21CqJbgHe7Msp5K6WYcgonCzMYH9eEiXGClp4BjEoojMBLlUro2lUaroGxNA1IJrTRZnojHo8tM5/uyhIDgtcg1lPsbEQnbEYGRlXtRvBLmG4aTxqeFYqpJWmAqDNLSRg7VYCV/xK1qiB7Ok3Eme1TPGIa5H/GoxyzyjUI+9E5dl45YsFjiPcFQSXhdOnTzNu3DjGjRvHsGHDbNbZujh//jxAjS4SXbt2Zfny5bi7u5Oens6YMWOYPHkyrVu3plu3bgD069ePm266iblz5zJkyBDOnDlDRUUF7du3B6BTp054e3vbZUnw/PuGNGDAAPz9/QEYPHgwXbp0Ye3atWRkZNiNY/jw4Xh5WSwM1157LbIsc+LEiWrH/MgjjwDw+eef26aVlJSwcuVKm6vChSiKwrRp07jtttuIirJ8ogsMDOSFF14gMzOTjz76CIDc3FzeeecdHnzwQZtFtmXLlvTv39+uvzVr1qAoCrfddhtg8bW+5557anWlEAgE/z5kReFsUeklcVWw0iPIl+xyPb9nJLH4zE62Zp4k2NWbe2LjaxS4VrQqNVqVhg6+EfQIiKWDbwRalQZtHQJXVmRKjMVklKWQVpaEzlSCp5MPke6xhLpF4q71rNEYlJN9lsAf1tNhzIv07DKSVi/Nxnv3YQJ/3MyZIg2tvE2YQnzJWv4l6vsngBC4TYaw5DYikiQxIrqHcFeohguzL/z6668OLWcNmMrPz8dsNlebOcHZ2Zm2bdty+vRpOnToYJtubRsUFGTX/oYbbmDx4sW0a9eO8vJyNm/ejNlsxmAwVOn7wn117bXXcvDgQY4dO1ajf7B1zGVl1fuY9e/fnxYtWnD48GF27drF9ddfz3fffcett95arVsBwJEjR0hOTiY2NrZKX25ubvz666/MmDGDtWvXUlFRYRP4VqzC2MqOHTtIT09n3LhxtmmlpaWEh4eTm5tb7RgEAsG/j/OlFZSZzPVK8VUfTLIZIwVoVAqb0zO4OdyX7gGx+Dk7Jqrlv5+3OmMxJsWIRtLiobUYHKrLOmCQDZQYCikxFSMrZlzUbgS5hOKuqVnU/rMyGWXsGIJWriSkQm+bXNIpjpwR/djX91b0ZhVxvpYUY3pz46YvE9QfIXIbGUmS0IpEzHVyoWVx6tSprFy50m7aggUL6NSpE5IkYTabOXDgQI0+vFZBq3KgprpWqyUmJobp06dTWFjI3Xffjbd39bkMLyQkxBKpW50gtmK9Ucpy9S87kiTx8MMPM3XqVL744gu6d+/O4sWL7V4CLsRaxKE64RwREcG5c+cAS3YGAB8fn1q3Iz8/nzZt2jB//vxa2wkEgn83Z4pKUUkQ61VzppyGYJTNHCtI40BeMhVmI1Ge/ugM3twS1sFhQ4y1vG6RocCuOEOePhtvJ198nPxRSSoURbH42hqLqDCXoZLUeGq88HTywUlVSwYbkwn274frrgOgXNajyc5EW6GnrGUkOSP6U3LXUDw7dEeLBr9Cic55+fg4W+79zurGS18maBhC5AqaBc8++2yV3LERERG4ubnRt29ftmzZwrJly+oMVHOEkydPcv/99/Of//yniq9qXRQVWfytw8PDL2oMw4YN4+OPP2bnzp3MmDGDrl27VrG2VsYqrqsrPOHu7m5b1ppyLDMzk7i4uBr78/T0ZMeOHWRlZVXxdT516hRt2rSp9zYJBIKrjzNFOiI9XHHROGa8MSsy6lqEnVE2cbQgjYN5yejNJtp4h9ItIIbEYiP/PXyWjNIKwj3qjgmwCtxCQ36VeQqKbbqHxouM8tS/rbauBLqE4q7xqFl8yjLs2gXffQfLlqHk5FCYcIh0X4UCQwHBLz1Ei/fe40QkRHu1IVDjxu7MAgr0JnyctTzavh1l5lKSS04T5FI1jkRweRGvGIKLwmqtVGooy2g2m2udbyU4OJi4uDi7f9Ycu6+++iq+vr6sWrWKHTt21HuMF6577ty5SJLkkMC90HJ67NgxQkJC6NixY73HUNmy6+TkZBP1S5cutfnpQvX7tGPHjoSFhbFx40b0+n8+k4HFymvNPGF1U6jJHcTaZ8+ePSktLeWJJ56wWX/NZjMLFizgzJkz9do2gUBw9XKmqO5qZEbZhMFs4lhBGntzznGsIA2jbMIom2xtDGYT+3ITWXhmJ7uzzxLrGcSYlr3oG9Yebyc3Ovh54qJWsS+n0OGxFRkK6pyvVmnw1voS4RZDmFsUnlqvqgJXUeDQIXjpJYiNhRtvhLlzITsbk583afs3YVJMtPZqQ8tbxiB17U4H/25sS9cxacdRFp5OY01yJotOpzJp5xG2pevo4Ne1+kEJLitC5AouCqtlMT09HZPJVGV+YmIiANnZ2bV+4q+NsLAwFi9eTGRkJI8//jiLFi2yE3rnz58nKSkJtVptV3zC6ltq/ZRvxc3NjeLiYtvY9u7dS2FhIeXl5ZSXl9uVv12xYoUtc8PevXv5/fffmTJlii0fb2ampS56Xl6ebRmrtTcnJ8c2LTU1lfPnz9tEP8A999yDt7c3N9xwg50vcWqqJWo3IyPD1t7JyYmXXnqJ4uJiZs6caROrP/zwA15eXjz44IOAJVCuU6dO/Pjjj2zevBmwuFYcPXrU1rfBYGDkyJHExcVx9OhRhgwZwo033kjPnj1Zv369Q1kvBALB1U+xwUhWuZ6WXjX7x5pkM/tzk5ifsI2tmSfZl5fE1syTfH16G/tzkzDJZg7npbDwzA7+yj1HK69gxra6gZtD2+Hl9I/FVqtS0SXAm73ZtQtXKzpjsZ2LQnUoKJQaS/B19sdJXUtBnOXLoUsXmDkTUlIwe7iTdfdAji15n6ST24m54xG6+HcjxC0EtaTGqMC65GzWJGdhlO3HYJQV1iRnsS45G1PtwxNcBoS7gqDBTJ48mS1btgAWEXvLLbcwZswYHn30UQBeeOEFtm7dClhE8M0338w999zDs88+W+91tWzZkrVr1/LTTz+xadMmFi1ahI+PD3q9HkVR6NmzJ2+//TYtWrQA4PXXX+fHH38E4O233+bs2bO8+uqrADz55JMkJCQwfvx4+vTpw4033siNN97I/v37WbVqFSNHjiQpKQmwWFAnTpyIXq/HYDAwe/ZsBg4cCMC7775ryxLx8ssvc+7cOZydnW1ZE+bOnUtZWRmHDh3i0KFDlJWVceutt3L//fdz//334+7uzujRo4mPj7fbp7/99htgyZt76623Mm7cOB544AEGDhzI3LlzmTt3LoMHDyY0NJSYmBi+++47W8ovjUbDV199xdtvv83kyZNp3749bdq0ISAggBYtWrBz504CAwNp3bo133zzDe+99x6//vorpaWl9O3bl1deeeWylUUWCATNm7NFlhf81j7VW3KNson9uUnsy0uqMs+syLbpke7+tPEOpYt/NB7amnPE9gjy5c+sAs6XVtSak9eSj97o0DaYFCOKovzj55ueDt9/D2FhcO+9lu24pS9qby+KbuzG+Tv6Uj6gLyH+LWjjGoxGVY1MUmBdSlbV6ZVYn5LF4CjhrtDUSEpd35H/RViDoeqK/K+oqCAxMZHY2NgqRQEEVwe7d+9m/PjxvPPOO4wcObKphyNAXHcCweVm2Zl0/szK54NeHasNBjOYTcxP2FZnad0H4nrjVJ1YrNKfzLM7DnNbdAhDY2oux2tWzOiMxeTps+vsM8A5GK8Ss8Va+913sG0bKApKjx7odv7K+fIMcspzQK/HzzuMULcwvLXetQa/bU3P4ZtTqXWu+/42kfQJv3xl5v9NOKrXhCVXIKgG8e4nEAj+7Zwp0tHS271GwZdQnFmrwAWLRTehKJMOvhF1rs9JraKzvxf7cgqqiFxFkSk16SgxFmFWzIS6RpCvz0FBQatywkPjiYQKBRmdqQSjbMB9xXo8f9gIGzZaMiX8jeH6a8ka3pekvP04q12I9IgiJDD0nzK7tWCSFfIqHLMi5+mNmGUFtarpq4z+WxEiVyCohuxsi4VA5IwVCAT/RoyyTGJJGXcHVZ9JxqzI6IwVDvWlMzqeL7ZHkC+fHk0ku1xPkKtzNXltXfF28kWSJHyd/HFSu+CkciK7PAeDXoeTswehrpEYZD2aFZuR1v5iGe81nSkcOZCkIddRFhGIn7MfHVzD8HX2q1fueI1Kwt+l+uJEF+LvrL3qBa7JLKNRN9/wLiFyBYILeOWVV1i1ahUAc+bM4ezZs8ycObOJRyUQCASXj5SSckyyQqtqKp3JikKRoRx3TS3BXJXw0DqeL7aTnxdOKok/Ms9zXbCKCnM5KlR4ar3x1HrbBZB5OvmQWpSIbuNqAlZuJvyXbRxa+ymJLaMJd48g8vEnKWvXhtRhvciO9Uer0hLiGkIH1zBcNA13eeoZ7MeShLQqQWeV0aokegb7NXgdzZlyoxkF2JCQQ6ZOT4iHMwNbW9wy3LTNK6ZDiFyB4AKmT5/O9OnTm3oYAkGjYZIVNFe5RUnQuJwp0uGkkoj0+CdjjcFs4mRRBofzU9Gq1AyP7s7O7IQ6fXLjvGv2r62MwaxHZyqkhbeavdmFXBfsT5BLKG4X5rVVFMx/7KR04ZeE/fQzTjn/ZGTwX7edtKciSS1Ngevb4HtzPPqSJNq4hRHgEtA4xRkkGBAZxNrkmoPPBkUF0wTFRS85epPMNwfSWHwoA735n+P+4c5Exl4TxgPdInHWNB/LrhC5AoFAcBVSYTaDAruz8smrMOLvorVYliRwERk0BHVwpqiUWC93NCqJEmMFR/JTOV6YjlE208ormGv8olBLEl38oqrNrmClq180ULPakxXZUo3MUIherkAlqekW6MWS0/k4q4Lx0F7gJ3v8OAwdijoxEa+/Jxl9vci9rQ85I/pRFH+NrWl6aRoR7pF09u/S4P1QHacKdNwSEYQKifWp9mnEtCqJQVHB3BYdglMz/ozfEMqNZr45kMbX+9OqzNObZb7en4YkwfguEbg2E4uuELkCgUBwlWEwy6xLzmJ9iv0DeElC2lX7ABY0HoqicKZIR9dATzalH+FMcTZalZoOvuF09I3Es1IasO4BsUhIHMhPtrPoqiUVXf2i6RYQg0ZVVfDozRWUGIsoMRajIOOqdiPIJQx3jQfBLjI/JBSwL6eQAYYSS9qv3r0tC7ZoAXl5yO5u5A7sRfaI/hT27oHiVNVPVkYmpyKbULewRts3ueV6Pj+exA0hfoxsEcbg6GDLi6TeiL+z5UVSkrgqry8FWHQovdY2iw5mMK5L3UGGlwshcgUCgeAqosJsZl1yFquTMqvMM8qKbfqQ6GCchUVXcAGyonAg7zxFBhNpZSmo1BpuCG5NO58wtNWkAdOo1HTxj6ZrQDSnizLRGfV4aJ2J8w5BUbATuLIiozMWU2IsQi9XoJbUeDv54Kn1Rqv6x2LrmpPNfet+pOUvq+HYYYiLw3D8MCXGEoqNRfis/p7iNlGkKDlVxnMherPjQW91YZYVPj+ehJtGzR2xobZSx33CA/8VWRQ2JORgMNeeeUhvltmQkMOI9o65qFxqhMgVCASCqwmRqF7QAIyyiZOF5zmUn0JikRnw4PboONr7hqCqxblUVmR0pgIMZgMtPP2wuCYo5FVk4aR2wkflj1E2UGwstFUpc1W7E+wShpvG45/MBvn58OOP8N13KL//Tu+/0zgqKhXFoT4cO7sZs5cHTionPK7rjtZshJK6Ra6z2vGgt7pYnXSes0WlvNQtDjetvXy62gVuid5Ipk5fd0MgS6fHJMtoVE1vzRYiVyAQCK4idmflY6ol6hssFt3dWfkiUb0AnbGCIwWpHC9IxyCbaekVRKnelQqjno5+obUuKysyxYYCXNRueGl9LGm8ZD1OKmeCXIMxyHqKDQU4q10pM5Xi7eSHl9YbjcretcCsmDE//wxO3ywGLDK5sHsH1l4/GPU9t9KjcyyttN54OXnhrHJGkiTMsonEkrPI1Bz0pkJFkEvjvMydLtSxOimT4bGhNVaAu5owmWUOZ5WwK7WAP1IKuTnWjwC3uvMIAwR7ODcLgQtC5AoEAsFVg0hULwAwyeZq/WArk1NezMH8FM4WZ6FRqWjnE05nvyg8tS5MSz9BS2/HhJy71ov00jTSS9PsBGdiyVnC3SMId49ALamIcm9hsdoaDBh/+Qnzd4vJfWo8ue0j0Rl1eA/qQos9uyi5ayime+7GvVV7Mk/kYZYV7vVqXc2aJcLdIyxZFGog3L1xfENLjSY+P5ZIa2+PWiuxXelk6/TsSi1kV2oBu9MKKTWY8XXREh/pQ7tAD7qGevPfXUl2WRUuxFmtsqUTaw4IkSsQCARXCSJR/b8Xg9lS0SuhOBOdsQIPrYvNL9ZJbXnUK4pCki6XQ/nJZJQV4ql14frg1rTzDrO1KTeZSdOVc0tE3ULFJBvJKMuoVmjKyLbpIU5BlP66FvX3y/D4eSPawmK0gNrfGZe3XyXIJRivkV1xu+cF3Cu5RlwbJPPNyRSKDEa8LwgsU6vURHpEAVQR2CpUljy5HlGopYvzO1cUhW9OplBhlnmkQ0ytrhtNRUMLMpjMMocyS/gjtYBdKQWcyS9DJUGHIE/uuyac6yN9aBvoYdvmcqOZsdeEVZtdwcp9XcJqyaVx+REiVyAQCK4i/u2J6v+NmGQzB/KSOJifYpfhYEfWabr4RdE9IJZzxdn8lZdIkaGMYFdvBoZ3ItYzsIq/6rniUhSgVS2WXEVRMCtmVJKatNLUGtupS0pxem0yzmt24HL+vG26OTgI5Z5RhN7/AKE+7WpcvmuAD99KKRzIKeTmalxr1JKaSPcoIt2jyK7IQm/W46x2trkoXKzABdh+Po+9OYU80TEWfxfHPtdfDhpakCFLp2dXSgF/pBbwV1oRpUYzfq4Wa+2EbhFcF+GDTw0vyq5aNQ90i0SSLFkUKlt0ndUq7usSxoSuIk+uQCAQCC4VkiURfXXZFaxcrYnq/40YzCYO5CVVm6vWrMi26RHu/gQ4e9A/tD0hbj419nemqBR3jZogVyfMshmTYsQkmzAqJipM5ZSadJSbywhxDcVgNqJg/zKlySvE5G/p3+zqTMCKzUi5BSi+vkh33gmjR6Pu0wccyOzh6aShjY8ne7OrF7lgsegChLqFNVoWBSvnSytYcjqN3qH+9AjybbR+L5b6FGQwmmUOZhZbhG1KIecKLNbaTsGe3NclnF5RvrQJcHfYQu2sUTG+SwTjukSwISGHLJ2e4L8FtvT3/OaEELkCwVWIwWDgl19+YcGCBYwfP56RI0c29ZAElwkXtZrboi1+gxfmyb2aE9X/mzmYX7NfqnV+V/8YbghuQYW5gsyyTJt4NSkmy/+yCZNiZF+OjL+rwq7sHVUErBUJCX+nQAyyAQDn1PMErtxC4MotaPOL2LP3e4uI1WhIfHUiPiEtCBx+H5Jz/Uvp9gjyYfHpVHRGEx7a2iVLYwpcoywz71gifi5aRsc1n7yvjhZkGNYmmA//SGRveiFlRhl/Ny3XR/ryUPdIekb64OXccPlnLfQwon1Is8miUBNC5ArqxU8//cSUKVOIi4vD29sbSZLYs2cPHh4etG/fHkVRyMjIID09nXfffZctW7awceNGvL296dq1q62f4uJijh49SosWLVi1alW9x7Fy5UrKy8sZPXp0Y27eRbFlyxaWLVtGVlYWbm6WUphBQUEMGDCAAwcO8Pzzz+Ps7Fit94tl165dbN68mRMnTlyW9QmaF05qFf3CA7k1Moi92QXk6434OGm5PsQP1VWaqP7fSkJxZq1ldcFi0U0oziTc3YMThccAiyDUSlo0Kg0aSYNGpcFF5UZ2WRk3hDoT7OqFSTGiKDIqSY2HxgMPJy88NZ6oVRrIyqJo0WL8li7Fa+8x27pkrQa308mUtWsBQPaogXh5tUbVAIEL0C3Qh0WnUjmQU8hNYQEN6qMh/Hg2g4zSCl7u0aZZ5ZN2tCDDvZ3C8HRSc3/XCHpF+dLa33FrbX1ozgIXhMgVNICpU6dy//332363adOGuLg4Fi5caJs2a9YsPD09mTlzJhs3bqRly5bMmzfPrp+TJ08yffr0Bo1h/vz5GI1G7r333n/yLF4Ey5cvJz4+noiI+r+xl5WV8eqrr7Jr1y7eeOMN+vfvj+rvC7+goIDPPvuMBQsWMG7cuAb13xD69OmDJEls2rTpsqxP0PzYnJZNQmEpk7u1xqwobE7LYWt6DoOir97o8H8bJtmMzljhUFudUY+7OoSegdejUWmqWD0NZj2ni/IwyGVEeEpoVCq8Nf64az1xU7shVW4/dy488wzeskVcK5JE0Q1dyb6jH3lDemPy9bI1vdg0Xt5OWuJ8PNh7GUXukbwiNqZmc2+rcKI93S7LOh3F0YIMW87lMa1f3GUaVfNFiFxBvfDw8GDYsGF1tnv44Yc5ePCgzaJZHW3btmXMmDH1HsOuXbs4efIkANu3b6e3tdxjAykrK+OLL74gPj6+Qcu/+OKL/P777/zwww+0b9/ebp6vry9TpkxBlmVyc3Mvm8gFLpvVWNA8OZpfTKibC5IkoZEkssv0HMsvZmBUcKO8GAqaHoNsxl3j2HXuoXVGq7Z/5BvMenSmEkpNJRhlAycKTKgk6OIfgY+Th0XYlpXBmuXQpg1cc41lwe7dQZZRel5H/oiBJAzqgjHYv9r1NkYarx5BPixNSKfMaKpShKGxKTIY+ep4Mh39vLglMuiSrqs+KIpCXpnhiizI0JQIkdvIKIqCQa7901FT4qRSXdQDbsCAAQ618/b2pk+fPnW2GzJkSL3HMH/+fCZOnMhnn33GN998c1Ei12g0MnnyZJKSkhq0/IoVK9i0aRNjx46tInAr8/TTT1NUVNTAUQoE9aPIYCS5pJwBlR7SXQO9+T0jl/TSCiI8XJtwdIKLpUBfyl+55yjQlzI8qhs7sxNqdVlQSyrivC0W/AuFrYQKd40Hfs6BFFTkEO2hxlflCr+sg+++g1WrQKeDRx6Bzz+3dNizJ5w7hxQbi49iJkSXcknTeHUL9GHx6TQO5hbRK7R6Md0YyIrCV8eTAXiofXSTpwvTm2T2ZRSxPSmfHSn53N42+IosyNCUCJHbiCiKwjv7T3OmqLSph1IjrbzdmdItrsktOUePHqWiooIePXrUa7mzZ8+SlZXFvHnz2LhxIzt37uTs2bO0bNnSrt2kSZPYtGkTRqORb7/9lp49ezJv3jw+++wzysrKeOeddxg5ciSffPIJx45Z/MmsPrMzZ84kLCyMkpISPvroI86ePcv58+fx8vJi4sSJ9O/f37ae+fPnAzB06NBax+3l5YWXl+UTXnp6OqtXr2b16tU89NBDJCUlsXDhQh5++GGefPJJTp48yfvvv4/ZbCY5OZkWLVowbdo0oqKiSEtLY+3ataxcuZLbbruNsLAwfv31V/bv3098fDzTpk3Dx8enyvp/+uknfvvtN3bt2sXdd9/N5MmT67XfBVcWx/KKAejg989n47a+nrioVezPKRQi9wqlyFDG3txEThedx03jTIS7CwoKXfyiqs2uYKWLXxSKopBamlhF2NpcERQF+fdljN7+K2xaZymzayUmBlq0+Oe3JEFsLHB50nj5OjvRytudvTmFl1Tkbk7L4Wh+MZOuaVklL+/lIq/MwM6UArYn5bMnrZByk0yYpzM3x/gTH+lDS1/3K64gQ1MiZL6gSVi5cmWDlps/fz7jxo1DkiTGjh1rSdT9zTdV2s2ePZtnnnnGbtpjjz1WxQd40qRJjBgxAoAPP/yQhQsXEhYWRkVFBWPGjCE8PJwFCxawfv16unfvzpNPPmkbe3Z2NqdOnQIgLq6q71NqairDhw+nX79+tn/PP/88BoOB8vJyzpw5w7p16+jatSu33HILnp6e6HQ6HnjgAa655hoWLFjAd999x19//cWMGTMAiIiIoE+fPpw7d44dO3bQrVs35s6dy6effsqGDRuYOHFilXFs3LiRG264gTlz5vDCCy/w9ddfs2fPnvrvfMEVw9H8YqI9XfGq9KDWqlR09vdmf05h0w3sMlNXeeMrhRJjBb+fP8F3Z3eRqsvjusBY2vu44KKuoMJcRveAWLr7x6C+wM9WLano7h9D94BY1CoVbmoPgl3DifFoSZBrKO4aD5uvbZHeyMgZr9Lq+8UWgRscDE8/DX/8AefOwUsv1Tg+tUqNWqUm1C2MKI9oQt3CbNMaix6BPhzNL6bcZG60PiuTXFLG8jPpDIgMopO/d72XN9UiOmtDURQS8kr5el8qD/50iCHf/sWM389QUG7kwe6RfDeqKyvGdOeFG1vQKdgLSYKx14TV2mdzK8jQlDRbS25qairz588nLS2Nz62fSC5g0qRJ/PLLL7bfbm5ubN++HQ+PpqkrLUkSU7rFXdXuCg3l9OnTjBs3DoDCwkISEhIYNGhQvfrIz89nx44dvPrqqwDccccdfPjhh/z88888//zzVSyYAQFVgxSqm1YdX3/9NVlZWbYAO0mSmDRpEmvXrmXGjBnceuutnK+U3NzJqeonpMjISFasWMEjjzzCjh07GDNmDNOmTQMgPj6ezz77jO7du9O3b1/69u0LQEJCAvn5+TbXh5CQEFq2bGnnTmE9v3v16kVMTAwAXbp0YfDgwaxZs4a9e/faWcgHDBhAcLDFqnLttdcCcOzYMa677jqH9oXgykJWFI7ml9AnrKrFq1ugN58dKyC3XE+A69Xps11hNoMCu7Pyyasw4u+itRS+kCzp1a4kSo169uclcawwDSeVhvigVoS7uXOm+BQalYYuft0wK0ayy9Pp4h9Ft4AYThdlojPq8dA6E+cdgqzIZJen46H1wt/lb+veyZMWV4T162H7dnBy4kxxKeduG8ntxhKc7xsLN98MmvpLhMZM41WZ7kG+LD2TzqHcIuJDGreQid5sZt6xRMLcXbizZe0CsjINLchgMFvcEHYk5bMjuYDzOj1uWhXxkb6M7BDKDVG++LpePQUZmpJmKXL//PNPfv/9dxYvXlzjgzg5OZk///yT2L8/mQD079+/yQSuFUmSmlW6kebChdkXfv3113r3sXjxYoYPH24LqPLw8OCOO+5g8eLF/PDDDzz66KONNt41a9YQGRmJutKxdHZ2ZsCAASxatIi//vqLkJB/otSzs7OrDSpTqVR06tSJHTt22PnsWrMvBAXZBza0bt2aRYsW0bVrV4xGI1u3bqWgoMDWvjIXvqz06NGDNWvWcPTo0RrdQKz7rqysrK5dILhCSSkpR2c00bGSq4KVTv7eaCSJA7lF3NqMgmoaC4NZZl1yVpX8wEsS0q6o/MDlJgMH8pI5WpCKWlJxbUALOvpEkKPP5GTRMXydfGnj0w6NpKHAkEuFXE56WSJuandaePoBEqCQV5FJmdniPueWnIfy00akpUvh4MF/VrZhAwwbxpmiUvY+8Ch39+rYFJtcJ/4uTsR6urEvp7DRRe53p9PIrzAy7dq2aB30Y61PQQaAgnIjO5Pz2Z5cwO60AsqMMqEeztwY48dN0b50C/N2+Ny80goyNCXNUuTGx8cTHx9f6yftL7/8ko8++oiePXtevoEJGo3Kfq1gSUt24fFesGCB7SVHr9fz/fffExYWxv79+21trGJt8eLFPPjgg2gaYHmojvT0dMLDw6tMt04rLi7mhhtuwM3NjbKyMvbu3Vtj5gSr5PqTSgAAp8FJREFUUFY7+PLTrl075s6dy5kzZxgxYgRhYWFkZtZcvcqKVXQbDIYa21iFsdyMvzYILo4j+UW4qFW0rKYsq6tGTTs/Tw7kFF51IrfCbGZdcla1ld6MsmKbPiQ6uNkaIirMRg7lJXMoPxVJgi7+0VzjF4VGpSKh6DQ5FdlEuEcS4xGLJEmYZCNq6Z97Xpm5lLJy+5gQ578O4ffKe7ju2vfPRI0GBg2C0aPh769IZ4pKaeXtflm2s6H0CPJhVeJ59GZzox3DvdkFbDufx4S2UYS6O5bL19GCDPd0DGPliSy2J+dzNKsEgA7BntzfNYKbov1o6efW4C+rV1JBhqakWYpcK66u1QdHZGdns3XrVm655RbKyspqTVMluDJ49tlnmTBhgt20yqJx1apVxMfH88EHH1RZ9sEHH2Tnzp1s2LCB2267zTb9YtwyQkJCyMjIwGAw2LkiWL8UREdHo9VqufPOO1m4cCHLly/njjvuaPD6rGRmZjJmzBhuv/125syZA/wT3FYX1uwN1Ylzwb+Ho3nFtPP1RKOq/vzvGuDDwlMpDlWQuqJQYF1KVq1N1qdkMTiq4TlbLxUGs4nD+SkczE9BVmQ6+UXS1S8aF40TFaYKDuUdptxcRlvvdgS4BFJmLqXIkI+syIS6RpKvz7FVJ5OKSlAVl2COtHx2lz3dcd21D0WSkPr0sQjbO+8E/3/cWYxmmaSSMuJDmk/p2uroHujLsrMZHMkrbpQyu3kVBhacTKFHoA831SOgrT4FGbYm5hHo4cQrN7eiV5Qv/g5mR6gPQuDWTLO+w9UkUubPn09WVhaPPvoobm5ujB49mmeffbZR8oIqilLnp1y9Xo8sy5jNZszmS+MEf6VgtQha98eFWKcpilLrvgoICKjWX9a6j+fPn8+cOXOq7eOxxx5j586dfPXVV3Z+vlZxWl5eblvOOt7Kx05RlCrb0L9/f7766is2btzI4MGDbX0mJycTGRlJhw4dMJvNPPXUU2zdupW//vqLpUuXcvfddzu0j6obB1h8gdPT03nooYfsxld5/1mX1el0dssePXoUV1dX4uPjMZvNta63puPVnLFuU3l5ubBE10C5yczZolJGxQbXeB9r6265LvakZxMf5HMZR3fp0Gq1/JldWGegmVFW+DMrnxuCfDAajZdpdLWNx8zJkkyOFqdjlM209Qyhk3c4rmonZIOJzLJ8zpWfRY2aONe2SGaFFN05zJjQ4oS7yhNFkfE2u2BcvQKP5Wtx3bCVsttuIXvBh5Z1tG2Fbt7HuAwajiGgUsR9pfPjbEkZZkUhwlnTrF2ZPIAIN2f+PJ9Le4+Le97LisJnx1NwVkmMig6kvLzcoeW0Wi0bEvIcKsjw69lc/je0HYrZ9PdUE2VlplqXEziG9bldF81a5NbEwIED6dy5M4cPH2blypV89dVX7N27lwULFly0VddgMDhUClWj0aDXO5aU+WomOdmSUzA9PR2dTlfFXeDcuXOAxUJZXFxcbYBWXSxatAiVSkVoaCgVFVWr+3To0AEPDw+OHTvG0qVLbRbV6OhoAPbu3cu1115LSUmJrYRwbm6urS93d8snurS0NCRJQqfTMW7cOH755Rdmz55Nx44dCQwMJCMjg7Vr1/LKK6/Yjr1Wq+WLL77gueee44033iA9PZ377rvPZvEtKCiwnU/Ozs62dVqD1hISEuy2ybp/9u7dy3XXXceZM2dITU1Fr9dTWlpKZmam7eVvw4YNjB07Fn9/fxITE/nxxx+ZOHEiLi4uVFRUkJqaCkBWVpZtHdnZ2bbjUd2+bM7o9XpMJpPtnBJU5awRZCS0uZmcyK/ZxSVYDTtTzuOdd77GNlcSYRER5Fc49tKWrzeSm59PRlrVT82NgYuLC76+vqjUamSzmYKCgirXmoxClkZPhrYCMwqBJmfCTO44l1WQlHUWBQWTuxGjtwGVQY2b7EKRUy4SEsZSMxX5Bsy6Qrx2r0bavBnf339HKimx9a9NTAZZRlKp8dL64vzg45w8eQp9Tm61Y96vt4iB4uRETjTzsPxwM+zLq+CIsRjNRYz1rwo4p4c73CE54bTDy4WGhTtckCG71EBhQQEZ6ZfmXPs3Y3XLM5lMtbopSoqjcrgJ6NevH+Hh4XYBSxdSUlLCiy++yJYtW3jwwQcvKv9n//79URSFNWvW1NpOr9eTkZFBTEwMLi4Nq8d9NTBlyhR+++03iostOTlDQkIYPXo0jzzyCAD/93//x7Zt2yj5++br7+/PqFGjePrppx1ex6RJk9iwYQNgCcp67733aNOmjW1+cnIyTzzxBImJibZpPXv2tH3inzdvHl9//TU9evSgc+fOdOjQgcmTJ9OnTx9GjhxJjx49KCoq4sknnyQ/P5+7776bCRMmIEkSmZmZfPDBB+zbt4+YmBjc3Nx48MEH6d69e5VxyrLML7/8woYNGzh79iw+PhZLkdFo5JprrmHUqFF06NABgM8++4wvv/ySsrIyJEmiX79+NteEwsJCJk2axKlTp+jTpw89evTg1KlTtpy69957L0VFRdx6663cdtttFBQUUF5ejk6nY/To0dx7772A5WvHJ598Qnl5OVqtlhEjRtCtWzfef/998vLyUKvVDBs2jLffftvhY9HUVFRUkJSURFhYmKjmVgNLzp3nbHE5r3ZpUWu7zRl5rE3LZWb31ldEIFZdaLVadmYX8u2p1Drbjm8TeUksuZIkodKqkSQVCcWZ6IwVeGhdaO0VgqLIyEYzJtlMgi6bw0VplJsNtHIPorNPBJ6af54jsiKTXJFIvjEfX60PbloXkMBVcsdd8kQjWaLunQcPRr1tm205JSoK7hlF6d23Y+jUBo3KCQ+tJ7KsYDQYa7V8fX4qjQqzzDPtoxp1n1wKssr1vHUokUfjwuns59mgPs6WlPHRsRQGhftzW2Td+WTzy43syShmd3ox7YO9cdOqeX9H3S/bU3q3ZGhr/2bx1eBqw5qbfsuWLbW2u+JFLlgU/YgRIyguLmb79u0NXp81GKquyP+KigoSExOJjY39V4tcQdOQlpZG//79eeqpp+r1wnClI6672lEUhRf/OEbXQG/GxEXW2jarrIIpfx7nyU4t6B7oc3kGeImpMJl5dsdhu6wKF6JVSXx8Y2dcNI0feGaSzezLTeRgfopd9TG1pKKLXxRd/WP4Je0gGWWFtPYK4drAFvg42X95rDBXcKzgCOWmcnycvfDQuOOl9cH7aCLqZT/CK6+A59/CbuZMmDULRo2y+Nlefz387ZupKIrDMQmKovDcjiPcHB7AiBaOp89qSl7dfZwoDzce6RBT72XLjCam/XUSX2ctk7vGoa7Gd90sKxzLLuGPlAL+SCngZK4lmK9doAfD2gQxOC6QQd/8VWdBhg0Trqs1lZig4Tiq165Id4ULcXJyYuzYsbz77rtNPRSB4JLTjN9LBU3I+bIK8vQGOvlXTR12IcFuLoS7u3Agp/CqEblIMCgquNrsClYGRQVzKdKEG2UT+3OTqq06ZlZk2/Qbg9ugkiT8nO0zXyiKQnZ5JmdKziAhEeIaRHBqCe7LF6NauhT+LjpD584wdqzl76efhhdeqDaXbX2CbrPL9ZQYTc0+s0JlegT5sjElC6Ms26X8MpllNLV8mVAUhW9PpVJmMjO5a2s7gZtXZmBXaiG7UgrYnVZIsd6El7OG+Egf7u0cRnykD36uf8d5GM2MvSas2uwKVkRBhubBVSFyAUJDQ2ndunVTD0MguORY/WpzcnKaeCSC5sTRvGI0Kok4H8c+4XYN8OG39BzMslKtNetKw0Wt5rboEBQFNqTa58nVqqRLmidXUeBgfkqtbQ7mp9AtIAat6p/HrqLIlBiLSS1NIV9fgHuxnnYrduPyw0qkSqkScXaGYcMs5XWtNFJWoTNFpUhAS68rSOQGWlKJnSksJcbTzeGCDDvO57Mnu4CJHWLwcXbi0Pli/kgtYNff1loJi7V2VMdQro/ypX2gR7XXhijIcOXQKCLXaDSyZMkSfv31V/Ly8ggJCaF3796MHj26QYFGVqxR5Y5w4MABnnjiiQavSyC4EpgzZw5ff/01AMuWLSMlJYUFCxY07aAEzYKj+SW08fHA2UER1y3QhzXJmZwu1NGugb6NzQ2jLNPBz5NB0cH8lZVPnt6Iv7OW7kG+5FcYLpn/cUJxpp2LQnWYFZnTRZl08I3ArJgpMRRSaMgnrzyXMtlAsEswrSpcUL000LKAWg233mpxRbjjDvCq20LfEM4UlRLm7oLbFZROLszdhfa+nkR4uDpckCGzrIJvTqQQqHJh1eEcpqYl2Ky110f6MLpzGD0rWWvrQhRkuDK46LO6tLSUcePG2WUkOHfuHH/88QfLly/n22+/xde3/vnsDAYDxcXFuLq62vkXZWRkMHv2bEaOHMn1118PwG+//Yabm1uVAgMCwdXG008//a/ywxU4hsEsc6qwhJH18KmM9nTF11nL/tzCq0bk/plVwHcJqcy+sRN9wgNtVuojeUXMPnSWqd3jaFVNkYyLwazI6IyOZSrRGfWUGkvJyTmF8+pN+CxbhadGhXnFjwS5BoMP8NRT0L493HUXBNYdFHWxnCnSNfo+udRIksSY1pEsPJDOggO1F2S4q0MoS49ksOJUJsVlZiTKHLLWOoIoyND8uWiR+8knn3D8+HGcnJxwd3dHkiQMBgOlpaUkJCQwe/Zs3nzzzXr1uXTpUr744gt0Oh06nY7bbruNl156id69e+Pi4kJmZiaPPfYY7du3p1OnTgwYMICJEyde7KYIBALBFcnpQh1GWaGjA/64ViRJolugDwdyChnTOuKiiqc0F/44n0dHPy88tZYMBFbx0sHPi3B3F1adO88LXRvHrU1RFFJL86gwm3DT1J3tQ603ELphC9pVa4laux5VhSUNlaLRIFU4gbX20d+ZVi4HZUYT6aUVDGqGBTLqwttJw5LDjhVk+Cu9CJUanro+mmFxwfi6aht9PELgNk8uWuRu27aNxYsXV0mrZDKZ2Lt3L6+88kq9+7z33nttqZAuxM/Pr85sCwKBQPBv4kh+MX7OWsLc6pd1oluAD7+m5ZBcUk6M15VdOTKjtJzEkjIe7xhbZZ5KkrgjNpS5RxM5Xagjzqfhlku92cTJogyOFqRRZCijpWcQN4e244/shBpdFrp89QPd5y3BWfdPyV19y2jU941HM3osNOBrZ2Nwttgynisp6MzKxjO5DhVk2Hw2lzu6BOKiUV+RYl5wcVy0yHVzc6s2b6hGoyE+Ph4fH5+LXYVAIBAIauFoXhEd/bzqbY1t7eOBu0bN/tzCK17k/pGZj5tGTRd/72rndw30IcrDlZXnMnixW1y9+8/X6ziSn8apovPIikwLryD6hbYnxNUbvdlAF78oSxYFRSHk4HEKYiPR+1gs60Z3V5x1pcgR4WQMuxHl3tGE3zgUlapp00slFJXiqdUQ5Hpl5Z0urjA6XJAhp9RAjwAvul0l1f0E9eOiRW5FRQUvvPAC1177/+ydd5xU9b3+32f67MzstO19YXfpvSpYKCKKilFRQGNJ8Sb23JiYmBt/JjE30asxmpgbTW6sgGKPghVsIFKkC0hdtrepO33mnPP7Y9iFZdssLE3P+yUvnFO/Z9idec7nfL7PMwG73Y5Op0MURXw+Hxs2bDjjUpUUFBQUziRawlHqQ9Fj8jjVqARGZljZ1OztUz/v6YYky6xpcDMx2462m8llKkFgbmkuf9m2n52eVobYe+9DlmSJytYWtnmqqQ15SFPrGO0sYpitAJM2KQxlWSaSaGVsjZfCp1/G8tqbWOoa+eS+O/hq/qWoBRXmhdcTmzaXrUPSKLCUkJOW06/Xf6zs8wUps5pO+1aVSEJkc72fNdVevqj2MHNgBhlpqU0QyzDpGO5MR3WaX6PCieG4Re53v/td7rvvPpYvX97l+mNpV1BQUFBQSI3t7lZUAgxNQbR1xdhMG2sa3DSGImT3sd3hdGGnpxVPNM6UHGeP243OsFJiSeON/XUMHlvRrbgLJ2Ls9CZbEgKJCDlGKxfkDWdAehZq4bCIju3egbBkCZYXX0a962vabhNEk4kBCS3O7EGUW3OIS3Eo01Ac85BpPD0emYuSzH5/kMtKck/1UDohyzIHvWG+qPayptrDxjo/UVEiy6RjcqGNkdkWhmen8/iayl4DGS4qzzwh4R8KZwbHLXKvvvpqvF4vf/nLXzpE12k0Gm6++WaubTOuVlBQUFDod7a7fQxINx2zBdRwhwWtSmBTi4/ZRWemyF1d7yI7Tc+AXlouBEHg8gG5/HnLPnZ4Whnm6DhRrznsZ5unmj3+RgDK07MZYS8k03h4O1EWCcZbCTRXkTtsHEIiAYCs1+GaPgnflRejvexyVGlmLHKC7Z4thBJBCk1FFJh6TqI7mdQEw0RFqd/7cXsLZOiOQCzBhlofa6o9fFHlpT4QRasSGJtn5UcTi5hcaGeA3dh+Y5JKIMO1o/KUCu63nH4xxrv55pu54oor+OSTT2hubsbpdHLuueeSnX163LEqKCgofBNJSDI73a3HNaFGr1Yz3JHOxmbvGTkxJ5wQ2djs5ZKS3E6VWVGWOlReAUY40hmYbuL1/XUMtVuQkNnvb2Krp5rGsA+zxsCEjAEMseVh1CQficsuF7Gli0l8tYWmP/wcGRmj3YF44QWo4wmka65h3dRcEm2BCmIjtDZ2OG9tsIZCU9GJeyP6yF5fAI0gUGI5/l7scFxMOZChDUmW2eMKsqYqWa3d2tiKKMkUWQ2cU+LgrEIbY/Os7TZdR9NbIMO1o/O4aUwBeqWK+62m39yfMzIyuPLKKzstf+6557j++uv76zQKCgoKCofY7w8SFqU+WYd1xZhMG0/vPIgvGseq7397pRPJhiYPcUnm7BwHkIzYleVkQEMgHsGsNVBhTfbAalWa9mruI5v38sr+rwmKTYTEGAVpDmYXjKTEnJms/gUCJN5YgrRkMdoPPkIfj6MHHHfejmngMDQqDbz5FqjVNIXqSPj39DhOCYmmSCO5aadH7/NeX5BiS1q3PcypEk1IKQcyeMJx1tYkRe3aai/ucJw0rYrx+TZ+OqWUswrt5Ken/jRBEODSIdksGJnH+3tbaAnGyDDpmFWWQSCeOO17jRVOPH0WuVu2bOHtt99m/vz5DBw4kDfeeKPbbd1uN3/5y18UkaugoKBwAtjm8mHWaig+zmrcqEOOBJtbfJyXn9EfQztprG5wM9huwWHQkZBENrZUstld1cHOa1XjbkY7ihiXUYo7GqQmVIlFF+fTOj9XlWUywlGIQ5+0FRPXrCb26CNo3n4HTfjwxGlp1AiE+QuxOgqgLZpXrUaSJaJiajP9o2IUSZZQCafeU3WvN8j443QcCMdFnt1U02XLQFsgA8DFFVn8esVudjUHkIEKp4lLBmVxVqGdkTmWYxLaEVHknYONvFXZQL7JwPgsOwMsBsKixP9s3kNtMMKlJTlcXJyNXq1Uc7+t9Fnk3nrrrbhcLrZs2cLSpUv5xS9+odwtKSgoKJwCtrtbGe6wHHffoUWnYZDNzMYW7xklcpvDUXZ7A/xgSDFxKcHGlsqkjddRiLLUvrzA5KQlGmBWYRav7vOTLmRhF1WEEgFa4z6Er9aQ9fLrAEhlAxDmL0RYuBDVkCFdjiEuxdGpUoyCVetPC4HrjsRwRWPH3Y8rAy9s6TmQYdGWOhaMzGN8XjrzhucyucBGhim196u3k79TlWwJqQ1GqD1Q32mTd6sauegMbME5k4iL0nE/DTiR9FnkZmZm0tLSgsPhaF8my90bMisC+JvFa6+9xi9/+UsqKiqwWq0IgsC6deswm80MHToUWZapq6ujtraW22+/naVLl9LY2IhKpeL73/8+1157Lbm5h2fztrS08Le//Y1FixaRl5fHPffc034DFQgEAFi6dCmjRo3qdkzBYJBp06bh8/mwWCycd955PPLII32+th/96Ef88Y9/VLydFc4I/LE4B1tDzCzon+jXMZk2Xt5bSzghYjxD+hg/b3CjV6sYl2VDlmU2u6t63H6zu4oxzhIWlk5GWLuW7Kf+RtkH7+C58ya8P/kBOpUO8xVXI+2sQ7VgIapx45LPxLsgEA9wMHCAiBhlpGMUB1r3I9H9TH8VKrIMp4fgOhwCcXxxvu/taU4pkGHlfhe3n9U5pON4WNvoJiH1fO64JLO20c15+Sc+HvnbRDCabAl6+ctqqj0hCu1pXD0+OanSpO+3Lth+oc+jefrpp1m3bh1TpkwBwGq18sADD2C1djbg9vl8/OpXvzr+USqcVtx7773ccMMN7a8HDRpERUVFhyS6Rx55hMGDB/PUU08xd+5czGYzP/3pTzvd9GRkZHDfffexbNkynnzySSoqKpg9ezY33ngjM2bMIB6P8+STT/K3v/2t2/EsXboUn88HwOLFi6mo6LvR+86dO/noo49YunQpN998c5/3V1A42XzlbgVguOP4+nHbGJNhZcmeGra5fEzMdvS+wylGlmU+r3cxPtOGXq3mK09Nt4ljbVi/3ofvqddwvrEM4UAl4w8t97zzGfm/+h06lR7BJMAjf+r2GKFEiIOBSloizRjURorNxagQyDcVUB3sXmTnmwqO5TJPCHu8ATINumPqv46JEtsbk7Z1qQYyNAaiJCSp36JvE5KMKxLvfUPAFY0jSnJ7xLPC8RGJizz64W4eX7mHSPzw79s9r23ljunl3D1rEIZuJgueCvoscm02G7NmzWp//cQTTzB+/Pgut5UkCfspiitUODGYzWYuvfTSXrf7wQ9+wObNmxk8eDBTp05l1apVrF27lsmTJ3fadsOGDUyePLmDOM3OziYjI4NEIsHKlSvZu3cvZWVlnfaNx+M8++yzZGZm0tzcTElJyTFd17/+9S8AFi1axPe+9z00mtPrblRB4Wi2u/wUmY39NlEsw6inyGxkY/OZIXL3+II0R2LclOtElCUC8R6Ch2SZK+ffQfa2Xe2LJFMa0mWX8O/JM9g6diL3qfQ9PnmMJMJUBQ/SGG5Er9JTnl5BljG7vf2g0Jx0TqgN1nSo6KpQkW8qoNBchFo4Pb78kyEQqVVxJVlmryvIuhof62u9bKr3E0lI3Da5mKwU2w6yzfp+E7iQDDFxGlL7uXfqtYrA7SeC0QSPfribh977utO6SFziofe+RgDumllx2lR0j/un7vHHH+923RdffEHikIfgtwVZlgnHxdP2T0+tJakwa9YstNreP1ysVivnnXceQHvV98hK75G8+OKLXfopq1QqbrjhBmRZ5h//+EeX+7799ttUVFRQWlravk9faWxsZOvWrVxyySU0NDTw3nvv9fkYCgonE0mW2e72H7erwtGMzbSx1eUjLvVcET0dWF3vwmnQUWZNwxsNkaY5HE1rbHYz6M0PDm8sCPgLchC1WvwXz0JashhVUzOaxS8x6oZrqYpJrG/ydHmeqBhlr28PG1rW4466GWApY3zmRHLScjv016oFNYWmIiZnnU1ZejmFpiLK0suZnHU2haaTK3ATPQQkREWJqkCIMlv3/bi1/ghv7GzgVx98zexn13HdK1t4ckMVkgw/HF/I81eN4rpR+VxUkYW+l35MvVrVbifWn0zKdqDtRbxqVQKTzoAbtjMFWYbHVvTsIvLYyp7Xn2xOqNQeOXIkd999d5fVu28isizzwze2sbWx9VQPpVtG5Vh4au6Ik9orfe6551JWVsbKlSuprq6msPCwIbrb7Wb//v1MnDixy30XLFjAk08+ybJly7jzzjvJyztsvyPLMv/617+47777erzZ6o3nn3+e+fPnM2bMGN5++22effZZ5syZ02EbSZJYuXIlL7/8MuvWreP999/nhRdeYM2aNTQ1NfHAAw8wderU9u3/8Y9/sGLFCsLhMLt27eKss87imWeeaV+/adMmnnzySUKhENXV1YwdO5a7776b3NxcZFlm7dq1vPXWW7z//vssW7aMO+64g4MHD/LMM88waNCgY75WhW8G1YEwrfEEI/qpVaGNMZlW3jhQzy5PKyOcnVvQTheiYlKUjsow8OL+L9Cq1MxNH4Dr1XcZuGwl+eu2oJIkmoZV4CkrBmDNf/6AVff/hGvHXYxKffirb6DVxEhnOm8eqGdClr19El9MilETqKY+VIdKUFFsLiEvLR+1qnux2rYuNy3vpLsopOpVW+kPIsp0mHTmDcfZUOdjfY2XdbU+av2RZIpeppnvDM1hYr6NETkWdEcJWgF6DWS4bnQeJ+TbRoDZRdm8VdnQ7Sazi7K7a6lWOAZe/rKaaKLnG+BIXGLphmpumtK/PdjHyjGJ3FtuuYWPPvqo/fWQbmadAhgMZ2aCzrGi/EJ1zQ033MCvf/1rnn322Q5Rzy+//DJXXXVVt/uZzWYWLFjAU089xb/+9a8O+37yySeYTCYmTJhwzOMKhUK8/fbbvPXWW1gsFoYNG8aWLVvYvHkzo0ePbt9OpVIxc+ZMXnrpJUKhEO+99x533HEHd911F9dddx333nsvn376KQCrVq1ixYoVLF68GJVKxYoVK1i0aFH7sdasWcN//ud/8tJLL1FUVERzczM33HAD8+fP55VXXsHpdGI0Gtm2bRt+v5+lS5fy/e9/nyVLlqBWrHAUgG0uPwa1ioH9nFZVYDKSadCxqdl32orcqBjntf37iYgSscA+Jny8hcHLP0X/wUrOPyJ1s2HUEHSBYPvrQH4245wliHICWVZ3uNG/vDSP327YxdpGNxOyrNQEq6kLJV0DCkyF5JsKkr64feBkCty+eNXu9QXRCQI17iivb2tifa2X3S1BZKDEZuTsIjsT8q2My7Ni6eWRc2+BDNeNzuPGMYfP3Z8Y1GrmFCf9j9+taiR+xCQ0rUpgdlE2c4pzOglzhdRpbo3yxX4Xa/a7GFNoo9oTSmm/Gk/4mJPv+ptjErl/+9vfeOqpp3j00UeBnt0VzjnnnGMb2RmIIAg8NXcEkV7udE4lBo3qlDhezJ07lz/96U+8+uqr3HnnnVgsFiRJ4u233+bFF1/scd/rr7+eZ555hldeeYVbbrml3dnjH//4Bz/4wQ+Oa1yvvvoqM2bMwGKxAHDdddfxy1/+kmeffbaDyG2jrcf8uuuua182atQoNmzYgMvlwul0smvXLoLBIPF4HL1ez4wZM/j662QPkyzL/L//9/+YM2cORUXJHr7MzEx++tOfcsstt/DnP/+Z3//+94waNYpBgwbx9ddfM2/ePLKzs7nggguO61oVvjlsd/kZYrf0a58jJD/DxmTaWNvo5jq5sF8iUWVZ7pfPnEAszEbXAXb5GtjpMuEwqLls3wGKbzs8uVkaMZzaSy/ks2lj8OYfdjJQCypGO4oY4ywmKoYRNB2LLyXpaYx2pvPqvmokdgESeaY8CkxFaFWndzhGKl61ggDzhuXyxs5GXv+6nubWOBu/3kFGmpYJBTauGZHHhHwr2WZ9F2foGb1GxfWjC/ju6ALe29NMYyBK9qEqsnBo/YlCp1ZxcXE2FxVls7bRjSsax6nXMinbgSDwrRG4/WHjJcsy+1uCrNmXFLVr9rvY25R0OCq0G5lU6iDflpofd4HdeFoIXDiOdoW2GejPPfdcl0JWrVZTVFTEd7/73WMf3RmIIAjdxhB+m9Hr9cyfP5///d//ba9MfvTRR0ycOBGTqedqVGZmJt/5znd46aWXeP7557nzzjvZsmULHo+HGTNmHPOYJEni+eef58knn2xfNmfOHB566CHef/99GhoayMnJ6bBPVz2/RqMRgEgkOfFlypQpPP7443znO9/hjjvuYNasWdxyyy0AbNu2jYMHD7b3ELcxY8YM0tLSWLFiBb///e8B2qu2Sjy2wpGEEiL7/AEWVhT2vvExMDbTxvvVTez3pz456WikQy4HgbifhBxHI2gxa5OtFalWOCVZJCJGaAp7+MpVTeLj1VQs/wjHgAGsnXUTVw50kDXqWuRxzyBceCHS/GuoLbVg12dylUrPHn8jgXgUs1ZPeXo2MSlKY7iGfFNxp7HWh+oYaGtms0tPQ8jOpcUD0an7LvhOBal41b6wuY75I/L47KCbqCQxc5CT740sotRu7JcbkLbvvO8MzelXF4VUaAt6OC8/81vlonC8Nl4JUWJrrY81+1zt1dqm1iiCAMNy0zm/IpNfXjSYyaVOCh1JcRuIJPjl61s7uCocjUGrah/H6cBx9eTefPPNmEymLicNKSgczbXXXss///lPXnjhBW688UYWL17Mvffem9K+3//+93n55ZdZvHgxP/jBD9qruD19QA8dOrTD6/z8fD744PBklA8//BCv18t9993XYTuDwYDH4+GFF17g7rvvTvn62p5oDBkyhBdffJHf//733HnnnZSVlfHAAw8wZswYamqS1ZZQqPNjn4KCAvbv35/y+RS+nez0tCLK/WcddjRlVhMWrYZNzb5jErmSLOGNufDFPMgcfsrnijZh1dmx6ZxdCt2ElCAihomIISJimOawD8/q9Thef5fz3/kYU5MLgGBRMeoLb+L8vAKMWg1s2JA8gCxhirloitSSpjYxwOIg2TUq4442EhKD2HSHJyFJskRjuIGqQBUxKUq5LYcxGSrWNkSZW9IPYQUniVS9alfsc/HnOUP5yeptLBiZxwDH8aXkdcfJFLhH820RuMdi4xWIJlhf6W4Xtesr3QRjInqNivHFdq4/q5jJA5xMLHFgS+v6518Q4I7p5V26K7Rx5/Ty/rnIfuK4J571JHB37dpFQUEBZvPxGU4rfDPIzMzkkksu4fXXX+epp55CkiQGDhyY0r7FxcXMmjWLd999lwcffJCvvvqqvV2mO46OnD7aFeJf//oXf/7znzn77LM7LK+treXCCy/k5Zdf5tZbb22v1PaFoUOHsmjRIj766CN++9vfctNNN/HOO++0V4YPHjzYaR+TydTewqCg0B3bXX6y0/RkGk9MpVElCIzOsLKx2ctVA/P6VOlrE7jemLvTOhm5fblN50SUE4TFMJFEUtQm5DiyLOONJdD+6UlKF73GmKq6wwew25GvuIKXxpzN2AwbadqOX18qQYVN58SksaAW1DSFm4lJUXQqPVnGHEQ5gValQ0CgMdxIVaCSiBgh05BFkbmYNE0aaQPC/L91O1lV7+L8MyD5LRRPpOxV2xSM4o/FEYAB6f3by61w8kjVxuvWaWV8/HUza/YnRe3WWh+iJGNP03LWACe/uGgwkwc4GV1gQ5/i02eTXsPdswYhkHRROFJgG7Qq7pxezk/PdJ/crjh48CBfffUVsVisw/Kvv/6a6upq/vrXv/bHaRROQ6RDVkOpWpPdeOONvP766zz22GO9OiJIkoQkSe0tAj/84Q959913eemll/jFL37RQbS2jUM6wvqop1CIDYeqP0cLXEhWfC+99FJee+01Xn311Q79t6lc5zPPPMPcuXOx2+1MmzaN4uJiLrroIrZu3cq0adPIy8vj/fff59e//jV6/WGhUlNTw4IFCzodr796GhXOfORD1mFjMk7spLCxmTY+q3dRF4qQb+rbTZ4vlrTi0qp0mDUWkk6VEoFEK3Ephi/mwapz0BCuJS7F0Kn0pNW6aMjMYIe3kZZogNn7q7BV1SGnpSHMnQsLFsCFF1IZibNqw9fclefs8twyMvWh+k5etQda95FvKqDAVMgu7w48MQ9OvZOhtmGYtIeLMAVmIxOy7LxdWc+UXAfaU1iV7I7GQJRPK918csDF6DwrGd1U3Y4m26ynJRKjwGw8YxLtFDqTqo3Xj88v44/v7iIUEzlroIObppRw1gAn5VkWVMdR8TZo1dw1s4K7ZlawdEM1NZ4wBXZje4vC6SRwoR9E7meffcaPf/xjRFHscr1Od+Y89lHoO20VydraWhKJRK8hCoMHD2bSpEkcPHiwx37aQCCA2+2mpqamvbo5fPhwzj77bL766iuuvvrq9m0TiQT19cnc8urq6l6rw7FYjD/84Q+dbMKOZPr06bz22mv87//+L7NmzSIrKwug/TzNzc1kZibtedrS1pqbmykoKCAWi/GrX/2Khx56CLPZTCwWw2g0MmLECHQ6Hb/4xS+48847efDBB/n1r3+NIAgsXbqU9PR0vve977WPoaWlBYD9+/enXPFW+GbTEIriisROWKtCG0PtFvRqFZuafX0SuYG4H7WgwaHPRqvSsdffSCAewaw1UJ5eQEyK4o42EYj7yfFrEJa+QWzRItLWf8knSx7DcNZULi0aQ8E998Pc+QiXXQZH9Ox/XtmIVadhmL3z9YuSSHWwqsvUMQmpfXmJpZRiuQSLruv3cG5pLv+1dgef1bmY3k+RyceDLMsc8IT5pNLFJwfc7GgOoFYJjM+zUmw1cHaRncfXVHZwNjiaNq/af+440ME6TOHMI1Ubrzc21fLBT87FkmJoRl9o6/m9aUrpaeOi0B3HLXIfffTRHgMfhg8ffrynUDhNueeee1i5ciUATU1NzJw5k4ULF/Yai3vjjTeyc+fObu2wHn74YV5//XWi0Sjz5s3j/PPP58EHHwSS1dx169a1T1Z79tlnefbZZ6mtTU68WLBgAeeeey4PP/xwl8euqqri2muvpampib1797J58+ZOFeXnnnuu/elDS0sLF110Ef/5n//JW2+9xaZNmwC46qqr+M1vfsNrr73GihUrAPjxj3/Mr3/9awBWrFjB+eefT3l5ORqNhr///e/tHr8XXnghTzzxBE888QQXXXQRubm5lJSUsGTJkvbWiMsvv5ydO3e2X9Mvf/lLvvOd7/T4vip889nu9qNRCQyyn9gWMK1axQhHOhubvVxSktP7DiTFmCSL5BgL2eQ6yGZ3VYeY3VWNuxmntjH6041IS15A89FnCJKEBpAFgVm1ESzFY5Mbj3HCmDEdjp+QJNY2upma6+ym91KmJljd4xhrgzXJYIYevG5zTQYm5zh4u7KBc3Kdxz1r/ViQZJmvmgJ8fMDFJwdcVPkiGDUqziqyM39kHlOK7O32XuG4mJJXrSTLbHb5+eHQkpN0Fd8u+sPh4GhkWeZAS5AtNT62VHsZU5S6jVetN3xSJsGfzgIXQJCPMwJr/Pjx3HPPPRQVFbF69WpMJhNjxoxBkiT+/ve/87e//Y20tBPT4N7ftFUW20RLd0QiEQ4cOEBpaem3zgdYQeFUofzewZ8270WWZX465sRP7viiwc1TOyp5+OzhOAy9P5GTZZmoGGOLu5ovXZWd1tv3VHL1VbegPsLL1j9mBNqF12FceB0cEfTSFV82e3li235+O3EIBebO1eX6UB17/b2nLZWll5Ob1vO5GkMRfrV2B9eUFXBBYVavx+yKvla44qLEhjofHx9w8WmlG1cojt2g5ZwSO+eVOpmYb+vWjiuakHhmU3WPXrX7/EEe3ryHB88adsL6ub9tHK/DwZGIksyexlY213jZUu1lS42PrTVe/JFkETHfZuR3c4fhC8f5ydItvR7vsWtGnzaBDCeCVPXacVdyR40axbx584BkwtmNN97IlVdeSUZGBm+++Savvvrqt85GTEFBQaG/iYkSX3tb+c6AngVafzHSmY5agE0tPmb08NhelmUCCT+RRASrzslmdxWqeIKCz7/E4Gtl92UzAfAOKCRsTydmMZN+/U3E580jfcjQbo97NJ/Xuyi2GLsUuAkpQVRMbQJWVIz2mkaWnWbg7Bwnyw42cG5eRq/RtW2kmjrWRiCWYE2Vl08qXayu8hCMieRZ9FxYlsl5pQ5GZqen5BiQilftTo8fq05LRgo3LAq9cywOB23EEhK7Gvxsqfay+VCVdnudj1As2fZZ4jQxutDKT2ZWMKrQxqgCG5mW5I1J0sZr2xll43UqOW6Ra7FYeP3111m0aBHf//73mTlzJnPnzmXcuHGsXLmSDRs2KCJXQUFB4TjZ7Q0Ql+R+j/LtjjSthsE2CxubvV2K3DZx64m6SMhxsnTZ1L7zOlOff5YB73+G0esn5LSz5+JpyBo1slrNy6/+L2GHjfNyhzDMXpDyWPyxOFtdPq4uO7xPTIzREm2mJdyMVW9Dp0pNvOnV+pS8ei8tyWFNg4uPapuZXdS7V3WqqWOuUIzPKt18fMDN+lovcUmmwmni2pF5nFfqpMyRdkwTTXvzqt3rC1JmNSmTWPuBVB0O7ppZgUoQ2F7nO1Sd9bK1xsdXdX5iooQgQEWWhVGFNuaOzmNUgY0RBVbsPUwmPBNtvE4lxy1y582b15469eGHH/KHP/yBV155hQ8++CBpB+P1Hu8pFBQUFL71bHf7seu15JlOXqvG2Ewbi/ZUE4wnMB2y7JJlmWCiFU/MRVyKkb6zCtuLy1C//CrmmsN9oSGnjb0Xnos2HCFmSfbQh53JxMBAvPdq6pGsbUw6NozNMFMXqqUl0owv5kNAwKazYVKbsOntHGjd38FV4WhUqMgypBaukmnUMzU3g+UHGzk/LwNDD44EqaSOAZxX6uDGV7ciCDA6J53bJpdwXomDvPT+/Tc9WuAmJJkD/uBJewrwTSdVh4Nbzi/jkr+uYlutD41KYEhuOqMKrCycVMToAhvD8q2Y+9jWcCbaeJ1KjlvkTpkyhYceeojly5dzww03oNPp+Oc//8nPf/5z9uzZ02G2uIKCgoLCsbHN5We4I/2kVuJGZ1p5fnc1W10+Jmc7CCUCuGMtxMUoRo0Zo9aO8Nyf0DzxFACixcLumWezZ840aieORu5GGJq1qVVTIVmx/aSugWIL7PCtB8Cus1OeXoHTkNEeuytKIvmmgi7dFdrIN6VePQa4pCSH1fUuVtQ0M6eHCXippI4t2lLHwlH5PHjhYEbnpGMznry44OpAiJgkH3OCnUJHUnU4eHVjDX9dMAYZGJqb3m/C80yz8TqV9ItP7qWXXsqll17a/rqwsJAlS5bg8Xi4+OKL+fGPf9wfp1FQUFD4VuKKxKgPRfjOgNyTel67XkepJY11TS5KvLtQv/QazleW8/W9d7FmwnB88TAZ08Zw1oEZJK6ZR/6V1/Jp9boOrgpHoxZUVFh7dmyIiTFc0RaaI81U+v3UBU1cWqLqJGw7HFelptCctBs82idXhYp8UwGF5iLUQuoCwGnQcW5eBu9WNTK9ILNbf9nUU8da+M7Q1Nwq+pO9viAalUCxpe/BNgpJWiNx1ld6SNOpU3Y4qPdFGJFvPSEOBGeSjdep5JhFrtfrpaqqiqKiImw2W6f1O3bs4K677lLaFRQUFBSOk+0uPwIwxG45pv1FWUKdYuW0DVmWCdQdYNq/nyf7jWXkbj88ozv91X9TOO08Jqc5yK84D8OVPwIgLiUY7Sjq0l2hjTGOYpJxux2JSTFckaSw9cW8ANh0dlzhLEyaCJeWjOg1MlYtqCk0FVFoKqIp0khUjKJX69tbFPoicNuYU5LNp/UtfFjdxKWlh28yZFlmrztENCGmnDrWGIh22S97otnjC1BqSTulkbsng/6y8ZJlmWpPmC8OpYWtPeDmqzofkgy/uWwYedbUbhYK7MaTIj4Vgds9xyRy//rXv/Lkk0+2m///7Gc/4/rrr29f/69//YtHH32UeDyuNLkrKCgoHCfb3X4GWk3tfbGpEJeS9kZ7/A3tgQxtFVStquvjSLJEY9hPdd0+Cr93C9mfb2DqoUlUsiAQPnsyqoULGXj1fAZmdI691ao0jMtI2hYd7ZOrFlSMdhQxNqMEzSGf2jZh2xJpxtsubG2UpVeQYchAhYZ/7djOpGxHygKtzQM3Ny2vT32/3WHX6zg/L4P3qpuYlGVne2OAz6s8fFHtpSkY48cTi8g0pZ46drKFpizL7PUGOSvHcVLPe7LoDxuvuCixrdaXFLT73XxxwEW9LwJAWZaZyaUObj5nAJMHOCjPshCKidz7huJwcCbQZ5H7zjvv8Ne//hVBEJBlmXg8zh//+EcmTJhAZmYm99xzD59//nn79krimYKCgsKxk5Bkdrj9XJjCDP/D+4hsbKnsMpBhtKOIcRmlaFRqZFnGEwtS01JH66b17CjPIy6JaIAhNQ2oRInE2LG8P3UG/suvYP60yT2eV5REGkJ1VFgzGO0sZo+vgWAihkmjo9yaQygRpCFUR4Yhgz3+3UcJ23KchowOLgnbXD58sThTco9NoB2vwG2r1rb6JHYcCHHZzi+RZCixGZkxMIOzC+2MyU0nIcs89nlqqWMnG3c0jjcWP2lJZyciFKE7jtXGyxOKsb7SzRf73azd7+LLKg+hmIheo2JMkY1rxhcyqdTBpFInGZbOnsKKw8GZQ59F7iuvvAKASqXCbrfj9XpJJBL893//N/v27cPjSc6ClWWZIUOG8NBDD/XviBUUFBS+Rez3BwmLEsOdqVmHxaUEG1squ2wZEGWpfXlZmpONL/6Twrc/YPCHq0EQiGx+lyx7LiWWfMz/ehby89GUlxPaV8vndS3Mk+RefFtlKgMHkJFJ05jIMGRg0+sR5QTbPVsIJYKoUJGdloNebUgKW30GOnXXxZDV9W5y0wyUWI4tUOhYehUD0QTrar0dqrUGjYp8m564VeQP5w6h3NFxApeYYurYqXiuuccbAGDgCRS5/RmK0Jdzpmrj1eCPsPZAUtB+ccDFzvpWADLMOiYPcHLvRUOYVOpgdKENfQqTthSHgzOHPv/07dmzh5tuuom77roLvV6Pz+fjvvvu47333muv7qrVan7wgx9w++23o9GcmB9wBQUFhW8D291+zFp1ykJPlpOtAl0iSeRs2oF5+V9wfPA5F7S0tK9KFORyVqsO44BByTaz889vXzcmw8byg43s9QUY1ENfcFOkCZnkBKxQIkhVINh5CEi0hJupsA7q8TpCCZFNLV7mlub2qe2tr4EMsiyzxxViTbWHz6s8bG1sRZRkSu1GZg7M4OwiO6Nz04mIIves+YqtHl8nkWvUqrlpbCGCQI+pY90llp1I9vmDZKfpSdedGDeH4wlFOB5StfH68fllXPvPteyo9zM4x8LkAU7umF7OpFInAzOP3TdYcTg4M+izAjUYDPz85z9v/8GwWq3cf//9vP/++8iyTHFxMQ8++CCjR4/u77EqKCgofOvY7vIzzJGOKsUv4z3+hm7dDUY//QpnP/KP9tdSZgbBy2ejXngdxnNmkqbu+ou5ND0Nm07LxhZftyJXkqXUU8ek3n1y1zd6SEhyn3pJUw1kCEQTrK3x8nm1hy+qvDSHYhg1KiYU2Lh7SilnFdo7edfq1CpmFGTyYU0zFxRmdRKNqaSOnQr2+gKUpZ+YKm5fqqnHW9GVZRl/JIE7GEOvUfHu9oaUbLxe31TDC9+fhMOkw5Fi73SqKA4Hpz99/qkrLCzsdOdjt9txOp1ccMEF/PznP8doPDzz8KabbuLpp58+/pEqKCgofMvwx+IcbA31GKt7JKIsEYgnJ8xYK2soX/4RDaOHUXP2WAAOzDib8X9fxP4LpsD8BZTMnYdZ33s1SyUIjMm0sqnZy/yy/E7bR8UIETHSr6ljqxtcDHVYsOtTO2aqgQznlzq56bUtiDKU2o1cUHa4WqvrRaTMLspmZU0z71Y1dkhfa6O31LGTTSQhUh0Ic37eiekFTrWaetfMig7LYgkJTyiGO9jFn66Wh2J4gjESUvIpwR+vGEFLio4Wdd4IJc60Ey5AFYF7enJMt1b19fXIckdPwIKCAn74wx/i8XjweDzIskxVVRWbN2/uj3EqKCgofOvY4W5FBoalEOUryzKB/fsoWvwcpS+9RNZXSfGxb9Y57SLXV1LAv1a/jKTTcV7OYAyG1MMBxmTY+Ki2hepAmCJLGpIs0RJpoTHcgDfmwaw1M9w+sl9SxxpDEfb6gtw8tCTl8fUlkOG3MyoYnm0h19K3pDGzVsMFhVm8V9XIhUXZWHtoATjVAhfggD+EJEOZ7cRUclMNRViyrgq1SuBPH+zGHYzRGk10ua3NqMV+qOLqMOkocqQxutDW/rrtz5CcdJZvr09pjCfLxkvh9KTPInf16tVMnz69y3UzZsw47gEpKCgoKCTZ5vZTZDZi0/fQTynL+P/6GLFFz+NctwnroQKEpFZRfdY49s06p8Pmkk6HWlBRnt63UILBdjNGjZovGpuIyXGaw00k5ATp2nTK0yvIMGQiIPRL6tjnDW4MahVjMm0pj+9kBTLMKsziw5pmlh9sZEF53xLUTjZ7fQHSNGpy0/o/CjouSn0KRZhS5uTKsQWdBKvDpMORpsOWpu2TGL16XCG/eG2rYuOl0CPH3CRzdCW3OxSf3G82oijy3nvv8eqrrxIMBtFqtahUKsaMGcOcOXP44x//yP/93/+ldKwVK1awbNkyli1bhiAIFBcXk5ubiyiKNDc3U1BQwLnnnsv8+fPbrenef/99/vCHP1BXVwfAD37wA372s591OvY999zDRx99hM/nw+l0MmTIEGpqaqisrATg4osv5tFHH+2038MPP8wbb7xBc3Mz6enpTJs2TXEMUTgpSLLMVy4/U3KdnVfGYqDT4YuFWNu8j9FP/q29citOnULVpRfw0ZRhRBy2Lo892lGEKCeQZXVKn9ExKUZzuIkCc4J1jc0UpyfITcsly5hDmqbjhLjjTR2TZJnPG9xMyLKj70X0JESJLQ2tgHzSAhlMWg0XFmbx9sEGZhdlpdxOcSrY4wsy0GpKuZ87VYLRBAdagn0KRZg+OJvpg1O3wesNxcZLIRWOSeTa7XYMht7vDD0eD5FI5FhOoXAG4HK5uOuuu2hqauKPf/wjY8aMaV+3atUqbrnllnZLuVSYMWMG48ePZ9myZYwZM4YlS5a0rwuHwzz//PM89NBDLF68mCeeeIKBAwcya9YsLrjgAhYvXszvfvc7/vnPf1JRUcHcuXM7HPvBBx+ksrKSyy67jHfffZf09OTj3/fff5+f/vSnLF++nPLycm655ZYO+91999386Ec/4uyzz+all15iwIABx/JWKSj0mepAGH88wYg267BIBN55B5YsQf7wQ7744n22iD6MGh0Vd9xCa3ML4tXzUBeXkK9OY5jrYLeBDGOcxUTFMIKm+89xWZZxR100hhtwR90ADLHb+bc3Tql5BFndVAePN3VstzeAKxLrWtwDvkicNVUeVlV5WFPloTUmcudZJWSdxECGCwqz+KC6ieUHG7m24vSsFEqyzD5/kNlFWf1zPElm9b4WFq+r4s3NdRQ701h2+zmnLBRBsfFSSIU+i9zrrruO//qv/0pp29bWVi666KI+D0rh9Ccej/OjH/2Iqqoqli9fjtPZ8Qtp6tSpPPPMM1x33XV9Oq7Vau1yudFo5Oabb2bAgAHcdttt3HLLLbz66quYzWYEQeDaa6/l//7v/6itreXXv/41JSUljBo1qsMxioqKcDgc7QIXYNasWYwePZp169bx+OOPU1FRwcyZMzvsZzabcTgclJSU9OlaFBSOh+0uP0ZZonztanjpJXjtNfD7gWQobmjZvxk6/yry0vRICy7Cp9KRrrVh0pqpDVYxzJ7DGGcxe/yNBOJRzFo95enZxKQojeEa8k3FXZ43mAjSGGqgKdJIXIpj0pgptQwky5hFQlKx/OBWNrf4mFXUvUA+ntSx1fUuMg06yg/5usqyTKU3zKqDbj476GFrgx9JhsGZJq4Zkcc5JQ4GZZiIJCT+fJICGYwaNbOLsnnzQD2zi7JxGk6/am59MEI4IVJmTb3vuiv2NLXy4vpqXlpfTZU7RInTxO3Ty5g/oQi9RnVKq6mKjZdCb/RZ5M6ePTvlbS0WC/PmzevrKc5oZFkmFBNP9TC6JU2X2uPJ3njmmWfYunUrP/vZzzoJ3Dby8/P7LHJ7Y+bMmcyZM4e3336bxYsXc/PNN3dYf/311/Pcc89x66238uqrr5KdffjxmEqlQtVNBadtv5/97GcsWbKEwYMHd1jf074KCicC33vv8z/3/idqt6t9WTA7kz0XnU/rvDnknzUeky4Ns8aCSWNBp04mM0myhEWbtBxTCVBksiLKImpBfWiZgEXbcSJbQkrQHGmiMdxAa7wVjaAhy5hNtjEHs/awSNKqYKjdwsZmH7NSTGCTJFClqDWiosiGZi8XFGSxvtbHqoNuVh30UOOPoNeomFRg4xfnDGRKsZ1MU8ckKgFOaiDDjIJM3qtuYlllA9cPLuqno/Yfe3xBVAIMSO97kIYnFOO1jbUsWVfFuko36QYNV4wtYOHEIiaVOjp8h5zqaqpi46XQE30WuePHj+/T9nfeeWdfTwFAdXU1Tz/9NDU1NTz11FOd1jc2NvLAAw/Q0tKCKIpce+21nR5Rn2xkWWbWnz9l7QH3KR1HT0we4OC9O889LqEryzIvvfQSQKeq59Fcc801LFiwgE2bNiHLMitWrKCgoIDf/va3LF26lHg8znPPPcekSZNSPv8VV1zB22+/zauvvtpJ5P7yl7+kurqajz76iFtuuYVFixal1Fpzww03EI/HWbJkCbfccguvvPIKDsc3M+sdkv+GSr/8yaXH91yWYdu2ZK/t+PGEEyIbHdks8LgRHXb2XXguX110LvKUCQx1ZFNkdGA+QtgeiUpQYdHZqA5U9dgXKyDgiXpoDDfgirQgIeHQOxhiG4pD7+y2+jo208azu6rwx+LdBgz0NZABwBOO89y2avZUh9mx9yDhuESWScfUYgc/LbYzLt+KQdO9WDrZgQwGjZqLi7N5dV8tFxdnk2Hs/G9xKtnnC1BoTkPfjffx0cRFiQ93NLJkfRXLtzUgyjIzBmfx9I0TuHh4LkZd18c5naqpisBVOJrTMo7siy++4OOPP2bRokVMnDix03q32821117L1Vdfzc0334zL5eLyyy8nkUhw5ZVXnoIRH+bboBuampqorq5Gq9X2+gjfbDazZMkS7rvvvnZhDHDfffeRk5PDI4880ufzDxkyBICqqiri8Tha7eEvWpVKxSOPPMLChQvZvn079957L3/6059SOu5//dd/UV1dzapVq7j99tt5+umn2ye4fROQDk0WDcYTJCQZjUrApE1+BPT3xBSFJNKhfthA3E9CjqMRtJgPVVFVggr27YMlS5J/duyACy5Afu89Nrc043Zmsmzx36gZWoLdbGass4hic3aXwvZIREmkOljVpcOBhNS+3KF3sN2zFaPaSJG5mCxjNvpejg0wOsPKs8CWFh/n5GV0Wp9qIIMsy+z3hPis0sOqg262NSbt0uwmDQtG53FOsZ1yZ98SqU52IMO0/EzerWrkrcoGbhrSdfvHqSAuShRb0sjpxVVBlmW21vhYvK6Kl7+spiUQY3heOvdfOpR54wvJTk/NlUGppiqcrpyWInfy5MlMnjyZN954o8v1jz32GMFgkO9973sAOJ1OFixYwO9//3umTZt2yipwgiDw3p3nfuPbFerrk/6ENpst5X0yMzv3wWVkdP6CTAWLJZm4JEkSHo+HrKyOEytMJhNPPvkkV111FcuWLaOsrKzThLKu0Gg0PPbYY8yfP58NGzbwm9/8ht///vfHNMbTDUmW8cXi+GOJDs4o7micdJ0Gq06rCN1+RpIlvDEXvpinPeoWwFu5Dee/PyFt6b8R1q9vXy7rdMTMeva5tvN+dRi9WsA7fjgzMwcywJLdh3YZmZpgdY9b1AZrKDAVMsY5FpPG3KfPhHSdljKriY3N3k4iN9VAhgvLM7lr+Q7qW6MYNSomFdq4a0oJ79Q38MPhxV2K51Q5mYEMerWKi4tzWLq3houLs8k+AVZdqRKMJpDlpHdttSdEns3INeMLCUYTndLG6n1hlm6oZsm6anbU+8my6Jk/oYgFE4sYkd/1vIhUUQSuwunEaSly2zgyOa2NcDjM66+/zvnnn49Gc3j4EyZM4LHHHuPNN9/kpptuOpnD7IAgCMcdX3i6o9cnqz2nyjmjtbUVSL7XR04iO5KcnBz+/ve/c91113U7oawrzGYzf//737n66qt55ZVXqKio4IYbbujX8Z9s2gSuLxrvtE6W5fblitDtP9oErjfmRqvSYdZYABUgYbz5JgyfrAFAVqmIT5uK/8qLcF98PvtUKiqbmmgIWRnqMLFw4Lg+TdoCaIo0dRDVXY4PieZIE7lpecd0fWMzbby6v45IQuzQQpBqIMOCkXlcPiSbwZlmxuZa0WtULKtsIM2lYnyW/ZjG1BUnI5Dh/LwM3j2YrOb+oA/hFf1JJC7y6Ie7efyovth7X9/GHdPLuXvWIGRZ5u1t9SxeW8VHXzehVauYMyKX31w2jBmDsxRxqvCN5LRWY11VF9atW0c0GqW0tLTD8jZrp3Xr1h2XyJVlmVCoZ4PraDSKJEmIoogonr5V2xNFcXExOp2O1tZWXC5XShVdSZLa/257z7padiTdvbfbtm0DoLy8HK1W276dLMsd9hkyZAgPPvggd955Jz/72c9YtGhRp23a9jtyDLm5ufz1r3/lxhtv5MEHH2TAgAFd7ncmIAgCCAK+WNcJQ234YwmsOi2SJKXsgX2yEUURSZIIh8PtPzunI4IgoDfoCHmbyHt/HdqXX2PvQ7/B60jHrDUwaOENiDEJ/5UXkX7tD2k0xTnQ6uNrf3KCWWlaPuvFEJMcTiLhvt1IarQaomJqfrFRMUosHiMR7/lnoysGm/UkJJkv65sZc8jiTKvV8t4eV0qBDCv3u7huRA7xeBwxFiEYlVlV18JIuwU5FiUU6/OQTikX5Dl4pbKRGdlWsk9yb64oaHhs5d4uHQ4icYmH3vsaWZaZMSSb7z+7gYnFNh68fCiXjczBaky2esWiEc6wt1zhW06q31OntcjtitraZJXg6EfdbY+w29YfK7FYjJ07d/a6nUajIRpN7cvkm8iMGTN45513ePfdd7n88su73S6RSOD3+9sFYjQaba8Ax+PJCmIsFutUFZYkqdtK8csvvwzApZde2mEbWZY77TN16lRuv/12Hn/88faWha7OdeS4AAYNGsT999/Pvffey09+8pMuj30moNVqiSAkJzb1gCzLBOIJDMjt/y79jUqlQq1Otsu03TT0RaxGo1ESiQT79+8/IePrD4RYjOKvv0az/G3yl72DcOiGuWl4Edu++x0AVp1TxujLX2CMs5hdvlrW1VYSkRJkJfTkxw3scAeTNd/6anY2pHZeSS0hGhPk5eSh06TWR65T6WhpbmlvP+orThV8VlmLoamWiChjyc6nIZCaVGoMRGlxuamrTbYvNCSgMSIwURVl507fMY3nVOKUwSTAizv2c5nDgN1uR1CrkUXxhPrFG41GCkrLeGzFnh63+8tHe7l9ejlvXjeAdHUCCFBXuZe6EzIqBYUTTyyW/KxJJBIdnuofzRkncn2+5Afg0TPm1YdmkB6v8NTpdO0Tm7ojGo1SV1eHXq9Paeb+N5Gf/vSnrF69mieffJLZs2d3W819+umnufrqqzGbD9sQtb1nbRPGtFpt+7K2uzOVStXle/vuu+/ywQcfMHToUL773e92mBgmyzI6na5T7+KPfvQjamtrefXVV8nLy+vyuEeOoY25c+dSX1/P448/3mHcZxKCICD2UsVtQ5RlNFpN++9Sf44h6dskEBXjSTsrlQq91gDIyf9SvSvXaCgqKmpvmTldEOrq0D7wAOo330TwetuX+wrz2DNnGlXnTGhfJiLzpasSgAKTk1yTndHpBaRrk+1Zq3ZVU66XGTmkZ1uqhJzAE3fjjrsIiSEEVESlGHnGfA607u/gqnA0KlRkGbOJq+J96q0/kvKv63iv0kOloGNrU5CbxsbJTEs9kCHD6cCanixObD3QgC0e4IIRA8/Ylpl5Lj9ZFhNF5jTWNrpxB+I4DFomDxqMJEkIYqLfn5JotVpeWFdDNNHzzWIkLvH6plqumzj0hN3EKiicTNq++3sSuHAGity2L7ejxWzb6+7CBFJFEATS0nr2FWzzTFWr1f0uCM4UCgsL+cc//sF//Md/sHDhQu6//34mT57c3mLS2NjIiy++yJw5c0hPT2+/cdiyZQvl5eU0NTWxcuVKAPx+f/v7eGRC2pHvbSAQ4Nlnn+WJJ55gzJgxPPHEEx16tgOBAB6Ph9ra2i4dH37zm99QW1tLdXV1h+PG43EaGhq63e/WW2/l4MGDvPnmm2fsv7VGlZpo0AjCCfEClmWZYCJKOBHr0CkaiEcwanSYNPqUzqtWq1GpVBiNxlN/wyHL4HLBoSdKssMBixcjxONIebnI867mzSkVNAwr79ZyZbO7ijHOEmYXHg4tiYsSe/0hLh+Q1+XnkCiLuKNumsONuKNuZGTsOjsV5sE49U40Kg2iJJJvKujSXaGNfFMBAkKX8x66Iy5KbKr3s+qgm9VVHqp9EQQBrNk67jyrhPNLnVj0Gh5bk1ogg1arRqvVEpckNrpaOTcvA7PJlPJ4Tjcm6Q28VVnPi3tqGJ1hw6RR44sleHDTbkY6rcwpzkF3nH2v0bjIV/V+Nld72VztZepAJ9Wentvr2qjxhBFUatLSurZ9U1A4k0h1suwZJ3KLipLVDe8R1ZIjX+fm5p7kEX17GT16NO+++y7PPPMM//3f/00kEiE/Px+73U5ZWRk33HBDe5Vo6tSpXHvttfzxj39k1apVTJgwgenTp7Nt2zY2bNhAQUEBO3fuZPny5QBs3LiR888/n5ycHCApfgcPHsyjjz7KrFmzOvyAP/zww7z++utEo1Guvvpqpk2bxoMPPthhrFqtlr/85S/85Cc/aV/27LPP8swzz1BXV8dtt93G6NGjefrppztd5wMPPHDcbTCnEpNWgzsa77GKJAiH7cT6E0mWCSWihBKdH2PLQCgRQwCMGv2ZUcHbvj1p9/Xii8iZmURWrSSUCBDUBUj7wy+IDynHOuMyDoRaaWjY1eOhRFlij7+BYfaC9mW7fQFiksxwx+EJlbIs4415aY400hJpQZRFzFoLpZYBZBqy0Kk7Vk/VKjWF5uTnpCfqxmFwIsgaZCGBO+LCrndQaC7qNV4XoCUU4/MqD6sPelhX4yUYF8ky6ZhSZOeOySW8WVvHmEwr1wxKTmALx8U+BzJsafERTIhMyT1zfakjosintS0MtlqYnpfJq1/WUOcNk2czcue4gdQGI3xc28x5+Rkp+9YeLWg3VXnYUe8nLsqoVQJDcixcODSbfFtqYQ8FdqMyuUzhW4cgn66zTIDp06eTn5/P888/377M6/UyZcoUZs+e3cFjdc2aNdx444387ne/4+qrrz6m882YMQOAFStW9LhdJBLhwIEDlJaWnvqKkoJCL/TkrtCGVa89Ie4KkizjirT2ONdfAJwGS6/nPmW/dwcOHPay3b69fbFkSqNq+wqEjEzSNCbSNGaM6jRkYEPz/vaWhJ4Y5yxlQmZpu4PCi3tqWN/k4X/OGkZIDNIUbqI50kRMimFQG8kyZJFpzCJN07uwCcUSgMB7e5toCMTIMeu4sDwLZJk0Xdc3NJIss6s5wKqDHlZVudnVHEQAhmdbmFJkZ+pR3rWLd1fzZbOX/zl7ePu/XzQh8cym6pQDGR7bshd/LMGvJ3RMGTyTiCZEgnGRv67Yw18/2tsp+eu2aWXcNqMck1aNvotAi94EbXm2maF56QzOTac810xRRhqySqDAbMCs1lD2q+Udznk0Bq2Kfb+/GItBqeIqfDNIVa+d1pVcWZY7VZ9sNhsXXXQRq1ev7pAgtG7dOqxWKxdeeOGpGKqCwmmLShCw6rQggz/esS9QEATStSfOJzcqxnsxs0pWdKNiHGOKE6aOhz4nvd1zDzz00OH9tVpCF5xDZN7lCJddRq41G51K335M+VDl2qRJTYSbtfoOFmFbXV5KLCo2ur4kLIbQqrRkGrLIMmRh1lpSHns0IfHc5touAhkqOwQyAASiCdbWeFld5eHzKg/ucByLTs3kQjsLRuRxVqEdm7FrcTQ208aHNc1UtoYYkJ5sNehLIIMvFmeb28+C8sKUrut0xROJ89TH+3j4/d2d1kXiEg+/vxsB+OF5A6kPt/LO1w3sawxwsClIdXOIRk8USZIRBLCn67DbdYwYbsdq1WGzalEfqsDuJsTu5hA0J4+9oLyAoVYLt00r6/Lcbdw+rQxfLEGaToM6xfYlBYVvAqetyI3FYvj9foxGY6cvpp///OfMnTuXV155hXnz5lFTU8PSpUu59957j7snV0Hhm0gwnsCoUWPVa5OJZ7KMRhBI02iIS1LSfKGfv/tkWUaUU3NPEGXphEUN95o61obXC6+9hjx9OpGCLEKJAPLQApyCQOS8yUSvvgLVFVeRlpmPSdVR9IUSUXb7GtjprUMlCMwtGsvqpt09Xr9aUFFhzSEuxWmONLHf10hDCIY6Ili0dgakD8Sus/f5PUkpkEGASyqy+P0n+9jc4EeUZAbY05gzKIspRXZG5qSn1MtdbjVj1qrZ1OxtF7mQeiDD2gY3AgKTsvvPG/dI4qKE9gQ/ok9IMka1mr+s3Nvjdn/5aC+3Tivntqc3sq3WhyBAls1AfqaRyYMzGJBtYkC2GYteg16tQq9WH/pb1eG1Qa1Cp1ZhOPR6faOb22aUIxw6x9FV5NunlXHrjHL2+FopSE+9B1tB4ZvAaSlyX3zxRf7xj38QCAQIBALMmTOHX/ziF5x77rkAZGVl8cILL/C73/2ON954A0mS+M1vfpOS2b+CwrcNSZbxRuMYNWoyNHosOm27oBQlicZwFLNWg9PQv5VUQRBQpxhkoBJURKUEAqBTafpN7HaXOuaKNmHV2bEljKjeXoa8ZDG88y5CLIb31z/B87P/QC2oSZt7KeHK2RgKB2I86lpEWaIq4GKXt46DgRYQYIAli8HWPFSCitGOoh5bFkY7ihBliQ3NXyAjUxOwISByWfE40o/DPSKlQIbNdSwYkUeJzciMAU7OLrKTl2KE65GoVQKjnFY2Nvu4cmB+l9v0FMiwusHN6Awr5n7sBz86+avQnsbV45OV4hMR1KNRCby6MTWHg9c21vD4/DGIksyIfCtG3fFPZh2VYeOBL3exYHIht88o55Uva6j3Rsi1GbhqXAG1wQh/2rqH/xp35raDKCgcK6elyJ0/fz7z58/vcZuBAwfyzDPPnJwBKSicwQTjCURZTrYsHKJNRKpVKmw6LZ5oDLNWnfKkmFTRq7UE4pFee3INag2t8QhRMYEggF6lRa/WHJfgPTJ1rAOJBIYPV6F9ZRnCshUQDLUXseNDB6EtKCEvrQi9ypA8t63j7u5ogF3eer721RMWY2QaLEzJrqDcmoNBnXyPRSnBKGdSWG12V3Wo6KoPCeBRziLUgkBF+iBsBjvbmmsYkB4/LoEL8N6e5pQDGe45d+BxnQuSLQurG9zUByPkmlIXylWtIaoDYS4v7b/Jwt0lf93z2tb25C+D9th+xn3hOPubA+xvCbKvOcD+5iD7WwIsmFBETYoOB/W+CDcWWPt3ApgA4zLtPLJ1L/kmA+Pz7RQWmohIEo9s3UttMMKlJTndmXwoKHyjOS1FroKCQv8gy3KyF0+j6faxbbpOQyCewB2Jk5Om6rcqqizLSLKEUaPr0l2hjTSNDhCw6tJISCJRMUFUihOJxTsI3mOZI+uLeTotE2Jxsr/3n6gCSWEil5YSu/oKVAuvRTtyDF11n8bEBHv9jez01dEY9mFQaylPz2GILY8Mg6XT9k2RJmqC1VRYBzHaWcweXwPBRAyTRke5NYdQIsg29yYKTIXkpuUhSjJfeVq5sDCrz9fYRigu0hKK0RBIzSu8MRDtsZUgVYY50tGpVGxq8ZJrykl5v88b3Fi0GkY4+6fFLBhN8OiHu3tM/hKAu2ZWdFvR9YRi7eJ1X1Py7/3NSVHrCh7+GXaYdAzIMDEw00xZlrm3rJV2ToTDgUGtZk5x8n1/t6qRNw8cDvfQqgQuLcnpF/syBYUzEUXkKih8gwklRBKSRKax+1YEQRBwGHQ0hiIEE2K/PDpu88aNignsehMCSbuwI7WAQFLgpmkOT9zSqNRoVGpM6A8J3jgRMUFEjJOIxYmIcWqDHkr02b22QgRiPnRfbsX0ytsYduwlvPx1ENSgkwjeOB9JTBC4ag6WKReQru/cEyrLMnUhDzt9dez3NyHKEoUmJxfmj6DEnIm6G3GYkBJExSgRMcJ2zxbSNCYyDBnY9HpEOcF2zxZCiSCQjNaVZIn9/hDhhMiwI6zDUnmP97hCfFHt4YsaL5vr/dw0tqBPgQzHK3ABdGoVw50WNjX7uLg4NZErSjJfNLqZlG1P2ce5N2SZXpO/Hlu5hztnVrC91sdXdf6kmG0Otldo3UcI2QyzjgEZZgZmmbhgaDYDMk0MyDAzINOE/aj3eEyhnV+8vrVXh4O2ton+RqdWcXFxNhcVZbO20Y0rGsep1zIp24EgoAhchW8tisg9Dk5j9zUFhUNV3DgGTe9tCEaNmjSNBk8kRppGfdxOC23euEn3AAGjRo9Roz+UeCahFlToDz3a765y3CZ40zTJCWz+RCuiJPFJw04+bvmaUksWZenZ5JvsHQXvjh3IixdjenEx6fsOtC+uWb8J16ABmLUGyv/8d2JSFDHaRIJEh0lvrfEIX3vr2OWrxx8PY9UaGZdRyiBrLmZt14/jI2IEd8SFK9qCVWdDpzosgkKJIFWBYJf76dVJd4Xtbj8mjZrS9J6twbzhOGtrvHxR7WVtjYeWUByDRsW4PCt3nlXCOcUObEZtyoEM/cWYDBv/t/MgnmgMu753kb3d7ccfSzAl19lvY3j5y+qU+mKXrKuiJRDlD+/sIsuiZ0CmiYpsCxcNzz0kZE0MyDRj7cZRoisEAe6YXt5lFbmNO6eXp3y8Y6Htd/y8/ExESVZcFBQUUETuMaHVahEEgWAw2KfEIAWFk0lElIiJEtlpqfVJOgxaaoMi3mgcx3FMQgvGo4cezetJ0yT7S9tEs1Gj67OLgiAIaAQ1UjSBWWfgksKx7A+2sNffyC5fHQa1lgGWLIat3ozzd39AtXUbAqAGZKMR96zprL9gEpXpCaRDE8FWNe5mtKOIMc5iomIYUZY44G9mp6+OmqAbjaCmLD2LGbZh5BitncabrFQHcEVcuKIugokAAgJWnQ2DyoDD4GyP1m2r5LYFMrREWgglgsloXUM2ANtdfoY50jvdXCQkma8aW1lT7eGLai87mwPIQJkjjdnlmUwutDMqJ72DLdexBDIcL6MyrKgE2NziY1p+7+J5dYOLApOBInP/fH7GRSnl5K8GX4QfnlPKbdPK+s031qTXcPesQQgkq8VHOxzcOb2cnx5HP3BfUQSugkISReQeA2q1GqvVSnNzM9FolPT0dDSa/psRrqDQH7SEoqiQIa4ikkjtZ9MoJ/AEImhEwzE94gwnYoQSMYxqLWpBJiJG+nyMI5FlmUQigd/vx+/3Y7PZyDIl/0ySzLiiAXYbJPb6Gwi4DnDJ1m1IGjWhGedjuO4GNk0ZyfpIc6fjirLU7nxQbs1h6Z7PiEoJco02puUOpSw9C62q48ejJEv4Yl5cURfuiIuoFEUtqHHoHRSYCnHoHWgO7SNKIsXmEnQqC2kaE+/vaaYxGCfbpOPC8lEEE0FiUisArbEEla0hphUkxWFDa5Qvqj2sqfayvtZLICaSrtcwqcDGlcNymFRgI8vc/eQ0o1bNTWMLEQRSDmQ4XsxaDYNsyZaF3kRuMJ5gc7OPKwbmHfdnpj8c58OdjahVArnW1ARzgd1ITorb9gWDVs1dMyu4a2YFSzdUU+MJU2A3trconCyBq6CgcBhF5B4jOTk5GI1Gmpqa8Pv9p3o4CgodiEsS/lgCs1ZDsA9iNRkhG6dFEEjX9a3KFZcSRMQEOpW6vRWhv1Cr1eTm5mIFeOYZ5CVL4MMPMd71I3Luv5NsYxaxK77D7mCCTVPHQYaTy4rGsnHvqh6Pu9ldxShnMeMzBlBsdmLTmzqsT0gJ3FE37mgL7qgbURbRq/Q4DE6c+gysOmtHr9228arUZBryeHpjDYu27OwgNB/9/OChQIYC1Co1m5pb8AdFPtvj4fFPDlLpDaMSYFiWhYUj85hcaGdIprlP1bm+BDL0F2MyrLy0t5ZQQiSti1SvNtY1eRBlmcnZxxbjW+sJs3x7Pcu21fPZnmbioszlo/N4fP4YfvXGtlPWFwuHLcpumlJKQpSUGF0FhVOMInKPEUEQsNlsWK1WRFEkkUic6iEpKLTzwu4qmqISdw0e0Of+2p2eVp7/uoprK3JSngi139/Ehua9VNhzGJFR2q9PNTSJBOp33kFYsgR5+XKIRtsftQt79+HQZ2DRpKNO18B/3kf5oUlv+w5NFusJUZbY529klLOofdmR/bW+mA8ZGZPGTL6pAKfeiUlj7vX6UgpkAKYNcHLP218jyhD2ephcaOM/JhQxocBG+nF6uqYayNBfjM20sXhPDVtbfEzO6V7Afl7vZrgzHZs+tRshWZbZXudn2dZ6lm+rZ3ONF41KYGpZBr+/fAQXDc+h2GkiGE2c8r7YI1EEroLCqUcRuceJIAhoNBo0GuWtVDg9qAuG+bwlwE2Di0g7hp7xMbkGPmny8WJlEw9kZ6Dv5ct6r7+RT1y7GezIY0rOkP5t25Fl5GHDEPbvB5KODLFBA4lfcwWahddjrBhG2lHnEwQBo0ZHMJFaq0QgHiWaiNIQru/UXzvAMhCHwYlB3beghJQCGbbUsXBUPmeX2yhzpPGjUSUnrOXpRAtcAIdBR7EljY0t3g4i98jUsfpghH3+ID8aVtLjseKixOq9LSzbVs872xuocodIN2iYOTSb22eUccGQbGxHORycbn2xCgoKpx5FmSkofMN4t6oJu17bYzWtNxaUF/DrtTt552ADlw/I63a7A61NfFi7nbL0HM5LUeB2O/FMkmDVKli2DP7wB2JyHH/ch276ZIyJCJGr5qJasBDj2LPQqXoWKmpB1a0TwtGYtXoaIvXUhmpw6J2d+muPhVQDGVbsa+HcUgcVtt6rw2cCYzOtrG/0EIjEAaFT6pgoShSZjYzJsHXa1x+O88HORpZtq+eDHY34wnEK7EYuGp7LnBG5TC3LQNdLm4XSF6ugoHAkishVUPgG4Y7EWNPg5qqBeWiPo3qXnWbgwqIsllc1cnaukyxj54lOVQEX79Vuo9SSyYy8oT22RUiH2gYCcT8JOY5G0GLWpoMso9q0GZYsgZdegprkY/zmmRNonTwCtaBG+MMDGP6Wg0Xbt6p0eXoOqxp399iyoBZUVFhziIhhCk1FXfbXpkpLMMYX1R7sRm2fAhnKLUbKrabeNz4DmJBpZ0q2k0c/3MNfPuqcOnbbtDL+c2ZFe2W3xhNi+bYGlm2rZ9XeZH/tyHwrPz5vIHNG5DKyoLOzRW8ofbEKCgptKCJXQeEbxPvVTRjUKs7NyzjuY11SksPnDW5e2lPD7SM7xr/WBt28U7OFIpOTmfnDexSHbfG6vpgH+VAchLq2AfGZl0l/7V3Ys+/wtulmgpfNQrDbyTbkkaYxI5iPrcKZkOOMdhS1uyh0xWhHEXEpjkXbObWsN+KixJYGP2uqk761e1xBBOAX5w4ky5SaBVumSYdZpzkp7QQnA4tGwyMffM3D7+/utC4Sl3j4/d2oBIH5Ewq56Zn1bKnxoVEJnFOewX9/ZwQXD8+l0NGzV3BfUASugsK3G0XkKih8QwjEE3xS18KswiyMPcxuTxW9Ws01ZQX8/asDbHP52uNX60NellVvITfNxqz8ET0mj7UJXG/MnWxHOCTmNNV12B98AgDZYEC85GI835mJZs5lWCzZWI6jVQAgJkZxR1oY5Uw+pq4KuigxZ6IRNCTkBJWBZopMTkY5C2kJN2Ew5adUxa3xhfmi2suaag8ban2EExIOo5ZJhTa+OzqfSQU27EYtobjInz/vPZDhgrIMdnpaj+taTydkGf6ycm+P2zy+cg+3nF/G+RVZ3DmjgguGZvcpeEFBQUEhVRSRq6DwDWFFTTOyLDOjoP+SrCZk2fi41szi3TX8dpIFTzTAsupNZBktXFQwCk0vvbE0NyMt+l9yX1lGbGgFrj/dB0B04mhaF8wlPG0KoTkzKModRQaq4+pLjUtxmsJNNIUbcBicaAUtu707GWodyghbKe/ubaIxECPbbGRO+XhESWS3dweZxqxuBW4oLvJlnY8vqpJhDNX+CGqVwKhsC98bV8jkQhvlTlOnVg0Beg1kuHZUHrXBCKXp34xWBUg9dezNzbX87vLhJ2lUCgoK31YUkaug8A0gKop8WN3EOXkZffa37QlBEFhYUcj963fyxv5q/OIB7HozFxeMRtudwPX74Y03YMkShA8+IEMUAdDt3o/rf34FajWoVDQ/+WD7LoF4K+k6W5/HJ8kS7qibpnAD7qgbAIfeQbrWikVrwarL5PlNdSzaUt/Rq3Z1JdeOyuW7Y4ZiOCLyWJZl9rpD7WEMW+r9xCWZXIueswrt3FFoY1y+FbOu54/OVAIZFo7K5+9fHeBnY06erVV/I8syB90h1h1wk52uTzl1rMYTVvplFRQUTjiKyFVQ+AbwaZ2LsChyYWFWvx+7wGxkSo6N96pbOK/AyCWFo9Gpu/nouOsuePJJiCTtuwQgOnoogXmXELji4qTA7YKEHE857leWZQKJAE3hRprCTSTkOGaNmVLLADKNWehUyX7YUCzB85vr+dfGzlZeSa/aWhAErh2ZxxfVbr441FvbHIqh16gYn2fljrNKmFxop8hq6HOVuadABoC/bNtHQT/F2vbGkTZex0MgmmBTlYf1lW7WHXCz/qCH5tbkJLuHrhxJXh9SxxSBq6CgcKJRRK6CwhlOQpJ4r6qRSdkOMrpwQThefLEQoqoGjWCiNZJxOM0skYCVK2HaNNAml4k6LepIhHj5ABLXXEXimqtoLu49UEIjaHsVkTExSlOkicZwA6FECK1KR7Yxm2xjDiZtF4/8BaF3r9rNdSwYcbit4MLyDCYX2hmVk94vqWDdBTI0hCJ87Q1wYVH2cZ+jO4LRBLJMJxsvOOxA0BOyLLO3OcC6A242VCaF7fY6H5IMFr2GccV2bjy7hAklDiYU23Ga9QQiCe49xaljCgoKCm0oIldB4QxnbaMHdzTOxSdAMLXGw7x5cCNpGg1Xl+WzaFcNVTUfULTsTXj5ZWhqIvHWmwQumEIg0Yp448VoL5uKZuwELDobJrWBlsBeZGS0Kh1mjQUBFTISgUQrcSmGgJC0E+sCURZxRVw0hRvwxDwICDgNGZRaBmDXOXoUxql61X64r4V/XD6i1xaE4+VIB4XtLj8aQWCwzXxCzhWJizz64W4eX9nZxuuO6eXc3UUogi8c58uDh6q0lW42VLrxhOIADMq2MLHUwQ/OKWViiZNBOZYuY4YFgdMqdUxBQeHbjSJyFRTOYCRZZvnBRkZnWMnv50ffgXiENw9uRAVc7lZjfPlPjHt+EdaGuvZtRKcDd80OgrHBpGnM2MvHY9SY2idySbKEXedEpzagU+loCjcTk6LoVHpyjYXEpCgxsWMymSzLtMb9NIYbaY40IcoiFm06ZenlZBgy0ap67zlOiFLKXrXNwRiGfqja9oXtbj/lNjOGfnDBOJpgNMGjH+7uUmhG4hIPvfc1AvCj8wby9tZ61le6WX/Qza6GVmQZbEYt40sc/Oi8gUwscTCu2N4pXaw7NBqBO2eWI8syf/lob6fUsdunlXHHzHI0mjM/+EJBQeH0RxG5CgpnMFtafNSHItw0pKhP+/XW/xpKRPl31UYkWeLKsJm0iWMBsALhNBMNF87AcN2lCDNmYE5zkqExd+lQoBJUWHQ2qgNV1AZrkDgseg607iPfVEChORnCEBEjh/psGwmLYfQqPXlp+WQZs0nT9O6dKskyO5oCfFrpJtOkIyNFYZZt1p9Un9q4KLHLE2Buae4JOb4sw2Mr9vS4zWMr9/Dj88t48tOkR/FZA5zcMb2cCSUOyjLNqLqo0qZ67j9u3s3CyYXcPqOcV76sod4bIddm4KpxBdQGIzy4eTf/NW7wMR1fQUFBoS8oIldB4QxFlmWWHWygwmamzNr7Y+9uU8fgsECtrSW+eBH7avcQu/W7zC0ai6ySiI8dSbQwh+BVc3hj8FS2BFX8fvJg7Pqeo3NFSaQ6WEV1sKrzeJDalzsNGWx2bUQlqMjQZ1KWXo5VZ+u1TzeakNhQ6+XTSjefHXTTEopjNWi4cmgO143O5/E1vXvVtk0EO1ns9gWISRIjnL33Kh8Lqdp4vbG5lo/uPh99P1aT1za6qQ9GeWTrXvJNBsbn2yksNBGRJB7ZupfaYKR9u/PyT+77rqCg8O1DEbkKCmcou70B9vtD3DVqYK/bdpU6BuCKNmFrlbH+eyWqF19C/vRTtLLM4DQDxnt+gjdWh4yM/uM3MOttODUWFiZg+xdf8cb+Rm4aUtzLmWVqgtU9blEbrCHfVMBQ2zBsOjvqXrx3fZE4q6s8fFrpZk2Vh3BCoiDdwIXlmZxb4mBkdjpqlUA4LvbqVXvd6DxO9oPz7S4/Np2WfFPPNwipUusJt7ccjC2yp2zjVesJoz4OX+KjSUgyrkj88PGDEWoP1He5rSsaR5TkLvt6FRQUFPoLReQqKJyhLDvYSIHZyAhHzxXBDqljR2B8/1OsTy3CuHI1QiIBJC2/GsYNJ37NZRjVMmk6J2atBa3q8KN/iw6+MyCPF3ZXc15+BgN6CDNoijS1i+o0jYkMQwaCrEEWErREWgglgkhItESayU3L6/Y4tf4In1a6+bTSxeZ6P6IMw7LM3Di2gHNLnAywGztVfVPxqr1xTGG/uCikQpuNV3M4ynBn+jEFXwSjCTZXe9lw0M36Q44H9b5kdbTQbmR8sYN8W2qxuP1t46VRCTgNqXk0O/VaReAqKCiccBSRq6BwBnKwNcR2t5+bh5akJJZ8MQ9EY0kVq0sKVv3WHaS9/wkA0RFD2HPxDLbMOptJ46dQZMpGp+7ejuz8/Aw+rWth0dfV/Gr8oE6JX5AU11ExikFtoNg0iDSNiff3NNMYjJNt0nFh+SiCiSAHg18TFaNIstTeNiHLMjubA4eErZu97hBalcCEfBs/O2cg5xTbyTT1bpfWk1etcGj9iaQrG6/rxhYS66GFog1JktnXHDjkdOBhw0E32+v8iJKMSadmTJGd+RMKmVDiYHyxgxxrsjIciCT45etbT4mN16RsB4v31BCXune10KoEJmU7+v3cCgoKCkejiFwFhTOQdw42kmHQMSHL3vOGokj4/bfIWPw8aW99gOfPD6BesBBQoVr4XXwxEc+Vs1nrsOCLxbm0cAy5pl6OCagOJaH9ceNuVtW7ODcvo31dOBGmJdKMRqXBqDYy1DbmUOrYzo6pY58fPJQ6NoZgwkNCgi/rPHx6INlf2xSMka7XMKXIzvcPReiajsHmqzuv2hNNX2283MEYGw4e9qT98qAHb/iwhdeEEjvfm1LK+BIHQ3Is3VZhT6mNlwCzi7J5q7Kh201mF2XTj10SCgoKCt2iiFwFhTOMxlCE9U0erqso7PqRryzDF1/AkiXIS5diamxsX2VfuZ5dc68gEI9gzsqi/IFHiMaCiLU7OCengJw0W8rjqLCZOSvbwSv76hjuMBIS3TRHmgnEW1EJKnKNeeQYinluU03PqWPAjIEZXLN4HcG4SK5Fz/QBTs4tcTA6J72fH6mfHIGbio0XwHcnF/P75TtZX+lmX3MQAIdJx4QSO7dNL2N8sYOxRbaULbwgGfRw96xBCCRdFI628bpzejk/7cIntz8wqNXMKc4B4N2qxg4VXa1KYHZRNnOKc9ApaWcKCgonAUGW5Z7d0r9FzJgxA4AVK1ac4pEoKHTPc7uq2Njs5aGzh3cWC4EAjBwJBw60L5IddqSrruLri87js4psxCN0sVpQMcpRxBhnETExgkWX+oz/mBhjv7+BR7e0UGaLcVZODIfeSaYhE4feiVqlJhQTmfXs2h5DGfRqFcu+O54P97kYkWOhzJF2TP2qpxOBSILSe5f16HJg0KrY9duL+OnLW8gw6xhf7GBCiZ3SDFO/XH8wmuyzXrqhmhpPmAK7sU+JZ8dDVBSR5aSLgisax6nXMinbgSCAvptoZwUFBYVUSVWvKZVcBYUzCG80zqp6F3NLc5MCd/9+WL8errkGAMmUhpyVgdDUSHDODKJXX076xfPZ1FrHl67KTscTZYmNrkoEYIyzpNfzx6U4LZFmmiPN+GJeBATOyknnszod8wYMoyTd0mH79/amljq2Yr+LK4blpPo2nNYkRImlKdp4vbm5lqdvnHBCxtEmZG+aUkpClPq1It4bbUL2vPxMxUVBQUHhlKGIXAWFM4gPqptwelq4YNPHsPQlWLsWWaslPO0sAhYNwUQA9d8fQJNfhMmajV1jQZRhs7uzT+2RbHZXMTajpMt1CSnphNASacIT8wBg09kpT6/AachgcpaaPd6dvLinnnvGmhFl2NkcQJbllFPHGgPRk9Iv2+Zw0J/UecOsO5CMwl13wM1V4wpoao30viNQ4wmfFAF6MgXu0SgCV0FB4VShiFwFhTMBj4fo0pcZ+dT/cdXmDQhSskooq1REzpmIq2YHDBqETefEPLKkg+XXLk8NotxzVVGUJXb7GhhmLwCSwtYdddEcacYTdSMjY9VaGZheToY+A5368PFlWWZaThZ/2XiAH1RvZb87TDAmcsukYjJNpz51rCuHg2N9bB9LSGyp8bLugJv1lUlhW+MJA1DkSGPCoRjcbbW+lI7X3zZeCgoKCgqHUUSugsIpordo3Q688AL6O+5g0KGX0YljaL3qYsJXXkpaXilZ2nR0Kn378WJigsaID42gJhA/XFV06E0MtGSjETQk5AT7WhtxR5MTngLxKMF4kIOBSjxRNxISFm06pZYBZBgy0R9hKdYSirGhxsu6Wh/rarw0BWOoBAgaJeaPyOXsIgdDMs3ERInHPj91qWN9dTg4mgZfhHWVLtYdcLP2gJvN1V6iCQm9RsWYIhtXjMlnYqmDiSXOdgsvgCE56fzitd5tvOadABsvBQUFBYUkishVUDiJtEXrxqUYoiyiFtTtVVeVoIJYDN59F5YsgYsuguuvJyHF8Vw6g8hfBlN98SyKfnA5+rIhmDXpONVGAPzxMJWBBhpCXhrCPtzRADIwObMMk0ZPutbIudlDsevMvL+3maZgnCyTgTnl43FHAnzWtAOzVk9LtJmoGKHYUkKGIRODOincQnGR1TXudlG7z51M1SpzpjFzYAYTC2wU2vQ8sPFrHHYNI7KTvblxkVOWOpaKw4EA3DWzApNeQ1yU2Frja287WF/ppurQdRbYjUwscfCdQ6J2ZL4NXQ8euzIyt00r4+H3d3e7ze3TylDm/SooKCicOBSRq6BwkkiGI8RQCWr2+luSNl5aA+WmDORPPkb30isIr70GnmTfa6K+hqarzicihvlSlHn374v49fhisk3pNEcD7PQ20Rj20RD2EhaTfqp2nYkco5URjkJyjDbsujTikkixOZsXNtexeMuuDlXVP39+kIWjcrlu9AQMGhWSJFJsLiEhyexoamVdTRPran1sa2xFlGSyTDomFdi4YUwBE/KtOI+ytppTnMNblQ1MzXWSazKc0tQxWYbHVuzpcZvHVu7htunl/MfzG3h9cy2RuIROrWJ0oY3LRuUxscTBxFIHeTZjn869w+vnthnlCMBfPtrbycbr9mll3DqjnB1eP5NynMdyeQoKCgoKvaCIXAWFk4AkS4iyxFZ3DZvdVckeWVnm7P95iviyjzA1u9q3FXOyCVwxm8BVc1AhoFfb+by+npJ0NRtc+3i3thUJGY2gJtuYzlBbATlpVrKNVgzqzrGqcVFm0eZ6nu7Gq/bpjbUICFw9IocP9rpYX+vjyzofwZiIWadmXJ6Vn55dyoQCG0VWQ48tFrOLsllV72LJnhp+MmoggiCcstSxl1N0OFi6oZrZw3MZUWBjYomDUQVW9L14yIYTIu5IDFc0lvz70B93JM7ZuQ5awjFeP9DAdycXcvuMcl75soZ6b4Rcm4GrxhVQG4zw0OY9TMy2K+4DCgoKCicIReQqKJwEEpLEJlclu7d+gZh/yCpLEHDsOYCp2UUk3YL/0ouw3/gD9o4uxidJeGNRGmsrqWqV8cXMVDgi2HTpDLbmkpNmw6E3tcfgHo0sy4QSQWRAwMALWzoL3CNZtKWOBSPzeGtXExa9mu+OymdigY3BmWY0fRBgOrWKBeUF/GXbfja3+BiTaQNOfuqYPxyn2hNKadsGX4Sbzi5pnwAmyTLuSAx3tE24xnBF4rijMVoOvQ4lxPb9VQLYdTocBi1Og44Mgw5RkmkKR3lk617yTQbG59spLDQRkSQe2bqX2mCyT9qp1yoCV0FBQeEEoYhcBYUTzYEDqBYvpuy5/2PinkqeW7mYYHYyBnfjzQvYtnAu1VPGI+j13FA2lS1VX+KLhck0WCizZLOzJcoQu54fDx3b42lEWcQX8+KOunFHXESlKCMdo1iZolfth/taePqKkcdtsTU6w8pwRzpL9tQwzJHeKbCiv9tQE6LEV3X+9l7adZVu5k8oJNeaWotBgd3Ibl+Af1c24I7E8ERjHPl2GTVqnHotDoOOcqsJR5Ydp0HX/seq6yxUB6SLvLi3hrgkUxuMUHugvtN5tSqBSdmO47p2BQUFBYXuUUSugsKJoLERli5NTiBbswYN4AREjYasbbs4kD0VgLoJow7vI0vs9TdxSeEYjBodakHFVpePpvA+bhxc0uVpomIUd9SFO+rGG/UgIWFQG3AaMrDr7IiJNBoCqdlZNQdj9EfQmCAILKwo4Ndrd/JOVSNzS3P71cbLFYiyrvKQhdcBN18e9BCMiWhUAiMLbFw4LIfxxcle2l+9sa1Xh4OrxhWw2x/AoddSZjXh1OtwHBKwDoOONM0xJHQJydaNtyobut1kdlF2v7zfCgoKCgpdo4hcBYX+5r334OKLoc3LVhDwTz2LjbMms3/mVKJWS7e7BhNR0jS69jaE5QcbGZCexiCbOXksWSYQb8V1SNgGEwEA0rVWiszF6FQ2djTE+HCHlzXVB7h0cBYZaSffqzYnzcCswizWN7m5ID/zmG28RElmZ33HKu3epuQ1Z1r0TCp1cM/swUwsdTC60EaaTkNzOEo4kSAQT6TkcOCLJRidYWNclr1frh3AoFYzpzjZlvJuVSNx6XBpWKsSmF2UzZzinM6xzAoKCgoK/YYichUUjodQCN56CwwGmDsXgMSkCah1OuIjBuO/8mI011xHTbqFnY27ej2cWatvF7h7fQF2ewPcMrwYV7SlvWIbl+JoBA12vYO8tAKaW/VsONjKmmov2xqqEWUothk5r8TBhHwbFU4Tj685+V61l5bkMNKeziPv7+Z/3k/NxssTirVXaNdXutlQ6aE1mkCtEhiel860QVncc2FS1JY40xAEgXBCZJenlVcP1PGVu5WmcJTLS3PJNOhScjjY42ulIL1v7gmpoFOruLg4m4uKslnb6MYVjePUa5mU7UAQUASugoKCwglGEbkKCn0lFoMPPki2IrzxBgSDyGPH0nrReQQSrUTUIVQ7VqLNKqAmGKIu1Mrs9IGsbtqNKEvdBjKoBRUV1mT1L5wI89q+Shx6CItfsdMrk6ZJI9uYgyCls7NR4t1qL19UH8QTiZOmVTEh38bdUwcwudBOfvrhYIJwXDwlXrUGjZpck4HHV/Zu43XrtDLmP7WG5duTj/cdJh0TSxz85wUVTCx1MLbI3t7WIMkyB/wh3qps4Cu3n33+IJIMWUY9wxwWrnbkM8RuQQAe+HIXC3pwOPjT1j3817jB/Xzlh9GrkxXq8/IzFRcFBQUFhZOMInIVFFJl1Sp4/nl45RVwu9sXJ4oLaZ02Hk+wFqM+Hbs+i2qrkU11lYQTMQbbkgJyUuZAHLr0bgMZPHE/kiyxoXkddcEou7wmLixSUWweQL1Pz7qqEGuqPexqTk5iGpRhYu6QbCYX2hiRbel2wtip9Kp99cualGy8Xv6yhhvPLuWy0flMLHEwMNPUwaqsJRxlfa2Xr9x+dnpaCSVEjBo1Q+0WrqsoZKgjnSyjvuNxRZFxmfYeHQ4uLck5aX2xisBVUFBQOLl8I0TukiVLuP/++zsse/bZZ5k8efKpGZDCGUOP0bqyTAcF9NhjSYELiFmZ7V62TJ6MWZtOgdrM/tZmVtTtwB8PU5Gew4TMAVh1aYhSgsHWAp7ZWMOiLgIZrh2Vx41jC1AJEpmGTFZUhokGw3y5V8/fP60hGBOxGTRMKrAxf0QeEwtsKffaAu1etdePLqDSGyYSFzFo1ZQcCjnoT4GbECX2HOqbPVYbr7YWhK/crXzl9tMYjqISYEC6iQsKsxjmsFBqMfUoHI/ui33zCIcDrUrg0pIcpS9WQUFB4RvMGS9yRVHkhRdeoLS0tH1ZVlaWInAVuqUtWjcQ95OQ42gELWZtOnAoWnfnzmQrwosvwltvIVdUEBaDxBbORW2EwFUXI55/LmaDnWyNBbWgYX9rM+uaN+CJBSm1ZHJRwSicBnP7OYPxBIu3NHbZMhAVpfblF5Q5+fl7bqp9EVQCpGVruW5UPmcVJj1rVcdRdpQkGVmGL3Y3tzscFB1yODhWfOE4X9X52FbrY1tN8u8d9X6iCYlfXjS4TzZe/niC1VUuvnK3ss8fQJQh06BjmDOdqxz5DLGZSdP27SNL6YtVUFBQ+PZyxovc5cuXM3v2bG6//fZTPRSFMwBJlvDGXPhiHmQOz3j37t1MxhsrMb78FsLmze3Lgy/8k+Z7/gNJFtHOmoL54tk4tenoVDpkWaY66GZt816aI60UmBxMzxtKttHafq7WeCsqQYVaMKYcyDA024TBAo9NH05Gmr7HfVIlEheP2eEAkhXvak+YbTVettb62F6bFLSVrmSlVqsWGJKTzogCK1ePL2RkgZWR+TbUKiElG6954wt55uuD7PS0MsRhYWFFIcPs6WT1w/UrfbEKCgoK307OeJH7j3/8gwULFtDc3ExmZv/ODlf4ZtEmcL2xw/20moM1ZP3w5xi+2Ni+TNZoCM84h8BVFxO9ZBYWrRWzxoJOpW9vbagPeVnbvJe6kJcco5W5RWPJS7MTSASoDlThi3nxxX1IssRIx2g+TDGQYcX+FiYNsDJetvabwA1GEzz64W4eei81h4NoXGRnQyvba33tgnZ7rQ9vOA4kJ4WNzLdyycg8RuRbGZFvpSLbgq6LloeGQCQlG69ALMHCisIugxX6E0XgKigoKHx7OKNF7scff8zXX3/N/fffz+9+9ztmzpzJL3/5S3Jzc4/5mLIsEwql1keocOYgCAJ6gw5/SzW6g9XIo0Zi1lgQCi3od+xBFgQiUycQvOoS7Av/g1CajE7WYUKHIAqIokSYMK5ogI3eKmojXhzaNM5xDsCsAVe4moOtOxARUaHCrLaQq8sjGjNS7xFoCMRSGmdTIEaZxcjIdGO//BwKgoAsaHhsRWoOBwv/sYZ3v2okIckIApQ60xiWa+FH5xQzLC+dYbkWctP1nfqYE7EIiaMuUa3RsMXlS8nGa32zh/Nz7EQj4eO+ZgUFBQWFbzZyitGZZ7TILS4u5oknnmDPnj0sX76c9957j/Xr1/PCCy8wcODAYzpmLBZj586d/TxShVOJEIlQsmMHmuX/pvid95ELC5F27aAp0kJMFUV87u9Yx05Bzs0gFGlEq9USc4U5WL+//RhhQaRaG8ajiaNHoFijxWKI45Wq8UZBFVOjjqpRRbRUewS2eyNs94WoDsncPL6QTFNqk8QyTDrMcoLqvd1XPvtCbm4u7+6PpOxwcOWoLMpNUQbadQywazFq26qzMZBb8NW14KtL/dw6tYaHNu/huz3YeD20eQ8XFmbiammhvr5z/K2CgoKCgsKRxGLJqkoikUCj6V7KCnKqcvg0RxRF/v73v/P4448zbNgwXnvttT4fY8aMGciyzNtvv30CRqhwUonHUX30EZqXX0b91lsIra3tq2KDyqj89/+RVTISQVYjCyL1wVqMmjTyTQUE435MKgvxeAJ3zM9GbxU1YT86lUCmQYVdJ2DWmLGo07Fo0hFEI182BFhT42ddrR9PJEG6Ts3E/HTOKkjnrEIbaQYDFz67vtdAhmXXjycciWDpoT82VTyhGIG4wEtf1vTYLtDGz2YN4u4LypDFxHGfG5JVZEGr485VW4lLctLGK8uOQaUiIklsaPJQG4ygVQk8NnUkcjyW8t25goKCgsK3l0suuQSAlStX9rjdGV3JPRK1Ws2tt95KfX09L7/8MpWVlZSUlPT5OIIgkJaW1v8DVDi53HYbPPFE+0upuBD56mtwX3EhaWPPoViloT4UISZK6NRaBlgGk5BFWsItWPVWvvbvY5evieZIHLUAJWYjFenZOAx20rVWav1xVh90s6qqhU31fkRJZqAjjcuGZDO1yMGwbAuaI/o/UwlkWDgqD180QWEPsb89UecN8/k+F6v3tbBmn4sd9f4+OxwY9TogdWuy3oiIIrOLsnmrsoHaYITaA50rtbOLshEEMBj7P3VMQUFBQeGbR7fWn0fxjRG5bdx00028/PLL+Hy+Uz0UhZOBLMOmTUnLr2uvRR41iqgYJnbReZheWkLgO7MJXT2XtKkzMOmsWGWBlnAck1ZiQ7Wf5mCMTJOOWWU6AnGRDGMWX/tqWd1Yi0pQMdyey7iMgWgFHZvq/bz1lYdVByup9kXQqQXG59n46dmlTCm2k2sxdDvMIwMZPj/oZWqJHZNWTTAusqrSw1lFNq4ZmUtM6rmt4PBly+xrDrJmXwur97n4fJ+LSlcQgPIsM2cPdHLnjHLOq8gg3aBLyeHg6uO0E+vyuEd51calw5VarUpgdlG24lWroKCgoHBC+MaJ3JycHLRa7TFVcRXOIHbvTgrbxYuT/w+Eo600/ffdiLKIeupIYge2YjLYSFenIQgCkUSchARvfNXIkq0dk78e+7ySBSPzuH5MPqXmHMJinCJjLhtqWvnt1krWVnsJxkWyTDqmFNu566xSxudbMfahrUCvUbFwRB7Xjshn6YZq9ja0UmBL44lLhxEVRT6pa+byAfld7itJMl/V+/l8Xwuf73Xx+f4WGv1RBAFG5luZPTyHKQOdnDXASVZ6R7EdjCa4Y3p5l+4Kbdw5vTzl6+griletgoKCgsKp4Bsncjdu3Mh1112H1Wo91UNRSJEeU8eOJBxOtiAsWQIbD1t+SQY9odnnE5w+CZMmHbPGjF5t7HBMSZYQJYHnNtXyzKauAxme2VSDIMCciiz+9lmAnU2bABiWbeG7o/OZWmyn3GlK+THJ0UTiIo+t2NPJq/YXr2/ltmll3D1rUPuyWEJic7WXz/e1sHpfC2v3u/GG42jVAuOK7Vw7sZizy5xMKnViNWp7PK9Jr+HuWYMQSLooHO1wcOf0cn7ai0/u8aJ41SooKCgonGzOWJErSRK/+c1vGDJkCPPmzUOtVrNv3z4+/PBDfvWrX53q4Sn0Qq+pY21Eo6BP+sVKWg088jCqhkZktZrw9CmE5l2G6vLvkObIJUtl6CRAQ4kQjeEG8tPykVGzeGvPgQyLt9SxcGQeo3PSmTcsl7OL7Nh7EZGp0JtX7cPv70YlCMyfUMhdL21mfaWHcFzEpFMzsdTJrdPKmDIwg3HFdoy6votRg1bNXTMruGtmBUs3VFPjCVNgN7a3KJxIgXs0isBVUFBQUDgZnLEiV6VSIUkSDz/8ME8//TQTJ05k3Lhx3H///cdcaVM4OXSXOuaKNmHV2bFFdaje/DcsWYL81VcEv95CUA4RSgSx3H0zKrUW4ap5pOWW4FR19myNS3FaIs00hhtojbeiETQUpBXx3p7UAhk+3NfCbZOL0aj67zG6LNOrV+3jK/dwy/llDM1N58JhOZw9MIORBVa0/fQ436RP/rrfNKWUhCihUdoEFBQUFBS+wZyxIhfgd7/7Hb/73e9O9TAU+kBXqWMAQiSK8YNP0b38NsK7n0AkklwO+Fa/hzx5EjadHdNdv0Kn7pwEJssynpiHxnADrkgLMjImjRWdKpuWSJiGYISmYGqBDM3BGAf8IZ76qpICs4ECs5FCcxqFZiNZRn2fK5GhWIKlG6pT8qp9c3MtD101qk/HPxYUgaugoKCg8E3njBa5Cmcmvpinw2vTy2+T+ZP7UfkD7cvkQYMIXnUJ0vx5ZA0dg1bVta1VMBGkKdxIU7iRqBhFQk9cMtMYjrCz0U21S02dW8M1w91kpB0+xkBHGtMHONsdDlbud7HPnUwYyzbrseg0TMy2Ux0Is7rejTfWCCQdAfJMRgrNxqT4NSX/tuiSv0otrVE2VXvZXO1lc7WHzdVebp1WRksgmtJ7U+MJK1VWBQUFBQWFfkARuQonlUDUi+6LL5HsVrRDR2LVORAGT0DlDyDm59J65UUE5l1C+oTzSdfbuzxGXIrTHGmiMdyIN+onlBCISlrqAnCgJUKtO0GtW0U0oSfbrGNghp49QT//b+RgXv2qgZ9NKaXUnsarG2vY29BKrtXI3y4Zxj53iIdXH+DC8kzStGrmlR12OmiNJagJhqkJhKk+9Ofjg800u6N4vFGC/gQeXwx/MA5AukHD6EIbV4wt4PxBWXyx35XS+1NgNyoCV0FBQUFBoR9QRK7CiUeWYcsW5MWLMb24hPTqGsTv3YT01Cw8sRDxEcMxfvQBlqnnYhQgFGkgQaKD60KyHcFNQ6iB2mAz/rhEWFRT6xepalFT75Gp82qRZBiaaebGMQ7SzSpWNbUQFSXOK85CBTx52XD+/OFunvh4bweXgV+9sY1bzy/jycuG05XEDIYTVNYG2FLta6/Q1vmSLRUWg4bCzDSKBtrQm9VoTCpMaRo0KhUqk5raRJirxxXyi9e29upVO+8EeNUqKCgoKCh8G1FErsKJY+/e5OSxJUsQdu5EANSAbDYT0xtIJGQ+2x+gKRgnK3MosySBSCJGljGfSCKEIAgE40Fqg3XsDzTgicbwx6HeK1DjUlPv0dIc0KBVC0zMt/HdEQ6mFtsJSSKLdlezujbIhCwb15QV4DDoCEYT/GXlHh75oHPEbSQu8cgHu9GoBG6dVsZHu5qOaDvw0uBPClp7mpbRhXbmTyhidKGNUYU2SpxpHSa/heIJaoIRqgPJyq9erSYQT3DbtLIe43Vvn1ZGIJ7ArFd+LRUUFBQUFI4X5dtU4YQgSQm46EJUe/cjAJJeR2z2TNQLryc++yKe3eVm8XMbOgQy/PnzgywclcsNY/JRq3Qsr/qclmgYb1Sm1qOiwa2n2qUiEJOxGzRMLXZwTomDSQU2jNqkkHxtfx2f1LaQZzLwszHlDLEfjshNxeHgsZV7+PH5Zfzm7R00+COMLrRx7aSkoB1daKPIkdare0eaVkOFzUyFzQxAQpJZUd3EbTPKEYC/fLS3k1ft7dPKuHVGOeubPWSm9X1ym4KCgoKCgkJHFJGrcPy4XPDqq8j/fpPAi08TVMUIiSFs8y4mbd1WpAXXoLviGgx2J8FYguc21/D0xs5+tVFR4umNtQjAgpF5rK6E/c1GDrpkEhKU2o1cMdTBuSUOhmVZ2oWgJMt8XNvMq/vqkGSZa8oLmJ6fieYoofjyl6k5HLyxqZb37jqXdIOmX+zoNCoBg0bFQ5v38N3Jhdw+o5xXvqyh3hsh12bgqnEF1AYjPLR5DxcVZSkCV0FBQUFBoR9QRK4CkKw2Hi0KeyQQgDffRF6yGN57HyGRQACCb7+CeOkcnPpMTL97BI26syvCoi11PR560ZZ6FozMp7LRSLpOw62THJxT7KDIZuy07V5fgEW7qznYGmZKjoOryvKx6g6HNzT6I3y6pxlnmo5qTyilS6v1hjHp1P3qtzwp28HiPTU8snUv+SYD4/PtFBaaiEgSj2zdS20wglYlMCnb0W/nVFBQUFBQ+DajiNxvMRFRBBnWNrpxReI4DdqkyBLAoO4mAWvHDuTf/gb+/RZCOEybDIyNGII4fx4ZU+egMRV32k2WZSRZ6lMgwz8vH4FR2/WPqC8W55W9taxucFNsMXLvuArKrGbcwRhv7azjkz3NfLq7mV0NrQA8Mm8UedbOIrkrTojDgQCzi7J5q7KB2mCE2gP1nTaZXZSNkmOioKCgoKDQPygi91tKTJR452Aj71Y1EpcOi87Fe2qYXZTNnOIcdGoViCJ4vUgOO6FEgEi0iYyXlgKQGFhC4pqrUC/8LrphIzscPypGaY230hrzURtyUx8OMDVrDI1HBDL05FXbHIx1mfSVkGRW1jbz5v46VILAvNI8VCF45qMDfLq7hS21XmQZSpwmzqvI4GcXDuLc8kyy0w0EIgnufWNbrw4HV58AhwODWs2c4hyATu+5ViV0fM8VFBQUFBQUjhtF5H4LiYgi7xxs5K3Khk7r4pLMWwfqsW/6kqkff4DqlaXEpkyk/umHkZHRV5QS/v3/QztjFpqJZ6ERBBJSAk/UQyDeSmu8FW/MhzsaxR+XcIVlajwqGjwaIsWtZKTpKEg38KtzB3brVfvfn+4j26zvFKu709PKczsPsr3ajzEq0NISZdFrB0hIMrlWA+eWZ3LzuQM4pzyDYqep07UJAtwxvZyH3vu62/fmzunlx/8Gd4NOreLi4mwuKspOVs+jcZz6ZPVcEFAEroKCgoKCQj+iiNxvIzK8U9XYaXH+gb1MWvEukz5+n8z6wxPDNF9swK6yYjLaUQsagj/7TzyxVlp9X9MabyUshohLMsE4tMZV7HcnqHGraPQaqPeBJEOh1YAoClw8JJNppQ6eWLm3W6/ap+YOx3QoQSwuSny8t5mn1u5nY6UXtzuKKMk4TDrOLc9g4YQizqvIpCzL3GsPrUmv4e5ZgxBIuigc7XBw5/RyfjprEAZtN60a/YD+UBvIefmZiJKsTDJTUFBQUFA4QSgi91vI2kY3iUOPy/NNBsZn2Zl454/JfevN9m0iBiO+iy7GceO1uM8bR6vop9JTSyAeQEYGGVSCkaCoosav4avGGLVuFQ1eNaGYGqNGxYQCG9eOsHFWoZ38dAMArZE4//vRvh69alUCXDupmJ+9upXP9jQTiUtoNQJjS+z8ZGoZ5w3KYlhuOqpjEIgGrZq7ZlZw18wKlm6opsYTpsD+/9u794CoyrwP4N9hhuEiDMMduQiIjApeQkXUSk2h1rKLeekts1Tc9l1XvKD5qrkZq264ZjdNt9VSMzRNI13X3W1B3Vw3M9DNyDsiMCaIDAjDZa7n/cNg4445M4cZv5//fI7O/B5GnvPl8JzfcWvcomDNgNscAy4REZH1MOTeY4xmATVFakw4sAeq+SkICgrAP/Jv4mp4XwQ4/xXmcb9A6ZNP4/3IWMSFByPcU4ocdT783ZwR7O4OD1k3lNXrcOp6FS7eqMUPFVKUa50AyBHt647Jsd4YHuaN/oGere6plUCCdw+336t2/ZHL+M1D0TA6CYiOVuCx2O6YMzQKChfndv9dZ3X78WELM+6PhNFk5mN0iYiIHBBD7j1Cd7Mc6u0fw+/zfRj37+OQmM3IUgZiVsgw6ExmdOs5GpL0UTB4euHZ2GCkDuyO8xXVqNSZkHPDFXV6M7Q1NdDWmKCtM8NslsDVWY6YAA88098biZF+CFV03L2gs71qPzutxuLH+yDC0x0hHp3rivBzMOASERE5JoZcB2aqqcW1nZ/AvGsXQo8dRZTR2HispF8cDt3QIzTGtUWHg22n1ZBIgKdjgvDql7koKDOhqs4JEgkQ4iVHTIAburlLUA8TbhmM+LK8DF+Wl0EhlyHI3RVB7i4IcneFj1yOikodrpRokVtUidEq/073qr1eWY8X/SPgLGMIJSIiojvHkOtgBEHAlapafFVSjsvfncVrLyU3Hqvtq4Lk2WeAKS9g3n+qsPj+nm12OHjjeAGeGxAMqckDD/aQY2SEP+KDveDh0vS/TJ3RhNJaHX6oqcO31ypx6mol/vLDNRSX1aKiUg9BAJycAH9vV4ztF4AQpXun5hHq7caAS0RERD8bQ64jMJlQ/vcsVH20A5WaCqxfshIezhLE9g3GjYmPwykiCNeeGIkqVRgG+Q3BvwpqsOGxMGw80naHg/WPxeBEcQXeG9+/1a4FpVX1yC2sQE5hBXILNThVVIlbdQYAQHSABx7uFYiYEAVC/d3h6emMcr0eMT4KRHq4Y2nmmQ571U4aHGr5rxMRERHdMxhyu5A7erSuIKD6+FfQbNsOn/2Z8L1ZBl8AZqkUM954BUHhfqgxanFhw0LIJDIEuAXC30mBilqgt68HNh653G6HA4kEeGFEJIxmM3RGAf8pqkRukQa5hRXILaxAcUUdACDA0wVDwr0xd0w0Bod7Y1APJZTuLR/l2zC/Um095jzUC2980fK9G6Q81Au39Ea4y2XsQEBEREQ/C0OuyO700bp1RhOKN2yC/7q18FYXwfPHcb2XAvVP/gKVEx+B1M+Akrrr8HRWwlkIxkl1LXKvleJqeQnmJkQhsacfNhy53G5dG45cRsqYaMzY9g0OnrkOswC4y6WIC1NiQlwIhoT7YHC4N0K93TrsT9tA5iSBurYWc8ZGQ4LbXRSa96pNeagXfjM2GpduVXfqRjYiIiKi1jDkiqizj9Y1XLmCc2Yp/lVnwrflt3B/4Q94QV0Eo5sb6h5LQs2kx/DDg/1Q42SA2SxDcaUbzlw34GxpFcqqqiFAgiBPKZKiFHgg3BufnVJ3qsPB3lw1piaEI6lvEAaHe6NPkOdddyMY6KfEqtzzeHZYGFLGRmNvrhrXK+vRXemKSYNDca2mHm+euYTlg/vc1fsQERHRvY0hVyQ/fbRuwwMZ3KROqDOZkXOjAv889T16bv8Qff52EC4nv8aZlP9D8bP/g5HBMgz+zURoBoagNHEoyqS1KKsVcLEQOFcqxdVyAfWGOrjIgJhAdzzb3w9jIwIR5OkKg8mMsmpdpzsclNyqx4wREZZtsyUBBvt7Y92Zy7fnHeKNsLBuqDebse7MZVyrqcfjEUHo5MVhIiIiolYx5IpFAHLKKvDyfdEI7eaK4lv1MJZr4PvFITx2YB+kR49AYr59tVWQSJCkK0XSfQGoNdahuLoKh+Nicf5sHfLLnFBWLQEgoLtChoejFXi4ZxAGdVeiotaAkwXl2HTkMr4u0OBUUQUWJKrQ3atz2wBCvd0s3kfWVSrFY+FBAIC/FZVif8H1xmPOThI8HhHUeAWbiIiI6OdiyBXJtzcrsfi+aFTUGGEwCjjxbRGmPjkULnX/vcpqjI/HtfFPwueFSTgtqcKJUxdx8YYTijVO0Bmc4SIDYgNdMXWAHx4KD8TNah2+vlKOLUeu4OsCDQpu1gAAunu5IiHSB68+FoNRqgBE+nXDK59/12GHg4ZH3VqaXOqER8MDMa5H4O29yDoDfF1u70WWSMCAS0RERHeNIVcERp0eqn//Cy6nT2H70CmNbbyCA/sivOoGMvuOhMcLU/HitESY6/VI/sdZXNXoADgjWOGEX6gUGBbsB32dgNxCDT75ZzEWFZ6BVmeE1EmCAaFeeDgmEAmRvhga6YOwZjeH1eiMmDsmGn/4+4U2a5w3JtqqXwOXH2+qGxXiD5NZYBcFIiIisiiGXFsxm6E/ehQV23bA9y8H4K3RAAD2vBiMekUAAGDGI6mocXYFJBIgrx63si9i3lgVhnRXICFIgJsgw9lrVfj8y+tYU3q7BZdvNzkSIn2w6GEVEiJ9EddDCXd5+x9rNxcZFj3cGxIA7xy+1KLDwbwx0Vj4cG+4Orfs7mANDLhERERkaQy5VmY8dx4VG96D29698LhRgsAfx80BAfgweAiEn15hlTfdK7v+yGXMGRONf+SU4OsCDSQSILa7Avf38sWCJBWGRvggyr9bp1t4/ZSrsxTzE1WYn6jCnpxiqCvqEOrt1rhFwVYBl4iIiMgaGHKtwGw0Il9bj69KbsK8MxPTN24AAOg8PVA0dix8ps/Cfq/eSP0sr93XaWjj9er4GBjNAgb38IbCzdlidXb78RG9M+6PhNFktvhNZkRERERiYci1lOJiVG7fAWHXLvz7wTHYN+VFeDpLEDL2AXz77yRkDxmNv0QkoNIsx1v9+kJ9rrRTL2uVNl6tYMAlIiIiR8KQ24zQ8V/5r7IyaHfugu7jDPjmnITyx+H+egN2/WIyzlSYcbRKwAeJLwMmA3TXalFRVYXjPctEbeNFRERE5OgYcpu5VWvAh/8qwJT4MJgFAQrXltsDtHoDaidMgN/f/wYPkwkeAMwSCc6oBuLz2NHYHz0CN3MqUVt7uz2YRALEBCkQ388H8eHeeDDaD34eLp1q4zXZSm28iIiIiBwZQ24zlXUGzN/zHyzJPIOUh6Kx6JHecDPqYcg+jJOD43H8ehn+U1KHX1XqMcpkwnchKnzWdyT2Ro1AoVwJAPASZBge5oP4CF/ER3hjUCt7aTvbxot9B4iIiIjuHENuG4z1epzevAuX3jqD/t8cgXN1Nd5Z8gFy3ENQW2PApajHYYx6Ele8gxEV4I7E6AAkRPoiPsIHPf067njQ1dp4ERERETkShtxmXI16vHn0fTx16Sv41Vc1jl9XBsBw9ipqVb6IDfXAIw+NxQO9AnBfWMd9adt8L7bxIiIiIrIKhtxmgmo0mHX1FACgzE2Bz3uNQND/zkDk40l4t5sLQps9PexusY0XERERkeUx5DZjdJIio89D2Kt6AEfDBsDkJMXLPXrj4SAFXKx8ZZUBl4iIiMgyGHKbUXv64dcjU5qMhSjdrB5wiYiIiMhyeOmwhaZbEVydnTAlnm28iIiIiOwJQ24H5o6JhiDc0SMiiIiIiEhk3K7QBldnJ8z9sY2XG7cqEBEREdkVhtxmlG7OeGXKfZgSHwZBEBhwiYiIiOyQ3YdcvV6Pt956C9988w0kEgmGDx+OuXPnQib7eVPzcnfGzAciLVwlEREREdmS3e/JnTdvHq5cuYLdu3fjk08+QV5eHpYvX/6zX4+P0SUiIiKyf3Ydcg8dOoTDhw9jwYIFkEqlkEqlmD17NjIzM3Hs2DGxyyMiIiIikdh1yM3IyICPjw/69OnTODZgwAC4uLggIyNDxMqIiIiISEx2uydXq9Xi9OnTGDBgQJNxuVyO0NBQ5ObmQhCEO34EryAIqK2ttWSpRERERGQhnW3tarcht7S0FCaTCf7+/i2OeXp6Ij8/H1VVVfDy8rqj19Xr9Th37pylyiQiIiIiC9Lr9QAAo9HYbqMBuw25lZWVAABXV9cWx6TS222/6uvr7yjk3rhxAyaTCS+//LJFaiQiIiIiy9JoNJBKpR120rLbkOvi4gIA0Ol0LY41jCmVyjt+Tb1ef8dbHIiIiIjINmQyGeRyecd/zwa1WEWPHj0AABUVFS2OVVZWwsfHpzEId1ZOTo5FaiMiIiIicdltdwWFQoGYmBgUFBQ0Gdfr9SgpKcGIESNEqoyIiIiIxGa3IRcAnnvuOZSVleHixYuNY7m5uTAajXjmmWdErIyIiIiIxGTXIffpp59GfHw8tmzZAuD2jWYbNmzA5MmTMXToUJGrIyIiIiKxSITONhvrorRaLVavXo1Lly5BIpEgKSkJs2bNgpOTXed3IiIiIroLdh9yiYiIiIia4+VOIiIiInI4DLlERERE5HAYcomIiIjI4TDkEhEREZHDYcglIiIiIofDkEtEREREDochl4iIiIgcDkMuERERETkcmdgFdBXFxcXYunUr1Go1/vSnP9nkPc1mM3bu3ImMjAyo1WqEhIQgOTkZkydPtsn7dxVnzpzBc889hw8++AAJCQlil2MzgiAgOzsbhw4dQmBgIFQqFSZMmCB2WVZx7Ngx/PGPf4STkxP0ej169eqF1NRU+Pr6il2aRXW0jpSWlmLVqlW4efMmTCYTpk6diieffFKESi2rvXlXV1fjnXfeQVZWFjQaDfr164fU1FQMGTJEpGot507OG9u2bcPrr7+OCxcu2Kg66+nsvOvr6/Hpp5/i1KlTiIiIQEJCAoYNG2bDSi2rvXmbzWZ8/PHHOHjwIFxcXFBXV4fExEQkJyfD2dlZpIrvTmczSlde1xhyAZw4cQJHjx5FRkYGhg4darP3ff/996FWq7F69WrU1tZi/fr1WL58OaqqqpCcnGyzOsSk1WqxcOFCGAwGsUuxqYqKCixevBhmsxnp6enw9/cXuySrOXHiBFJTU7Fjxw706dMHZrMZy5cvxy9/+Ut8+umnkEqlYpdoER2tIxqNBlOnTsWUKVPw0ksvoby8HE899RSMRiMmTpwoQsWW0d68BUHAkiVLMHDgQKxfvx4FBQX4wx/+gOnTp+Ojjz7CoEGDRKr67t3JeeP777/HunXrbFSZdXV23ufPn8e8efMwduxYrFmzBnK53IZVWl5H8964cSOys7OxdetWKJVKVFVVYdq0abh58yaWL18uQsV3rzMZpcuvawI1SkhIEJ5//nmbvJdOpxPS09ObjGm1WmHkyJFCXFycoNfrbVKH2BYvXiysWLFCUKlUwokTJ8QuxyYqKyuF8ePHCzNnzhQMBoPY5VhdamqqkJKS0mTs7NmzgkqlEs6dOydSVdbT1jry6quvCsOGDWvymb/33ntCXFycUF5ebssSraK1eefk5AgHDx5sMpaXlyf06dNHmDlzpi3Ls5qOzhtarVaYMWOG8Otf/1pQqVQ2rMy62pt3Xl6eEBcXJ2zYsMHGVVlfW/N+4IEHhB07djQZ27p1qxAfH2+r0iyqsxmlq69r3JP7E25ubjZ7L61W2+Jqbbdu3TB69GjU1NSgsrLSZrWIZd++fejVqxcGDBggdik2tXDhQty4cQNr166FTOb4v0wxGAy4dOkSjEZjkzEXFxcEBASIWJl1tLaO1NXVITMzE/Hx8U0+8/j4eNTU1GD//v22LNEqWpu3QqHAo48+2mQsNjYWERERuHbtmq1Ks6qOzhurV6/GnDlzoFAobFSRbbQ17+rqavzqV79CbGwsZs+ebeOqrK+teRsMBpw/f77FWGhoqC3KsrjOZBR7WNcYcn9CIpHY7L18fHzg5+fXYtzNzQ0eHh7w8fGxWS1iuHLlCo4cOYJZs2aJXYpNZWdn49ixY5g2bZrDf8YNJk6ciCtXruD3v/89BEEAAOzatQtLly51yK9Ba+vIyZMnodPpEBkZ2WS8Z8+ejcftXWvzjo6ObnXczc0NPXr0sEVZVtfeeePAgQMICwuz620ZbWlr3hs2bEBZWRnmzp1r03OqrbQ1p0mTJmHv3r04dOgQgNv7kbOysvDaa6/ZsDrL6UxGsYd1zfEvI9mZ06dPY/z48Q6zT7E1er0eq1evxpo1axxyEWzPnj17AACBgYFYsWIFzp07B3d3d0ydOhVJSUkiV2cdo0aNwtKlS5Geng6NRoPhw4fjiSeewPDhw8UuzWYarlo2P2l4eno2OX4v0Gq1uHjxosP/gFtUVIS//vWveO+998QuxWYMBgMyMzPh7++Pa9euYe/evTh//jx69OiBlJQUqFQqsUu0mvnz56OoqAgLFy6EWq2G0WjEG2+8gbCwMLFLs6ifZhR7WNd4JbcL+e6773D58mXMnTtX7FKsau3atXjxxRdb/SnRkQmCgK+++go+Pj4ICQlBWloatm/fDoVCgTlz5mDfvn1il2g106dPx4wZM2A2m7Fy5UqHuMP8Tty6dQsA4Orq2mS84YdZnU5n85rEsnv3bsTGxmLcuHFil2I1BoMBv/vd7/Daa6/ByeneOc3m5eXh1q1bCAkJQf/+/bFmzRps2rQJly5dwqRJk5Cfny92iVYjk8mwdu1aPPXUUzh06BA+/vjjLhHyLKl5RrGHde3e+e7r4kwmE1atWoWVK1c6XFulnzpy5AicnZ0xcuRIsUuxuYqKCuh0OvTr1w8jRowAcPtXPytWrICzs7PD3H3dmlWrViEpKQnvvvsuZs2ahddffx3p6elil2UzLi4uAFou+g1/9vLysnlNYigtLUVGRgbS09Md+rc4b775JqZOnYrAwECxS7GpkpISAEBiYiKioqIAAMHBwVi0aBF0Oh3Wr18vZnlW1dApaOnSpdi5cyf69u2LWbNm4YsvvhC7NItoLaPYw7rG7QpdxLp16zBs2LAWN2k4mq1btyInJwfbtm1rHGvYpzl9+nRIJBKcPXtWpOqsq+GnWw8Pjybjvr6+iIuLw8mTJ1FeXu5wP+RkZGTg22+/bWyjM3/+fBiNRmzevBmjR4+2676ZndWw/7T5DaUNf+7evbuNK7I9vV6PRYsWYcWKFS328DmarVu3Yvv27U3GzGYzACAmJgbx8fEtjjuChpuPmq9xo0ePhkwmw8WLF8UoyyaWLVuG8PDwxhsMN23ahJkzZ+KVV17B0KFDoVQqxS3wLrWWUexhXWPI7QJ2796N8vLye+LK1urVq1FXV9dkLDs7G2+//TZWrVqF/v37i1SZ9Xl5eSEwMBDXr19vcczPzw8ymaxxL5Mj2bdvHyIiIpqMLViwAPv378fhw4fviZA7ZMgQyGQyFBQUNBkvLCwEANx///1ilGUzgiBg2bJleOKJJzBq1Cixy7G6AwcOtBh7++23kZ2djc8//9ymnXxsqWHPbfM1TiaTQalUOuSNpsDtq7hffPFFk5vM5HI50tLS8OijjyInJweJiYniFXiX2soo9rCuMeT+hCAIjVcVbeXPf/4zvvzyS7zzzjtNfn1XVlbmkA8IaG0Tfl5eHgAgNDTUoW9MAIAJEyZgy5YtKCkpQVBQUON4cXExHnzwQbtvmN4ad3d3XL16tcmYVCqFr6+vQ57sW1tHlEolxo0bh+PHj0MQhMbv9ZMnT8LLywuPPPKIGKVaVHvrZ1paGnr37t3iSUmOsM61Nu/W1rGGK3yOssa1Nu/w8HAMGjQIWVlZWLBgQeP/85qaGmg0God4yFFr85bL5ZDJZI3hrkFDi0R3d3eb1WdpHWWUrr6ucU/uj/R6PaqqqqDRaGwWdA8cOIAtW7YgJSUFhYWFyM/Px4ULF3DgwAF8+OGHNqmBbOull15CVFQU0tLSGp/ydvDgQRQWFmLx4sUiV2cdycnJ+P7775vcWJeVlYWSkhI888wzIlZmee2tIw2f7969ewEAarUae/bswbJly7rE3rW70da8BUFAWloaqqurMWbMGOTn5yM/Px9nz57Fxo0bcfz4cRGrvntinDe6gvbmnZaWhtLSUmzevBnA7W0aa9euhUqlwvPPPy9GuRbT1rzlcjmmTZuGPXv2NPbKNZvN2LRpEwYOHGi3j6vvTEbp6uuaRLiXvjPb8Mknn2Dz5s1Qq9UAgKioKCxZssSqN0ft378fS5Ysadyn1dyePXswcOBAq71/V/LZZ59h6dKl+Oijj+x2MbgTGo0Ga9euxblz5+Dq6gqlUolFixahV69eYpdmNVlZWdi4cSOMRiN8fX2hVCqRmprqUO11OrOO5OfnY+XKlTAYDDCbzUhOTrbrX2MC7c/7t7/9bWPbvOa8vLxw7NixxptX7M2dnjeWLFmCzMxMu+8s0pl5X7hwAenp6dBqtRAEAbGxsUhNTe0Soefn6mjeZrMZmzdvRmZmJhQKBVxcXNC/f3/Mnj27xR5le3AnGaUrr2sMuURERETkcLhdgYiIiIgcDkMuERERETkchlwiIiIicjgMuURERETkcBhyiYiIiMjhMOQSERERkcNhyCUiIiIih8OQS0REREQOhyGXiIiIiBwOQy4RERERORyGXCIiIiJyOAy5RERERORw/h8AXh2BUAsP/QAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df1 = pd.read_table(\n", + " 'perf_with_increased_depth_subplot1.tsv',\n", + " header=0,\n", + " sep='\\t',\n", + " usecols=['Depth', 'TestName', 'AvgTime', 'Throughput', 'Ratio'])\n", + "\n", + "fig, axes = plt.subplots(2, figsize=(8, 8))\n", + "\n", + "g = sns.barplot(\n", + " ax=axes[0],\n", + " data=df1,\n", + " x='Depth',\n", + " y='AvgTime',\n", + " hue='TestName',\n", + " ci='sd',\n", + " palette='pastel')\n", + "\n", + "g.xaxis.label.set_visible(False)\n", + "g.yaxis.grid(True, clip_on=False)\n", + "g.set_xlabel('Depth', fontsize=16, fontweight='bold')\n", + "g.set_ylabel('Elapsed Time(ms)', fontsize=16, fontweight='bold')\n", + "g.tick_params(labelsize=14)\n", + "\n", + "g.legend(\n", + " title='',\n", + " bbox_to_anchor=(0.00, 0.99),\n", + " loc='upper left',\n", + " fontsize=14,\n", + " title_fontsize=14)\n", + "axes[0].set_title(\n", + " '(a). The elapsed time varied according to depth.',\n", + " fontdict={\n", + " 'fontsize': 16,\n", + " 'fontweight': 'bold'\n", + " })\n", + "\n", + "df2 = pd.read_table(\n", + " 'perf_with_increased_depth_subplot2.tsv',\n", + " header=0,\n", + " sep='\\t',\n", + " usecols=['Depth', 'TestName', 'AvgTime', 'Throughput', 'Ratio'])\n", + "\n", + "x_vs = [v - 1 for v in df2['Depth']]\n", + "palette = sns.color_palette('GnBu', 6)\n", + "g = sns.lineplot(\n", + " ax=axes[1],\n", + " x=x_vs,\n", + " y='Ratio',\n", + " hue='TestName',\n", + " marker='o',\n", + " palette=palette,\n", + " linewidth=1,\n", + " markersize=8,\n", + " data=df2)\n", + "\n", + "ref_x = list(range(20))\n", + "ref_y = list(range(1, 21))\n", + "g_ref = sns.lineplot(\n", + " ax=axes[1],\n", + " x=ref_x,\n", + " y=ref_y,\n", + " linewidth=1.5,\n", + " color='red',\n", + " linestyle='--',\n", + " markers=True)\n", + "\n", + "g.set_xlim(1, 20)\n", + "g.yaxis.grid(True, clip_on=False)\n", + "\n", + "g.xaxis.label.set_visible(False)\n", + "\n", + "g.legend(\n", + " title='',\n", + " bbox_to_anchor=(0.00, 0.99),\n", + " loc='upper left',\n", + " fontsize=14,\n", + " title_fontsize=14)\n", + "\n", + "g.set_xlabel('Depth', fontsize=14, fontweight='bold')\n", + "g.set_ylabel('Ratio', fontsize=14, fontweight='bold')\n", + "axes[1].set_title(\n", + " '(b). The increasing ratio of time varied according to depth.',\n", + " fontdict={\n", + " 'fontsize': 16,\n", + " 'fontweight': 'bold'\n", + " })\n", + "\n", + "g.set_xticks(range(20))\n", + "xticks = [str(i) for i in range(1, 21)]\n", + "g.set_xticklabels(xticks)\n", + "g.tick_params(labelsize=14)\n", + "\n", + "xticks = g.xaxis.get_major_ticks()\n", + "for i in range(len(xticks)):\n", + " if (i - 1) % 2:\n", + " xticks[i].set_visible(False)\n", + "xticks[0].set_visible(True)\n", + "\n", + "fig = g.figure\n", + "fig.savefig('stacked_lstm_perf_with_depth.pdf', dpi=500, bbox_inches='tight')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot1.tsv b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot1.tsv new file mode 100644 index 000000000..f658ab4c6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot1.tsv @@ -0,0 +1,37 @@ +Depth TestName AvgTime Throughput Ratio +1 PT-JIT 1.96780 16261.78517 1.00000 +4 PT-JIT 7.77687 4114.76474 3.95206 +8 PT-JIT 15.53171 2060.30052 7.89292 +12 PT-JIT 24.19508 1322.58278 12.29548 +16 PT-JIT 32.10150 996.83803 16.31337 +20 PT-JIT 41.20204 776.66064 20.93808 +1 TF-WhileOpLSTM 3.90668 8191.10010 1.00000 +4 TF-WhileOpLSTM 13.55371 2360.97774 3.46937 +8 TF-WhileOpLSTM 27.79051 1151.47235 7.11359 +12 TF-WhileOpLSTM 43.66595 732.83646 11.17726 +16 TF-WhileOpLSTM 62.98433 508.06287 16.12222 +20 TF-WhileOpLSTM 80.80373 396.02133 20.68348 +1 TF-GraphMode 2.55039 12547.11352 1.00000 +4 TF-GraphMode 8.89519 3597.44963 3.48778 +8 TF-GraphMode 24.10036 1327.78104 9.44969 +12 TF-GraphMode 30.35351 1054.24393 11.90153 +16 TF-GraphMode 43.90387 728.86518 17.21459 +20 TF-GraphMode 59.90020 534.22195 23.48671 +1 TF-AutoGraph 2.50235 12787.99954 1.00000 +4 TF-AutoGraph 6.49833 4924.34298 2.59689 +8 TF-AutoGraph 12.64750 2530.14397 5.05426 +12 TF-AutoGraph 18.24804 1753.61285 7.29237 +16 TF-AutoGraph 55.36173 578.01660 22.12393 +20 TF-AutoGraph 53.05073 603.19619 21.20040 +1 TVM-Ansor 1.1106 28813.254097 1.000000 +4 TVM-Ansor 3.7581 8514.941061 3.383847 +8 TVM-Ansor 7.4149 4315.634735 6.676481 +12 TVM-Ansor 11.1161 2878.707460 10.009094 +16 TVM-Ansor 14.8240 2158.661630 13.347740 +20 TVM-Ansor 18.4955 1730.150577 16.653611 +1 CuDNN 0.37390 85583.48580 1.00000 +4 CuDNN 1.07919 29651.76547 2.88629 +8 CuDNN 2.02688 15787.78251 5.42087 +12 CuDNN 2.97586 10753.18958 7.95889 +16 CuDNN 4.06898 7864.38144 10.88242 +20 CuDNN 4.85620 6589.51845 12.98782 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot2.tsv b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot2.tsv new file mode 100644 index 000000000..dab590415 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot2.tsv @@ -0,0 +1,121 @@ +Depth TestName AvgTime Throughput Ratio +1 PT-JIT 1.96780 16261.78517 1.00000 +2 PT-JIT 3.82791 8359.66267 1.94527 +3 PT-JIT 5.87914 5442.97304 2.98767 +4 PT-JIT 7.77687 4114.76474 3.95206 +5 PT-JIT 9.61712 3327.39877 4.88724 +6 PT-JIT 11.57076 2765.59103 5.88004 +7 PT-JIT 13.57346 2357.54120 6.89777 +8 PT-JIT 15.53171 2060.30052 7.89292 +9 PT-JIT 17.71576 1806.30087 9.00281 +10 PT-JIT 19.41923 1647.85154 9.86848 +11 PT-JIT 21.31421 1501.34598 10.83147 +12 PT-JIT 24.19508 1322.58278 12.29548 +13 PT-JIT 25.53686 1253.09087 12.97734 +14 PT-JIT 27.64572 1157.50307 14.04902 +15 PT-JIT 30.21741 1058.99218 15.35591 +16 PT-JIT 32.10150 996.83803 16.31337 +17 PT-JIT 34.34094 931.83244 17.45140 +18 PT-JIT 36.62051 873.82737 18.60984 +19 PT-JIT 38.55125 830.06385 19.59101 +20 PT-JIT 41.20204 776.66064 20.93808 +1 TF-WhileOpLSTM 3.90668 8191.10010 1.00000 +2 TF-WhileOpLSTM 6.83922 4678.89632 1.75065 +3 TF-WhileOpLSTM 11.19222 2859.12932 2.86489 +4 TF-WhileOpLSTM 13.55371 2360.97774 3.46937 +5 TF-WhileOpLSTM 17.75204 1802.61019 4.54402 +6 TF-WhileOpLSTM 20.64037 1550.36016 5.28335 +7 TF-WhileOpLSTM 23.96340 1335.36957 6.13396 +8 TF-WhileOpLSTM 27.79051 1151.47235 7.11359 +9 TF-WhileOpLSTM 31.95632 1001.36677 8.17992 +10 TF-WhileOpLSTM 35.40744 903.76486 9.06331 +11 TF-WhileOpLSTM 39.40306 812.11973 10.08607 +12 TF-WhileOpLSTM 43.66595 732.83646 11.17726 +13 TF-WhileOpLSTM 47.41686 674.86541 12.13738 +14 TF-WhileOpLSTM 53.23548 601.10293 13.62678 +15 TF-WhileOpLSTM 62.23625 514.16977 15.93073 +16 TF-WhileOpLSTM 62.98433 508.06287 16.12222 +17 TF-WhileOpLSTM 65.56097 488.09529 16.78176 +18 TF-WhileOpLSTM 70.15835 456.11104 17.95857 +19 TF-WhileOpLSTM 74.84951 427.52451 19.15937 +20 TF-WhileOpLSTM 80.80373 396.02133 20.68348 +1 TF-GraphMode 2.55039 12547.11352 1.00000 +2 TF-GraphMode 4.53552 7055.42639 1.77836 +3 TF-GraphMode 6.64848 4813.12670 2.60685 +4 TF-GraphMode 8.89519 3597.44963 3.48778 +5 TF-GraphMode 11.03347 2900.26582 4.32619 +6 TF-GraphMode 14.85596 2154.01692 5.82498 +7 TF-GraphMode 16.50565 1938.72952 6.47182 +8 TF-GraphMode 24.10036 1327.78104 9.44969 +9 TF-GraphMode 21.63371 1479.17285 8.48252 +10 TF-GraphMode 24.42823 1309.95969 9.57824 +11 TF-GraphMode 28.42625 1125.72007 11.14586 +12 TF-GraphMode 30.35351 1054.24393 11.90153 +13 TF-GraphMode 33.77798 947.36286 13.24425 +14 TF-GraphMode 36.88077 867.66081 14.46085 +15 TF-GraphMode 40.64202 787.36253 15.93562 +16 TF-GraphMode 43.90387 728.86518 17.21459 +17 TF-GraphMode 47.50709 673.58371 18.62740 +18 TF-GraphMode 51.19966 625.00417 20.07525 +19 TF-GraphMode 55.06099 581.17371 21.58927 +20 TF-GraphMode 59.90020 534.22195 23.48671 +1 TF-AutoGraph 2.50235 12787.99954 1.00000 +2 TF-AutoGraph 5.04030 6348.82357 2.01423 +3 TF-AutoGraph 7.59233 4214.78081 3.03408 +4 TF-AutoGraph 6.49833 4924.34298 2.59689 +5 TF-AutoGraph 10.75813 2974.49471 4.29922 +6 TF-AutoGraph 9.46353 3381.40109 3.78186 +7 TF-AutoGraph 24.77981 1291.37377 9.90263 +8 TF-AutoGraph 12.64750 2530.14397 5.05426 +9 TF-AutoGraph 14.61008 2190.26924 5.83855 +10 TF-AutoGraph 15.45693 2070.26870 6.17698 +11 TF-AutoGraph 41.13654 777.89716 16.43919 +12 TF-AutoGraph 18.24804 1753.61285 7.29237 +13 TF-AutoGraph 37.62562 850.48426 15.03614 +14 TF-AutoGraph 23.50631 1361.33673 9.39371 +15 TF-AutoGraph 23.82844 1342.93290 9.52244 +16 TF-AutoGraph 55.36173 578.01660 22.12393 +17 TF-AutoGraph 33.40216 958.02174 13.34834 +18 TF-AutoGraph 43.36034 738.00152 17.32788 +19 TF-AutoGraph 61.50502 520.28270 24.57894 +20 TF-AutoGraph 53.05073 603.19619 21.20040 +1 TVM-Ansor 1.1106 28813.254097 1.000000 +2 TVM-Ansor 1.9611 16317.372903 1.765802 +3 TVM-Ansor 2.8679 11157.990167 2.582298 +4 TVM-Ansor 3.7581 8514.941061 3.383847 +5 TVM-Ansor 4.7305 6764.612620 4.259409 +6 TVM-Ansor 5.6727 5641.052761 5.107780 +7 TVM-Ansor 6.5658 4873.739681 5.911939 +8 TVM-Ansor 7.4149 4315.634735 6.676481 +9 TVM-Ansor 8.3995 3809.750580 7.563029 +10 TVM-Ansor 9.2911 3444.156236 8.365838 +11 TVM-Ansor 10.1799 3143.449346 9.166126 +12 TVM-Ansor 11.1161 2878.707460 10.009094 +13 TVM-Ansor 12.0683 2651.574787 10.866469 +14 TVM-Ansor 12.9781 2465.692205 11.685665 +15 TVM-Ansor 13.9795 2289.066133 12.587340 +16 TVM-Ansor 14.8240 2158.661630 13.347740 +17 TVM-Ansor 15.7174 2035.960146 14.152170 +18 TVM-Ansor 16.6283 1924.430038 14.972357 +19 TVM-Ansor 17.5877 1819.453368 15.836215 +20 TVM-Ansor 18.4955 1730.150577 16.653611 +1 CuDNN 0.37390 85583.48580 1.00000 +2 CuDNN 0.60998 52460.90498 1.63138 +3 CuDNN 0.85136 37586.87751 2.27695 +4 CuDNN 1.07919 29651.76547 2.88629 +5 CuDNN 1.35605 23597.89159 3.62674 +6 CuDNN 1.56943 20389.56775 4.19742 +7 CuDNN 1.82736 17511.60911 4.88724 +8 CuDNN 2.02688 15787.78251 5.42087 +9 CuDNN 2.26694 14115.94807 6.06289 +10 CuDNN 2.50623 12768.17017 6.70288 +11 CuDNN 2.73955 11680.75610 7.32688 +12 CuDNN 2.97586 10753.18958 7.95889 +13 CuDNN 3.30582 9679.88441 8.84137 +14 CuDNN 3.45818 9253.41692 9.24885 +15 CuDNN 3.67296 8712.32534 9.82327 +16 CuDNN 4.06898 7864.38144 10.88242 +17 CuDNN 4.14866 7713.33745 11.09552 +18 CuDNN 4.39514 7280.76393 11.75474 +19 CuDNN 4.60445 6949.79726 12.31453 +20 CuDNN 4.85620 6589.51845 12.98782 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/plot.ipynb b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/plot.ipynb new file mode 100644 index 000000000..43206bcab --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/plot.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaQAAAEUCAYAAABkhkJAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABONElEQVR4nO3dd1QT2dsH8G9CkQ5iQykWZEFFxd4QBURRUcG1/Sygi30BxbVhF7EXVFBs2HXXir0hin1dRCy4ooCFoiAIghBpyX3/4M0ssQYlJMjzOcdzZDIzeS4J88y9cwuPMcZACCGEyBlf3gEQQgghACUkQgghCoISEiGEEIVACYkQQohCoIRECCFEISjLOwB5ysvLQ3R0NGrUqAElJSV5h0MIIRWKUChEWloaLC0toaam9sPnq9QJKTo6GsOGDZN3GIQQUqHt27cPrVu3/uHzVOqEVKNGDQDFv0wDAwM5R0MIIRVLSkoKhg0bxl1Lf1SlTkjiZjoDAwMYGRnJORpCCKmYyuqRB3VqIIQQohAoIRFCCFEIlJAIIYQoBEpIhBBCFAIlJEIIIQqhXBJSQUEBZs2aBVtbW7Ro0QL9+vXDlStXAABJSUkwNzdHixYtuH8bNmyQONbHxwctW7ZEp06dsGPHDolz37p1C46OjmjevDlGjBiB5OTk8igSIYSQMlYu3b6LiopQu3Zt7NmzB3Xq1MGVK1cwefJknDx5ktsnIiICysqfhhMQEICXL1/i8uXLSE9Ph6urK0xNTWFjY4OMjAx4eHjAz88PdnZ2WLt2Lby9vXHw4MEfilckYuDzeT90jvJWEWMmhJCSyiUhaWhowNPTk/vZ1tYWRkZGePToEZo0afLVY0NCQrBs2TLo6upCV1cXAwcOREhICGxsbBAaGgozMzP07NkTAODp6Yn27dsjPj4epqam3x0vn89D1LM33328PLRoUFPeIRBCyA+Ry8DY9PR0vHjxAg0bNuS22dragsfjoVOnTpg2bRr09fWRlZWFtLQ0WFhYcPtZWFggLCwMABAbGwtzc3PuNQ0NDZiYmCAuLu6ThJSdnY3s7GyJbSkpKbIoHiGEkO9Q7gmpsLAQU6dOhYuLC0xNTZGbm4vDhw+jUaNGePfuHXx9fTFt2jQEBwdDIBAAALS1tbnjtbW1kZubCwAQCATQ19eXOL+Wlhb3ekm7du1CYGDgZ2OKjo5Gamoq93OrVq1+uJzyEBkZKe8QCCGVSFpaWpmer1wTkkgkwvTp06GiooK5c+cCADQ1NdG0aVMAQPXq1TF37lxYW1sjJycHGhoaAICcnBxUqVKF+7+mpiaA4hpRTk6OxHvk5uZyr5fk5uYGFxcXiW3ieZgsLS1/iqmDKmoiJYRUTElJSWV6vnJLSIwxzJ49G+np6di6dStUVFQ+ux+Px+P219XVRY0aNRATE4NOnToBAGJiYrimPjMzM4SEhHDHCgQCJCQkSDQFiuno6EBHR6esi0UIIaSMlNs4pPnz5yM+Ph6bNm2SWDfj/v37ePbsGUQiETIzM+Hn54e2bdtyzXTOzs4ICgpCVlYW4uPjcejQIa6m4+DggNjYWJw/fx75+fnYsGEDzM3Nf6hDAyGEEPkolxpScnIyDhw4AFVVVVhbW3PbFy5cCD6fjzVr1iAjIwNaWlro2LEj1qxZw+3j5eWF+fPnw9bWFmpqahgzZgxsbGwAAPr6+ggICOCeOzVv3lziWEIIIRUHjzHG5B2EvCQlJcHe3h5hYWGfPEOibt+EEPJ1X7uGfg+aOogQQohCoIRECCFEIVBCIoQQohAoIRFCCFEIlJAIIYQoBEpIhBBCFAIlJEIIIQqBEhIhhBCFQAmJEEKIQqCERAghRCFQQiKEEKIQKCERQghRCJSQCCGEKARKSIQQQhQCJSRCCCEKgRISIYQQhUAJiRBCiEL45hLmjx49Qnh4OJ48eYLs7Gzo6OjA3NwcNjY2aNq0aXnESAghpBL4YkK6du0a/P39kZubi7Zt26Jly5bQ1NREbm4u4uPjMXXqVGhqamLy5MmwsbEpz5gJIYT8hL6YkA4dOoQFCxagWbNmXzz4wYMH2LZtGyUkQgghP+yLCWn9+vXfPLhZs2ZS7UcIIYR8i1SdGq5fv47nz59LbHv27Blu3Lghk6AIIYRUPlIlJF9fX2hqakps09TUhK+vr0yCIoQQUvlIlZDevn2LmjVrSmyrWbMm0tLSZBIUIYSQykeqhGRsbIxbt25JbLt9+zaMjIxkEhQhhJDK55vjkADAw8MDnp6eGDBgAIyNjZGYmIijR49iyZIlso6PEEJIJSFVDalbt27Yvn07BAIBrly5AoFAgG3btqFbt25SvUlBQQFmzZoFW1tbtGjRAv369cOVK1e412/dugVHR0c0b94cI0aMQHJyssSxPj4+aNmyJTp16oQdO3ZInPtrxxJCCKk4pKohAcVdvL82JulrioqKULt2bezZswd16tTBlStXMHnyZJw8eRIaGhrw8PCAn58f7OzssHbtWnh7e+PgwYMAgICAALx8+RKXL19Geno6XF1dYWpqChsbG2RkZHz1WEIIIRWHVDWkgoIC+Pv7w97eHq1atQJQ3BV87969Ur2JhoYGPD09YWRkBD6fD1tbWxgZGeHRo0cIDQ2FmZkZevbsiSpVqsDT0xMxMTGIj48HAISEhGDixInQ1dWFqakpBg4ciJCQEAD45rGEEEIqDqkS0pIlS/D06VOsWrUKPB4PAGBmZoY///zzu940PT0dL168QMOGDREbGwtzc3PuNQ0NDZiYmCAuLg5ZWVlIS0uDhYUF97qFhQXi4uIA4KvHfiw7OxtJSUkS/1JSUr4rfkIIIWVPqia7ixcv4sKFC9DQ0ACfX5zDatWqhdTU1FK/YWFhIaZOnQoXFxeYmppCIBBAX19fYh8tLS3k5uZCIBAAALS1tbnXtLW1kZubCwBfPfZju3btQmBg4Gdjio6OliiLuBZY0URGRso7BEJIJVLWQ3+kSkgqKioQCoUS2zIyMqCnp1eqNxOJRJg+fTpUVFQwd+5cAMW1mpycHIn9cnNzoampCQ0NDQBATk4OqlSpwv1fPEj3a8d+zM3NDS4uLhLbUlJSMGzYMFhaWv4UXdgraiIlhFRMSUlJZXo+qZrsHB0dMWPGDCQmJgIA3rx5A19fX/Tu3VvqN2KMYfbs2UhPT0dAQABUVFQAFDf9xcTEcPsJBAIkJCSgYcOG0NXVRY0aNSRej4mJQcOGDb957Md0dHRgZGQk8c/AwEDq+AkhhMiWVAnJ29sbRkZG6Nu3L7Kzs9GjRw/UrFkTv//+u9RvNH/+fMTHx2PTpk1QU1Pjtjs4OCA2Nhbnz59Hfn4+NmzYAHNzc5iamgIAnJ2dERQUhKysLMTHx+PQoUNcTedbxxJCCKk4eIwxVpoDMjIyULVqVa5zgzSSk5NhZ2cHVVVVKCv/10q4cOFC9O3bFzdv3oSvry9evXqF5s2bY+nSpVwTWkFBAebPn4/z589DTU0NY8aMwahRo7hzfO3Yb0lKSoK9vT3CwsI+OSbq2Rupy6cIWjSo+e2dCCGkDH3tGvo9pEpIcXFx0NPTQ/Xq1ZGbm4vg4GDw+Xy4u7tDXV39h4OQF0pIhBDy/co6IUnVZDdlyhRkZ2cDAJYvX46IiAjcu3cP8+bN++EACCGEEEDKXnbJyclo0KABGGMIDQ3F6dOnoaamBnt7e1nHRwghpJKQKiFVqVIFOTk5iI+PR+3ataGvr4+ioiLk5+fLOj5CCCGVhFQJycnJCW5ubsjNzcXw4cMBAP/+++9PMXaHEEKIYpAqIc2aNQvXr1+HsrIy2rdvDwDg8Xjw8fGRaXCEEEIqj68mpCFDhqBr167o0qULrK2tJV5r2rSpTAMjhBBSuXy1l93MmTORl5eH2bNno0uXLpgzZw5CQ0M/O1ccIYQQ8iO+WkOysrKClZUVJk+ejLS0NFy5cgUnT57E3LlzYWFhgS5dusDGxoZmRiCEEPLDpF6gr0aNGhgwYAAGDBiAoqIiREZGIjw8HF5eXnB2dsaYMWNkGSchhJCfnNQJSeIgZWW0a9cO7dq1w4wZM1BUVFTWcRFCCKlkpEpIQ4cOlWruun379v1wQIQQQionqRJSu3btcOTIEbi4uKBOnTp49eoVjh07hl9//RXGxsayjpEQQkglIFVCunHjBoKDg2FmZsZt69OnD2bNmoWDBw/KLDhCCCGVh1STq8bHx8PExERim5GREZ49eyaToAghhFQ+UiWkNm3aYObMmXjx4gXy8vLw/PlzzJ49G61bt5Z1fIQQQioJqRLSsmXLABTPaWdlZYU+ffqAMYYlS5bINDhCCCGVh1TPkPT09ODv7w+RSISMjAzo6+uDz5cqlxFCCCFSkTqrxMfHIygoCBs3bgSfz8ezZ88QExMjy9gIIYRUIlIlpLNnz2LYsGFITU3FsWPHAAC5ublcUx4hhBDyo6Rqslu/fj127twJCwsLnD17FgBgYWFBNSRCCCFlRqoaUkZGBszNzQGAm7GBx+NJNXsDIYQQIg2pElKTJk1w/PhxiW2nT59Gs2bNZBIUIYSQykeqJrvZs2fD3d0dhw8fhkAggLu7O54/f47t27fLOj5CCCGVhFQJydTUFGfPnsXly5fRtWtX1K5dG127doWmpqas4yOEEFJJSL38hLq6Onr16gUASExMRGZmJiUkQgghZUaqZ0hTpkzB3bt3AQBHjhxB79694eTkhEOHDsk0OEIIIZWHVAnp1q1bsLS0BADs3LkTO3bswKFDh7B161ap32jv3r3o378/LC0tMXPmTG57UlISzM3N0aJFC+7fhg0buNcLCgrg4+ODli1bolOnTtixY8cnsTk6OqJ58+YYMWIEkpOTpY6JEEKI4pCqya6wsBCqqqpITU3Fu3fv0KpVKwBAenq61G9Us2ZNTJw4EdeuXUN+fv4nr0dEREBZ+dNwAgIC8PLlS1y+fBnp6elwdXWFqakpbGxskJGRAQ8PD/j5+cHOzg5r166Ft7c3LYlBCCEVkFQJqVGjRti8eTOSk5PRtWtXAEBqaiq0tLSkfqPu3bsDAB4+fIjU1FSpjwsJCcGyZcugq6sLXV1dDBw4ECEhIbCxsUFoaCjMzMzQs2dPAICnpyfat2+P+Ph4mJqaSpwnOzsb2dnZEttSUlKkjoMQQohsSZWQFi9ejHXr1kFZWRnTp08HAERFRaFPnz5lFoitrS14PB46deqEadOmQV9fH1lZWUhLS4OFhQW3n4WFBcLCwgAAsbGx3IBdANDQ0ICJiQni4uI+SUi7du1CYGDgZ987OjpaIkmKa4AVTWRkpLxDIIRUImlpaWV6PqkSkomJCVavXi2xzdHREY6Ojj8cQNWqVXH48GE0atQI7969g6+vL6ZNm4bg4GAIBAIAgLa2Nre/trY2cnNzAQACgQD6+voS59PS0uJeL8nNzQ0uLi4S21JSUjBs2DBYWlrCyMjoh8sibxU1kRJCKqakpKQyPd8XOzWIayHfIu1+X6KpqYmmTZtCWVkZ1atXx9y5c3H9+nXk5ORAQ0MDAJCTk8Ptn5OTw3U319DQkHgNKJ709XPd0XV0dGBkZCTxz8DA4IdiJ4QQUna+mJDOnDkDJycnbN68GXfv3kVmZiYKCgqQmZmJqKgobNmyBU5OTtxkq2VFPD8eYwy6urqoUaOGxCSuMTExaNiwIQDAzMxM4jWBQICEhATudUIIIRXHF5vsVq9ejSdPnuDAgQOYPn06kpKSuGRhYmICGxsb+Pv7w8zMTKo3KioqglAohEgkglAoRH5+PpSUlPDo0SNoa2ujXr16yMrKgp+fH9q2bcs10zk7OyMoKAiWlpZIT0/HoUOHuJVqHRwcsGLFCpw/fx5du3bFhg0bYG5u/snzI0IIIYrvq8+QzM3NMW/ePADAhw8fkJ2dDR0dHairq5f6jYKCgiQ6FZw4cQIeHh6oX78+1qxZg4yMDGhpaaFjx45Ys2YNt5+Xlxfmz58PW1tbqKmpYcyYMbCxsQEA6OvrIyAggHvu1Lx5c4ljCSGEVBw8xhj71k5+fn6YM2fOJ9sXL16M2bNnyySw8pCUlAR7e3uEhYV90qkh6tkbOUX1fVo0qCnvEAghlczXrqHfQ6qZGo4ePfrZ7SdOnPjhAAghhBDgG012hw8fBgAIhULu/2KJiYnQ09OTWWCEEEIql68mJPGifIWFhRIL9PF4PFSvXh3Lly+XbXSEEEIqja8mpD179gAA/P394e3tXS4BEUIIqZykmqlh0qRJEIlEn32Nz5fqMRQhhBDyVVIlpMaNG3NjkD72+PHjMg2IEEJI5SRVQvp4eqC0tDRs2bIFtra2MgmKEEJI5SNVQjI0NPzk5+XLl2PAgAEYOHCgTAIjhBBSuXz3A6CcnBxkZGSUZSyEEEIqMalqSNOmTZN4hpSXl4eIiAj07dtXZoERQsjPLD8/H1WqVJF3GKUi65ilSkh169aV+FldXR1DhgxBx44dZRIUIT+qqEgIZWUleYdRKhUxZvL9qlSpAksza3mHUSrRsddlen6pEpKHh4dMgyCkrCkrK2H5xvPyDqNUZkzsIe8QCJErqRISUDyN0OnTp/HmzRvUrFkTvXr1woABA77YHZwQQggpDakS0ooVKxAWFgY3NzcYGhri1atX2L59O54/f47p06fLOkZCCCGVgFQJKSQkBCEhIRJLfnft2hUuLi6UkAghhJQJqbp9a2pqQlNT85NtWlpaMgmKEEJI5SNVDcnNzQ0eHh4YO3YsDAwM8Pr1awQHB2PkyJFITEzk9jM2NpZZoIQQQn5uUiWkxYsXAwBu374tsf3WrVvw8/MDULwkBc1rRwgh5HtJlZBiYmJkHQchhJBKjtaOIIQQohCkqiElJiZi7dq1ePz4MQQCgcRr4eHhsoiLEEJIJSNVQpo6dSqMjY0xY8YMqKuryzomQgghlZBUCSk2NhZ//vknrQ5LCCFEZqTKMG3atMG///4r61gIIYRUYl+sIa1bt477v6GhIUaPHg0HBwdUr15dYr9JkybJLjpCCCGVxhdrSCkpKdy/Dx8+wNbWFkVFRRLbU1JSpH6jvXv3on///rC0tMTMmTMlXrt16xYcHR3RvHlzjBgxAsnJydxrBQUF8PHxQcuWLdGpUyfs2LFD6mMJIYRUHF+sIS1durRM36hmzZqYOHEirl27hvz8fG57RkYGPDw84OfnBzs7O6xduxbe3t44ePAgACAgIAAvX77E5cuXkZ6eDldXV5iamsLGxuabxxJCCKk4pO72/TmqqqqoUaOGVJ0dunfvDgB4+PAhUlNTue2hoaEwMzNDz549AQCenp5o37494uPjYWpqipCQECxbtgy6urrQ1dXFwIEDERISAhsbm28eSwghpOKQKiE5ODhw6x4xxiTWQOLz+bCzs8P8+fM/eb4kjdjYWJibm3M/a2howMTEBHFxcahevTrS0tJgYWHBvW5hYYGwsLBvHvtxQsrOzkZ2drbEttI0ORJCCJEtqRLSokWL8M8//8DT05ObXDUoKAhWVlZo06YNVq1aBV9fX6xfv77UAQgEAujr60ts09LSQm5uLjcIV1tbm3tNW1sbubm53zz2Y7t27UJgYOBnY4iOjpaotbVq1arU5VAEkZGR8g5BYdBnSBTdz/AdTUtLK9NzS5WQAgICEBoaiipVqgAA6tati/nz56NHjx64evUqli1bxjXJlZaGhgZycnIktuXm5kJTUxMaGhoAgJycHO69c3JyuKUwvnbsx9zc3ODi4iKxLSUlBcOGDYOlpSWMjIy+K35FUlG/4OQ/9BkSRVfyO5qUlFSm55ZqHJJIJPrkjV+9egWRSAQAUFdXh1Ao/K4AzMzMJCZvFQgESEhIQMOGDaGrq4saNWpIvB4TE4OGDRt+89iP6ejowMjISOJfyQUHCSGEyJdUCcnNzQ1ubm7w9/fHn3/+CX9/f4wcORKurq4AgKtXr8LKyuqr5ygqKkJ+fj5EIhGEQiHy8/NRVFQEBwcHxMbG4vz588jPz8eGDRtgbm7OPQNydnZGUFAQsrKyEB8fj0OHDnE1nW8dSwghpOKQKiGNGTMGS5YsQVpaGsLCwvDmzRssXrwYY8eOBQB069YN27Zt++o5goKC0KxZM2zZsgUnTpxAs2bNEBQUBH19fQQEBMDf3x9t2rTBgwcPsGbNGu44Ly8vGBsbw9bWFiNGjIC7uztsbGwA4JvHEkIIqTh4jDEm7yDkJSkpCfb29ggLC/vkGVLUszdyiur7tGhQU94hKJzlG8/LO4RSmTGxh7xDIOXM0sxa3iGUSnTsdYmfv3YN/R5SdWooOY3Qx2jqIEIIIWVBqoT08XidtLQ0REREoFu3bjIJihBCSOUjVUL63DRCV69exenTp8s8IEIIIZXTdy9wZG1tjYsXL5ZlLIQQQiqx75rL7sOHDzh16hRq164tk6AIIV9XWFgEFRWp/nwVRkWMmZSvUs1lJ+6Qp66ujkaNGmHZsmUyDY4Q8nkqKsrw9N4s7zBKJcB/nLxDIApOqoRUcjYEQgghRBakrj8XFRUhKioKqampMDAwgJWVFZSVqfpNCCGkbEiVUeLj4zFhwgTk5eWhdu3aeP36NapUqYJNmzbRND2EEELKhFQJaeHChRg0aBDc3d25tZCCg4OxYMEC7NmzR6YBEkIIqRyk6vYdExODUaNGSSzM5+bmRs+WCCGElBmpElLNmjXxzz//SGy7c+cOatak+dMIIYSUDama7Ly9vTFx4kR07doVderUwatXrxAeHo6VK1fKOj5CCCGVhFQ1JFtbW4SEhMDMzAy5ubkwMzPD0aNHaS47QgghZeabNSShUIgWLVrgzp07mDhxYnnERAghpBL6Zg1JSUkJ9erVQ2ZmZnnEQwghpJKS6hlSnz59MH78eLi6usLAwEDitQ4dOsgkMEIIIZWLVAnpzz//BAAEBARIbOfxeAgLCyv7qAghhFQ6UiWkS5cuyToOQgghlVypJ6MTiUQSP/P5372kEiGEEMKRKiE9evQIvr6+ePLkCfLz8wEAjDHweDw8fvxYpgESQgipHKRKSDNnzoStrS2WLFkCNTU1WcdECCGkEpIqISUnJ8Pb21tiLjtCCJGVgoJCqKqqyDuMUqmIMSsaqVeMvX79Ojp37izreAghBKqqKujby0veYZTKiTPr5R1ChffFhDRt2jSuRlRQUAAPDw+0atUK1atXl9hvxYoVso2QEEJIpfDFhFS3bl2Jnxs2bCjzYAghhFReX0xIHh4eOHXqFJycnMolkBEjRuDevXvcsug1a9bE+fPnAQAnT57EmjVrkJmZiY4dO2LJkiXQ09MDALx79w6zZ8/GjRs3ULVqVUyZMgV9+vQpl5gJIYSUna8OIpo3b155xcG9X1RUFKKiorhkFBsbi3nz5mHFihW4ceMG1NXVsXDhQu4YX19fqKio4MaNG1i5ciUWLFiA2NjYco2bEELIj/tqpwbGWHnF8UUnT56EnZ0d2rRpAwCYNGkSevXqhZycHPD5fFy4cAEnT56EpqYmWrduDTs7Oxw/fhxTp06VOE92djays7MltqWkpJRbOQghhHzdVxOSSCTC33///dXEVJaTq65evRqrVq1C/fr14e3tjXbt2iE2NhYtWrTg9jExMYGKigpevHgBPp8PJSUl1K9fn3vdwsICERERn5x7165dCAwM/Oz7RkdHIzU1lfu5VatWZVam8hQZGSnvEBTGz/4ZUvkUU2n+Bn+GMqalpZXpub+akAoKCjB79uwvJqSynFx16tSpMDU1haqqKk6fPo3x48fj+PHjEAgE0NbWlthXS0sLubm5UFJSgpaWlsRr2trayM3N/eT8bm5ucHFxkdiWkpKCYcOGwdLSEkZGRmVSDnmqqF9w8p+f/TOk8lV8JcuYlJRUpuf+akJSV1cvt9m8mzdvzv3fxcUFp06dwpUrV6ChoYGcnByJfXNycqCpqQk+n//F1z6mo6MDHR0d2QRPCCHkhynszKg8Hg+MMZiZmSEmJobbnpiYiMLCQtSrVw/16tWDUCjEixcvuNdjYmKoizohhFRAX01I5dWpITs7G9euXUN+fj6Kiopw4sQJ3LlzB507d0afPn1w+fJl3LlzBwKBAOvWrYODgwO0tLSgoaEBBwcHrF+/HgKBAJGRkQgLC0O/fv3KJW5CCCFl56tNdlFRUeUSRFFREdauXYtnz55BSUkJDRo0wIYNG7jOCgsXLsTUqVPx7t07dOjQAUuXLuWOnT9/PmbNmoWOHTtCT08PCxYsgJmZWbnETQghpOyUej0kWdDX18eRI0e++HqfPn2+ONhVT08PGzdulFVohBBCyonCPkMihBBSuVBCIoQQohAoIRFCCFEIlJAIIYQoBEpIhBBCFAIlJEIIIQqBEhIhhBCFQAmpkhIKRfIOodQqYsyEEOkpxMBYUv6UlPg4EBot7zBKZbCDpbxDIITIENWQCCGEKARKSIQQQhQCJSRCCCEKgRISIYQQhUAJiRBCiEKghEQIIUQhUEIihBCiECghEUIIUQiUkAghhCgESkiEEEIUAiUkQgghCoESEiGEEIVACYkQQohCoIRECCFEIVBCIoQQohAoIRFCCFEIP0VCevfuHX7//XdYWVnB1tYWJ0+elHdIhBBCSumnWDHW19cXKioquHHjBh4/foxx48bBwsICZmZm8g6NEEKIlCp8DUkgEODChQuYNGkSNDU10bp1a9jZ2eH48ePyDo0QQkgpVPga0osXL6CkpIT69etz2ywsLBARESGxX3Z2NrKzsyW2JScnAwBSUlI+Oe+b1LcyiFZ2klQLSn1M5ts3MohEdpKSkkq1//vsCvYZlrJ8HwRZMopENkpbvsLCDzKKRDZKWz4AYBDKIBLZ+biM4munUFg25ajwCUkgEEBLS0tim7a2NnJzcyW27dq1C4GBgZ89x7Bhw2QWHyk7/vIOQMYO7ZB3BLJlHx4s7xBkyt7evvQHVbA2qi+VMS0tDXXr1v3h81f4hKShoYGcnByJbTk5OdDU1JTY5ubmBhcXF4ltBQUFSExMRL169aCkpCTzWFNSUjBs2DDs27cPBgYGMn+/8kblq/h+9jJS+cqWUChEWloaLC0ty+R8FT4h1atXD0KhEC9evEC9evUAADExMWjYsKHEfjo6OtDR0fnk+AYNGpRHmBIMDAxgZGRU7u9bXqh8Fd/PXkYqX9kpi5qRWAWrMH5KQ0MDDg4OWL9+PQQCASIjIxEWFoZ+/frJOzRCCCGlUOETEgDMnz8feXl56NixI/744w8sWLCAunwTQkgFU+Gb7ABAT08PGzdulHcYhBBCfsBPUUOqKHR0dODh4fHZZ1k/Aypfxfezl5HKp9h4jDEm7yAIIYQQqiERQghRCJSQCCGEKARKSISUM2olJx8r+Z0oq2l4KiJKSBWEeCokkUgk50ikRxdeSWFhYcjOzgaPx6PfDZHA4/EAAIGBgdi0adMn825WFpSQKoDw8HAMGTIEAMDnK/5H9ujRI/j4+CAqKkreoSiM7OxsbN26FYsWLZJ3KArv4xrCz5i8P1emyMhIBAYG4tatW4iOjpZDVN9HfJNcVFT0w5+V4l/dCAwNDaGiooKdO3cCUNxa0ocPH+Dj44Phw4fD2NgYLVu2lHdICkNDQwNeXl6IjIzEw4cPwePxFPZzlBfxxUw8r6R4Zmlx7eFnwRj7bJksLCxQp04d5OTk4Pbt23j9+rUcopNefn4+tm3bht27dwMAlJWVwePxJG4oSpugKCEpoGvXruHWrVsoKioCAJiYmOB///sf9u3bh6ysLPD5fIW7a9y8eTOsra1x69YtHD9+HBMnTpR3SApB/DkpKyujZcuW6NKlC9asWQOgYtR2ZenOnTu4cOEC97P4In3t2jU4OTlh6tSpmD9/Pq5cuQJAcW/EpCWOn8fjITc3F4sWLcLZs2eRn58PoPg70q9fP1hZWeHOnTu4d++ewv2dlyQUCpGeno67d+/i7dvipV7mz58PHx8fHD58GEDpbyYq91+EArp79y7GjBmD8ePHY8aMGUhPT0eVKlXQvXt3mJiYcBczRXHv3j306NEDV65cwejRo/HLL78gKSmpwl88yor4D/L48eNwd3dHUVER/vnnH1y6dAlAxb/I/oj79+9jypQpEs9LHjx4AH9/f/z+++9YtWoVLCws4OPjw92IVUTipCKO/8WLFzhw4AD27duHwMBA7m+ax+MhOTkZTk5OsLKywoULF/D8+XO5xf01jDFoaGjA0dERysrK2LFjB2bNmoXU1FQYGRlh48aN2L59O4DSfccr5if8kxGJRNiyZQsAoGXLlujbty86dOiAFy9ewMfHB5s2bYKuri7c3Nxw/fp1hWjyef78OVasWIG1a9eiTZs22L9/PyZMmABtbW2cPXuWW/yQAOfPn0dAQAB+//132Nraol27dvD3L17dqaJeZH+USCSCu7s76tati6CgIG7706dPUb9+ffTs2ROGhoZ49uwZMjIy8OTJEzlG+30YYxCJRNxNyZkzZ9C8eXOcOHECtra26NWrF9q3b4/Lly9j69atyMzMROPGjXHkyBGMHTsWr169wt9//428vDw5l+Q/H5fJysoKzZo1w7///ou8vDxs2rQJXl5emDFjBvz9/fH+/ftStehUzr8GBSMQCLBmzRps3boVANCvXz8UFBRg2LBhGDp0KA4dOoSFCxciJycHTk5OWLt2LQD5XMyEQiEWLFiAX3/9FQUFBeDxeEhJScH58+cBFK87FRcXh4iICK7J8Wf18Q1BYWHhZ/d79uwZmjdvjo4dO8LOzg5z585FlSpVuAUjK1stiTHGfXe9vb2xb98+ribw77//olq1ajh+/Dg6deqExMREXLt2DW3btoVAIJBn2KUivmjz+XzExcXB3d0dK1euxIoVK+Dl5YX69evDysoKRUVFcHZ2RpUqVeDh4YFq1apBVVUVurq6sLe3x9WrV/H48WN5FwfAf8+++Hw+8vLyuERpZ2cHdXV1vHjxgtu3R48esLKywoIFC7hjpUEJSU5KNlNoaWlh2bJl2LhxIwoLC9GpUycYGRnh5s2baNq0KXbs2IFatWphwYIFePfuHR4+fIjLly8DKP8eSHPmzEFoaChOnDiBOXPmwNfXFzweD//++y+EQiGaNWuGVq1aITw8HLGxseUaW3n5uAnG398f9+/fh4qKCoDiO+GrV69yC0empaWhatWq3PGGhobo3r07QkJCkJGRoZDPBGWJx+Ph7t27cHJyQkxMDIqKirjk7OTkhN27d2Pbtm3YuHEjNm3ahBo1auDQoUO4ceOGnCP/tpLfDYFAgBkzZmDIkCG4ceMGRowYgR49enA3auIbz8zMTPTr1w+dO3fGkiVLcP/+fQCAq6srXr9+jdu3b6OgoEBuZRIT14oWLVqEoUOHYsqUKfjnn39gYmICFxcXVK1aFWfOnOH2nz17Ns6ePYt79+5JffNMCamcFRQUYP78+ejZsyf27NnDfYGdnZ1Rv359zJs3DwAwatQovHr1CmfPnkXNmjUxfvx4zJo1C+/fv0d2djYOHToEoPx7IE2aNAnVqlXj7laNjY2Rn58PExMTrneUm5sbMjMzcfPmTe6i/DNccBljEAqF3O/80qVLcHBwwNmzZ1G1alU8ePAAvXr1wu7duxEYGIi5c+fi7t276NevH0JCQpCQkACRSARVVVUUFBQgOTmZu4P82XqSlfTxZ5+amorVq1dj0KBBGDp0KP744w+cOXMGV65cQcuWLdG9e3fUrVsXampqSExMxJgxY7B7927UqlVLTiX4NsaYRO+5vLw8TJ48GWlpabhz5w727t2LgIAAJCcnQ1lZGSKRCLq6urCzs0NiYiKuX78OLy8vrF69Gh4eHigsLISamhr8/PwwcuRIqKqqyrmExd/3Cxcu4PXr11iyZAl0dHSwbds27N+/H7a2tmjYsCHCwsKQmZkJoLjXYO/evSU6rnwTI+VKKBQyHx8f1qlTJ9auXTs2ffp0dvnyZcYYYxEREczc3JzFxsYyxhgLCgpiY8aMYXfu3JE4x+nTp8s7bAnLly9nfn5+7OnTp2zixInMxcWFi7moqIgxxtj+/fvZiBEj2NWrV+UZqkzExcWxUaNGMXNzc3bq1Clu+8qVK9maNWu4n/38/FifPn0YY4yNGzeOTZgwgZ06dYolJCSw33//nZ04cYLdvHmz3OMvL+Lvwsf+/fdf1qlTJ/b27Vtu26JFi5ijoyNjjLHc3Fy2fPly5unpyQYMGMB8fX2ZUCgsl5i/h0gk4v5/5coVNmvWLBYeHs4KCgok9hs4cCCbMWPGJ8csXryYzZw5k8XFxTHG2GfLWp7lF4lEn3x2BQUFzMbGhllZWbEbN24wxhgTCARs7969zNXVlaWkpLCIiAjm6enJtm7dyh1XWFhYqvemGlI5ePPmDRhjKCgoAJ/PR//+/dGuXTsMHjwYhoaGWLRoEbZv3w5jY2O4ublh5syZAIqr7IwxXLx4ketWCQC9evWSV1EAAF5eXjh37hwGDRqE2rVr4+jRo9yS8eKq+ZAhQ6ChoVHh7/xZibt7kUgEHx8fDBgwAMbGxmjevDkyMjIAFN/1X7p0CcOGDQMAzJs3D0ePHkX//v0BAH5+fmjQoAH27duHoUOHwtTUFH369EGHDh3Kv1BljP3/g+6Pt4lrzOfPn8fOnTsRHR0NgUAAVVVVWFhY4NmzZ9z+v//+O5KSknDw4EFoaGhg+vTpWLZsGYKDgzF37lzw+XyFnVKHx+MhLS0Nv/32G2bNmoUGDRpAWVmZqzWJm9uWLFmCY8eO4f79+xKdkpycnJCRkYHTp08D+Pyz4fJ6XixuAVBSUkJBQQE3Q4yKigqWLFmCgoICrvlZXV0drVq1gqamJp49e4ZmzZrB2NgYMTEx3CMJZWVl7rzSoOUnZEg88lpVVRVv375F/fr1MXv2bOjp6WH9+vVITEzE6NGj8e7dO+7B7sSJE+Ht7Y3g4GB06tQJBw8eRHR0NKZOnapQa5xcuHABu3fvxt69ewEUj9Iu+eUTf6EVoamhLMTExODdu3eIiorCoEGDUK1aNZw5cwbLly/HwYMHUatWLbi6ukJZWRlxcXFo0qQJ/Pz8UK1aNTx+/Bh169aFhoYG0tLSoK6uDi0tLQBfHiRZEb169Qo6Ojpc2TIzMzFz5kwkJiaiUaNGePHiBSwtLbFw4UKMHj0arVq1wpAhQ1C1alXEx8dj9OjRSElJwa1bt6Cnp8f9bsQdBBT597R06VJkZ2dj6dKln31d/Dfh4+OD58+f46+//pJ4ffPmzTA1NUW3bt3KI9xvCggIQFhYGKpWrYohQ4bA2toampqa+O2336CiooLNmzdz+9rZ2WHevHno2rUrXr16hZo1a3LXglL7zlod+YqMjAzm6enJ2rZtyzZu3MiioqLYn3/+yRwdHZmrqytLSEhgr1+/Zh4eHmzdunUsPz+fMVbcxDNhwgRmbm7OOnToIOdSfJ1IJGL9+vVj+/fv5+L/WV26dIk5Ozuz0NBQbptQKGRv375lEydOZNOnT2eMMfbnn3+yFi1asPDwcG6/kydPsunTp7Pnz58zxv5rqikqKpJotqnodu/ezczNzdnRo0e5ZpqrV6+y4cOHc/s8evSItW/fnoWHh7MbN26w0aNHMy8vLxYXF8emTJnCTp48Kffm6K/53OclEonY27dv2ZgxYyS+H4wxlpSUxD58+MAYY1zzXV5eHmvRogU7fvw4Y+y/Zk15fRdEIpFEc+CbN2/Yb7/9xgYMGMAiIiLYmjVrmIeHB9u/fz9jjLHY2FjWuHFjtnbtWvbq1St2/vx55uLiwp4+fSpx3u9tYvwpljBXJOLu2mpqarh8+TI0NDQAFPfX79ixIzw8PPDXX39h2rRpsLGxQXh4OK5evYpu3bphxowZSE1NRWFhIQwNDbkqvyKOVeHxeFiyZAl8fHzQqlUr/PLLL/IOqUzl5ORg27ZtmDx5MmxtbXHq1CncvHkTFhYWMDIyAp/Ph76+PlxdXTFt2jQ8ePAAAwYMwP379xEUFIRnz57h5s2biImJgY+PD+rVqwfgv84L4uasioZ9oUaXlpYGoPjBd8uWLVG3bl3ExsZCV1cXHz58gJqaGho3bowJEyZg3bp1OHr0KDQ1NREcHIyZM2fC0NAQPXr04HoqKqKS5RaJRODz+eDxeNDX18f79+/x999/4/Xr1ygqKsLevXuhpqYGbW1tLFmyBA0aNEBRURGqVKmCsWPHYtasWXBwcIC6uvon55aVjz+7kmV4+vQpnj59ytXS/ve//wEANDU14enpiffv36Nly5YwNzeHu7s7goKCwBjDhQsX4OXlBTMzM4n3+t5rluJd6SookUgEoVAIVVVVODs7Q0tLC9nZ2RITD5qYmKBfv364evUqEhIS4OjoCF1dXfzzzz9ISUmBsrIyDA0NERQUhAULFnB9/hVV48aNYWhoqPBzbn2P1NRUHDx4kGuSHDJkCB4/fozIyEiJ9vBmzZrBwcEBK1euhLKyMpYuXYoBAwbgw4cPaNWqFa5duyb3Z35lSXxB+/gzt7OzQ8+ePfHo0SOcPXsWAKCrq4uEhASJnolGRkZQV1dHamoqmjdvDn9/f2zfvh1r166FioqKQvfGzMvLg6+vLwBwXfXF3wUPDw+8efMGly9fRnh4OEaNGoV58+ZBSUkJc+fOBfDf7278+PFYu3Ytl4xkreSURSWJn8tduHABEyZMwNu3b1G3bl10794dRUVFmDdvHkaNGoWOHTtCQ0MD586dAwCMGTMG+vr60NbWxpkzZ+Do6Fh2wX5XvYp8U79+/dj69etZXl4eY+y/qnleXh5r2rQpO3v2LGOsuMfc0KFD2fnz5+UW64/4Uk+qiq6wsJDt27ePOTg4MIFAwBgrblKdPHkyi4mJkdg3NjaW9e7dm+3Zs+eL5/pZfPjwgXl5ebEmTZqwv/76i/vdnD17lq1Zs4YdP36c2dracr8jW1tbtmHDBpaamsoYY+yvv/5iU6ZM+eS8HzcdKaLCwkLWunVrtnr1asbY57/7aWlpEs1vERERzMrKiqWlpTHGyr9pruT7HTlyhK1fv57r2ZmZmcl8fHyYs7OzRG9Rxhg7deoUGzVqFEtPT2eMMebi4sIcHR3ZpUuXGGOM7du3j7Vv357bv6w+O8W9/a4AWIm7OcYYgoKCcPDgQQDA2LFjcerUKW7KE3ETjVAohKGhIdc7q1evXvD29kb37t3LOfqyUVGbnj7XK6wkZWVldOvWDQYGBli9ejWA4l6PaWlpuHXrlsSsAXXr1oWjoyNu3779yfswxr7/Aa8CUlNTg4qKCgwMDHDixAmuxtC2bVucPHkSDg4OaNKkCYKDgwEAixcvxtWrVzF58mT88ccfWLNmzWe/64raGiD+nohEIigrK2PVqlXYvn07MjIyoKSk9Mn3qHr16hI1kbCwMPTp0wf6+voAyn+8GY/Hw82bN9GjRw8cOXIE2trauHPnDgoLC6Gnp4dmzZrh7du3SE1NBfBfeUNDQ2FkZIRq1arh2rVrMDIywogRI2BhYQEA+N///gcNDQ34+PgAKMNegGWS1iqZnJycT+508vPz2fr165mDgwN3t+Du7s5mzZolMd7i2rVrbODAgSwlJaVcYybFSt7JZWZmsmvXrn1xX5FIxC5evMg6d+7MjbPatm0bGz16NIuIiJDYV1xT+BmJa3jiGsGjR4/Y8OHD2d69e5mbmxtbvXo1i4qKYitXrmRXr15lMTExzNramhuD9vLlS3bixAm2efNmlpmZKa9ilNqXav+jRo1iEydOZIxJfp+EQiF7//49u3btGgsLC2MuLi6sd+/e7MGDB+US7+f8888/rG/fvmz37t2fff39+/ds+vTpzNfXl71584bbHhISwuzt7ZmLiwvr0KGDREcd8bUvMjKSnTt3rkzjVbxbEgUWHh6O0aNHIzAwEJcuXUJiYiKOHz8OAFBVVcWvv/6KGjVqcHfUU6ZMwa1btxAXFweg+K5j2bJlsLOzQ7Vq1RS6vfxnwz4z3Y+1tTXi4+O/eAyPx0Pbtm3RoUMHrFy5EgAwYsQI8Hg8nDt3TmJsmPh5gKKOlSmNrKws+Pj44OTJkwAkx5IwxtC4cWOYm5vjyZMnGDx4MIyNjeHp6YmoqCgUFhbC3Nwc9vb22LhxI9LS0mBiYoI+ffpg7Nix0NPTU/jfkTg+JSUlZGVlYdmyZdi9ezfu3bsHAPDx8cGlS5dw9+5d8Pl8bpwRn8+HSCRCeHg4goOD0adPH5w6dQpNmzaVV1Fw4sQJNGvWDCNGjOC2McawevVqHDx4EFpaWnBwcMCrV69w/fp1bh9nZ2cEBgZi/PjxuHnzJrp06cIdK67ltWzZEj169CjTeCkhSeHdu3cYP3485s6di169esHe3h4mJib466+/cPbsWW5l1Jo1a2LYsGE4deoUEhMT0bhxY9jY2GDFihUYNGgQfH19MW7cOIwfP55bzIrIXsnZiS9cuAA7OztcuXIFZ8+ehZub21eP1dbWxrBhwxAXF4eLFy9CVVUVTk5O0NDQgKam5if7V9QmzJKKioqgo6PD3Wylp6fD3d0d8+fPx/79+wEAo0ePxvPnz5GcnIyBAwfC3d0dr1+/5nrbeXt7gzHGrfUjxkoMmJW3L90QiuOLiorCkCFD8Pr1a9y5cwdTpkzBmTNnYGZmhqFDh2LOnDkAwI21mzdvHnbs2AFXV1fs2rULo0aNAiC/m5TMzEw8ffoU7du3B1Bc3rt376JLly44evQogoODkZSUxF3P7ty5g6dPn3LHW1hYcM2r4vn3ZH7NKtP61k9q//79zNvbm2uWEVfT4+Pjmbe3N1u3bh33WkZGBhs1ahTz8fFhjDGWnZ3NnJyc2Nq1a+UTfCVWsjnlyZMnrH///qxTp06sS5cu7MiRI9xr3+qYkZ+fz/z9/Vm/fv1kFarCiYqKYr/99htbuXIlmzFjBluwYAHbvn07Mzc3ZxcvXmSMMbZnzx7m6urKoqKiGGPFD/Tz8/MrREeXkt+NuLg49ubNG248XX5+PpsxYwYbMWIE27JlC2Os+Duyd+9e1qpVK5aXl8fev3/P2rZty06fPs3Cw8NZu3btmKOjI3vy5Mln30PWPjfdD2PFnUo2btzI/ZySksLu37/PGGNsxIgRbNGiRYwxxu7cucOGDx/OjY+SF0pI3yAQCNjgwYNZQEAAY+y/9nRxO+qePXuYh4cH1/tEJBKx4cOHM3Nzc3blyhXGGPvpB44qmpIXAoFAwM6dO8e6devG9Y46ePAgc3d3Z7dv35b6nC9evGCDBw9mBw4c4N5D0XuFlYZQKJR4Lvr+/Xu2Z88e5ujoyBYvXsxtDwoKYl26dGGFhYVMKBSyCRMmsJUrV3K96D6myMkpNjaWDR48mPXr1485OTkxNzc37m/25MmTrF27dmzVqlWMsf/+3nv16sWCgoIYY8V/++bm5qxt27bsxIkT3Hnl2ZMuMzOTPX36lGVlZTHGGPP392e2trbcNajk53Hq1ClmZ2fHPTt6/PhxOUb9edRk9w25ublISUlBx44dAfxXZWX/X93/9ddfoaenh2PHjiE+Ph4PHz5EgwYN4OPjA0NDQwD4aabPqSjEz4k2bdoEGxsb7NmzB+rq6lwPIfG4oBs3bnC9Hdk3nufVqlULrVq1wv379yEUCsHn8xWyV9j3YP8/+JrH4+Ht27dgjEFLSwudO3dG9erVJVYtHT9+PEQiEYKCgsDn8+Hi4oLLly9/cWVTRWme+9iNGzcwfvx4tGvXDvv378esWbPQsGFDTJs2DRcvXoSTkxM6dOiA7OxsJCcnc3/3HTp0wOvXryESiTBo0CBs2LABt2/fRp8+fQBAYsyVrIm/s+L3W7duHXr27ImlS5di8ODBuH//Puzs7KCjo4OFCxcCkPw8njx5Ant7e26qJ/Hfx7f+FmTp5/iLkqHq1atDX18fx44dk9guvhipq6ujUaNGqFWrFsaOHYtx48ahbdu2cHNzg6mpqRwiJunp6ejevTvOnDmDvXv3Ijg4GNbW1ti7dy/i4+OhqamJAQMGIDIykntQ/bmLiHiwM1Dc3Tk9PR3GxsYKe5H9XjweD9nZ2Rg9ejTc3d0xadIkXLlyBXXr1sX//vc/pKWl4e7du9z+CxYswMaNG5GYmAgHBwf4+fmhXbt2ciyB9MQX2/DwcHTq1Ane3t7Q0NBAhw4dMGfOHHTo0AEHDhxAamoqRowYgSdPnuDChQvc4otxcXFo3rw5+Hw+VFVVYW9vD+C/Zyzl8d2Ij4+HQCCQ+M7evXsXERER+OuvvxAYGAhra2usWrUKeXl5+OOPP3Ds2DF4eXkhODgYO3bsgLW1NZ4+fQp3d/dPBujK89k2JSQpDBo0CBcvXsTLly8/+cKdPXsWeXl5mDVrFvz9/XHr1i307t1bTpESANDX10deXh48PT1hbm6OKlWqwMrKCgKBANra2gAAR0dH1KlTB+Hh4UhMTPzkHEVFReDz+VBSUkJ8fDy3yFpFufBKQ3xxPnbsGEJCQmBsbIyAgADo6upiy5YtCA0NRffu3dGsWTPs2rWLO87Ozg7NmjXjknmLFi0kzqfIeDwecnJyEB4ejs6dOwMortWIE8r48ePx/Plz3LlzBy1btkTTpk2xbt06bpZ3gUDw2Rnay2us2Z07dzBs2DCkpaXh4cOH2LhxI7Kzs3Ht2jUA4CbxnT17NlRVVXHz5k107twZW7duRe3atfHkyRP8/fff8PX1xaZNm1CrVi3F+tzk1VZYkWRkZDB3d3fm5OTEEhMTudkXQkJCWP/+/VlYWJicIyQfi4iIYIMGDWJZWVlMJBKxWbNmsSlTpkg8dH/48CEbPnw4O3z4MDf5Zcn2+MLCQjZ37lzWokULtnTpUrmUoyx9/JyIseJJP83NzVmnTp3Ys2fPGGOMZWVlsS1btrABAwYwoVDIbt++zYYPH16qjiCK5ONYi4qKmIuLC7d2lfh3In4m6OrqyqZNm8YYYywhIYENHjyYbdiwQWIsTnkrOdvHmDFjWLdu3ZilpSU3O0hISAjz9PTkZlZgrHhmBnt7e4nzfPw8W9E+R6ohSaFq1apYtWoVNDU14erqCi8vLzg7O2PDhg2YOnUq7Ozs5B0i+Ujr1q2hpaWFKVOmwNraGu/fv8ecOXOgqqrK1XItLS1hZmaGCxcucN2Vxc0Ve/fuRfv27ZGeno7Q0FBujaqKSvzcS/ycqKCgAIwxqKioYNGiRXj37h03samOjg633EB0dDQaN24MS0tLREZGciP5lZSUuMl/FZU4NvHnXfI5V8uWLfHkyROkpqZyS1yIy6anpwdtbW2IRCIYGxujS5cuePDgAbdirbg2VZ7ENTDGGAQCAdLT07Fs2TIMHz4cQPG8gQBw8eJF7hgtLS00btwYBQUFXNnEz7NLjrVSKHJNhxXMu3fvWFRUFDt//nyZj1AmZe/NmzfMysqK6xnH2H93hOK74czMTBYfHy9x3L1795inp2epeuFVBO/fv2dTpkxhTk5ObPTo0SwkJIR7zdbWli1btoz7+dWrV8zBwYFbxVQ8F5siEtdwLl68yMLCwj656z948CCztbVlQ4YMYd7e3uzx48csMjKSDR06lC1fvlxi37y8PDZkyBCJlY5zcnLYb7/9xlasWCEx64osfVyTTU9PZ8OHD2evX79mjDG2YsUKNnr0aJaQkMAYK65BBQUFcTW/06dPMzs7O7Zjx45yibes/DyTbJUDXV1dWFlZyTsMIqUaNWpgyJAhePr0Kd69ewc9PT3ujlDcKUVPT49bDA4oriE1b94c69evl1vcZYF9tNRAfHw8vLy8YGFhgcDAQJw8eRLnzp0Dn89H3759MX/+fEyYMAFVqlTBsGHDcOTIEW5GZ6C4cw/w30JziiQvLw/q6uq4evUqoqOjYWFhgTp16gAA/v77bxw+fBirV69G/fr1cezYMbi5ueHcuXNwdHREYGAgCgoK0L59e6irq2Px4sUwMTGBpaUlgOLyampqwtHREdHR0eW2PMbHHQt4PB40NDSwePFiBAQEYNy4cRg4cCCuXr0KFxcXaGhoYNy4cTA1NcWlS5cQGRmJKVOmVLzn2fLOiITI0ocPH5iDgwO7fPnyT7Ug3teULGdoaCg7evQoe/jwocRYmZCQENahQwc2YsQI9u7dO8YYYx4eHszCwoKtWLGCDRo06JNF1xTRzp07mZeXF2Os+HmYk5MTCw4O5hbG27x5MzdIPSUlhU2dOpU1adKEJSYmMqFQyEJDQ5m7uztzdXVlTk5ObNeuXXIri5hIJGIikYiFhIRwi/6JRCJ29+5d1rVrV26s1M6dO5mzszOLi4tjhYWF7Ny5c5+dU7EijZejZ0jkp6ampoZx48Zhy5YtyM7Olnc4ZarklDTibslA8d10ZmYm9uzZA39/fxgYGKBu3bro1q0b3rx5A3d3d6xfv56bNunAgQMAwM3c3Lp1axw4cABmZmafzGataOzt7bm5I1VUVODm5oajR4/ixYsXAICUlBTUq1cP69atQ58+faCqqoqoqCgYGBhAKBSiW7du2LZtG9avX4/jx4/D1dUVQPlO93P48GGcPn1aYt2inJwcXLhwATdv3uSec5mbm6N3795Yt24dAMDNzQ01atTA0qVL0axZM0REREj09hOXoUKNl5N3RiRE1kQiEYuOjpZ3GGWmZA2ooKCABQYGsjVr1rB79+4xkUjEkpKS2LRp05i9vT03rY9YUFAQ8/b25n62t7dnXbt25dYv8vX1Zf3792dFRUUK1wPra0rO2j506FC2cOFCxljxOk3m5ubMzc2NW0aeMcYCAgI+O4N1eZc5Li6OWVhYsDZt2rDly5dzNTvGimu3v//+Ozt06BC37fHjx8zGxobr8ZiSksJu3Ljx03y/K1DqJOT78Hg8NGnSRN5hlJmSPQE7d+6MZ8+eoUaNGtyMAoaGhmjevDlycnK43oNCoRACgQCRkZFo0KABAGDPnj3o2LEjvL29uVlF5s6di0ePHmHfvn0K9awoIyODqwWWrA0Cxav7zps3j3vuN2nSJISHh+PWrVtwdHREhw4dYGZmhoSEBNy/fx/9+/fHzZs3YWxszJ1DXkvLm5qaomHDhmjcuDGSkpIwceJE7jMTr8cVGRmJmJgYAP+tVOvn54e8vDzUqlULHTt2RJMmTSR6ClZUPMYUuN8mIeSzTp06he3bt2PatGmfHaiZkpICf39/VKlSBfPnz+cutP7+/oiKiuKS2IoVK2BmZgYAKCgogKqqKsLCwmBkZARzc/NyLdPH2P93zHjw4AEmT56MNWvWcJ2KPnz4gKysLBgYGAAADh06hC1btuDAgQPQ19fH1KlTkZOTgw0bNiAtLQ2nTp1CdHQ0UlJSYGtri3HjxsmxZMVEIhH4fD4WLlyIDx8+YPLkyRg+fDgaNWqE//3vf+jYsSPu3buHnTt3QklJCbNnz8b27duhq6uL2rVrw8nJ6ZPOKxUdJSRCKpiCggL89ttvsLW1hbu7O3dRSk5OxsaNG9GqVSv0798fZ86cwYkTJ9C3b19u/j6hUIiEhASkpKRwiYx9NCeaIhozZgwMDQ0xY8YM7N27F7t27ULNmjXRv39/ODs7Q0lJCR4eHqhZsyaWLl2KlJQUuLm5YcKECejXrx94PB4EAgFUVFS4nnKy7DFYmkTh6+uLzMxM+Pv7IykpCbt27cKJEyewfv16tGvXDhEREdi2bRtiYmJgamqKVatWcSvQ/myoyY6QCiY9PR3v37/nmiF5PB5OnjyJIUOG4ObNmwgLC0NCQgJsbGzQoEEDhIWFcYsJKikpoX79+lwyEk8GqojJKDk5GbNmzUJ8fDymTJmCq1ev4tSpU3j48CH27t2Lvn374vr16zh8+DDU1dUxatQo3LhxA/fv34eBgQEcHBywefNmvHnzBkDxvJMqKioSg3tloeT6Wx8+fADw+WmVxHE0b94cd+7cAVC8OGJERAQKCgqwbt06rF69Gm3atEFAQAD27t2L7du3Q19fX6EHJP8ISkiEKBBpLjRVq1ZFQkIC3r9/z20zMzPD2bNncfnyZbx//x4nT56EpqYm2rVrh3fv3iE6Ovqz51KU50SfK7eamhouXryIpKQkNGrUCNbW1li5ciUaNmyIevXqYejQoWjTpg1u376NmJgYWFtbw9bWFkuXLgUATJ06Fb6+vtwMC+IkIateZ+IEw+fzkZGRgT/++AMHDhyQSFAlieMwNTWFmpoafv31V0yYMAG9e/fGxYsXMXDgQBw7dgwJCQlQVVWFsbExGGPlOqN4eaOERIiCkObOWiQSQV1dHd26dcOmTZu4fSwsLLhlBMTLZBQWFqJDhw5YsmQJtwS1ouLxeBAKhdizZw9ycnIAANWqVUPnzp2xb98+AMWr0Orp6UFdXZ173tWhQwfo6Ohwy60PHjwYWVlZiIuLAwC0adNG5rGLPyNxglmzZg0cHR3B4/EwfPjwbyZALS0tZGdn45dffsGhQ4cwZswYVKtWDS4uLrhy5QpMTEy4fXk8nsLcRMgCJSRC5Kw0d9bii9uoUaPw7NkzbNmyRSJpZWZmIiEhAe3atYOysjJUVVW5GoKiN/Ps2bMHixcvxoYNG/Du3TsAQM+ePZGRkYH4+HhUrVoVAwYMwPnz55GUlAQAaNy4MVq3bo179+4hPDwcjRs3xvHjx9GwYUOZx8sYk/iMUlJSMGHCBBw4cACnT5/GqlWrpJoFvF69eqhVqxaMjIxQq1YtiTFQfD5fbkugywMlJELk5EfurBs3boxZs2Zhx44dGDlyJHbv3o2FCxfCwcEB+vr6GDly5CfHK2IzT2ZmJvd/BwcHVKtWDU+fPsXRo0eRkJCA+vXrQ0dHBy9fvgQAjB07FiKRCGfOnEFubi4AoH379nBwcMAvv/wCoHgCUVl3fxY3m/H5fERHR2Pq1Kl48uQJTExMYGtrCzU1NW5fcfL8kvz8fNStW5eb8PbjGtDPXCP6GCUkQspZWd1ZDxw4EAEBAWjUqBFevnyJnJwcHDx4ELNnz4aSkpJCj0lhjOHMmTP49ddfuaRkaGgIFxcXmJmZgcfjwd/fH6ampvjw4QOePXvGHevp6YmjR4/i33//BQAYGxtj5MiR3Px1gOyfEykpKeHDhw+YOXMmRo4cCSMjI3Tp0gUODg7Iy8vD8ePHAQDTp0+Ho6MjN3PE51SpUgXjxo2Dt7e3Qt40lCeaXJWQciTuaszj8RAdHY2dO3eiT58+MDExga6u7id31kZGRp89j7hbcevWrdG6dWuJLswVYYl1Ho+HVq1awcTEBH/88QfmzZuHevXqQUdHB5qamujVqxeuXbuGLVu2oEuXLjh+/DhGjx4NALC1tcWBAweQlZUlcc7yGJMj/p1u2rQJW7duhbKyMk6cOMElQysrK9y7dw9Hjx7FihUr4OzsjOvXr0NPT++z5yssLISKiorEZK6VqUb0McX9xhLyEynrO2vxhffjLswikYhLePL2rWdWtWrVQmBgIAoLC7F582YkJiaiQYMGuH37NqpWrYolS5bgn3/+QUJCAjIzM7kVagFgw4YN6Natm8T5yqPMSUlJaNGiBa5fv465c+dyPR7FzW3KysqwtbWFiYkJunXrBl9fX+jp6X0yu4S4t5x4TNT58+exatUqrjNLZUUJiZByUPLO2traGpcvX8aJEycwefJkAMV31paWljh69CiaNWsGNTU1XL9+HfXq1fvs+b40caai1Iqk6TEoFAq5RRQZY5gzZw66dOmChw8fIjo6GgYGBhg/fjy3IN2rV6+4Y+W1QGCdOnXg5+eHvXv3wtnZGe3atcO2bduQmZnJldfU1BTW1tYoKChAaGgoF2/Jcot7y7148QKurq6YP38+fvnlF66nZGWlGN9eQn5yZX1nLb7AnT9/HqtXr+a6SstbaXoMisvQokULTJ48GampqVi6dCnat2+Pq1evAiieeXzGjBlYt24dN9uEmDwG9PL5fPTu3Zu7Ifjjjz+QkJCAy5cvS6wk27lzZxgaGiI0NBTv37+XuFFQUlJCYWEh5s2bh/79+6NZs2b4+++/0bdv33ItiyKihERIOZDlnbWZmZnc76x/dCyOgYEBNm3ahAcPHuD69et4+fIl14uudu3a6NGjh8T7yJuSkhKEQiF0dHQwdOhQHDx4kOsJCBQ3R1pZWeHt27dc54uSJk2ahOfPnyM0NBRTp04tz9AVGs1lR0g5EtdusrOzMWDAAPz2228YMGAA16suNTUV27dvR2ZmJubOncut2CpWWFiIRYsW4dSpUxg6dKjcL2biZjNxwklJScHChQtx9+5dnDp1CjVq1JDqPOKJRv/++2+sWLEC5ubm8PX1LbcVWr9HyU4UAwcORKdOnTBu3Dioq6sDKF7JNisrixsHBvz3+X/48IHbj/yHakiElKOf6c66LMfiiC/s7du3R3BwMJYuXarQyQj4b3YJAPDw8MDx48fx4MED7nU1NTXUqlVL4lmXuMZLyejzKCERUs7EtYmRI0dCSUkJJ0+elOhdZWtriyVLlqBdu3bcNvGFb/Xq1dizZw+qVatWvkGXIIuxODwejztv1apVAZTvqq3fS5xgunTpgsaNG3/SFR2Qz7Ouioqa7AiRA3HTzZUrV7BgwQIsW7ZMIgEBir8sRMmxOCEhIdxYnKKiIuzcuRNnzpxBXFwcnJ2dMWXKlC+OxanoY2/E8Vf0cigCqiERIgcV+c66svQYlJY4fnl1Rf+ZUEIiRE7ETVLr169H9+7d5RyN9H72HoM/QlFvIioKSkiEyElFvbOmsThEVighEaIAKuKd9c/UY5AoBurUQAj5bjQWh5QlqiERQr4bjcUhZYmWnyCE/BBpewwS8i3UZEcI+WE0FoeUBUpIhJAypegDeonioiY7QkiZokREvhd1aiCEEKIQKCERQghRCJSQCCGEKARKSIQQQhQCJSRCCCEKgRISIYQQhUAJiRBCiEKghEQIIUQh/B/Qlq8xxEOQIAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set_theme(style='ticks', color_codes=True)\n", + "df = pd.read_table('for_plot.tsv', \n", + " header=0,\n", + " sep='\\t',\n", + " usecols=['Test Name', 'Throughput'])\n", + "\n", + "ax = sns.barplot(\n", + " x='Test Name',\n", + " y='Throughput',\n", + " data=df,\n", + " palette='ch:start=.2,rot=-.2')\n", + "\n", + "ax.xaxis.label.set_visible(False)\n", + "ax.set_xticklabels(ax.get_xticklabels(), rotation=30)\n", + "ax.yaxis.grid(True, clip_on=False)\n", + "\n", + "ax.set_ylabel('Throughput(seq/sec)', fontsize=12)\n", + "ax.tick_params(labelsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "fig = ax.get_figure()\n", + "fig.show()\n", + "fig.savefig('./stacked_lstm_impl.pdf', dpi=500, bbox_inches='tight')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "ffc07b31ae6529d1a9a758fd197adbf7471a1c3d37ee83e817ab97b7225508a4" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf new file mode 100644 index 000000000..60d66a18f Binary files /dev/null and b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf differ diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_results.tsv b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_results.tsv new file mode 100644 index 000000000..2901b0d1d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_results.tsv @@ -0,0 +1,19 @@ +Test Name Average Time (s) Elapsed Time(s) Throughput(seq per sec) +pt_cudnn_lstm 0.0251 0.7545 2544.7697 +pt_finegrained_op_v1_cuda:0 0.4980 14.9410 128.5051 +pt_finegrained_op_v2_cuda:0 0.2123 6.3702 301.4025 +pt_finegrained_op_v1_cuda:0_JIT 0.1387 4.1602 461.5115 +pt_finegrained_op_v2_cuda:0_JIT 0.0505 1.5161 1266.4127 +tf_graph_cudnnlstm 0.0374 1.1219 1711.4016 +tf_graph_fine_grained_op_lstm_v1_gpu 0.1267 3.8025 504.9329 +tf_graph_fine_grained_op_lstm_v2_gpu 0.0743 2.2293 861.2472 +tf_graph_static_lstm_cell_gpu 0.0823 2.4683 777.8715 +tf_graph_whileOpLstm_gpu 0.1068 3.2042 599.2203 +tf_eager_cudnnlstm 0.0751 2.2541 851.7819 +tf_eager_fine_grained_op_lstm_v1_gpu 4.5640 136.9190 14.0229 +tf_eager_fine_grained_op_lstm_v2_gpu 3.2110 96.3287 19.9318 +tf_eager_static_lstm_cell_gpu 2.3811 71.4322 26.8786 +tf_autograph_cudnnlstm 0.0328 0.9851 1949.1312 +tf_autograph_fine_grained_op_lstm_v1_gpu 0.0857 2.5710 746.7896 +tf_autograph_fine_grained_op_lstm_v2_gpu 0.0501 1.5038 1276.7482 +tf_autograph_static_lstm_cell_gpu 0.0503 1.5080 1273.2241 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/lstm_cell_pytorch.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/lstm_cell_pytorch.py new file mode 100644 index 000000000..59dac7bc0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/lstm_cell_pytorch.py @@ -0,0 +1,113 @@ +import os +from time import time +from collections import namedtuple +import sys + +import torch +from torch.profiler import profiler, record_function, ProfilerActivity + +import pt_model as model +torch.manual_seed(1234) + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def GetTestName(cell_type): + if cell_type == 'cudnn_lstm': + return 'CuDNN' + elif cell_type == 'v2': + return 'PT_JITed' + + +def run_lstm_cell_pytorch_cudnn(batch_size, seq_len, hidden, depth, cell_type): + input_shape = [seq_len, batch_size, hidden] + torch.backends.cudnn.enabled = True + device = 'cuda:0' + + x = torch.randn(*input_shape, device=device) + + m = model.small_model( + batch_size=batch_size, + cell_type=cell_type, + max_seq_length=seq_len, + hidden_size=hidden, + num_layers=depth).to(device) + m = torch.jit.script(m) + m.eval() + + torch.cuda.synchronize() + for i in range(10): # warmup + output = m(x) + + iter_count = 1000 + + torch.cuda.synchronize() + start = time() + for i in range(iter_count): + output = m(x) + total_time = time() - start + return total_time / iter_count # count in seconds + + +def run_lstm_cell_pytorch_cudnn_profiler(batch_size, seq_len, hidden, depth, cell_type): + input_shape = [seq_len, batch_size, hidden] + torch.backends.cudnn.enabled = True + device = 'cuda:0' + + x = torch.randn(*input_shape, device=device) + m = model.small_model( + batch_size=batch_size, + cell_type=cell_type, + max_seq_length=seq_len, + hidden_size=hidden, + num_layers=depth).to(device) + m = torch.jit.script(m) + m.eval() + + warmup = 10 + for _ in range(warmup): + m(x) + torch.cuda.synchronize() + + with torch.profiler.profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] + ) as prof: + with torch.profiler.record_function("lstm_cell"): + m(x) + + print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) + event_list = prof.key_averages() + cuda_time = None + + for event in event_list: + if event.key == "lstm_cell": + cuda_time = event.cuda_time_total + break + + return cuda_time + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: python lstm_cell_pytorch.py [tsv_file_path]") + sys.exit(1) + + tsv = sys.argv[1] + + max_depth = 20 + print("Pytorch LstmCell Benchmark......") + + hidden_sizes = [128, 256, 512, 1024] + batch_sizes = [32, 64, 128, 256] + + depth = 1 + seq_length = 1 + with open(tsv, 'w') as f: + f.write( + "[depth, seq_length, batch_size, hidden_size]\tTestName\tAvgTime(ms)\n") + for hidden_size in hidden_sizes: + for batch_size in batch_sizes: + t = run_lstm_cell_pytorch_cudnn(batch_size, seq_length, + hidden_size, depth, 'cudnn_lstm') + f.write('[%d, %d, %d, %d]\t%s\t%.5f\n' % (depth, seq_length, + batch_size, hidden_size, 'CuDNN', t * 1000)) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/performance_with_increased_depths.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/performance_with_increased_depths.py new file mode 100644 index 000000000..d7c494afa --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/performance_with_increased_depths.py @@ -0,0 +1,179 @@ +from tf_model.rnn import WhileOpLstmNet +from tf_model.rnn import FineGrainedOpLstmNet +from tf_model.rnn import StaticRNN +import tensorflow as tf +import test_utils as tu +import math +import gc +import logging +import sys +import os +from time import time +from collections import namedtuple + +import torch +import pt_model as model +torch.manual_seed(1234) + + +# supress tensorflow deprecation warning. +tf.get_logger().setLevel('ERROR') + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +ITERS = 30 +BATCH_SIZE = 32 +HIDDEN = 128 +SEQ_LEN = 30 + +logger = logging.getLogger() + +logging.basicConfig( + level=logging.INFO, + filename='figures/perf_with_increased_depth.tsv', + filemode='w', + format='%(message)s') +logger.info(('Depth\tTestName\tAvgTime\tThroughput\tRatio')) + + +def GetTestName(cell_type): + if cell_type == 'cudnn_lstm': + return 'CuDNN' + elif cell_type == 'v2': + return 'PT_JITed' + + +def RunPyTorchTest(batch_size, seq_len, hidden, depth, cell_type): + input_shape = [seq_len, batch_size, hidden] + torch.backends.cudnn.enabled = True + device = 'cuda:0' + + x = torch.randn(*input_shape, device=device) + + m = model.small_model( + batch_size=batch_size, + cell_type=cell_type, + max_seq_length=seq_len, + hidden_size=hidden, + num_layers=depth).to(device) + m = torch.jit.script(m) + m.eval() + + torch.cuda.synchronize() + for i in range(10): # warmup + output = m(x) + + torch.cuda.synchronize() + start = time() + for i in range(ITERS): + output = m(x) + return time() - start # count in seconds + + +def RunTensorFlowGraphTest(model, batch_size, seq_len, hidden, depth): + dev = 'gpu' + stddev = 1.0 / math.sqrt(hidden) + + with tf.device(tu.device(dev)): + data = tf.random.uniform( + (seq_len, batch_size, hidden), minval=-stddev, maxval=stddev) + + output = model(data) + + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + + for _ in range(5): # warmup + sess.run(output) + + start = time() + for _ in range(ITERS): + sess.run(output) + return time() - start + + +def RunTensorFlowEagerAutoGraphTest(model, batch_size, seq_len, hidden, depth): + dev = 'gpu' + stddev = 1.0 / math.sqrt(hidden) + + with tf.device(tu.device(dev)): + data = tf.random.uniform( + (seq_len, batch_size, hidden), minval=-stddev, maxval=stddev) + + for i in range(5): # warmup + y = model(data) + gc.collect() + + start = time() + for i in range(ITERS): + y = model(data) + return time() - start + + +def report(test_name, total_times): + throughputs = [BATCH_SIZE * ITERS / t for t in total_times] + + base = total_times[0] + raitos = [] + for t in total_times: + raitos.append(t / base) + + for i, (time, throughput, ratio) in enumerate( + zip(total_times, throughputs, raitos)): + logger.info('%d\t%s\t%.5f\t%.5f\t%.5f' % + (i + 1, test_name, time / ITERS * 1000, throughput, ratio)) + + +if __name__ == '__main__': + max_depth = 20 + + for cell_type in [ + 'v2', + 'cudnn_lstm', + ]: + total_times = [] + for depth in range(1, max_depth + 1): + print(f'{GetTestName(cell_type)}, depth = {depth}') + t = RunPyTorchTest(BATCH_SIZE, SEQ_LEN, HIDDEN, depth, cell_type) + total_times.append(t) + + report(GetTestName(cell_type), total_times) + + tf.compat.v1.disable_eager_execution() + + total_times = [] + for depth in range(1, max_depth + 1): + print(f'depth = {depth}') + model = WhileOpLstmNet(HIDDEN, HIDDEN, depth) + t = RunTensorFlowGraphTest(model, BATCH_SIZE, SEQ_LEN, HIDDEN, depth) + total_times.append(t) + report('TF_WhileOpLSTM', total_times) + + total_times = [] + for depth in range(1, max_depth + 1): + print(f'depth = {depth}') + model = FineGrainedOpLstmNet(HIDDEN, HIDDEN, depth, 'v2') + t = RunTensorFlowGraphTest(model, BATCH_SIZE, SEQ_LEN, HIDDEN, depth) + total_times.append(t) + report('TF_GraphMode', total_times) + + total_times = [] + for depth in range(1, max_depth + 1): + model = StaticRNN( + hidden_size=HIDDEN, num_layers=depth, use_cudnn_rnn=False) + t = RunTensorFlowGraphTest(model, BATCH_SIZE, SEQ_LEN, HIDDEN, depth) + total_times.append(t) + report('TF_StaticLSTMCell', total_times) + + tf.compat.v1.enable_eager_execution() + total_times = [] + + # for depth in range(1, max_depth + 1): + # print(f'depth = {depth}') + # # model = FineGrainedOpLstmNet(HIDDEN, HIDDEN, depth, 'v2') + # model = StaticRNN( + # hidden_size=HIDDEN, num_layers=depth, use_cudnn_rnn=False) + # t = RunTensorFlowEagerAutoGraphTest(model, BATCH_SIZE, SEQ_LEN, HIDDEN, + # depth) + # total_times.append(t) + # report('TF_AutoGraph', total_times) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/pt_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/pt_model/__init__.py new file mode 100644 index 000000000..7ec7faad7 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/pt_model/__init__.py @@ -0,0 +1,10 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from .rnn import small_model + +__all__ = [ + "small_model", +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/pt_model/rnn.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/pt_model/rnn.py new file mode 100644 index 000000000..3881cd009 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/pt_model/rnn.py @@ -0,0 +1,226 @@ +from typing import Tuple +from typing import List + +import torch.jit as jit +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +__all__ = [ + 'small_model', +] + +import torch +from torch import nn +from torch import Tensor +from typing import Tuple + + +def init(param): + return nn.init.kaiming_normal_( + param, mode='fan_out', nonlinearity='sigmoid') + + +class FineGrainedOpLSTMCell_v1(nn.Module): + def __init__(self, input_size: int, hidden_size: int, dtype=torch.float16): + super(FineGrainedOpLSTMCell_v1, self).__init__() + # learnable parameters for input gate. + self.Wi = nn.Parameter( + init(torch.Tensor(input_size, hidden_size).half())) + self.Ui = nn.Parameter( + init(torch.Tensor(hidden_size, hidden_size).half())) + self.bi = nn.Parameter(torch.ones(hidden_size, dtype=torch.float16)) + + # learnable parameters for forget gate. + self.Wf = nn.Parameter( + init(torch.Tensor(input_size, hidden_size).half())) + self.Uf = nn.Parameter( + init(torch.Tensor(hidden_size, hidden_size).half())) + self.bf = nn.Parameter(torch.ones(hidden_size, dtype=torch.float16)) + + # learnable parameters for cell candidate. + self.Wg = nn.Parameter( + init(torch.Tensor(input_size, hidden_size).half())) + self.Ug = nn.Parameter( + init(torch.Tensor(hidden_size, hidden_size).half())) + self.bg = nn.Parameter(torch.ones(hidden_size, dtype=torch.float16)) + + # learnable parameters for output gate. + self.Wo = nn.Parameter( + init(torch.Tensor(input_size, hidden_size).half())) + self.Uo = nn.Parameter( + init(torch.Tensor(hidden_size, hidden_size).half())) + self.bo = nn.Parameter(torch.ones(hidden_size, dtype=torch.float16)) + + def forward(self, input: Tensor, + state_prev: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]: + h_prev, c_prev = state_prev + + ig = torch.sigmoid(input @ self.Wi + h_prev @ self.Ui + self.bi) + fg = torch.sigmoid(input @ self.Wf + h_prev @ self.Uf + self.bf) + og = torch.sigmoid(input @ self.Wo + h_prev @ self.Uo + self.bo) + c_candidate = torch.tanh(input @ self.Wg + h_prev @ self.Ug + self.bg) + + c = fg * c_prev + ig * c_candidate + h = og * torch.tanh(c) + return h, c + + +class FineGrainedOpLSTMCell_v2(nn.Module): + def __init__(self, input_size: int, hidden_size: int, dtype=torch.float16): + super(FineGrainedOpLSTMCell_v2, self).__init__() + # learnable parameters for four gates. + self.W = nn.Parameter( + init( + torch.zeros( + (input_size, hidden_size * 4), dtype=torch.float16))) + self.U = nn.Parameter( + init( + torch.zeros( + (hidden_size, hidden_size * 4), dtype=torch.float16))) + self.b = nn.Parameter(torch.ones(hidden_size * 4, dtype=torch.float16)) + + self.hidden_size = hidden_size + + def forward(self, input: Tensor, + state_prev: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]: + h_prev, c_prev = state_prev + + g = input @ self.W + h_prev @ self.U + self.b + ig, fg, og, c_candidate = g.chunk(4, 1) + + ig = torch.sigmoid(ig) + fg = torch.sigmoid(fg) + og = torch.sigmoid(og) + c_candidate = torch.tanh(c_candidate) + + c = fg * c_prev + ig * c_candidate + h = og * torch.tanh(c) + return h, c + + +class CuDNNLSTM(nn.Module): + def __init__(self, hidden_size: int, num_layers: int, dtype): + super(CuDNNLSTM, self).__init__() + + # The model uses the nn.RNN module (and its sister modules nn.GRU + # and nn.LSTM) which will automatically use the cuDNN backend + # if run on CUDA with cuDNN installed. + self.rnn_net = nn.LSTM( + hidden_size, + hidden_size, + num_layers, + dropout=0., + batch_first=False, # layout is: [length, batch, hidden] + bidirectional=False, + dtype=dtype) + + def forward(self, input): + hiddens, _ = self.rnn_net(input) + return hiddens + + +class StackedLSTM(nn.Module): + '''Container module for a stacked RNN language model. + + Args: + cell_type: str, the recurrent net to compute sentence representation. + vocab_size: int, the size of word vocabulary. + embedding_dim: int, the dimension of word embedding. + rnn_hidden_dim: int, the dimension of RNN cell's hidden state. + num_layers: int, the number of stacked RNN network. + Returns: + A Tuple of Tensor. The first element is the final output of the + model before loss computation with a shape of + [batch_size, seq_len, vocab_size]. + The second element is the hidden states of the RNN network with a shape + [batch_size, seq_len, rnn_hidden_dim]. + ''' + + def __init__(self, + batch_size: int, + max_seq_length: int, + cell_type: str, + hidden_size: int, + num_layers: int, + dtype=torch.float16): + super(StackedLSTM, self).__init__() + self.max_seq_length = max_seq_length + + self.register_buffer( + 'init_state', torch.zeros((batch_size, hidden_size), dtype=dtype)) + + if cell_type == 'v1': + self.cells = nn.ModuleList([ + FineGrainedOpLSTMCell_v1( + hidden_size, hidden_size, dtype=dtype) + for i in range(num_layers) + ]) + elif cell_type == 'v2': + self.cells = nn.ModuleList([ + FineGrainedOpLSTMCell_v2( + hidden_size, hidden_size, dtype=dtype) + for i in range(num_layers) + ]) + else: + raise ValueError(f'Unknown cell type {cell_type}.') + + def forward(self, input): + '''Define forward computations of the RNNLM. + + Args: + input: Tensor, its shape is [batch_size, seq_len], dtype is float16. + + Returns: + A Tensor with a shape of [batch_size, seq_len, rnn_hidden_dim], dtype is float16. + ''' + + xs = input + + hiddens = torch.jit.annotate(List[Tensor], []) + cells = torch.jit.annotate(List[Tensor], []) + for rnn in self.cells: + h: Tensor = self.init_state + c: Tensor = self.init_state + + hs = torch.jit.annotate(List[Tensor], []) + cs = torch.jit.annotate(List[Tensor], []) + + inputs = xs.unbind(0) + for i in range(self.max_seq_length): + h, c = rnn(inputs[i], (h, c)) + + hs.append(h.half()) # Ensure the tensor is float16 + cs.append(c.half()) # Ensure the tensor is float16 + + hs = torch.stack(hs) + cs = torch.stack(cs) + xs = hs + + hiddens.append(hs) + cells.append(cs) + return hiddens, cells + + +def small_model(cell_type, + batch_size, + max_seq_length, + hidden_size, + num_layers, + dtype, + states=False): + if cell_type == 'cudnn_lstm': + return CuDNNLSTM( + hidden_size=hidden_size, num_layers=num_layers, dtype=dtype) + elif cell_type == 'script_lstm': + return ScriptLSTM(hidden_size, hidden_size, num_layers, states) + else: + return StackedLSTM( + batch_size=batch_size, + cell_type=cell_type, + max_seq_length=max_seq_length, + hidden_size=hidden_size, + num_layers=num_layers, + dtype=dtype) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_PyTorch.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_PyTorch.py new file mode 100644 index 000000000..752cf0e44 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_PyTorch.py @@ -0,0 +1,253 @@ +from collections import namedtuple +import torch +from time import time +import argparse +import unittest +import logging +import sys +import os + +import pt_model as model +from torch.profiler import profile +from torch.profiler import record_function +from torch.profiler import ProfilerActivity + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=32) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=256) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=8) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +class PytorchStackedLSTM(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + dtype = torch.float16 + PROFILER_ENABLE = False + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN = cmd_args.hidden_size + NUM_LAYERS = cmd_args.depth + + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tPyTorch(ms)\n") + + def setUp(self): + torch.manual_seed(1234) + + self.shape = (PytorchStackedLSTM.SEQ_LEN, + PytorchStackedLSTM.BATCH_SIZE, PytorchStackedLSTM.HIDDEN) + + def _report(self, test_name, test_case, elapsed): + seq_len, batch_size, hidden, num_layers = test_case + torch.cuda.synchronize() + # elapsed_time = time() - start + # average_time = elapsed_time / TritonStackedLSTM.ITERS + # seq_per_sec = (TritonStackedLSTM.ITERS * + # TritonStackedLSTM.BATCH_SIZE) / elapsed_time + + print( + f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + f"hidden_size: {hidden}, PyTorch(ms): {elapsed}ms") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {batch_size}, {hidden}]\t" + f"{elapsed}\n") + + def _apply_forward(self, test_name, test_case, x, model): + model.eval() + for i in range(PytorchStackedLSTM.WARM_UP): + output = model(x) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() + + if PytorchStackedLSTM.PROFILER_ENABLE: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + profile_memory=True, + record_shapes=True) as prof: + with record_function('model_inference'): + for i in range(PytorchStackedLSTM.ITERS): + if i >= 5: + break + output = model(x) + + print(prof.key_averages().table( + sort_by='cuda_time_total', row_limit=20)) + print( + prof.key_averages(group_by_input_shape=True).table( + sort_by='cuda_time_total', row_limit=20)) + # prof.export_chrome_trace('trace_' + test_name + '.json') + else: + for i in range(PytorchStackedLSTM.ITERS): + output = model(x) + + end_event.record() + torch.cuda.synchronize() + elapsed = start_event.elapsed_time( + end_event) / PytorchStackedLSTM.ITERS + self._report(test_name, test_case, elapsed) + + def test_fine_grained_op_lstm_forward(self): + if not self.DEFAULT_TEST: + for enable_jit in [ + # False, + True, + ]: + for device in [ + # 'cpu', + 'cuda:0', + ]: + for cell_type in [ + # 'v1', + 'v2', + ]: + x = torch.randn( + *self.shape, device=device, dtype=self.dtype) + + m = model.small_model( + batch_size=PytorchStackedLSTM.BATCH_SIZE, + cell_type=cell_type, + max_seq_length=PytorchStackedLSTM.SEQ_LEN, + hidden_size=PytorchStackedLSTM.HIDDEN, + num_layers=PytorchStackedLSTM.NUM_LAYERS, + dtype=self.dtype).to(device) + + if enable_jit: + m = torch.jit.script(m) + + test_name = f'pt_finegrained_op_{cell_type}_{device}' + ( + '_JIT' if enable_jit else '') + test_case = [ + PytorchStackedLSTM.SEQ_LEN, + PytorchStackedLSTM.BATCH_SIZE, + PytorchStackedLSTM.HIDDEN, + PytorchStackedLSTM.NUM_LAYERS + ] + self._apply_forward(test_name, test_case, x, m) + + def test_default_data(self): + if self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'cuda:0', + ]: + test_name = f'pt_finegrained_op_v2_JIT{device}' + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + x = torch.randn( + (seq_len, batch_size, hidden), + device=device, + dtype=torch.float16) + m = model.small_model( + batch_size=batch_size, + cell_type='v2', + max_seq_length=seq_len, + hidden_size=hidden, + num_layers=num_layers, + dtype=torch.float16).to(device) + + m = torch.jit.script(m) + return x, m + + test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [64, 256, 256, 32], + [64, 256, 512, 32], + [64, 256, 1024, 32], + # scale with depth + [64, 256, 256, 1], + [64, 256, 256, 2], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 16], + [64, 256, 256, 32], + [64, 256, 1024, 1], + [64, 256, 1024, 2], + [64, 256, 1024, 4], + [64, 256, 1024, 8], + [64, 256, 1024, 16], + [64, 256, 1024, 32], + # scale with length + [32, 256, 256, 32], + [64, 256, 256, 32], + [128, 256, 256, 32], + [32, 256, 1024, 32], + [64, 256, 1024, 32], + [128, 256, 1024, 32], + # figure 2 + [64, 256, 256, 1], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 12], + [64, 256, 256, 16], + [64, 256, 256, 20], + ] + + for test_case in test_cases: + x, m = build_data(test_case) + self._apply_forward(test_name, test_case, x, m) + del x + del m + torch.cuda.empty_cache() + + # def test_cudnn_lstm_forward(self): + # torch.backends.cudnn.enabled = True + # device = 'cuda:0' + + # x = torch.randn(*self.shape, device=device, dtype=self.dtype) + + # m = model.small_model( + # batch_size=PytorchStackedLSTM.BATCH_SIZE, + # cell_type='cudnn_lstm', + # max_seq_length=PytorchStackedLSTM.SEQ_LEN, + # hidden_size=PytorchStackedLSTM.HIDDEN, + # num_layers=PytorchStackedLSTM.NUM_LAYERS, + # dtype=self.dtype).to(device) + + # self._apply_forward('pt_cudnn_lstm', x, m) + + +if __name__ == '__main__': + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py new file mode 100644 index 000000000..54bb47a2b --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py @@ -0,0 +1,201 @@ +import gc +import unittest +import time +import sys +import math +import os +import logging +import datetime +import argparse + +import test_utils as tu +import tensorflow as tf + +from tf_model.rnn2 import StaticRNN +from tf_model.rnn2 import FineGrainedOpLstmNet + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Only print error information. +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def force_gpu_sync(): + tf.constant(1).gpu().cpu() + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=32) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=256) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=8) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +class TFEagerStackedLSTM(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN = cmd_args.hidden_size + NUM_LAYERS = cmd_args.depth + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + PROFILER_ENABLE = False + + def setUp(self): + tf.compat.v2.random.set_seed(1234) + + self.stddev = 1.0 / math.sqrt(TFEagerStackedLSTM.HIDDEN) + self.shape = (TFEagerStackedLSTM.BATCH_SIZE, + TFEagerStackedLSTM.SEQ_LEN, TFEagerStackedLSTM.HIDDEN) + + def _report(self, test_name, test_case, start): + ''' + Args: + test_name (String): Name of the test. + start (String): Timestamp of the start time. + ''' + seq_len, batch_size, hidden, num_layers = test_case + elapsed_time = time.time() - start + average_time = elapsed_time / TFEagerStackedLSTM.ITERS + seq_per_sec = (TFEagerStackedLSTM.ITERS * + TFEagerStackedLSTM.BATCH_SIZE) / elapsed_time\ + + print( + f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + f"hidden_size: {hidden}, Tensorflow(ms): {average_time * 1000}ms") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {batch_size}, {hidden}]\t" + f"{average_time * 1000}\n") + + def _apply_forward(self, test_case, dev, test_name, model): + '''Only Test the forward computation. + Args: + dev, String: Device that on which the test is running. cpu or gpu. + test_name, String: Name of the test. + model, Callable: The tested model. It should be a callable object. + ''' + shape = (test_case[0], test_case[1], test_case[2]) + with tf.device(tu.device(dev)): + data = tf.random.uniform( + shape, minval=-self.stddev, maxval=self.stddev) + + for i in range(TFEagerStackedLSTM.WARM_UP): + y = model(data) + gc.collect() + + if TFEagerStackedLSTM.PROFILER_ENABLE: + log_dir = 'logs/' + datetime.datetime.now().strftime( + '%Y%m%d-%H%M%S') + '_' + test_name + tf.profiler.experimental.start(log_dir) + + start = time.time() + for i in range(TFEagerStackedLSTM.ITERS): + y = model(data) + self._report(test_name, test_case, start) + + if TFEagerStackedLSTM.PROFILER_ENABLE: + profiler_result = tf.python.eager.profiler.stop() + tf.profiler.experimental.stop() + + def test_fine_grained_op_lstm_forward(self): + if not self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'gpu', + ]: + for cell_type in [ + # 'v1', + 'v2', + ]: + model = FineGrainedOpLstmNet( + input_size=TFEagerStackedLSTM.HIDDEN, + hidden_size=TFEagerStackedLSTM.HIDDEN, + num_layers=TFEagerStackedLSTM.NUM_LAYERS, + cell_type=cell_type) + test_case = (self.SEQ_LEN, self.BATCH_SIZE, self.HIDDEN, + self.NUM_LAYERS) + self._apply_forward( + test_case, device, + f'tf_eager_fine_grained_op_lstm_{cell_type}_{device}', + model) + + def test_staticlstm_forward(self): + if not self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'gpu', + ]: + model = StaticRNN( + hidden_size=TFEagerStackedLSTM.HIDDEN, + num_layers=TFEagerStackedLSTM.NUM_LAYERS, + use_cudnn_rnn=False) + test_case = (self.SEQ_LEN, self.BATCH_SIZE, self.HIDDEN, + self.NUM_LAYERS) + self._apply_forward(test_case, device, + f'tf_eager_static_lstm_cell_{device}', + model) + + def test_default_data(self): + if self.DEFAULT_TEST: + + def build_model(test_case): + seq_len, batch_size, hidden, num_layers = test_case + GraphModeModel = StaticRNN( + hidden_size=hidden, + num_layers=num_layers, + use_cudnn_rnn=False) + return GraphModeModel + + test_cases = [ + [64, 256, 256, 1], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 12], + [64, 256, 256, 16], + [64, 256, 256, 20], + ] + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tTensorflow-autograph(ms)\n" + ) + print('default-tf_autograph_gpu') + for test_case in test_cases: + model = build_model(test_case) + self._apply_forward(test_case, 'gpu', f'tf_autograph_gpu', + model) + + +if __name__ == '__main__': + tf.compat.v1.enable_eager_execution(tu.get_config()) + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py new file mode 100644 index 000000000..54fd0d5f2 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py @@ -0,0 +1,230 @@ +import time +import sys +import math +import unittest +import os +import logging +import datetime +import argparse + +import test_utils as tu +import tensorflow as tf +from tf_model.rnn import StaticRNN +from tf_model.rnn import FineGrainedOpLstmNet +from tf_model.rnn import WhileOpLstmNet + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Only print error information. +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=32) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=256) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=8) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +class TFGraphStackedLSTM(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN = cmd_args.hidden_size + NUM_LAYERS = cmd_args.depth + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + PROFILER_ENABLE = False + + def setUp(self): + tf.compat.v2.random.set_seed(1234) + + self.stddev = 1.0 / math.sqrt(TFGraphStackedLSTM.HIDDEN) + # the layout of input tensor if batch-major: [length, batch, hidden] + self.shape = (TFGraphStackedLSTM.SEQ_LEN, + TFGraphStackedLSTM.BATCH_SIZE, TFGraphStackedLSTM.HIDDEN) + + def _report(self, test_name, test_case, start): + ''' + Args: + test_name (String): Name of the test. + start (String): Timestamp of the start time. + ''' + seq_len, batch_size, hidden, num_layers = test_case + elapsed_time = time.time() - start + average_time = elapsed_time / TFGraphStackedLSTM.ITERS + seq_per_sec = (TFGraphStackedLSTM.ITERS * + TFGraphStackedLSTM.BATCH_SIZE) / elapsed_time\ + + print( + f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + f"hidden_size: {hidden}, Tensorflow(ms): {average_time * 1000}ms") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {batch_size}, {hidden}]\t" + f"{average_time * 1000}\n") + + def _apply_forward(self, test_case, dev, test_name, model): + '''Only Test the forward computation. + Args: + dev, String: Device that on which the test is running. cpu or gpu. + test_name, String: Name of the test. + model, Callable: The tested model. It should be a callable object. + ''' + shape = (test_case[0], test_case[1], test_case[2]) + with tf.device(tu.device(dev)): + data = tf.random.uniform( + shape, minval=-self.stddev, maxval=self.stddev) + output = model(data) + + with tf.compat.v1.Session() as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + + for _ in range(TFGraphStackedLSTM.WARM_UP): + sess.run(output) + + if TFGraphStackedLSTM.PROFILER_ENABLE: + log_dir = 'logs/' + datetime.datetime.now().strftime( + '%Y%m%d-%H%M%S') + '_' + test_name + tf.profiler.experimental.start(log_dir) + + start = time.time() + for _ in range(TFGraphStackedLSTM.ITERS): + sess.run(output) + + if TFGraphStackedLSTM.PROFILER_ENABLE: + tf.profiler.experimental.stop() + self._report(test_name, test_case, start) + + def test_whileOpLstm_forward(self): + if not self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'gpu', + ]: + model = WhileOpLstmNet(TFGraphStackedLSTM.HIDDEN, + TFGraphStackedLSTM.HIDDEN, + TFGraphStackedLSTM.NUM_LAYERS) + self._apply_forward(self.shape, device, + f'tf_graph_whileOpLstm_{device}', model) + + def test_fine_grained_op_lstm_forward(self): + if not self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'gpu', + ]: + for cell_type in [ + # 'v1', + 'v2', + ]: + model = FineGrainedOpLstmNet( + input_size=TFGraphStackedLSTM.HIDDEN, + hidden_size=TFGraphStackedLSTM.HIDDEN, + num_layers=TFGraphStackedLSTM.NUM_LAYERS, + cell_type=cell_type) + test_case = (self.SEQ_LEN, self.BATCH_SIZE, self.HIDDEN, + self.NUM_LAYERS) + self._apply_forward( + test_case, device, + f'tf_graph_fine_grained_op_lstm_{cell_type}_{device}', + model) + + def test_staticlstm_forward(self): + if not self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'gpu', + ]: + model = StaticRNN( + hidden_size=TFGraphStackedLSTM.HIDDEN, + num_layers=TFGraphStackedLSTM.NUM_LAYERS, + use_cudnn_rnn=False) + test_case = (self.SEQ_LEN, self.BATCH_SIZE, self.HIDDEN, + self.NUM_LAYERS) + self._apply_forward(test_case, device, + f'tf_graph_static_lstm_cell_{device}', + model) + + # def test_cudnnlstm_forward(self): + # model = StaticRNN( + # hidden_size=TFGraphStackedLSTM.HIDDEN, + # num_layers=TFGraphStackedLSTM.NUM_LAYERS, + # use_cudnn_rnn=True) + # self._apply_forward('gpu', 'tf_graph_cudnnlstm', model) + + def test_default_data(self): + if self.DEFAULT_TEST: + + def build_model(test_case): + seq_len, batch_size, hidden, num_layers = test_case + whileOpModel = WhileOpLstmNet(hidden, hidden, num_layers) + return whileOpModel + + def build_model2(test_case): + seq_len, batch_size, hidden, num_layers = test_case + GraphModeModel = StaticRNN( + hidden, num_layers, use_cudnn_rnn=False) + return GraphModeModel + + test_cases = [ + [64, 256, 256, 1], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 12], + [64, 256, 256, 16], + [64, 256, 256, 20], + ] + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tTensorflow-whileop(ms)\n" + ) + print('default-tf_graph_whileOpLstm_gpu') + for test_case in test_cases: + model = build_model(test_case) + self._apply_forward(test_case, 'gpu', + f'tf_graph_whileOpLstm_gpu', model) + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tTensorflow-graphmode(ms)\n" + ) + print('default-tf_graphmode_gpu') + for test_case in test_cases: + model = build_model2(test_case) + self._apply_forward(test_case, 'gpu', f'tf_graphmode_gpu', + model) + + +if __name__ == '__main__': + tf.compat.v1.disable_eager_execution() + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_triton.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_triton.py new file mode 100644 index 000000000..eef056f8f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_triton.py @@ -0,0 +1,238 @@ +from collections import namedtuple +import torch +from time import time +import argparse +import unittest +import logging +import sys +import os + +import triton_model as model +from torch.profiler import profile +from torch.profiler import record_function +from torch.profiler import ProfilerActivity + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def parse_test_args(): + parser = argparse.ArgumentParser(description='Girdlstm') + parser.add_argument( + '--seq_len', type=int, help='Sequence length', default=32) + parser.add_argument( + '--batch_size', type=int, help='Batch size', default=256) + parser.add_argument( + '--hidden_size', type=int, help='Hidden size', default=256) + parser.add_argument('--depth', type=int, help='Depth size', default=8) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) + return parser.parse_args() + + +class TritonStackedLSTM(unittest.TestCase): + WARM_UP = 5 + ITERS = 10 + PROFILER_ENABLE = False + dtype = torch.float16 + + cmd_args = parse_test_args() + SEQ_LEN = cmd_args.seq_len + BATCH_SIZE = cmd_args.batch_size + HIDDEN = cmd_args.hidden_size + NUM_LAYERS = cmd_args.depth + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + if OUTPUT_FILE: + with open(OUTPUT_FILE, 'w') as fout: + fout.write( + "depth\t[seq_length, batch_size, hidden_size]\tTriton(ms)\n") + + def setUp(self): + torch.manual_seed(1234) + + # def _report(self, test_name, test_case, start): + # seq_len, batch_size, hidden, num_layers = test_case + # torch.cuda.synchronize() + # elapsed_time = time() - start + # average_time = elapsed_time / TritonStackedLSTM.ITERS + # seq_per_sec = (TritonStackedLSTM.ITERS * + # TritonStackedLSTM.BATCH_SIZE) / elapsed_time + + # print( + # f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + # f"hidden_size: {hidden}, Triton(ms): {average_time * 1000}ms") + + # if self.OUTPUT_FILE: + # with open(self.OUTPUT_FILE, 'a') as fout: + # fout.write( + # f"{num_layers}\t[{seq_len}, {batch_size}, {hidden}]\t" + # f"{average_time * 1000}\n") + + def _report(self, test_name, test_case, elapsed): + seq_len, batch_size, hidden, num_layers = test_case + torch.cuda.synchronize() + # elapsed_time = time() - start + # average_time = elapsed_time / TritonStackedLSTM.ITERS + # seq_per_sec = (TritonStackedLSTM.ITERS * + # TritonStackedLSTM.BATCH_SIZE) / elapsed_time + + print( + f"depth: {num_layers}, seq_length: {seq_len}, batch_size: {batch_size}, " + f"hidden_size: {hidden}, Triton(ms): {elapsed}ms") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {batch_size}, {hidden}]\t" + f"{elapsed}\n") + + def _apply_forward(self, test_name, test_case, x, model): + model.eval() + for i in range(TritonStackedLSTM.WARM_UP): + output = model(x) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() + + if TritonStackedLSTM.PROFILER_ENABLE: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + profile_memory=True, + record_shapes=True) as prof: + with record_function('model_inference'): + for i in range(TritonStackedLSTM.ITERS): + if i >= 5: + break + output = model(x) + + print(prof.key_averages().table( + sort_by='cuda_time_total', row_limit=20)) + print( + prof.key_averages(group_by_input_shape=True).table( + sort_by='cuda_time_total', row_limit=20)) + else: + for i in range(TritonStackedLSTM.ITERS): + output = model(x) + + end_event.record() + torch.cuda.synchronize() + elapsed = start_event.elapsed_time( + end_event) / TritonStackedLSTM.ITERS + self._report(test_name, test_case, elapsed) + + def test_fine_grained_op_lstm_forward(self): + if not self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'cuda:0', + ]: + x = torch.randn( + (TritonStackedLSTM.SEQ_LEN, TritonStackedLSTM.BATCH_SIZE, + TritonStackedLSTM.HIDDEN), + device=device, + dtype=TritonStackedLSTM.dtype) + + m = model.StackedLSTM( + batch_size=TritonStackedLSTM.BATCH_SIZE, + max_seq_length=TritonStackedLSTM.SEQ_LEN, + hidden_size=TritonStackedLSTM.HIDDEN, + num_layers=TritonStackedLSTM.NUM_LAYERS, + device=device, + dtype=TritonStackedLSTM.dtype) + + test_name = f'triton_finegrained_op_{device}' + test_case = [ + TritonStackedLSTM.SEQ_LEN, TritonStackedLSTM.BATCH_SIZE, + TritonStackedLSTM.HIDDEN, TritonStackedLSTM.NUM_LAYERS + ] + print("Triton for Stacked LSTM:", test_name) + self._apply_forward(test_name, test_case, x, m) + + def test_default_data(self): + if self.DEFAULT_TEST: + for device in [ + # 'cpu', + 'cuda:0', + ]: + test_name = f'triton_finegrained_op_{device}' + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + x = torch.randn( + (seq_len, batch_size, hidden), + device=device, + dtype=torch.float16) + m = model.StackedLSTM( + batch_size=batch_size, + max_seq_length=seq_len, + hidden_size=hidden, + num_layers=num_layers, + device=device, + dtype=torch.float16) + return x, m + + test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [64, 256, 256, 32], + [64, 256, 512, 32], + [64, 256, 1024, 32], + # scale with depth + [64, 256, 256, 1], + [64, 256, 256, 2], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 16], + [64, 256, 256, 32], + [64, 256, 1024, 1], + [64, 256, 1024, 2], + [64, 256, 1024, 4], + [64, 256, 1024, 8], + [64, 256, 1024, 16], + [64, 256, 1024, 32], + # scale with length + [32, 256, 256, 32], + [64, 256, 256, 32], + [128, 256, 256, 32], + [32, 256, 1024, 32], + [64, 256, 1024, 32], + [128, 256, 1024, 32], + # figure 2 + [64, 256, 256, 1], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 12], + [64, 256, 256, 16], + [64, 256, 256, 20], + ] + + for test_case in test_cases: + x, m = build_data(test_case) + self._apply_forward(test_name, test_case, x, m) + del x + del m + torch.cuda.empty_cache() + + +if __name__ == '__main__': + unittest.main(argv=['first-arg-is-ignored']) diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/test_utils.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/test_utils.py new file mode 100644 index 000000000..90759911f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/test_utils.py @@ -0,0 +1,34 @@ +import tensorflow as tf + + +def get_config(): + config = tf.compat.v1.ConfigProto( + gpu_options=tf.compat.v1.GPUOptions( + allow_growth=True, per_process_gpu_memory_fraction=0.2)) + + config.log_device_placement = False + config.allow_soft_placement = True + + config.intra_op_parallelism_threads = 0 + config.inter_op_parallelism_threads = 56 + + return config + + +def device(dtype='cpu'): + '''Return the TF device string. + + Args: + dtype: String, 'cpu' or 'gpu'. + + Raises: + ValueError: if dtype is an unknown device. + ''' + + if dtype == 'cpu': + return '/device:CPU:0' + elif dtype == 'gpu': + assert len(tf.config.list_physical_devices('GPU')) + return '/device:GPU:0' + else: + raise ValueError('Unknown device type. Should be cpu or gpu.') diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/tf_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/tf_model/__init__.py new file mode 100644 index 000000000..347a90c28 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/tf_model/__init__.py @@ -0,0 +1,7 @@ +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from . import rnn +from . import rnn2 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/tf_model/rnn.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/tf_model/rnn.py new file mode 100644 index 000000000..16609165d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/tf_model/rnn.py @@ -0,0 +1,355 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from typing import List +import math +import tensorflow as tf + +layers = tf.keras.layers + + +class FineGrainedOpLstmCellV1(layers.Layer): + def __init__(self, input_size, hidden_size): + super(FineGrainedOpLstmCellV1, self).__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + + def build(self, input_shape): + stddev = 1.0 / math.sqrt(self.hidden_size) + self.igx = tf.Variable( + tf.random.uniform( + [self.input_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.igu = tf.Variable( + tf.random.uniform( + [self.hidden_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.ib = tf.Variable( + tf.random.uniform( + [self.hidden_size], minval=-stddev, maxval=stddev)) + + self.fgx = tf.Variable( + tf.random.uniform( + [self.input_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.fgu = tf.Variable( + tf.random.uniform( + [self.hidden_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.fb = tf.Variable( + tf.random.uniform( + [self.hidden_size], minval=-stddev, maxval=stddev)) + + self.ogx = tf.Variable( + tf.random.uniform( + [self.input_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.ogu = tf.Variable( + tf.random.uniform( + [self.hidden_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.ob = tf.Variable( + tf.random.uniform( + [self.hidden_size], minval=-stddev, maxval=stddev)) + + self.cgx = tf.Variable( + tf.random.uniform( + [self.input_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.cgu = tf.Variable( + tf.random.uniform( + [self.hidden_size, self.hidden_size], + minval=-stddev, + maxval=stddev)) + self.cb = tf.Variable( + tf.random.uniform( + [self.hidden_size], minval=-stddev, maxval=stddev)) + + # uncomment the following line to enable auto-graph. + # @tf.function + def call(self, x, h_prev, c_prev): + ig = tf.sigmoid(x @ self.igx + h_prev @ self.igu + self.ib) + fg = tf.sigmoid(x @ self.fgx + h_prev @ self.fgu + self.fb) + og = tf.sigmoid(x @ self.ogx + h_prev @ self.ogu + self.ob) + c_candidate = tf.tanh(x @ self.cgx + h_prev @ self.cgu + self.cb) + + c = fg * c_prev + ig * c_candidate + h = og * tf.tanh(c) + return h, c + + +class FineGrainedOpLstmCellV2(layers.Layer): + def __init__(self, input_size, hidden_size): + super(FineGrainedOpLstmCellV2, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + + def build(self, input_shape): + stddev = 1.0 / math.sqrt(self.hidden_size) + self.w = tf.Variable( + tf.random.uniform( + [self.input_size, self.hidden_size * 4], + minval=-stddev, + maxval=stddev)) + self.u = tf.Variable( + tf.random.uniform( + [self.hidden_size, self.hidden_size * 4], + minval=-stddev, + maxval=stddev)) + self.b = tf.Variable( + tf.random.uniform( + [self.hidden_size * 4], minval=-stddev, maxval=stddev)) + + # uncomment the following line to enable auto-graph. + # @tf.function + def call(self, x, h_prev, c_prev): + g = x @ self.w + h_prev @ self.u + self.b + g_act = tf.sigmoid(g[:, :self.hidden_size * 3]) + c_candidate = tf.tanh(g[:, self.hidden_size * 3:]) + + ig, fg, og = ( + g_act[:, :self.hidden_size], # input + g_act[:, self.hidden_size:self.hidden_size * 2], # forget + g_act[:, self.hidden_size * 2:], # output + ) + + c = fg * c_prev + ig * c_candidate + h = og * tf.tanh(c) + return h, c + + +class FineGrainedOpLstmNet(tf.keras.Model): + def __init__(self, input_size, hidden_size, num_layers, cell_type): + super(FineGrainedOpLstmNet, self).__init__() + self.hidden_size = hidden_size + + if cell_type == 'v1': + self.cells = [ + FineGrainedOpLstmCellV1(input_size if i == 0 else hidden_size, + hidden_size) for i in range(num_layers) + ] + elif cell_type == 'v2': + self.cells = [ + FineGrainedOpLstmCellV2(input_size if i == 0 else hidden_size, + hidden_size) for i in range(num_layers) + ] + else: + raise ValueError('Unknow cell type.') + + # uncomment the following line to enable auto-graph. + # @tf.function + def call(self, input_seq): + batch_size = int(input_seq.shape[1]) + + for rnncell in self.cells: # iterate over depth + outputs = [] + input_seq = tf.unstack( + input_seq, num=int(input_seq.shape[0]), axis=0) + h = tf.zeros((batch_size, self.hidden_size)) + c = tf.zeros((batch_size, self.hidden_size)) + for inp in input_seq: # iterate over time step + h, c = rnncell(inp, h, c) + outputs.append(h) + + input_seq = tf.stack(outputs, axis=0) + + return [input_seq] + + +class WhileOpLstmLayer(tf.keras.Model): + """Lstm implemented in fine-grained operators via symbolic while-ops. + Only works in graph-mode. + """ + + def __init__(self, input_size: int, hidden_size: int): + """ + Args: + input_size: int, + hidden_size: int, + Return: + A Tensor with a shape [batch_size, sequence_length, hidden_dim] + """ + super(WhileOpLstmLayer, self).__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + stddev = 1.0 / math.sqrt(self.hidden_size) + + self.w = tf.Variable( + tf.random.uniform( + [self.input_size, self.hidden_size * 4], + minval=-stddev, + maxval=stddev)) + + self.u = tf.Variable( + tf.random.uniform( + [self.hidden_size, self.hidden_size * 4], + minval=-stddev, + maxval=stddev)) + self.bias = tf.Variable( + tf.random.uniform( + [self.hidden_size * 4], minval=-stddev, maxval=stddev)) + + def _while_op_lstm(self, input): + shape = tf.shape(input) + seq_len = shape[0] + batch_size = shape[1] + + def body(t, step): + """The Lstm cell. + For some TF implementation constrains, we cannot reuse LstmCell + defined in utils.py, but implement in the body function. + """ + h_prev, c_prev, output_array = step + + x_t = input[t, :] + g = x_t @ self.w + h_prev @ self.u + self.bias + g_act = tf.sigmoid(g[:, :self.hidden_size * 3]) + c_candidate = tf.tanh(g[:, self.hidden_size * 3:]) + + ig, fg, og = ( + g_act[:, :self.hidden_size], # input + g_act[:, self.hidden_size:self.hidden_size * 2], # forget + g_act[:, self.hidden_size * 2:], # output + ) + + c = fg * c_prev + ig * c_candidate + h = og * tf.tanh(c) + + return t + 1, (h, c, output_array.write(t, h)) + + init_h = tf.zeros([batch_size, self.hidden_size]) + init_c = tf.zeros([batch_size, self.hidden_size]) + + init_t = tf.constant(0) + output_array = tf.TensorArray( + dtype=tf.float32, size=seq_len, dynamic_size=False) + cond = lambda i, _: tf.less(i, seq_len) + _, step = tf.while_loop( + cond=cond, + body=body, + loop_vars=(init_t, (init_h, init_c, output_array))) + _, _, output_array = step + + return output_array.stack() + + def __call__(self, input_seq): + """Stacked Lstm network implemented by TF's symbolic while loop operator. + Args: + input_seq, Tensor, input sequence batch. The layout must be + batch_size major: [seq_len, batch_size, input_dim]. + """ + return self._while_op_lstm(input_seq) + + +class WhileOpLstmNet(tf.keras.Model): + def __init__(self, input_size, hidden_size, num_layers): + super(WhileOpLstmNet, self).__init__() + self.hidden_size = hidden_size + + self.rnns = [ + WhileOpLstmLayer(input_size + if i == 0 else hidden_size, hidden_size) + for i in range(num_layers) + ] + + def call(self, input_seq): + outputs = [] + xs = input_seq + for rnn in self.rnns: # iterate over depth + xs = rnn(xs) + outputs.append(xs) + return outputs + + +class StaticRNN(tf.keras.Model): + """A static RNN. + """ + + def __init__(self, hidden_size, num_layers, use_cudnn_rnn=True): + """ + hidden_size: Int, hidden dimension of the RNN unit. + num_layers: Int, the number of stacked RNN unit, namely depth of the RNN + network. + """ + super(StaticRNN, self).__init__() + + if use_cudnn_rnn: + self.cells = [ + tf.compat.v1.keras.layers.CuDNNLSTM( + hidden_size, return_state=True, return_sequences=True) + for _ in range(num_layers) + ] + else: + # About layers.LstmCell's `implementation` argument, either 1 or 2. + # Mode 1 will structure its operations as a larger number of smaller + # dot products and additions, whereas mode 2 will batch them into + # fewer, larger operations. These modes will have different + # performance profiles on different hardware and for different + # applications. + self.cells = [ + layers.LSTMCell( + units=hidden_size, + activation='tanh', + recurrent_activation='sigmoid', + use_bias=True, + kernel_initializer='glorot_uniform', + recurrent_initializer='orthogonal', + bias_initializer='zeros', + unit_forget_bias=True, + dropout=0.0, + recurrent_dropout=0.0, + implementation=2) for _ in range(num_layers) + ] + + self.hidden_size = hidden_size + self.use_cudnn_rnn = use_cudnn_rnn + + def _cudnn_lstm_call(self, input_seq): + # A workaround to stack CuDNNLstm in TF 2.0. + # https://stackoverflow.com/questions/55324307/how-to-implement-a-stacked-rnns-in-tensorflow + x = input_seq + for rnn in self.cells: + x = rnn(x) + return x + + # uncomment the following line to enable auto-graph. + # @tf.function + def call(self, input_seq): + """Define computations in a single time step. + + input_seq: Tensor, the layout is + [batch_size, max_sequence_length, embedding_dim]. + """ + if self.use_cudnn_rnn: + return self._cudnn_lstm_call(input_seq) + + batch_size = int(input_seq.shape[1]) + + hiddens = [] + for cell in self.cells: # iterate over depth + state = (tf.zeros((batch_size, self.hidden_size)), + tf.zeros((batch_size, self.hidden_size))) + + # unpack the input 3D tensors along the `max_sequence_length` axis + # to get input tensors for each time step. + input_seq = tf.unstack( + input_seq, num=int(input_seq.shape[0]), axis=0) + outputs = [] + for inp in input_seq: # iterate over time step + output, state = cell(inp, state) + outputs.append(output) + + input_seq = tf.stack(outputs, axis=0) + hiddens.append(input_seq) + + return hiddens diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/__init__.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/__init__.py new file mode 100644 index 000000000..4166af503 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/__init__.py @@ -0,0 +1,9 @@ +from .rnn import StackedLSTM +import os +import sys +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +__all__ = [ + "StackedLSTM", +] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/op.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/op.py new file mode 100644 index 000000000..9f86e399b --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/op.py @@ -0,0 +1,467 @@ +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +import triton +import triton.language as tl + +from time import time + +import os +__all__ = ['LSTMscan'] + + +@triton.autotune( + configs=[ + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 32 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 16 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 16 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 64 + }, + num_stages=3, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 128 + }, + num_stages=3, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 64, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 16, + 'BLOCK_SIZE_K': 128 + }, + num_stages=4, + num_warps=4), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 128, + 'BLOCK_SIZE_K': 128 + }, + num_stages=2, + num_warps=8), + triton.Config( + { + 'BLOCK_SIZE_B': 32, + 'BLOCK_SIZE_H': 256, + 'BLOCK_SIZE_K': 128 + }, + num_stages=2, + num_warps=2), + triton.Config( + { + 'BLOCK_SIZE_B': 16, + 'BLOCK_SIZE_H': 32, + 'BLOCK_SIZE_K': 128 + }, + num_stages=2, + num_warps=2), + ], + key=['hidden_size', 'batch_size'], +) +@triton.jit +def LSTMscan_kernel( + Wi_ptr, + Ui_ptr, + bi_ptr, + Wf_ptr, + Uf_ptr, + bf_ptr, + Wo_ptr, + Uo_ptr, + bo_ptr, + Wg_ptr, + Ug_ptr, + bg_ptr, + h_prev_ptr, + c_prev_ptr, + input_ptr, + h_ptr, + c_ptr, + input_size, + hidden_size, + batch_size, + stride_hm, + stride_hk, + stride_wk, + stride_wn, + BLOCK_SIZE_B: tl.constexpr, + BLOCK_SIZE_H: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_h = tl.program_id(1) + + Wi_block_ptr = tl.make_block_ptr( + base=Wi_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Wf_block_ptr = tl.make_block_ptr( + base=Wf_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Wo_block_ptr = tl.make_block_ptr( + base=Wo_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Wg_block_ptr = tl.make_block_ptr( + base=Wg_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Ui_block_ptr = tl.make_block_ptr( + base=Ui_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Uf_block_ptr = tl.make_block_ptr( + base=Uf_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Uo_block_ptr = tl.make_block_ptr( + base=Uo_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + Ug_block_ptr = tl.make_block_ptr( + base=Ug_ptr, + shape=(hidden_size, hidden_size), + strides=(stride_wk, stride_wn), + offsets=(0, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_H), + order=(1, 0), + ) + h_prev_block_ptr = tl.make_block_ptr( + base=h_prev_ptr, + shape=(batch_size, hidden_size), + strides=(stride_hm, stride_hk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + input_block_ptr = tl.make_block_ptr( + base=input_ptr, + shape=(batch_size, hidden_size), + strides=(stride_hm, stride_hk), + offsets=(pid_m * BLOCK_SIZE_B, 0), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_K), + order=(1, 0), + ) + c_prev_block_ptr = tl.make_block_ptr( + base=c_prev_ptr, + shape=(batch_size, hidden_size), + strides=(stride_hm, stride_hk), + offsets=(pid_m * BLOCK_SIZE_B, pid_h * BLOCK_SIZE_H), + block_shape=(BLOCK_SIZE_B, BLOCK_SIZE_H), + order=(1, 0), + ) + offset_batch = ( + pid_m * BLOCK_SIZE_B + tl.arange(0, BLOCK_SIZE_B)) % batch_size + offset_hidden = ( + pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)) % hidden_size + bi_ptrs = bi_ptr + offset_hidden[None, :] + bf_ptrs = bf_ptr + offset_hidden[None, :] + bo_ptrs = bo_ptr + offset_hidden[None, :] + bg_ptrs = bg_ptr + offset_hidden[None, :] + bi, bf, bo, bg = tl.load(bi_ptrs), tl.load(bf_ptrs), tl.load( + bo_ptrs), tl.load(bg_ptrs) + bi_ = tl.broadcast_to(bi, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + bf_ = tl.broadcast_to(bf, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + bo_ = tl.broadcast_to(bo, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + bg_ = tl.broadcast_to(bg, (BLOCK_SIZE_B, BLOCK_SIZE_H)) + + ig_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + fg_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + og_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + c_candidate_ = tl.zeros([BLOCK_SIZE_B, BLOCK_SIZE_H], dtype=tl.float32) + for k in range(hidden_size // BLOCK_SIZE_K): + input = tl.load(input_block_ptr) + h_prew = tl.load(h_prev_block_ptr) + Wi, Wf, Wo, Wg = tl.load(Wi_block_ptr), tl.load(Wf_block_ptr), tl.load( + Wo_block_ptr), tl.load(Wg_block_ptr) + Ui, Uf, Uo, Ug = tl.load(Ui_block_ptr), tl.load(Ui_block_ptr), tl.load( + Uo_block_ptr), tl.load(Ug_block_ptr) + + ig_ += tl.dot(input, Wi) + tl.dot(h_prew, Ui) + fg_ += tl.dot(input, Wf) + tl.dot(h_prew, Uf) + og_ += tl.dot(input, Wo) + tl.dot(h_prew, Uo) + c_candidate_ += tl.dot(input, Wg) + tl.dot(h_prew, Ug) + + Wi_block_ptr = tl.advance(Wi_block_ptr, (BLOCK_SIZE_K, 0)) + Wf_block_ptr = tl.advance(Wf_block_ptr, (BLOCK_SIZE_K, 0)) + Wo_block_ptr = tl.advance(Wo_block_ptr, (BLOCK_SIZE_K, 0)) + Wg_block_ptr = tl.advance(Wg_block_ptr, (BLOCK_SIZE_K, 0)) + + Ui_block_ptr = tl.advance(Ui_block_ptr, (BLOCK_SIZE_K, 0)) + Uf_block_ptr = tl.advance(Uf_block_ptr, (BLOCK_SIZE_K, 0)) + Uo_block_ptr = tl.advance(Uo_block_ptr, (BLOCK_SIZE_K, 0)) + Ug_block_ptr = tl.advance(Ug_block_ptr, (BLOCK_SIZE_K, 0)) + + input_block_ptr = tl.advance(input_block_ptr, (0, BLOCK_SIZE_K)) + h_prev_block_ptr = tl.advance(h_prev_block_ptr, (0, BLOCK_SIZE_K)) + + ig = ig_ + bi_ + fg = fg_ + bf_ + og = og_ + bo_ + c_candidate = c_candidate_ + bg_ + + ig = _sigmoid(ig) + fg = _sigmoid(fg) + og = _sigmoid(og) + c_candidate = _tanh(c_candidate) + + c_prev = tl.load(c_prev_block_ptr) + c = fg * c_prev + ig * c_candidate + + c_ptrs = c_ptr + offset_batch[:, None] * \ + stride_hm + offset_hidden[None, :] * stride_hk + tl.store(c_ptrs, c) + + c = _tanh(c) + h = og * c + h_ptrs = h_ptr + offset_batch[:, None] * \ + stride_hm + offset_hidden[None, :] * stride_hk + tl.store(h_ptrs, h) + + +@triton.jit +def _dot(a, b): + return tl.sum(a[:, :, None] * b[None, :, :], axis=1) + + +@triton.jit +def _sigmoid(x): + # \sigma(x) = \frac{1}{1 + 2^{-x \cdot \log_2(e)}} + log2_e = 1.4426950408889634 # log2(e) + neg_log2_e_x = -x * log2_e + exp_neg_log2_e_x = tl.math.exp2(neg_log2_e_x) + return 1 / (1 + exp_neg_log2_e_x) + + +@triton.jit +def _tanh(x): + return 2 * _sigmoid(2 * x) - 1 + + +def LSTMscan(input_, + weight_, + blas_, + state_, + resident_, + size_, + device_='cuda', + dtype_=torch.float16): + Wi, Wf, Wo, Wg, Ui, Uf, Uo, Ug = weight_ + bi, bf, bo, bg = blas_ + h_prew, c_prew = state_ + input_size, hidden_size, batch_size = size_ + h_resident, c_resident = resident_ + + def grid(META): + return ( + triton.cdiv(batch_size, META['BLOCK_SIZE_B']), + triton.cdiv(hidden_size, META['BLOCK_SIZE_H']), + ) + + LSTMscan_kernel[grid]( + Wi_ptr=Wi, + Ui_ptr=Ui, + bi_ptr=bi, + Wf_ptr=Wf, + Uf_ptr=Uf, + bf_ptr=bf, + Wo_ptr=Wo, + Uo_ptr=Uo, + bo_ptr=bo, + Wg_ptr=Wg, + Ug_ptr=Ug, + bg_ptr=bg, + h_prev_ptr=h_prew, + c_prev_ptr=c_prew, + input_ptr=input_, + h_ptr=h_resident, + c_ptr=c_resident, + input_size=input_size, + hidden_size=hidden_size, + batch_size=batch_size, + stride_hm=h_resident.stride(0), + stride_hk=h_resident.stride(1), + stride_wk=Wi.stride(0), + stride_wn=Wi.stride(1)) + return h_resident, c_resident diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/rnn.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/rnn.py new file mode 100644 index 000000000..ce85e7db9 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/triton_model/rnn.py @@ -0,0 +1,140 @@ +from typing import Tuple +from typing import List + +import torch.jit as jit +import torch +import torch.nn as nn +from torch.nn import Parameter +from torch.nn.init import xavier_normal_ as init +from torch import Tensor + +from time import time + +from .op import * + + +class LSTMCell(nn.Module): + def __init__(self, + input_size: int, + hidden_size: int, + batch_size: int, + device: str, + dtype=torch.float16): + super(LSTMCell, self).__init__() + self.device = device + self.dtype = dtype + self.size = (input_size, hidden_size, batch_size) + self.Wi = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + self.Wf = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + self.Wo = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + self.Wg = init( + nn.Parameter( + torch.empty( + [input_size, hidden_size], device=device, dtype=dtype))) + + self.Ui = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + self.Uf = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + self.Uo = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + self.Ug = init( + nn.Parameter( + torch.empty( + [hidden_size, hidden_size], device=device, dtype=dtype))) + + self.bi = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + self.bf = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + self.bo = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + self.bg = nn.Parameter( + torch.ones([hidden_size], device=device, dtype=dtype)) + + def forward( + self, + input: Tensor, + state_prev: Tuple[Tensor, Tensor], + state_now: Tuple[Tensor, Tensor], + ) -> Tuple[Tensor, Tensor]: + + h, c = LSTMscan(input, (self.Wi, self.Wf, self.Wo, self.Wg, self.Ui, + self.Uf, self.Uo, self.Ug), + (self.bi, self.bf, self.bo, self.bg), state_prev, + state_now, self.size, self.device, self.dtype) + + return h, c + + +class StackedLSTM(nn.Module): + def __init__(self, + batch_size: int, + max_seq_length: int, + hidden_size: int, + num_layers: int, + device: str, + dtype=torch.float16): + super(StackedLSTM, self).__init__() + self.max_seq_length = max_seq_length + self.device = device + self.dtype = dtype + self.size = (batch_size, hidden_size) + self.cells = torch.nn.ModuleList([ + LSTMCell(hidden_size, hidden_size, batch_size, device, dtype) + for i in range(num_layers) + ]) + + def forward(self, input): + xs = input + batch_size, hidden_size = self.size + h_resident = torch.empty( + [batch_size, hidden_size], device=self.device, dtype=self.dtype) + c_resident = torch.empty( + [batch_size, hidden_size], device=self.device, dtype=self.dtype) + hiddens = [] + cells = [] + for rnn in self.cells: + + h = torch.zeros( + (batch_size, hidden_size), + device=self.device, + dtype=self.dtype) + c = torch.zeros( + (batch_size, hidden_size), + device=self.device, + dtype=self.dtype) + + hs = [] + cs = [] + + inputs = xs.unbind(0) + for i in range(self.max_seq_length): + h, c = rnn(inputs[i], (h, c), (h_resident, c_resident)) + + hs.append(h) + cs.append(c) + + hs = torch.stack(hs) + cs = torch.stack(cs) + xs = hs + + hiddens.append(hs) + cells.append(cs) + + return hiddens, cells diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/CMakeLists.txt b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/CMakeLists.txt new file mode 100644 index 000000000..1f1074303 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 3.0) +project(benchmarks CXX C) + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/../../../cmake/Modules/") + +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(CUDA QUIET REQUIRED) +find_package(CuDNN QUIET REQUIRED) + +set(CMAKE_BUILD_TYPE Release) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") +set(CMAKE_CXX_FLAGS_DEBUG + "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") +set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare") + +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") + +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w -gencode arch=compute_75,code=sm_75) +set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w -gencode + arch=compute_75,code=sm_75) +set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 -gencode + arch=compute_75,code=sm_75) + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CUDNN_INCLUDE_DIRS}) + +cuda_add_executable(cudnn_lstm main.cu) +target_link_libraries(cudnn_lstm ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY} + ${CUDNN_LIBRARIES}) + +cuda_add_executable(lstm_cell_cudnn lstm_cell_cudnn.cu) +target_link_libraries(lstm_cell_cudnn ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY} + ${CUDNN_LIBRARIES}) + +cuda_add_executable(stacked_lstm_cudnn stacked_lstm_cudnn.cu) +target_link_libraries(stacked_lstm_cudnn ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY} + ${CUDNN_LIBRARIES}) diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/Makefile b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/Makefile new file mode 100644 index 000000000..3b042d99d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/Makefile @@ -0,0 +1,16 @@ +BENCH_NAME ?= lstm_cell_cudnn +BUILD_DIR := build +OUTPUT_FILE ?= ../c_cudnn_lstm_cell_bench.tsv + +.PHONY: build bench clean + +build: + @mkdir -p build && cd build && cmake .. && make -j + +$(BUILD_DIR)/$(BENCH_NAME): build + +bench: $(BUILD_DIR)/$(BENCH_NAME) + @./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE) + +clean: + @rm -rf build diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/README.md b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/README.md new file mode 100644 index 000000000..aa6dd9d4d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/README.md @@ -0,0 +1,123 @@ +The second column `Shape` stands for `[batch_size, hidden_size, length, depth]`. + +Variable sequence length in batch: normal distribution (`mean`=`seq_length/2`, `stddev`=`seq_length/8`) + +## Fixed sequence length in batch(256) + +### Vary in depth + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[256, 256, 100, 2]|||||7.70831| +|CuDNN|[256, 256, 100, 4]|||||13.3942| +|CuDNN|[256, 256, 100, 6]|||||18.7314| +|CuDNN|[256, 256, 100, 8]|||||28.6231| +|CuDNN|[256, 256, 100, 10]|||||36.1529| +|CuDNN|[256, 256, 100, 12]|||||45.2314| +|CuDNN|[256, 256, 100, 14]|||||55.5315| +|CuDNN|[256, 256, 100, 16]|||||64.4784| +|CuDNN|[256, 256, 100, 18]|||||73.7362| +|CuDNN|[256, 256, 100, 20]|||||77.9639| +|CuDNN|[256, 256, 100, 22]|||||87.4949| + +### Vary in sequence length + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[256, 256, 50, 10]|||||17.9963| +|CuDNN|[256, 256, 75, 10]|||||28.2362| +|CuDNN|[256, 256, 100, 10]|||||36.75| +|CuDNN|[256, 256, 125, 10]|||||44.5773| +|CuDNN|[256, 256, 150, 10]|||||50.1302| +|CuDNN|[256, 256, 175, 10]|||||59.9653| +|CuDNN|[256, 256, 200, 10]|||||68.3289| + +## Variable sequence length in batch(256) + +### Vary in depth + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[256, 256, 100, 2]|||||5.72042| +|CuDNN|[256, 256, 100, 4]|||||9.48156| +|CuDNN|[256, 256, 100, 6]|||||16.9558| +|CuDNN|[256, 256, 100, 8]|||||24.1965| +|CuDNN|[256, 256, 100, 10]|||||32.0736| +|CuDNN|[256, 256, 100, 12]|||||38.5646| +|CuDNN|[256, 256, 100, 14]|||||46.5616| +|CuDNN|[256, 256, 100, 16]|||||56.7443| +|CuDNN|[256, 256, 100, 18]|||||63.9878| +|CuDNN|[256, 256, 100, 20]|||||68.5947| +|CuDNN|[256, 256, 100, 22]|||||76.7222| + +### Vary in sequence length + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[256, 256, 50, 10]|||||16.0103| +|CuDNN|[256, 256, 75, 10]|||||25.2088| +|CuDNN|[256, 256, 100, 10]|||||33.9192| +|CuDNN|[256, 256, 125, 10]|||||40.1468| +|CuDNN|[256, 256, 150, 10]|||||45.2206| +|CuDNN|[256, 256, 175, 10]|||||54.8865| +|CuDNN|[256, 256, 200, 10]|||||61.0839| + +## Fixed sequence length in batch(64) + +### Vary in depth + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[64, 256, 100, 2]|||||3.64703| +|CuDNN|[64, 256, 100, 4]|||||5.62296| +|CuDNN|[64, 256, 100, 6]|||||10.4035| +|CuDNN|[64, 256, 100, 8]|||||13.8126| +|CuDNN|[64, 256, 100, 10]|||||17.115| +|CuDNN|[64, 256, 100, 12]|||||22.0281| +|CuDNN|[64, 256, 100, 14]|||||26.3998| +|CuDNN|[64, 256, 100, 16]|||||29.0647| +|CuDNN|[64, 256, 100, 18]|||||36.4702| +|CuDNN|[64, 256, 100, 20]|||||41.028| +|CuDNN|[64, 256, 100, 22]|||||42.3192| + +### Vary in sequence length + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[64, 256, 50, 10]|||||8.34408| +|CuDNN|[64, 256, 75, 10]|||||13.3728| +|CuDNN|[64, 256, 100, 10]|||||16.3981| +|CuDNN|[64, 256, 125, 10]|||||22.0347| +|CuDNN|[64, 256, 150, 10]|||||26.8824| +|CuDNN|[64, 256, 175, 10]|||||30.6395| +|CuDNN|[64, 256, 200, 10]|||||31.9782| + +## Variable sequence length in batch(64) + +### Vary in depth + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[64, 256, 100, 2]|||||2.89689| +|CuDNN|[64, 256, 100, 4]|||||5.9606| +|CuDNN|[64, 256, 100, 6]|||||9.18842| +|CuDNN|[64, 256, 100, 8]|||||12.3405| +|CuDNN|[64, 256, 100, 10]|||||19.093| +|CuDNN|[64, 256, 100, 12]|||||23.3191| +|CuDNN|[64, 256, 100, 14]|||||22.5437| +|CuDNN|[64, 256, 100, 16]|||||27.4492| +|CuDNN|[64, 256, 100, 18]|||||38.8575| +|CuDNN|[64, 256, 100, 20]|||||40.549| +|CuDNN|[64, 256, 100, 22]|||||45.7558| + +### Vary in sequence length + +||Shape|Gather(ms)|Cell-GEMM(ms)|Cell-Elementwise(ms)|Scatter(ms)|Total(ms)| +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|CuDNN|[64, 256, 50, 10]|||||8.22669| +|CuDNN|[64, 256, 75, 10]|||||11.778| +|CuDNN|[64, 256, 100, 10]|||||15.9492| +|CuDNN|[64, 256, 125, 10]|||||19.4848| +|CuDNN|[64, 256, 150, 10]|||||28.9545| +|CuDNN|[64, 256, 175, 10]|||||33.4161| +|CuDNN|[64, 256, 200, 10]|||||37.8032| diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/RNN_example.h b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/RNN_example.h new file mode 100644 index 000000000..719a8ed0d --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/RNN_example.h @@ -0,0 +1,1182 @@ +#pragma once + +#include "fp16_emu.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +std::vector seqs; + +#define COUNTOF(arr) int(sizeof(arr) / sizeof(arr[0])) + +static size_t getDeviceMemory(void) { + struct cudaDeviceProp properties; + int device; + cudaError_t error; + + error = cudaGetDevice(&device); + if (error != cudaSuccess) { + fprintf(stderr, "failed to get device cudaError=%d\n", error); + return 0; + } + + error = cudaGetDeviceProperties(&properties, device); + if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { + fprintf(stderr, "failed to get properties cudaError=%d\n", error); + return 0; + } + return properties.totalGlobalMem; +} + +template +void printWeightAsMatrix(const T_ELEM* wDev, const int nRows, const int nCols) { + T_ELEM* wHost = (T_ELEM*)malloc(nRows * nCols * sizeof(T_ELEM)); + + cudaMemcpy(wHost, wDev, nRows * nCols * sizeof(T_ELEM), + cudaMemcpyDeviceToHost); + + printf("[DEBUG] Printing the weight matrix %dx%d:\n", nRows, nCols); + fflush(0); + for (int i = 0; i < nRows; i++) { + for (int j = 0; j < nCols; j++) { + printf("%1.6f ", (float)wHost[i * nCols + j]); + } + printf("\n"); + fflush(0); + } + + free(wHost); +} + +// Templated functions to get cudnnDataType_t from a templated type +template +__inline__ cudnnDataType_t getDataType(); +template <> +__inline__ cudnnDataType_t getDataType() { + return CUDNN_DATA_DOUBLE; +} +template <> +__inline__ cudnnDataType_t getDataType() { + return CUDNN_DATA_FLOAT; +} +template <> +__inline__ cudnnDataType_t getDataType() { + return CUDNN_DATA_HALF; +} + +// Define some error checking macros. +#define cudaErrCheck(stat) \ + { cudaErrCheck_((stat), __FILE__, __LINE__); } +void cudaErrCheck_(cudaError_t stat, const char* file, int line) { + if (stat != cudaSuccess) { + fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), + file, line); + exit(-1); + } +} + +#define cudnnErrCheck(stat) \ + { cudnnErrCheck_((stat), __FILE__, __LINE__); } +void cudnnErrCheck_(cudnnStatus_t stat, const char* file, int line) { + if (stat != CUDNN_STATUS_SUCCESS) { + fprintf(stderr, "cuDNN Error: %s %s %d\n", cudnnGetErrorString(stat), + file, line); + exit(-1); + } +} + +// Kernel and launcher to initialize GPU data to some constant value +template +__global__ void initGPUData_ker(T_ELEM* data, int numElements, T_ELEM value) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < numElements) { + data[tid] = value; + } +} + +template +void initGPUData(T_ELEM* data, int numElements, T_ELEM value) { + dim3 gridDim; + dim3 blockDim; + + blockDim.x = 1024; + gridDim.x = (numElements + blockDim.x - 1) / blockDim.x; + + initGPUData_ker<<>>(data, numElements, value); +} + +struct RNNSampleOptions { + int dataType; + int seqLength; // Specify sequence length + int numLayers; // Specify number of layers + int inputSize; // Specify input vector size + int hiddenSize; // Specify hidden size + int projSize; // Specify LSTM cell output size after the recurrent + // projection + int miniBatch; // Specify max miniBatch size + int inputMode; // Specify how the input to the RNN model is processed by + // the first layer (skip or linear input) + int dirMode; // Specify the recurrence pattern (bidirectional and + // unidirectional) + int cellMode; // Specify cell type (RELU, TANH, LSTM, GRU) + int biasMode; // Specify bias type (no bias, single inp bias, single rec + // bias, double bias) + int algorithm; // Specify recurrence algorithm (standard, persist dynamic, + // persist static) + int mathPrecision; // Specify math precision (half, float of double) + int mathType; // Specify math type (default, tensor op math or tensor op + // math with conversion) + float dropout; + int printWeights; + + RNNSampleOptions() { memset(this, 0, sizeof(*this)); }; +}; + +template +class RNNSample { + public: + cudnnHandle_t cudnnHandle; + + cudnnRNNDataDescriptor_t xDesc; + cudnnRNNDataDescriptor_t yDesc; + + cudnnTensorDescriptor_t hDesc; + cudnnTensorDescriptor_t cDesc; + + cudnnRNNDescriptor_t rnnDesc; + + cudnnDropoutDescriptor_t dropoutDesc; + + void* x; + void* hx; + void* cx; + + void* dx; + void* dhx; + void* dcx; + + void* y; + void* hy; + void* cy; + + void* dy; + void* dhy; + void* dcy; + + int* seqLengthArray; + int* devSeqLengthArray; + + void* weightSpace; + void* dweightSpace; + void* workSpace; + void* reserveSpace; + + size_t weightSpaceSize; + size_t workSpaceSize; + size_t reserveSpaceSize; + + cudnnRNNAlgo_t algorithm; + cudnnRNNMode_t cellMode; + cudnnRNNBiasMode_t biasMode; + cudnnDirectionMode_t dirMode; + cudnnRNNInputMode_t inputMode; + cudnnDataType_t dataType; + cudnnDataType_t mathPrecision; + cudnnMathType_t mathType; + + int inputSize; + int hiddenSize; + int projSize; + int numLayers; + int seqLength; + int miniBatch; + + // Local parameters + int bidirectionalScale; + int inputTensorSize; + int devinputTensorSize; + int outputTensorSize; + int hiddenTensorSize; + int numLinearLayers; + + double paddingFill; + + // Dimensions for hidden state tensors + int dimHidden[3]; + int strideHidden[3]; + + // Dropout descriptor parameters + unsigned long long seed; + size_t stateSize; + void* states; + float dropout; + + // Profiling parameters + int printWeights; + cudaEvent_t start; + cudaEvent_t stop; + float timeForward; + float timeBackwardData; + float timeBackwardWeights; + long long int flopCount; + long long deviceMemoryAvailable; + long long totalMemoryConsumption; + + RNNSample() + : seqLengthArray(NULL), + devSeqLengthArray(NULL), + x(NULL), + hx(NULL), + cx(NULL), + dx(NULL), + dhx(NULL), + dcx(NULL), + y(NULL), + hy(NULL), + cy(NULL), + dy(NULL), + dhy(NULL), + dcy(NULL), + states(NULL), + weightSpace(NULL), + dweightSpace(NULL), + workSpace(NULL), + reserveSpace(NULL){}; + + void setup(RNNSampleOptions& options); + + void run(); + + void testgen(); +}; + +static char* baseFile(char* fname) { + char* base; + for (base = fname; *fname != '\0'; fname++) { + if (*fname == '/' || *fname == '\\') { + base = fname + 1; + } + } + return base; +} + +static void parseRNNSampleParameters(int argc, char** argv, + RNNSampleOptions* options) { + struct cmdParams { + const char* name; + const char* format; + size_t offset; + const char* description; + } param[] = { + {"dataType", "%d", offsetof(RNNSampleOptions, dataType), + "selects data format (0-FP16, 1-FP32, 2-FP64)"}, + {"seqLength", "%d", offsetof(RNNSampleOptions, seqLength), + "sequence length"}, + {"numLayers", "%d", offsetof(RNNSampleOptions, numLayers), + "number of layers"}, + {"inputSize", "%d", offsetof(RNNSampleOptions, inputSize), + "input vector size"}, + {"hiddenSize", "%d", offsetof(RNNSampleOptions, hiddenSize), + "hidden size"}, + {"projSize", "%d", offsetof(RNNSampleOptions, projSize), + "LSTM cell output size"}, + {"miniBatch", "%d", offsetof(RNNSampleOptions, miniBatch), + "miniBatch size"}, + {"inputMode", "%d", offsetof(RNNSampleOptions, inputMode), + "input to the RNN model (0-skip input, 1-linear input)"}, + {"dirMode", "%d", offsetof(RNNSampleOptions, dirMode), + "recurrence pattern (0-unidirectional, 1-bidirectional)"}, + {"cellMode", "%d", offsetof(RNNSampleOptions, cellMode), + "cell type (0-RELU, 1-TANH, 2-LSTM, 3-GRU)"}, + {"biasMode", "%d", offsetof(RNNSampleOptions, biasMode), + "bias type (0-no bias, 1-inp bias, 2-rec bias, 3-double bias"}, + {"algorithm", "%d", offsetof(RNNSampleOptions, algorithm), + "recurrence algorithm (0-standard, 1-persist static, 2-persist " + "dynamic"}, + {"mathPrecision", "%d", offsetof(RNNSampleOptions, mathPrecision), + "math precision (0-FP16, 1-FP32, 2-FP64)"}, + {"mathType", "%d", offsetof(RNNSampleOptions, mathType), + "math type (0-default, 1-tensor op math, 2-tensor op math with " + "conversion)"}, + {"dropout", "%g", offsetof(RNNSampleOptions, dropout), "dropout rate"}, + {"printWeights", "%d", offsetof(RNNSampleOptions, printWeights), + "Print weights"}}; + + if (argc == 1) { + printf("This is the cuDNN RNN API sample.\n\n"); + printf("Usage: ./%s [OPTIONS]\n\nProgram options:\n\n", + baseFile(*argv)); + + for (int i = 0; i < COUNTOF(param); i++) { + char buf[64]; + sprintf(buf, "-%s<%s>", param[i].name, param[i].format); + printf("%-20s - %s\n", buf, param[i].description); + } + printf("[INFO] Default RNN sample parameters will be used!\n"); + + // Default RNN options + options->dataType = 1; // CUDNN_DATA_FLOAT + options->seqLength = 20; + options->numLayers = 2; + options->inputSize = 512; + options->hiddenSize = 512; + options->projSize = 512; + options->miniBatch = 64; + options->inputMode = 1; // CUDNN_LINEAR_INPUT + options->dirMode = 0; // CUDNN_UNIDIRECTIONAL + options->cellMode = 0; // CUDNN_RNN_RELU + options->biasMode = 3; // CUDNN_RNN_DOUBLE_BIAS + options->algorithm = 0; // CUDNN_RNN_ALGO_STANDARD + options->mathPrecision = 1; // CUDNN_DATA_FLOAT + options->mathType = 0; // CUDNN_DEFAULT_MATH + options->dropout = 0.; + options->printWeights = 0; + } + + while (argc > 1) { + argc--; + argv++; + + for (int i = 0; i < COUNTOF(param); i++) { + const char* pname = param[i].name; + size_t plen = strlen(pname); + if (strncmp(*argv + 1, pname, plen) == 0) { + int count = sscanf(*argv + plen + 1, param[i].format, + (char*)options + param[i].offset); + if (count != 1) { + fprintf( + stderr, + "ERROR: missing numerical argument in option '%s'\n\n", + *argv); + exit(-1); + } + break; + } + } + } +} + +template +void RNNSample::setup(RNNSampleOptions& options) { + char projSizeUsage[48]; + char inputModeEnumValue[48]; + char dirModeEnumValue[48]; + char cellModeEnumValue[48]; + char biasModeEnumValue[48]; + char algorithmEnumValue[48]; + char mathPrecisionEnumValue[48]; + char mathTypeEnumValue[48]; + char dataTypeEnumValue[48]; + + // Convert options to the sample parameters + switch (options.inputMode) { + case 0: + inputMode = CUDNN_SKIP_INPUT; + snprintf(inputModeEnumValue, sizeof(inputModeEnumValue), + "CUDNN_SKIP_INPUT"); + break; + case 1: + inputMode = CUDNN_LINEAR_INPUT; + snprintf(inputModeEnumValue, sizeof(inputModeEnumValue), + "CUDNN_LINEAR_INPUT"); + break; + default: + printf("[ERROR] Wrong parameter for the inputMode!\n"); + fflush(0); + exit(-1); + } + + switch (options.dirMode) { + case 0: + dirMode = CUDNN_UNIDIRECTIONAL; + snprintf(dirModeEnumValue, sizeof(dirModeEnumValue), + "CUDNN_UNIDIRECTIONAL"); + break; + case 1: + dirMode = CUDNN_BIDIRECTIONAL; + snprintf(dirModeEnumValue, sizeof(dirModeEnumValue), + "CUDNN_BIDIRECTIONAL"); + break; + default: + printf("[ERROR] Wrong parameter for the dirMode!\n"); + fflush(0); + exit(-1); + } + + switch (options.cellMode) { + case 0: + cellMode = CUDNN_RNN_RELU; + snprintf(cellModeEnumValue, sizeof(cellModeEnumValue), + "CUDNN_RNN_RELU"); + break; + case 1: + cellMode = CUDNN_RNN_TANH; + snprintf(cellModeEnumValue, sizeof(cellModeEnumValue), + "CUDNN_RNN_TANH"); + break; + case 2: + cellMode = CUDNN_LSTM; + snprintf(cellModeEnumValue, sizeof(cellModeEnumValue), + "CUDNN_LSTM"); + break; + case 3: + cellMode = CUDNN_GRU; + snprintf(cellModeEnumValue, sizeof(cellModeEnumValue), "CUDNN_GRU"); + break; + default: + printf("[ERROR] Wrong parameter for the cellMode!\n"); + fflush(0); + exit(-1); + } + + switch (options.biasMode) { + case 0: + biasMode = CUDNN_RNN_NO_BIAS; + snprintf(biasModeEnumValue, sizeof(biasModeEnumValue), + "CUDNN_RNN_NO_BIAS"); + break; + case 1: + biasMode = CUDNN_RNN_SINGLE_INP_BIAS; + snprintf(biasModeEnumValue, sizeof(biasModeEnumValue), + "CUDNN_RNN_SINGLE_INP_BIAS"); + break; + case 2: + biasMode = CUDNN_RNN_SINGLE_REC_BIAS; + snprintf(biasModeEnumValue, sizeof(biasModeEnumValue), + "CUDNN_RNN_SINGLE_REC_BIAS"); + break; + case 3: + biasMode = CUDNN_RNN_DOUBLE_BIAS; + snprintf(biasModeEnumValue, sizeof(biasModeEnumValue), + "CUDNN_RNN_DOUBLE_BIAS"); + break; + default: + printf("[ERROR] Wrong parameter for the biasMode!\n"); + fflush(0); + exit(-1); + } + + switch (options.algorithm) { + case 0: + algorithm = CUDNN_RNN_ALGO_STANDARD; + snprintf(algorithmEnumValue, sizeof(algorithmEnumValue), + "CUDNN_RNN_ALGO_STANDARD"); + break; + case 1: + algorithm = CUDNN_RNN_ALGO_PERSIST_STATIC; + snprintf(algorithmEnumValue, sizeof(algorithmEnumValue), + "CUDNN_RNN_ALGO_PERSIST_STATIC"); + break; + case 2: + algorithm = CUDNN_RNN_ALGO_PERSIST_DYNAMIC; + snprintf(algorithmEnumValue, sizeof(algorithmEnumValue), + "CUDNN_RNN_ALGO_PERSIST_DYNAMIC"); + break; + default: + printf("[ERROR] Wrong parameter for the algorithm!\n"); + fflush(0); + exit(-1); + } + + switch (options.mathPrecision) { + case 0: + mathPrecision = CUDNN_DATA_HALF; + snprintf(mathPrecisionEnumValue, sizeof(mathPrecisionEnumValue), + "CUDNN_DATA_HALF"); + break; + case 1: + mathPrecision = CUDNN_DATA_FLOAT; + snprintf(mathPrecisionEnumValue, sizeof(mathPrecisionEnumValue), + "CUDNN_DATA_FLOAT"); + break; + case 2: + mathPrecision = CUDNN_DATA_DOUBLE; + snprintf(mathPrecisionEnumValue, sizeof(mathPrecisionEnumValue), + "CUDNN_DATA_DOUBLE"); + break; + default: + printf("[ERROR] Wrong parameter for the mathPrecision!\n"); + fflush(0); + exit(-1); + } + + switch (options.mathType) { + case 0: + mathType = CUDNN_DEFAULT_MATH; + snprintf(mathTypeEnumValue, sizeof(mathTypeEnumValue), + "CUDNN_DEFAULT_MATH"); + break; + case 1: + mathType = CUDNN_TENSOR_OP_MATH; + snprintf(mathTypeEnumValue, sizeof(mathTypeEnumValue), + "CUDNN_TENSOR_OP_MATH"); + break; + case 2: + mathType = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; + snprintf(mathTypeEnumValue, sizeof(mathTypeEnumValue), + "CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION"); + break; + default: + printf("[ERROR] Wrong parameter for the mathType!\n"); + fflush(0); + exit(-1); + } + + switch (options.dataType) { + case 0: + dataType = CUDNN_DATA_HALF; + snprintf(dataTypeEnumValue, sizeof(dataTypeEnumValue), + "CUDNN_DATA_HALF"); + break; + case 1: + dataType = CUDNN_DATA_FLOAT; + snprintf(dataTypeEnumValue, sizeof(dataTypeEnumValue), + "CUDNN_DATA_FLOAT"); + break; + case 2: + dataType = CUDNN_DATA_DOUBLE; + snprintf(dataTypeEnumValue, sizeof(dataTypeEnumValue), + "CUDNN_DATA_DOUBLE"); + break; + default: + printf("[ERROR] Wrong parameter for the dataType!\n"); + fflush(0); + exit(-1); + } + + snprintf(projSizeUsage, sizeof(projSizeUsage), + (cellMode == CUDNN_LSTM) ? "enabled" : "disabled"); + + // Sizes + inputSize = options.inputSize; + hiddenSize = options.hiddenSize; + projSize = options.projSize; + numLayers = options.numLayers; + seqLength = options.seqLength; + miniBatch = options.miniBatch; + dropout = options.dropout; + printWeights = options.printWeights; + + // Compute local parameters + bidirectionalScale = (dirMode == CUDNN_BIDIRECTIONAL ? 2 : 1); + + // Calculating total elements per each tensor + inputTensorSize = seqLength * miniBatch * inputSize; + int devseqLength = 0; + for (auto i : seqs) { + devseqLength += i; + } + devinputTensorSize = devseqLength * inputSize; + // devinputTensorSize = inputTensorSize; + outputTensorSize = seqLength * miniBatch * hiddenSize * bidirectionalScale; + hiddenTensorSize = numLayers * miniBatch * hiddenSize * bidirectionalScale; + + // Dimensions for hidden state tensors + dimHidden[0] = numLayers * bidirectionalScale; + dimHidden[1] = miniBatch; + dimHidden[2] = hiddenSize; + + strideHidden[0] = dimHidden[1] * dimHidden[2]; + strideHidden[1] = dimHidden[2]; + strideHidden[2] = 1; + + // Compute number of linear layers + numLinearLayers = 0; + if (cellMode == CUDNN_RNN_RELU || cellMode == CUDNN_RNN_TANH) { + numLinearLayers = 2; + } else if (cellMode == CUDNN_LSTM) { + numLinearLayers = 8; + } else if (cellMode == CUDNN_GRU) { + numLinearLayers = 6; + } + + // Pick a seed. (required by dropout descriptor) + seed = 1337ull; + + paddingFill = 0.0; + + flopCount = numLinearLayers * 2ull * bidirectionalScale * hiddenSize * + hiddenSize * seqLength * miniBatch * numLayers; + + deviceMemoryAvailable = getDeviceMemory(); + totalMemoryConsumption = + (2 * devinputTensorSize + 2 * outputTensorSize + 8 * hiddenTensorSize) * + sizeof(T_ELEM); + + // Check consistency of parameters + if ((dataType == CUDNN_DATA_HALF && (mathPrecision != CUDNN_DATA_HALF && + mathPrecision != CUDNN_DATA_FLOAT)) || + (dataType == CUDNN_DATA_FLOAT && (mathPrecision != CUDNN_DATA_FLOAT)) || + (dataType == CUDNN_DATA_DOUBLE && + (mathPrecision != CUDNN_DATA_DOUBLE))) { + printf( + "[ERROR] Inconsistent parameter: dataType does not match " + "mathPrecision!\n"); + fflush(0); + exit(-1); + } + + if ((dataType == CUDNN_DATA_FLOAT && + (mathType != CUDNN_DEFAULT_MATH && + mathType != CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)) || + (dataType == CUDNN_DATA_DOUBLE && (mathType != CUDNN_DEFAULT_MATH))) { + printf( + "[ERROR] Inconsistent parameter: dataType does not match " + "mathType!\n"); + fflush(0); + exit(-1); + } + + if (inputMode == CUDNN_SKIP_INPUT && inputSize != hiddenSize) { + printf( + "[ERROR] Inconsistent parameter: inputSize does not match " + "hiddenSize!\n"); + fflush(0); + exit(-1); + } + + if (projSize > hiddenSize) { + printf( + "[ERROR] Inconsistent parameter: projSize is larger than " + "hiddenSize!\n"); + fflush(0); + exit(-1); + } + +#ifdef DEBUG_INFO + printf("[INFO] RNN sample parameters:\n"); + printf("[INFO] RNN seqLength = %5d\n", seqLength); + printf("[INFO] RNN numLayers = %5d\n", numLayers); + printf("[INFO] RNN inputSize = %5d\n", inputSize); + printf("[INFO] RNN hiddenSize = %5d\n", hiddenSize); + printf("[INFO] RNN projSize = %5d (%s)\n", projSize, projSizeUsage); + printf("[INFO] RNN miniBatch = %5d\n", miniBatch); + printf("[INFO] RNN inputMode = %5d (%s)\n", inputMode, + inputModeEnumValue); + printf("[INFO] RNN dirMode = %5d (%s)\n", dirMode, + dirModeEnumValue); + printf("[INFO] RNN cellMode = %5d (%s)\n", cellMode, + cellModeEnumValue); + printf("[INFO] RNN biasMode = %5d (%s)\n", biasMode, + biasModeEnumValue); + printf("[INFO] RNN algorithm = %5d (%s)\n", algorithm, + algorithmEnumValue); + printf("[INFO] RNN mathPrecision = %5d (%s)\n", mathPrecision, + mathPrecisionEnumValue); + printf("[INFO] RNN mathType = %5d (%s)\n", mathType, + mathTypeEnumValue); + printf("[INFO] RNN dataType = %5d (%s)\n", dataType, + dataTypeEnumValue); + printf("[INFO] RNN dropout = %5g\n", dropout); +#endif +} + +template +void RNNSample::testgen() { + // Initialise weights and inputs + // We initialise to something simple. + // Matrices are initialised to 1 / matrixSize, biases to 1, data is 1. + + // Initialize inputs + initGPUData((T_ELEM*)x, devinputTensorSize, 1.0); + if (hx != NULL) initGPUData((T_ELEM*)hx, hiddenTensorSize, 1.0); + if (cx != NULL) initGPUData((T_ELEM*)cx, hiddenTensorSize, 1.0); + + initGPUData((T_ELEM*)dy, outputTensorSize, 1.0); + if (dhy != NULL) initGPUData((T_ELEM*)dhy, hiddenTensorSize, 1.0); + if (dcy != NULL) initGPUData((T_ELEM*)dcy, hiddenTensorSize, 1.0); + + // Initialize Weights + cudnnTensorDescriptor_t wDesc; + cudnnTensorDescriptor_t bDesc; + + cudnnErrCheck(cudnnCreateTensorDescriptor(&wDesc)); + cudnnErrCheck(cudnnCreateTensorDescriptor(&bDesc)); + + for (int layer = 0; layer < numLayers * bidirectionalScale; layer++) { + for (int linLayerID = 0; linLayerID < numLinearLayers; linLayerID++) { + cudnnDataType_t dataTypeTemp; + int nbDims = 0; + int dim[3], stride[3]; + T_ELEM* linLayerMat = NULL; + T_ELEM* linLayerBias = NULL; + + cudnnErrCheck(cudnnGetRNNWeightParams( + cudnnHandle, rnnDesc, layer, weightSpaceSize, weightSpace, + linLayerID, wDesc, (void**)&linLayerMat, bDesc, + (void**)&linLayerBias)); + + if (linLayerMat) { + cudnnErrCheck(cudnnGetTensorNdDescriptor( + wDesc, 3, &dataTypeTemp, &nbDims, dim, stride)); + initGPUData(linLayerMat, dim[0] * dim[1] * dim[2], + 1.0 / (dim[0] * dim[1] * dim[2])); + if (printWeights) { + printWeightAsMatrix(linLayerMat, dim[1], dim[2]); + } + } + + if (linLayerBias) { + cudnnErrCheck(cudnnGetTensorNdDescriptor( + bDesc, 3, &dataTypeTemp, &nbDims, dim, stride)); + initGPUData(linLayerBias, dim[0] * dim[1] * dim[2], + 1.0); + } + } + } + + cudnnDestroyTensorDescriptor(wDesc); + cudnnDestroyTensorDescriptor(bDesc); +} + +template +void RNNSample::run() { +#ifdef DEBUG_INFO + FILE* fp = NULL; + fp = fopen("result.txt", "w"); + + if (fp == NULL) { + printf("[ERROR] Cannot open output file!\n"); + exit(-1); + } +#endif + + // Create cudnn context + cudnnErrCheck(cudnnCreate(&cudnnHandle)); + + // Memory allocation. hx, cx, dhx, dcx, hy, cy, dhy and dcy can be NULL. + cudaErrCheck(cudaMalloc((void**)&x, devinputTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&y, outputTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&dx, inputTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&dy, outputTensorSize * sizeof(T_ELEM))); + + cudaErrCheck(cudaMalloc((void**)&hx, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&cx, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&hy, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&cy, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&dhx, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&dcx, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&dhy, hiddenTensorSize * sizeof(T_ELEM))); + cudaErrCheck(cudaMalloc((void**)&dcy, hiddenTensorSize * sizeof(T_ELEM))); + + // Memory allocation for seqLengthArray on the host and device + seqLengthArray = (int*)malloc(miniBatch * sizeof(int)); + + cudaErrCheck( + cudaMalloc((void**)&devSeqLengthArray, miniBatch * sizeof(int))); + totalMemoryConsumption += miniBatch * sizeof(int); + + for (int i = 0; i < miniBatch; i++) { + seqLengthArray[i] = seqs[i]; + } + + cudaErrCheck(cudaMemcpy(devSeqLengthArray, seqLengthArray, + miniBatch * sizeof(int), cudaMemcpyHostToDevice)); + + // Create RNN Data descriptors + cudnnErrCheck(cudnnCreateRNNDataDescriptor(&xDesc)); + cudnnErrCheck(cudnnCreateRNNDataDescriptor(&yDesc)); + + // Set RNN Data descriptors + cudnnErrCheck(cudnnSetRNNDataDescriptor( + xDesc, dataType, CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, seqLength, + miniBatch, inputSize, seqLengthArray, &paddingFill)); + + cudnnErrCheck(cudnnSetRNNDataDescriptor( + yDesc, dataType, CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, seqLength, + miniBatch, hiddenSize * bidirectionalScale, seqLengthArray, + &paddingFill)); + + cudnnErrCheck(cudnnCreateTensorDescriptor(&hDesc)); + cudnnErrCheck(cudnnCreateTensorDescriptor(&cDesc)); + + cudnnErrCheck(cudnnSetTensorNdDescriptor(hDesc, dataType, 3, dimHidden, + strideHidden)); + cudnnErrCheck(cudnnSetTensorNdDescriptor(cDesc, dataType, 3, dimHidden, + strideHidden)); + + // Set up the dropout descriptor (needed for the RNN descriptor) + cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc)); + + // How much memory does dropout need for states? + // These states are used to generate random numbers internally + // and should not be freed until the RNN descriptor is no longer used + cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize)); + + cudaErrCheck(cudaMalloc(&states, stateSize)); + totalMemoryConsumption += stateSize; + + cudnnErrCheck(cudnnSetDropoutDescriptor(dropoutDesc, cudnnHandle, dropout, + states, stateSize, seed)); + + // Set up the RNN descriptor + cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc)); + + cudnnErrCheck(cudnnSetRNNDescriptor_v8( + rnnDesc, algorithm, cellMode, biasMode, dirMode, inputMode, dataType, + mathPrecision, mathType, inputSize, hiddenSize, projSize, numLayers, + dropoutDesc, 0)); + + // Set up weights and bias parameters + cudnnErrCheck( + cudnnGetRNNWeightSpaceSize(cudnnHandle, rnnDesc, &weightSpaceSize)); + + cudaErrCheck(cudaMalloc((void**)&weightSpace, weightSpaceSize)); + cudaErrCheck(cudaMalloc((void**)&dweightSpace, weightSpaceSize)); + totalMemoryConsumption += (2 * weightSpaceSize); + + // Set up work space and reserved memory + cudnnErrCheck(cudnnGetRNNTempSpaceSizes(cudnnHandle, rnnDesc, + CUDNN_FWD_MODE_TRAINING, xDesc, + &workSpaceSize, &reserveSpaceSize)); + + cudaErrCheck(cudaMalloc((void**)&workSpace, workSpaceSize)); + cudaErrCheck(cudaMalloc((void**)&reserveSpace, reserveSpaceSize)); + totalMemoryConsumption += (workSpaceSize + reserveSpaceSize); + +#ifdef DEBUG_INFO + printf("[INFO] weightSpaceSize : %g MiB\n", + weightSpaceSize / 1024.0 / 1024.0); + printf("[INFO] workSpaceSize : %g MiB\n", + workSpaceSize / 1024.0 / 1024.0); + printf("[INFO] reserveSpaceSize: %g MiB\n", + reserveSpaceSize / 1024.0 / 1024.0); + printf("\n"); + printf("[INFO] Total required memory : %g MiB\n", + totalMemoryConsumption / 1024.0 / 1024.0); + printf("[INFO] Total available device memory: %g MiB\n", + deviceMemoryAvailable / 1024.0 / 1024.0); + fflush(0); +#endif + + // Initialize all the data + testgen(); + + // Dynamic persistent RNN plan + if (algorithm == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) { + // Note: This step is expensive. Once completed the plan can be reused + // so long as the descriptor + // minibatch or datatype don't change. + cudnnErrCheck(cudnnBuildRNNDynamic(cudnnHandle, rnnDesc, miniBatch)); + } + + // ********************************************************************************************************* + // At this point all of the setup is done. We now need to pass through the + // RNN. + // ********************************************************************************************************* + + // Warm up + for (int i = 0; i < 3; ++i) { + cudnnErrCheck(cudnnRNNForward( + cudnnHandle, rnnDesc, CUDNN_FWD_MODE_INFERENCE, devSeqLengthArray, + xDesc, x, yDesc, y, hDesc, hx, hy, cDesc, cx, cy, weightSpaceSize, + weightSpace, workSpaceSize, workSpace, reserveSpaceSize, + reserveSpace)); + } + + cudaErrCheck(cudaDeviceSynchronize()); + cudaErrCheck(cudaEventCreate(&start)); + cudaErrCheck(cudaEventCreate(&stop)); + + cudaErrCheck(cudaEventRecord(start)); + + for (int i = 0; i < 5; ++i) { + cudnnErrCheck(cudnnRNNForward( + cudnnHandle, rnnDesc, CUDNN_FWD_MODE_INFERENCE, devSeqLengthArray, + xDesc, x, yDesc, y, hDesc, hx, hy, cDesc, cx, cy, weightSpaceSize, + weightSpace, workSpaceSize, workSpace, reserveSpaceSize, + reserveSpace)); + } + + cudaErrCheck(cudaEventRecord(stop)); + cudaErrCheck(cudaEventSynchronize(stop)); + cudaErrCheck(cudaEventElapsedTime(&timeForward, start, stop)); + timeForward = timeForward / 5; + /* + cudaErrCheck(cudaEventRecord(start)); + + cudnnErrCheck(cudnnRNNBackwardData_v8(cudnnHandle, + rnnDesc, + devSeqLengthArray, + yDesc, + y, + dy, + xDesc, + dx, + hDesc, + hx, + dhy, + dhx, + cDesc, + cx, + dcy, + dcx, + weightSpaceSize, + weightSpace, + workSpaceSize, + workSpace, + reserveSpaceSize, + reserveSpace)); + + cudaErrCheck(cudaEventRecord(stop)); + cudaErrCheck(cudaEventSynchronize(stop)); + cudaErrCheck(cudaEventElapsedTime(&timeBackwardData, start, stop)); + + // cudnnRNNBackwardWeights adds to the data in dw. + cudaErrCheck(cudaEventRecord(start)); + + cudaErrCheck(cudaMemset(dweightSpace, 0, weightSpaceSize)); + + cudnnErrCheck(cudnnRNNBackwardWeights_v8(cudnnHandle, + rnnDesc, + CUDNN_WGRAD_MODE_ADD, + devSeqLengthArray, + xDesc, + x, + hDesc, + hx, + yDesc, + y, + weightSpaceSize, + dweightSpace, + workSpaceSize, + workSpace, + reserveSpaceSize, + reserveSpace)); + + cudaErrCheck(cudaEventRecord(stop)); + cudaErrCheck(cudaEventSynchronize(stop)); + cudaErrCheck(cudaEventElapsedTime(&timeBackwardWeights, start, stop)); + */ + + // Report the performanc +#ifdef DEBUG_INFO + printf("[INFO] timeForward : %2.5f ms\n", timeForward); + // printf("[INFO] timeBackwardData : %2.5f ms\n",timeBackwardData); + // printf("[INFO] timeBackwardWeights: %2.5f ms\n",timeBackwardWeights); + + printf("[INFO] Forward: %3.0f GFLOPS\n", flopCount / (1e6 * timeForward)); + // printf("[INFO] Backward: %3.0f GFLOPS, ", 2ull * flopCount / (1e6 * + // (timeBackwardData + timeBackwardWeights))); printf("(%3.0f GFLOPS), ", + // flopCount / (1e6 * timeBackwardData)); printf("(%3.0f GFLOPS)\n", + // flopCount / (1e6 * timeBackwardWeights)); + fflush(0); + + // Save FLOPS to file + fprintf(fp, "timeForward : %2.5f ms\n", timeForward); + fprintf(fp, "Forward: %3.0f GFLOPS\n", flopCount / (1e6 * timeForward)); + // fprintf(fp, "Backward: %3.0f GFLOPS, ", 2ull * flopCount / (1e6 * + // (timeBackwardData + timeBackwardWeights))); fprintf(fp, "(%3.0f GFLOPS), + // ", flopCount / (1e6 * timeBackwardData)); fprintf(fp, "(%3.0f GFLOPS)\n", + // flopCount / (1e6 * timeBackwardWeights)); +#endif + cudaDeviceSynchronize(); + + // ********************************************************************************************************* + // Print checksums. + // ********************************************************************************************************* + { + T_ELEM* testOutputy; + T_ELEM* testOutputhy; + T_ELEM* testOutputcy; + + testOutputy = (T_ELEM*)malloc(outputTensorSize * sizeof(T_ELEM)); + testOutputhy = (T_ELEM*)malloc(hiddenTensorSize * sizeof(T_ELEM)); + testOutputcy = (T_ELEM*)malloc(hiddenTensorSize * sizeof(T_ELEM)); + + cudaErrCheck(cudaMemcpy(testOutputy, y, + outputTensorSize * sizeof(T_ELEM), + cudaMemcpyDeviceToHost)); + if (hy != NULL) { + cudaErrCheck(cudaMemcpy(testOutputhy, hy, + hiddenTensorSize * sizeof(T_ELEM), + cudaMemcpyDeviceToHost)); + } + if (cy != NULL && cellMode == CUDNN_LSTM) { + cudaErrCheck(cudaMemcpy(testOutputcy, cy, + hiddenTensorSize * sizeof(T_ELEM), + cudaMemcpyDeviceToHost)); + } + + double checksumy = 0.f; + double checksumhy = 0.f; + double checksumcy = 0.f; + + for (int m = 0; m < miniBatch; m++) { + double localSumi = 0; + double localSumh = 0; + double localSumc = 0; + + for (int j = 0; j < seqLength; j++) { + for (int i = 0; i < hiddenSize * bidirectionalScale; i++) { + localSumi += (double) + testOutputy[j * miniBatch * hiddenSize * + bidirectionalScale + + m * hiddenSize * bidirectionalScale + i]; + } + } + for (int j = 0; j < numLayers * bidirectionalScale; j++) { + for (int i = 0; i < hiddenSize; i++) { + if (hy != NULL) { + localSumh += + (double)testOutputhy[j * hiddenSize * miniBatch + + m * hiddenSize + i]; + } + if ((cy != NULL) && (cellMode == CUDNN_LSTM)) { + localSumc += + (double)testOutputcy[j * hiddenSize * miniBatch + + m * hiddenSize + i]; + } + } + } + + checksumy += localSumi; + checksumhy += localSumh; + checksumcy += localSumc; + } + +#ifdef DEBUG_INFO + printf("y checksum %E ", checksumy); + fprintf(fp, "y checksum %E ", checksumy); + if (cellMode == CUDNN_LSTM) { + printf("cy checksum %E ", checksumcy); + fprintf(fp, "cy checksum %E ", checksumcy); + } + printf("hy checksum %E\n", checksumhy); + fprintf(fp, "hy checksum %E\n", checksumhy); +#endif + + free(testOutputy); + free(testOutputcy); + free(testOutputhy); + } + /* + { + T_ELEM *testOutputdx; + T_ELEM *testOutputdhx; + T_ELEM *testOutputdcx; + + testOutputdx = (T_ELEM *)malloc(inputTensorSize * sizeof(T_ELEM)); + testOutputdhx = (T_ELEM *)malloc(hiddenTensorSize * sizeof(T_ELEM)); + testOutputdcx = (T_ELEM *)malloc(hiddenTensorSize * sizeof(T_ELEM)); + + cudaErrCheck(cudaMemcpy(testOutputdx, dx, inputTensorSize * + sizeof(T_ELEM), cudaMemcpyDeviceToHost)); if (dhx != NULL) { + cudaErrCheck(cudaMemcpy(testOutputdhx, dhx, hiddenTensorSize * + sizeof(T_ELEM), cudaMemcpyDeviceToHost)); + } + if ((dcx != NULL) && (cellMode == CUDNN_LSTM)) { + cudaErrCheck(cudaMemcpy(testOutputdcx, dcx, hiddenTensorSize * + sizeof(T_ELEM), cudaMemcpyDeviceToHost)); + } + + double checksumdx = 0.f; + double checksumdhx = 0.f; + double checksumdcx = 0.f; + + for (int m = 0; m < miniBatch; m++) { + double localSumdx = 0; + double localSumdhx = 0; + double localSumdcx = 0; + + for (int j = 0; j < seqLength; j++) { + for (int i = 0; i < inputSize; i++) { + localSumdx += (double) testOutputdx[j * miniBatch * inputSize + + m * inputSize + i]; + } + } + + for (int j = 0; j < numLayers * bidirectionalScale; j++) { + for (int i = 0; i < hiddenSize; i++) { + localSumdhx += (double) testOutputdhx[j * hiddenSize * + miniBatch + m * hiddenSize + i]; if (cellMode == CUDNN_LSTM) { + localSumdcx + += (double) testOutputdcx[j * hiddenSize * miniBatch + m * hiddenSize + + i]; + } + } + } + + checksumdx += localSumdx; + checksumdhx += localSumdhx; + checksumdcx += localSumdcx; + } + + printf("dx checksum %E ", checksumdx); + fprintf(fp, "dx checksum %E ", checksumdx); + if (cellMode == CUDNN_LSTM) { + printf("dcx checksum %E ", checksumdcx); + fprintf(fp, "dcx checksum %E ", checksumdcx); + } + printf("dhx checksum %E\n", checksumdhx); + fprintf(fp, "dhx checksum %E\n", checksumdhx); + + free(testOutputdx); + free(testOutputdhx); + free(testOutputdcx); + } + + { + T_ELEM *testOutputdw; + testOutputdw = (T_ELEM *)malloc(weightSpaceSize); + + cudaErrCheck(cudaMemcpy(testOutputdw, dweightSpace, weightSpaceSize, + cudaMemcpyDeviceToHost)); + + double checksumdw = 0.; + + for (int i = 0; i < weightSpaceSize / sizeof(T_ELEM); i++) { + checksumdw += (double) testOutputdw[i]; + } + + printf("dw checksum %E\n", checksumdw); + fprintf(fp, "dw checksum %E\n", checksumdw); + + free(testOutputdw); + } + */ + // Free all previously allocated memory, destroy all created cudnn + // descriptors + free(seqLengthArray); + + cudaFree(x); + cudaFree(hx); + cudaFree(cx); + cudaFree(y); + cudaFree(hy); + cudaFree(cy); + cudaFree(dx); + cudaFree(dhx); + cudaFree(dcx); + cudaFree(dy); + cudaFree(dhy); + cudaFree(dcy); + cudaFree(workSpace); + cudaFree(reserveSpace); + cudaFree(weightSpace); + cudaFree(dweightSpace); + cudaFree(states); + cudaFree(devSeqLengthArray); + + cudnnDestroyRNNDataDescriptor(xDesc); + cudnnDestroyRNNDataDescriptor(yDesc); + + cudnnDestroyTensorDescriptor(hDesc); + cudnnDestroyTensorDescriptor(cDesc); + + cudnnDestroyDropoutDescriptor(dropoutDesc); + cudnnDestroyRNNDescriptor(rnnDesc); + + cudnnDestroy(cudnnHandle); + +#ifdef DEBUG_INFO + fclose(fp); +#endif +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/fp16_emu.h b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/fp16_emu.h new file mode 100644 index 000000000..fcdebb810 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/fp16_emu.h @@ -0,0 +1,97 @@ +// Conversion from/to 16-bit floating point (half-precision). + +#if !defined(_FP16_EMU_H_) +#define _FP16_EMU_H_ + +#include +#include + +// Necessary to ensure visibility of CUDART_VERSION macro +#include + +// Definition of '__half_raw' was not provided before CUDA 9.0. +// '__half_raw' is our type where the unsigned 16-bit integer +// data member 'x' can be accessed in both CUDA 9.0 and 8.0. +#if CUDART_VERSION < 9000 +typedef __half __half_raw; +#endif + +// Internally, in CUDNN we use half1 struct as the FP16 type. +typedef __half half1; + +#define HLF_EPSILON 4.887581E-04 +#define HLF_MIN 6.103516E-05 +#define HLF_MAX 6.550400E+04 + +half1 cpu_float2half_rn(float f); + +float cpu_half2float(half1 h); + +static __inline__ __device__ __host__ half1 habs(half1 h) { + __half_raw hr = reinterpret_cast<__half_raw&>(h); + hr.x &= 0x7fffU; + return reinterpret_cast(hr); +} + +static __inline__ __device__ __host__ half1 hneg(half1 h) { + __half_raw hr = reinterpret_cast<__half_raw&>(h); + hr.x ^= 0x8000U; + return reinterpret_cast(hr); +} + +static __inline__ __device__ __host__ int ishnan(half1 h) { + // When input is NaN, exponent is all ones and mantissa is non-zero. + __half_raw hr = reinterpret_cast<__half_raw&>(h); + return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0; +} + +static __inline__ __device__ __host__ int ishinf(half1 h) { + // When input is +/- inf, exponent is all ones and mantissa is zero. + __half_raw hr = reinterpret_cast<__half_raw&>(h); + return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0; +} + +static __inline__ __device__ __host__ int ishequ(half1 x, half1 y) { + __half_raw xr = reinterpret_cast<__half_raw&>(x); + __half_raw yr = reinterpret_cast<__half_raw&>(y); + return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x; +} + +// Returns 0.0000 in FP16 binary form +static __inline__ __device__ __host__ half1 hzero() { + __half_raw hr; + hr.x = 0x0000U; + return reinterpret_cast(hr); +} + +// Returns 1.0000 in FP16 binary form +static __inline__ __device__ __host__ half1 hone() { + __half_raw hr; + hr.x = 0x3c00U; + return reinterpret_cast(hr); +} + +// Returns quiet NaN, the most significant fraction bit #9 is set +static __inline__ __device__ __host__ half1 hnan() { + __half_raw hr; + hr.x = 0x7e00U; + return reinterpret_cast(hr); +} + +// Largest positive FP16 value, corresponds to 6.5504e+04 +static __inline__ __device__ __host__ half1 hmax() { + // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff) + __half_raw hr; + hr.x = 0x7bffU; + return reinterpret_cast(hr); +} + +// Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05 +static __inline__ __device__ __host__ half1 hmin() { + // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits) + __half_raw hr; + hr.x = 0x0400U; + return reinterpret_cast(hr); +} + +#endif // _FP16_EMU_H_ diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/lstm_cell_cudnn.cu b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/lstm_cell_cudnn.cu new file mode 100644 index 000000000..0d94c0c33 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/lstm_cell_cudnn.cu @@ -0,0 +1,40 @@ +#include "utils.h" + +#include + +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + assert(argc == 2); + const char* filename = argv[1]; + + std::ofstream fout; + fout.setf(std::ios::fixed); + fout.precision(4); + + fout.open(filename, std::ios::out); + + srand(1234); + constexpr std::array hidden_sizes = {128, 256, 512, 1024}; + constexpr std::array batch_sizes = {32, 64, 128, 256}; + const int seq_length = 1; + const int depth = 1; + + fout << "[depth, seq_length, batch_size, hidden_size]\tAvgTime(ms)" + << std::endl; + + for (auto hidden_size : hidden_sizes) { + for (auto batch_size : batch_sizes) { + genSeqs(batch_size, seq_length, false); + float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, + seq_length, depth, hidden_size); + + fout << "[" << depth << ", " << seq_length << ", " << batch_size + << ", " << hidden_size << "]\t" << cudnn_time << std::endl; + } + } +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/main.cu b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/main.cu new file mode 100644 index 000000000..87d78ea7f --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/main.cu @@ -0,0 +1,69 @@ +#include "utils.h" + +int main(int argc, char* argv[]) { + srand(1234); + int batch_size = 64; + int hidden_size = 256; + int seq_length = 100; + int depth = 10; + + int input_size = hidden_size; + + genSeqs(batch_size, seq_length, false); + + for (auto depth : {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}) { + float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length, + depth, input_size); + + std::stringstream ss; + ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length + << ", " << depth << "]|"; + std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" + << std::endl; + } + + std::cout << std::endl; + + for (auto seq_length : {50, 75, 100, 125, 150, 175, 200}) { + genSeqs(batch_size, seq_length, false); + float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length, + depth, input_size); + + std::stringstream ss; + ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length + << ", " << depth << "]|"; + std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" + << std::endl; + } + + std::cout << std::endl; + + genSeqs(batch_size, seq_length, true); + + for (auto depth : {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}) { + float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length, + depth, input_size); + + std::stringstream ss; + ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length + << ", " << depth << "]|"; + std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" + << std::endl; + } + + std::cout << std::endl; + + for (auto seq_length : {50, 75, 100, 125, 150, 175, 200}) { + genSeqs(batch_size, seq_length, true); + float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length, + depth, input_size); + + std::stringstream ss; + ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length + << ", " << depth << "]|"; + std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" + << std::endl; + } + + return 0; +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/result.txt b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/result.txt new file mode 100644 index 000000000..e69de29bb diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/stacked_lstm_cudnn.cu b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/stacked_lstm_cudnn.cu new file mode 100644 index 000000000..c510c04f9 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/stacked_lstm_cudnn.cu @@ -0,0 +1,42 @@ +#include "utils.h" + +#include + +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + assert(argc == 2); + const char* filename = argv[1]; + + std::ofstream fout; + fout.setf(std::ios::fixed); + fout.precision(4); + + fout.open(filename, std::ios::out); + + srand(1234); + constexpr std::array hidden_sizes = {64, 128, 256, 512, 1024}; + constexpr std::array batch_sizes = {32, 64}; + constexpr size_t seq_length = 16; + constexpr std::array depths = {1, 2, 4, 8, 16, 32}; + + fout << "[depth, seq_length, batch_size, hidden_size]\tAvgTime(ms)" + << std::endl; + + for (auto depth : depths) { + for (auto hidden_size : hidden_sizes) { + for (auto batch_size : batch_sizes) { + genSeqs(batch_size, seq_length, false); + float cudnn_time = TestCuDNNLSTM( + batch_size, hidden_size, seq_length, depth, hidden_size); + + fout << "[" << depth << ", " << seq_length << ", " << batch_size + << ", " << hidden_size << "]\t" << cudnn_time << std::endl; + } + } + } +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/cuDNN/utils.h b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/utils.h new file mode 100644 index 000000000..80e9f5916 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/cuDNN/utils.h @@ -0,0 +1,59 @@ +#include "RNN_example.h" + +template +float runRNNSample(RNNSampleOptions& options) { + RNNSample sample; + sample.setup(options); + sample.run(); + return sample.timeForward; +} + +float TestCuDNNLSTM(int mini_batch, int hidden_size, int seq_length, + int num_layers, int input_size) { + RNNSampleOptions options; + + options.dataType = 1; // CUDNN_DATA_FLOAT + // options.dataType = 0; + options.seqLength = seq_length; + options.numLayers = num_layers; + options.inputSize = input_size; + options.hiddenSize = hidden_size; + options.projSize = hidden_size; + options.miniBatch = mini_batch; + options.inputMode = 1; // CUDNN_LINEAR_INPUT + options.dirMode = 0; // CUDNN_UNIDIRECTIONAL + options.cellMode = 2; // CUDNN_LSTM + options.biasMode = 3; // CUDNN_RNN_DOUBLE_BIAS + options.algorithm = 0; // CUDNN_RNN_ALGO_STANDARD + options.mathPrecision = 1; // CUDNN_DATA_FLOAT + // options.mathPrecision = 0; + options.mathType = 0; // CUDNN_DEFAULT_MATH + // options.mathType = 1; // CUDNN_TENSOR_OP_MATH + options.dropout = 0.; + options.printWeights = 0; + + return runRNNSample(options); + // return runRNNSample<__half>(options); +} + +int getRand(int min, int max) { return (rand() % (max - min)) + min + 1; } + +void genSeqs(int batch_size, int seq_length, bool random) { + std::vector temp(batch_size, seq_length); + + std::default_random_engine e; + e.seed(1234); + std::normal_distribution distribution(seq_length / 2, + seq_length / 8); + + for (int i = 1; i < batch_size; ++i) { + if (random) { + temp[i] = (int)distribution(e); + } else { + temp[i] = seq_length; + } + } + sort(temp.begin(), temp.end()); + reverse(temp.begin(), temp.end()); + seqs = temp; +} diff --git a/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/README.md b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/README.md new file mode 100644 index 000000000..f6a09ed22 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/README.md @@ -0,0 +1 @@ +[TBD] diff --git a/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/CMakeLists.txt b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/CMakeLists.txt new file mode 100644 index 000000000..08bb8909e --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/CMakeLists.txt @@ -0,0 +1,80 @@ +cmake_minimum_required(VERSION 3.18) +project(cute_stacked_lstm CXX C) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../cmake") +list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} + "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/") + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +find_package(CUDA QUIET REQUIRED) +find_package(CuDNN QUIET REQUIRED) + +cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") +message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +if(CUTLASS_NATIVE_CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) +else() + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) +endif() + +set(CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS} -w + ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS_DEBUG} + -w ${ARCH_FLAGS}) +set(CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS} + ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") +set(CMAKE_CXX_FLAGS_DEBUG + "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") +set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall + -Wno-sign-compare") +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CUDNN_INCLUDE_DIRS}) + +include_directories("../../") # include cuDNN implementation for comparsion + +# FIXME(ying): this requires to build the main project first. +include_directories( + "../../../../build/third_party/cutlass/src/extern_cutlass/include") +include_directories( + "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include") +include_directories("../../../../build/third_party/install/glog/include") +include_directories( + "../../../../build/third_party/gflags/src/extern_gflags-build/include") +include_directories("../../../../") + +link_directories("../../../../build/kaleido/core") +link_directories("../../../../build/kaleido/core/operators") +link_directories("../../../../build/kaleido/core/device") + +cuda_add_executable(dilated_lstm dilated_lstm.cu) + +target_link_libraries( + dilated_lstm + fill_op + print_op + expect_eq_op + fractaltensor_core + ${CUDA_LIBRARIES} + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_curand_LIBRARY} + ${CUDNN_LIBRARIES}) diff --git a/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/Makefile b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/Makefile new file mode 100644 index 000000000..40d48b8e0 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/Makefile @@ -0,0 +1,16 @@ +BENCH_NAME ?= dilated_lstm +BUILD_DIR := build +OUTPUT_FILE ?= ../../dilated_lstm_bench.tsv + +.PHONY: build bench clean + +build: + @mkdir -p build && cd build && cmake .. && make -j + +$(BUILD_DIR)/$(BENCH_NAME): build + +bench: $(BUILD_DIR)/$(BENCH_NAME) + @./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE) + +clean: + @rm -rf build diff --git a/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/dilated_lstm.cu b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/dilated_lstm.cu new file mode 100644 index 000000000..e07058cb6 --- /dev/null +++ b/artifacts/FractalTensor/benchmarks/rnn/fractaltensor/cute_dilated_lstm/dilated_lstm.cu @@ -0,0 +1,298 @@ +#include "cuDNN/utils.h" +#include "kaleido/core/cuda_allocator.h" +#include "kaleido/core/device/cuda_timer.h" +#include "kaleido/core/device/cuda_utils.h" +#include "kaleido/core/device/gpu_context.h" +#include "kaleido/core/device/kernels/cutlass_wmma.h" +#include "kaleido/core/device/kernels/fill.h" +#include "kaleido/core/device/kernels/lstm.h" +#include "kaleido/core/device/kernels/lstm/dilated_lstm/region1.h" +#include "kaleido/core/device/kernels/lstm/dilated_lstm/region2.h" +#include "kaleido/core/operators/fill_op.h" +#include "kaleido/core/operators/print_op.h" +#include "kaleido/core/place.h" +#include "kaleido/core/tensor.h" + +#include + +#include +#include +#include +#include +#include +#include + +using namespace kaleido::core; + +float bench_cudnn_lstm_half(int mini_batch, int hidden_size, int seq_length, + int num_layers, int input_size) { + RNNSampleOptions options; + + options.dataType = 0; // 1 for float, 0 for half + options.seqLength = seq_length; + options.numLayers = num_layers; + options.inputSize = input_size; + options.hiddenSize = hidden_size; + options.projSize = hidden_size; + options.miniBatch = mini_batch; + options.inputMode = 1; // CUDNN_LINEAR_INPUT + options.dirMode = 0; // CUDNN_UNIDIRECTIONAL + options.cellMode = 2; // CUDNN_LSTM + options.biasMode = 3; // CUDNN_RNN_DOUBLE_BIAS + options.algorithm = 0; // CUDNN_RNN_ALGO_STANDARD + + options.mathPrecision = 0; // 1 for float, 0 for half + + options.mathType = 1; // CUDNN_TENSOR_OP_MATH + options.dropout = 0.; + options.printWeights = 0; + + return runRNNSample<__half>(options); +} + +template , + typename ValueMnk = TileShape<1, 2, 1>> +float run_dilated_region1(const int depth, const int seq_length, + const int batch_size, const int hidden_size, + std::ofstream& fout) { + size_t numel = depth * 4 * hidden_size * hidden_size; + Element* dWs; + CudaCheck(cudaMalloc(&dWs, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(dWs, numel); + + numel = seq_length * batch_size * hidden_size; + Element* dXs; + CudaCheck(cudaMalloc(&dXs, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(dXs, numel); + + numel = depth * 4 * hidden_size * hidden_size; + Element* dUs; + CudaCheck(cudaMalloc(&dUs, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(dUs, numel); + + numel = depth * seq_length * batch_size * hidden_size; + Element* dCsss; + CudaCheck(cudaMalloc(&dCsss, numel * sizeof(Element))); + CudaCheck(cudaMemset(dCsss, 0, numel * sizeof(Element))); + + Element* dHsss; + CudaCheck(cudaMalloc(&dHsss, numel * sizeof(Element))); + CudaCheck(cudaMemset(dHsss, 0, numel * sizeof(Element))); + + numel = seq_length * batch_size * hidden_size; + Element* init; + CudaCheck(cudaMalloc(&init, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(init, numel); + + const int repeat = 10; + + float elapsed_time = 0.0f; + + for (auto i = 0; i < repeat; i++) { + elapsed_time += + cuda_kernel::DilatedLstmRegion1(dHsss, dCsss, dXs, dWs, + dUs, init, seq_length); + } + + CudaCheck(cudaFree(dWs)); + CudaCheck(cudaFree(dXs)); + CudaCheck(cudaFree(dUs)); + CudaCheck(cudaFree(dCsss)); + CudaCheck(cudaFree(dHsss)); + CudaCheck(cudaFree(init)); + + float avg_time = elapsed_time / repeat; + + return avg_time; +} + +template , + typename ValueMnk = TileShape<1, 2, 1>> +float run_dilated_lstm_cell(const int depth, const int seq_length, + const int batch_size, const int hidden_size, + int iter_count, std::ofstream& fout) { + size_t numel = depth * 4 * hidden_size * hidden_size; + Element* dWs; + CudaCheck(cudaMalloc(&dWs, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(dWs, numel); + + numel = seq_length * batch_size * hidden_size; + Element* dXs; + CudaCheck(cudaMalloc(&dXs, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(dXs, numel); + + numel = depth * 4 * hidden_size * hidden_size; + Element* dUs; + CudaCheck(cudaMalloc(&dUs, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(dUs, numel); + + numel = depth * seq_length * batch_size * hidden_size; + Element* dCsss; + CudaCheck(cudaMalloc(&dCsss, numel * sizeof(Element))); + CudaCheck(cudaMemset(dCsss, 0, numel * sizeof(Element))); + + Element* dHsss; + CudaCheck(cudaMalloc(&dHsss, numel * sizeof(Element))); + CudaCheck(cudaMemset(dHsss, 0, numel * sizeof(Element))); + + numel = seq_length * batch_size * hidden_size; + Element* init; + CudaCheck(cudaMalloc(&init, numel * sizeof(Element))); + cuda_kernel::FillRandomValue(init, numel); + + numel = 4 * 256 * batch_size * hidden_size; + Element* c_out; + CudaCheck(cudaMalloc(&c_out, numel * sizeof(Element))); + + Element* h_out; + CudaCheck(cudaMalloc(&h_out, numel * sizeof(Element))); + + Element* t; + CudaCheck(cudaMalloc(&t, numel * sizeof(Element))); + + cuda_kernel::CuteLSTMCell + lstm_cell; + float elapsed_time = 0.0f; + + for (int i = 0; i < iter_count; i++) { + elapsed_time += lstm_cell(dWs, dXs, dUs, dCsss, dHsss, c_out, h_out, t); + } + + CudaCheck(cudaFree(dWs)); + CudaCheck(cudaFree(dXs)); + CudaCheck(cudaFree(dUs)); + CudaCheck(cudaFree(dCsss)); + CudaCheck(cudaFree(dHsss)); + CudaCheck(cudaFree(init)); + CudaCheck(cudaFree(c_out)); + CudaCheck(cudaFree(h_out)); + CudaCheck(cudaFree(t)); + + return elapsed_time; +} + +template