Testcases: Down adjust L2/L3 cache size for GPU targets

halide · Sep 6, 2024 · de0a195 · de0a195
1 parent 06a6219
commit de0a195
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 4 deletions.
diff --git a/apps/bilateral_grid/CMakeLists.txt b/apps/bilateral_grid/CMakeLists.txt
@@ -25,7 +25,11 @@ add_halide_library(bilateral_grid_auto_schedule FROM bilateral_grid.generator
                    GENERATOR bilateral_grid
                    STMT bilateral_grid_auto_schedule_STMT
                    SCHEDULE bilateral_grid_auto_schedule_SCHEDULE
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=20000
+                   )
 
 # Main executable
 add_executable(bilateral_grid_process filter.cpp)

diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
@@ -20,7 +20,11 @@ add_halide_generator(local_laplacian.generator
 add_halide_library(local_laplacian FROM local_laplacian.generator)
 add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    GENERATOR local_laplacian
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=30000
+                   )
 
 # Main executable
 add_executable(local_laplacian_process process.cpp)

diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
@@ -18,7 +18,11 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=15000
+                   )
 
 # Main executable
 add_executable(stencil_chain_process process.cpp)

diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -42,7 +42,7 @@ struct ArchParams {
      * CACHE_SIZE to 48 KB.
      */
     constexpr ArchParams(bool has_gpu_feature)
-        : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 48 * 1024 : 16 * 1024 * 1024),
+        : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 35 * 1024 : 16 * 1024 * 1024),
           balance(has_gpu_feature ? 20 : 40) {
     }
 };