From 3ac78fc34996a2f123827435316e42f71120c29a Mon Sep 17 00:00:00 2001 From: Alberto <103119829+itzpr3d4t0r@users.noreply.github.com> Date: Sun, 12 Nov 2023 09:05:09 +0100 Subject: [PATCH] AVX Surface.fill() setup, AVX BLEND_ADD (#2382) * optimize the BLEND_ADD flag when used in surface.fill through AVX2 --- buildconfig/Setup.Android.SDL2.in | 2 +- buildconfig/Setup.Emscripten.SDL2.in | 2 +- buildconfig/Setup.SDL2.in | 2 +- setup.py | 2 +- src_c/simd_fill.h | 13 +++ src_c/simd_surface_fill_avx2.c | 121 +++++++++++++++++++++++++++ src_c/surface_fill.c | 20 ++++- 7 files changed, 157 insertions(+), 5 deletions(-) create mode 100644 src_c/simd_fill.h create mode 100644 src_c/simd_surface_fill_avx2.c diff --git a/buildconfig/Setup.Android.SDL2.in b/buildconfig/Setup.Android.SDL2.in index 7cfbc89f9d..2ee11fad3d 100644 --- a/buildconfig/Setup.Android.SDL2.in +++ b/buildconfig/Setup.Android.SDL2.in @@ -50,7 +50,7 @@ key src_c/key.c $(SDL) $(DEBUG) mouse src_c/mouse.c $(SDL) $(DEBUG) rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG) rwobject src_c/rwobject.c $(SDL) $(DEBUG) -surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG) +surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) surflock src_c/surflock.c $(SDL) $(DEBUG) time src_c/time.c $(SDL) $(DEBUG) joystick src_c/joystick.c $(SDL) $(DEBUG) diff --git a/buildconfig/Setup.Emscripten.SDL2.in b/buildconfig/Setup.Emscripten.SDL2.in index cea1aefa16..429d5c51d0 100644 --- a/buildconfig/Setup.Emscripten.SDL2.in +++ b/buildconfig/Setup.Emscripten.SDL2.in @@ -69,7 +69,7 @@ _sdl2.controller_old src_c/void.c #_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c _sdl2.touch src_c/void.c -#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64 +#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64 transform src_c/void.c diff --git a/buildconfig/Setup.SDL2.in b/buildconfig/Setup.SDL2.in index fafbd9c8fb..f6ca951259 100644 --- a/buildconfig/Setup.SDL2.in +++ b/buildconfig/Setup.SDL2.in @@ -60,7 +60,7 @@ key src_c/key.c $(SDL) $(DEBUG) mouse src_c/mouse.c $(SDL) $(DEBUG) rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG) rwobject src_c/rwobject.c $(SDL) $(DEBUG) -surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG) +surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) surflock src_c/surflock.c $(SDL) $(DEBUG) time src_c/time.c $(SDL) $(DEBUG) joystick src_c/joystick.c $(SDL) $(DEBUG) diff --git a/setup.py b/setup.py index 341ff9ccfe..e87beb2f99 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,7 @@ import distutils.ccompiler -avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2'] +avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2', 'simd_surface_fill_avx2'] compiler_options = { 'unix': ('-mavx2',), diff --git a/src_c/simd_fill.h b/src_c/simd_fill.h new file mode 100644 index 0000000000..05c30d095a --- /dev/null +++ b/src_c/simd_fill.h @@ -0,0 +1,13 @@ +#define NO_PYGAME_C_API +#include "_surface.h" + +int +_pg_has_avx2(); + +// AVX2 functions +int +surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color); +int +surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color); diff --git a/src_c/simd_surface_fill_avx2.c b/src_c/simd_surface_fill_avx2.c new file mode 100644 index 0000000000..101a20a2a0 --- /dev/null +++ b/src_c/simd_surface_fill_avx2.c @@ -0,0 +1,121 @@ +#include "simd_fill.h" + +#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) +#include +#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */ + +#define BAD_AVX2_FUNCTION_CALL \ + printf( \ + "Fatal Error: Attempted calling an AVX2 function when both compile " \ + "time and runtime support is missing. If you are seeing this " \ + "message, you have stumbled across a pygame bug, please report it " \ + "to the devs!"); \ + PG_EXIT(1) + +/* helper function that does a runtime check for AVX2. It has the added + * functionality of also returning 0 if compile time support is missing */ +int +_pg_has_avx2() +{ +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) + return SDL_HasAVX2(); +#else + return 0; +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ +} + +#define SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \ + /* initialize surface data */ \ + int width = rect->w, height = rect->h; \ + int skip = surface->pitch / 4 - width; \ + /* indicates the number of pixels that can't be processed in 8-pixel \ + * blocks */ \ + int pxl_excess = width % 8; \ + /* indicates the number of 8-pixel blocks that can be processed */ \ + int n_iters_8 = width / 8; \ + int i; \ + /* load pixel data */ \ + Uint32 *pixels = \ + (Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \ + \ + __m256i mm256_dst; \ + __m256i mask = \ + _mm256_set_epi32(0, pxl_excess > 6 ? -1 : 0, pxl_excess > 5 ? -1 : 0, \ + pxl_excess > 4 ? -1 : 0, pxl_excess > 3 ? -1 : 0, \ + pxl_excess > 2 ? -1 : 0, pxl_excess > 1 ? -1 : 0, \ + pxl_excess > 0 ? -1 : 0); \ + /* prep and load the color */ \ + Uint32 amask = surface->format->Amask; \ + if (amask) { \ + { \ + COLOR_PROCESS_CODE \ + } \ + } \ + __m256i mm256_color = _mm256_set1_epi32(color); + +#define RUN_AVX2_FILLER(FILL_CODE) \ + while (height--) { \ + for (i = 0; i < n_iters_8; i++) { \ + /* load 8 pixels */ \ + mm256_dst = _mm256_loadu_si256((__m256i *)pixels); \ + \ + {FILL_CODE} \ + \ + /* store 8 pixels */ \ + _mm256_storeu_si256((__m256i *)pixels, mm256_dst); \ + \ + pixels += 8; \ + } \ + \ + if (pxl_excess) { \ + /* load up to 7 pixels */ \ + mm256_dst = _mm256_maskload_epi32((int *)pixels, mask); \ + \ + {FILL_CODE} \ + \ + /* store up to 7 pixels */ \ + _mm256_maskstore_epi32((int *)pixels, mask, mm256_dst); \ + \ + pixels += pxl_excess; \ + } \ + \ + pixels += skip; \ + } + +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +int +surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color) +{ + SETUP_AVX2_FILLER({ color &= ~amask; }) + RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); }); + return 0; +} + +int +surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color) +{ + SETUP_AVX2_FILLER({}) + RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); }); + return 0; +} +#else +int +surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color) +{ + BAD_AVX2_FUNCTION_CALL; + return -1; +} + +int +surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color) +{ + BAD_AVX2_FUNCTION_CALL; + return -1; +} +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ diff --git a/src_c/surface_fill.c b/src_c/surface_fill.c index 3c28fbbe66..a5fb5adf0b 100644 --- a/src_c/surface_fill.c +++ b/src_c/surface_fill.c @@ -18,7 +18,8 @@ */ #define NO_PYGAME_C_API -#include "_surface.h" + +#include "simd_fill.h" /* * Changes SDL_Rect to respect any clipping rect defined on the surface. @@ -866,6 +867,14 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color, switch (blendargs) { case PYGAME_BLEND_ADD: { +#if !defined(__EMSCRIPTEN__) +#if SDL_BYTEORDER == SDL_LIL_ENDIAN + if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) { + result = surface_fill_blend_add_avx2(surface, rect, color); + break; + } +#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ +#endif /* __EMSCRIPTEN__ */ result = surface_fill_blend_add(surface, rect, color); break; } @@ -887,6 +896,15 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color, } case PYGAME_BLEND_RGBA_ADD: { +#if !defined(__EMSCRIPTEN__) +#if SDL_BYTEORDER == SDL_LIL_ENDIAN + if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) { + result = + surface_fill_blend_rgba_add_avx2(surface, rect, color); + break; + } +#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ +#endif /* __EMSCRIPTEN__ */ result = surface_fill_blend_rgba_add(surface, rect, color); break; }