Skip to content

Commit

Permalink
AVX Surface.fill() setup, AVX BLEND_ADD (#2382)
Browse files Browse the repository at this point in the history
* optimize the BLEND_ADD flag when used in surface.fill through AVX2
  • Loading branch information
itzpr3d4t0r authored Nov 12, 2023
1 parent 12be64b commit 3ac78fc
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 5 deletions.
2 changes: 1 addition & 1 deletion buildconfig/Setup.Android.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion buildconfig/Setup.Emscripten.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ _sdl2.controller_old src_c/void.c
#_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c
_sdl2.touch src_c/void.c

#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
transform src_c/void.c


2 changes: 1 addition & 1 deletion buildconfig/Setup.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@

import distutils.ccompiler

avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2']
avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2', 'simd_surface_fill_avx2']

compiler_options = {
'unix': ('-mavx2',),
Expand Down
13 changes: 13 additions & 0 deletions src_c/simd_fill.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#define NO_PYGAME_C_API
#include "_surface.h"

int
_pg_has_avx2();

// AVX2 functions
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
121 changes: 121 additions & 0 deletions src_c/simd_surface_fill_avx2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#include "simd_fill.h"

#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H)
#include <immintrin.h>
#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */

#define BAD_AVX2_FUNCTION_CALL \
printf( \
"Fatal Error: Attempted calling an AVX2 function when both compile " \
"time and runtime support is missing. If you are seeing this " \
"message, you have stumbled across a pygame bug, please report it " \
"to the devs!"); \
PG_EXIT(1)

/* helper function that does a runtime check for AVX2. It has the added
* functionality of also returning 0 if compile time support is missing */
int
_pg_has_avx2()
{
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
return SDL_HasAVX2();
#else
return 0;
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
}

#define SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
/* initialize surface data */ \
int width = rect->w, height = rect->h; \
int skip = surface->pitch / 4 - width; \
/* indicates the number of pixels that can't be processed in 8-pixel \
* blocks */ \
int pxl_excess = width % 8; \
/* indicates the number of 8-pixel blocks that can be processed */ \
int n_iters_8 = width / 8; \
int i; \
/* load pixel data */ \
Uint32 *pixels = \
(Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \
\
__m256i mm256_dst; \
__m256i mask = \
_mm256_set_epi32(0, pxl_excess > 6 ? -1 : 0, pxl_excess > 5 ? -1 : 0, \
pxl_excess > 4 ? -1 : 0, pxl_excess > 3 ? -1 : 0, \
pxl_excess > 2 ? -1 : 0, pxl_excess > 1 ? -1 : 0, \
pxl_excess > 0 ? -1 : 0); \
/* prep and load the color */ \
Uint32 amask = surface->format->Amask; \
if (amask) { \
{ \
COLOR_PROCESS_CODE \
} \
} \
__m256i mm256_color = _mm256_set1_epi32(color);

#define RUN_AVX2_FILLER(FILL_CODE) \
while (height--) { \
for (i = 0; i < n_iters_8; i++) { \
/* load 8 pixels */ \
mm256_dst = _mm256_loadu_si256((__m256i *)pixels); \
\
{FILL_CODE} \
\
/* store 8 pixels */ \
_mm256_storeu_si256((__m256i *)pixels, mm256_dst); \
\
pixels += 8; \
} \
\
if (pxl_excess) { \
/* load up to 7 pixels */ \
mm256_dst = _mm256_maskload_epi32((int *)pixels, mask); \
\
{FILL_CODE} \
\
/* store up to 7 pixels */ \
_mm256_maskstore_epi32((int *)pixels, mask, mm256_dst); \
\
pixels += pxl_excess; \
} \
\
pixels += skip; \
}

#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
{
SETUP_AVX2_FILLER({ color &= ~amask; })
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
return 0;
}

int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color)
{
SETUP_AVX2_FILLER({})
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
return 0;
}
#else
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
{
BAD_AVX2_FUNCTION_CALL;
return -1;
}

int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color)
{
BAD_AVX2_FUNCTION_CALL;
return -1;
}
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
20 changes: 19 additions & 1 deletion src_c/surface_fill.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
*/

#define NO_PYGAME_C_API
#include "_surface.h"

#include "simd_fill.h"

/*
* Changes SDL_Rect to respect any clipping rect defined on the surface.
Expand Down Expand Up @@ -866,6 +867,14 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,

switch (blendargs) {
case PYGAME_BLEND_ADD: {
#if !defined(__EMSCRIPTEN__)
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
result = surface_fill_blend_add_avx2(surface, rect, color);
break;
}
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
#endif /* __EMSCRIPTEN__ */
result = surface_fill_blend_add(surface, rect, color);
break;
}
Expand All @@ -887,6 +896,15 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
}

case PYGAME_BLEND_RGBA_ADD: {
#if !defined(__EMSCRIPTEN__)
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
result =
surface_fill_blend_rgba_add_avx2(surface, rect, color);
break;
}
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
#endif /* __EMSCRIPTEN__ */
result = surface_fill_blend_rgba_add(surface, rect, color);
break;
}
Expand Down

0 comments on commit 3ac78fc

Please sign in to comment.