Skip to content

Commit 3ac78fc

Browse files
authored
AVX Surface.fill() setup, AVX BLEND_ADD (#2382)
* optimize the BLEND_ADD flag when used in surface.fill through AVX2
1 parent 12be64b commit 3ac78fc

File tree

7 files changed

+157
-5
lines changed

7 files changed

+157
-5
lines changed

buildconfig/Setup.Android.SDL2.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ key src_c/key.c $(SDL) $(DEBUG)
5050
mouse src_c/mouse.c $(SDL) $(DEBUG)
5151
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
5252
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
53-
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
53+
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
5454
surflock src_c/surflock.c $(SDL) $(DEBUG)
5555
time src_c/time.c $(SDL) $(DEBUG)
5656
joystick src_c/joystick.c $(SDL) $(DEBUG)

buildconfig/Setup.Emscripten.SDL2.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ _sdl2.controller_old src_c/void.c
6969
#_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c
7070
_sdl2.touch src_c/void.c
7171

72-
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
72+
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
7373
transform src_c/void.c
7474

7575

buildconfig/Setup.SDL2.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ key src_c/key.c $(SDL) $(DEBUG)
6060
mouse src_c/mouse.c $(SDL) $(DEBUG)
6161
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
6262
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
63-
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
63+
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
6464
surflock src_c/surflock.c $(SDL) $(DEBUG)
6565
time src_c/time.c $(SDL) $(DEBUG)
6666
joystick src_c/joystick.c $(SDL) $(DEBUG)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272

7373
import distutils.ccompiler
7474

75-
avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2']
75+
avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2', 'simd_surface_fill_avx2']
7676

7777
compiler_options = {
7878
'unix': ('-mavx2',),

src_c/simd_fill.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#define NO_PYGAME_C_API
2+
#include "_surface.h"
3+
4+
int
5+
_pg_has_avx2();
6+
7+
// AVX2 functions
8+
int
9+
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
10+
Uint32 color);
11+
int
12+
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
13+
Uint32 color);

src_c/simd_surface_fill_avx2.c

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#include "simd_fill.h"
2+
3+
#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H)
4+
#include <immintrin.h>
5+
#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */
6+
7+
#define BAD_AVX2_FUNCTION_CALL \
8+
printf( \
9+
"Fatal Error: Attempted calling an AVX2 function when both compile " \
10+
"time and runtime support is missing. If you are seeing this " \
11+
"message, you have stumbled across a pygame bug, please report it " \
12+
"to the devs!"); \
13+
PG_EXIT(1)
14+
15+
/* helper function that does a runtime check for AVX2. It has the added
16+
* functionality of also returning 0 if compile time support is missing */
17+
int
18+
_pg_has_avx2()
19+
{
20+
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
21+
!defined(SDL_DISABLE_IMMINTRIN_H)
22+
return SDL_HasAVX2();
23+
#else
24+
return 0;
25+
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
26+
!defined(SDL_DISABLE_IMMINTRIN_H) */
27+
}
28+
29+
#define SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
30+
/* initialize surface data */ \
31+
int width = rect->w, height = rect->h; \
32+
int skip = surface->pitch / 4 - width; \
33+
/* indicates the number of pixels that can't be processed in 8-pixel \
34+
* blocks */ \
35+
int pxl_excess = width % 8; \
36+
/* indicates the number of 8-pixel blocks that can be processed */ \
37+
int n_iters_8 = width / 8; \
38+
int i; \
39+
/* load pixel data */ \
40+
Uint32 *pixels = \
41+
(Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \
42+
\
43+
__m256i mm256_dst; \
44+
__m256i mask = \
45+
_mm256_set_epi32(0, pxl_excess > 6 ? -1 : 0, pxl_excess > 5 ? -1 : 0, \
46+
pxl_excess > 4 ? -1 : 0, pxl_excess > 3 ? -1 : 0, \
47+
pxl_excess > 2 ? -1 : 0, pxl_excess > 1 ? -1 : 0, \
48+
pxl_excess > 0 ? -1 : 0); \
49+
/* prep and load the color */ \
50+
Uint32 amask = surface->format->Amask; \
51+
if (amask) { \
52+
{ \
53+
COLOR_PROCESS_CODE \
54+
} \
55+
} \
56+
__m256i mm256_color = _mm256_set1_epi32(color);
57+
58+
#define RUN_AVX2_FILLER(FILL_CODE) \
59+
while (height--) { \
60+
for (i = 0; i < n_iters_8; i++) { \
61+
/* load 8 pixels */ \
62+
mm256_dst = _mm256_loadu_si256((__m256i *)pixels); \
63+
\
64+
{FILL_CODE} \
65+
\
66+
/* store 8 pixels */ \
67+
_mm256_storeu_si256((__m256i *)pixels, mm256_dst); \
68+
\
69+
pixels += 8; \
70+
} \
71+
\
72+
if (pxl_excess) { \
73+
/* load up to 7 pixels */ \
74+
mm256_dst = _mm256_maskload_epi32((int *)pixels, mask); \
75+
\
76+
{FILL_CODE} \
77+
\
78+
/* store up to 7 pixels */ \
79+
_mm256_maskstore_epi32((int *)pixels, mask, mm256_dst); \
80+
\
81+
pixels += pxl_excess; \
82+
} \
83+
\
84+
pixels += skip; \
85+
}
86+
87+
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
88+
!defined(SDL_DISABLE_IMMINTRIN_H)
89+
int
90+
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
91+
{
92+
SETUP_AVX2_FILLER({ color &= ~amask; })
93+
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
94+
return 0;
95+
}
96+
97+
int
98+
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
99+
Uint32 color)
100+
{
101+
SETUP_AVX2_FILLER({})
102+
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
103+
return 0;
104+
}
105+
#else
106+
int
107+
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
108+
{
109+
BAD_AVX2_FUNCTION_CALL;
110+
return -1;
111+
}
112+
113+
int
114+
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
115+
Uint32 color)
116+
{
117+
BAD_AVX2_FUNCTION_CALL;
118+
return -1;
119+
}
120+
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
121+
!defined(SDL_DISABLE_IMMINTRIN_H) */

src_c/surface_fill.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
*/
1919

2020
#define NO_PYGAME_C_API
21-
#include "_surface.h"
21+
22+
#include "simd_fill.h"
2223

2324
/*
2425
* Changes SDL_Rect to respect any clipping rect defined on the surface.
@@ -866,6 +867,14 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
866867

867868
switch (blendargs) {
868869
case PYGAME_BLEND_ADD: {
870+
#if !defined(__EMSCRIPTEN__)
871+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
872+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
873+
result = surface_fill_blend_add_avx2(surface, rect, color);
874+
break;
875+
}
876+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
877+
#endif /* __EMSCRIPTEN__ */
869878
result = surface_fill_blend_add(surface, rect, color);
870879
break;
871880
}
@@ -887,6 +896,15 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
887896
}
888897

889898
case PYGAME_BLEND_RGBA_ADD: {
899+
#if !defined(__EMSCRIPTEN__)
900+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
901+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
902+
result =
903+
surface_fill_blend_rgba_add_avx2(surface, rect, color);
904+
break;
905+
}
906+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
907+
#endif /* __EMSCRIPTEN__ */
890908
result = surface_fill_blend_rgba_add(surface, rect, color);
891909
break;
892910
}

0 commit comments

Comments
 (0)