Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX Surface.fill() setup, AVX BLEND_ADD #2382

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion buildconfig/Setup.Android.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion buildconfig/Setup.Emscripten.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ _sdl2.controller_old src_c/void.c
#_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c
_sdl2.touch src_c/void.c

#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
transform src_c/void.c


2 changes: 1 addition & 1 deletion buildconfig/Setup.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@

import distutils.ccompiler

avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2']
avx2_filenames = ['simd_blitters_avx2', 'simd_transform_avx2', 'simd_surface_fill_avx2']

compiler_options = {
'unix': ('-mavx2',),
Expand Down
13 changes: 13 additions & 0 deletions src_c/simd_fill.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#define NO_PYGAME_C_API
#include "_surface.h"

int
_pg_has_avx2();

// AVX2 functions
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
121 changes: 121 additions & 0 deletions src_c/simd_surface_fill_avx2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#include "simd_fill.h"

#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H)
#include <immintrin.h>
#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */

#define BAD_AVX2_FUNCTION_CALL \
printf( \
"Fatal Error: Attempted calling an AVX2 function when both compile " \
"time and runtime support is missing. If you are seeing this " \
"message, you have stumbled across a pygame bug, please report it " \
"to the devs!"); \
PG_EXIT(1)

/* helper function that does a runtime check for AVX2. It has the added
* functionality of also returning 0 if compile time support is missing */
int
_pg_has_avx2()
{
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
return SDL_HasAVX2();
#else
return 0;
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
}

#define SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
/* initialize surface data */ \
int width = rect->w, height = rect->h; \
int skip = surface->pitch / 4 - width; \
/* indicates the number of pixels that can't be processed in 8-pixel \
* blocks */ \
int pxl_excess = width % 8; \
/* indicates the number of 8-pixel blocks that can be processed */ \
int n_iters_8 = width / 8; \
int i; \
/* load pixel data */ \
Uint32 *pixels = \
(Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \
\
__m256i mm256_dst; \
__m256i mask = \
_mm256_set_epi32(0, pxl_excess > 6 ? -1 : 0, pxl_excess > 5 ? -1 : 0, \
pxl_excess > 4 ? -1 : 0, pxl_excess > 3 ? -1 : 0, \
pxl_excess > 2 ? -1 : 0, pxl_excess > 1 ? -1 : 0, \
pxl_excess > 0 ? -1 : 0); \
/* prep and load the color */ \
Uint32 amask = surface->format->Amask; \
if (amask) { \
{ \
COLOR_PROCESS_CODE \
} \
} \
__m256i mm256_color = _mm256_set1_epi32(color);

#define RUN_AVX2_FILLER(FILL_CODE) \
while (height--) { \
for (i = 0; i < n_iters_8; i++) { \
/* load 8 pixels */ \
mm256_dst = _mm256_loadu_si256((__m256i *)pixels); \
\
{FILL_CODE} \
\
/* store 8 pixels */ \
_mm256_storeu_si256((__m256i *)pixels, mm256_dst); \
\
pixels += 8; \
} \
\
if (pxl_excess) { \
/* load up to 7 pixels */ \
mm256_dst = _mm256_maskload_epi32((int *)pixels, mask); \
\
{FILL_CODE} \
\
/* store up to 7 pixels */ \
_mm256_maskstore_epi32((int *)pixels, mask, mm256_dst); \
\
pixels += pxl_excess; \
} \
\
pixels += skip; \
}

#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
{
SETUP_AVX2_FILLER({ color &= ~amask; })
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
return 0;
}

int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color)
{
SETUP_AVX2_FILLER({})
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
return 0;
}
#else
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
{
BAD_AVX2_FUNCTION_CALL;
return -1;
}

int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color)
{
BAD_AVX2_FUNCTION_CALL;
return -1;
}
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
20 changes: 19 additions & 1 deletion src_c/surface_fill.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
*/

#define NO_PYGAME_C_API
#include "_surface.h"

#include "simd_fill.h"

/*
* Changes SDL_Rect to respect any clipping rect defined on the surface.
Expand Down Expand Up @@ -866,6 +867,14 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,

switch (blendargs) {
case PYGAME_BLEND_ADD: {
#if !defined(__EMSCRIPTEN__)
dr0id marked this conversation as resolved.
Show resolved Hide resolved
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
result = surface_fill_blend_add_avx2(surface, rect, color);
break;
}
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
#endif /* __EMSCRIPTEN__ */
result = surface_fill_blend_add(surface, rect, color);
break;
}
Expand All @@ -887,6 +896,15 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
}

case PYGAME_BLEND_RGBA_ADD: {
#if !defined(__EMSCRIPTEN__)
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
result =
surface_fill_blend_rgba_add_avx2(surface, rect, color);
break;
}
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
#endif /* __EMSCRIPTEN__ */
result = surface_fill_blend_rgba_add(surface, rect, color);
break;
}
Expand Down
Loading