diff --git a/docs/reST/ref/transform.rst b/docs/reST/ref/transform.rst index 9823f42bf1..81002f8c90 100644 --- a/docs/reST/ref/transform.rst +++ b/docs/reST/ref/transform.rst @@ -334,6 +334,9 @@ Instead, always begin with the original image and scale to the desired size.) .. versionadded:: 2.1.4 + .. versionchanged:: 2.4.0 Adjusted formula slightly to support performance optimisation. It may return very slightly + different pixels than before, but should run seven to eleven times faster on most systems. + .. ## pygame.transform.grayscale ## .. function:: threshold diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h index 4f25905873..2d4951e488 100644 --- a/src_c/simd_transform.h +++ b/src_c/simd_transform.h @@ -1,6 +1,21 @@ #define NO_PYGAME_C_API #include "_surface.h" +/** + * MACRO borrowed from SSE2NEON - useful for making the shuffling family of + * intrinsics easier to understand by indicating clearly what will go where. + * + * SSE2Neon description follows... + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _PG_SIMD_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + #if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__) // arm64 has neon optimisations enabled by default, even when fpu=neon is not // passed @@ -10,6 +25,8 @@ // SSE2 functions #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) +void +grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf); // smoothscale filters void filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, @@ -27,3 +44,5 @@ filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */ // AVX2 functions +void +grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf); diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index f1889f1bb7..5be6f9863f 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -42,3 +42,183 @@ pg_avx2_at_runtime_but_uncompiled() } return 0; } + +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +void +grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) +{ + /* See the SSE2 code for a simpler overview of this algorithm + * Current AVX2 process + * ------------------ + * - pre loop: Load weights into register x8 + * - in loop: + * 1. Load 8 pixels into register + * 2. remove the alpha channel for every pixel and save it. + * 3. multiply weights by pixels using standard shuffle to 2x 16bit + * register, mul + 255 then left shift. See multiply blitter mode + * for this operation in isolation. + * 4. pack pixels back together from A & B while adding with a + * horizontal add (e.g. adds A+R and G+B in a ARGB layout) + * 5. shift and add to make final grey pixel colour in 0th + * 8Bit channel in each 'pixel' + * 6. shuffle again to push the grey from the 0th channel into every + * channel of every pixel. + * 7. add the alpha channel back in. + */ + int s_row_skip = (src->pitch - src->w * 4) / 4; + + // generate number of batches of pixels we need to loop through + int pixel_batch_length = src->w * src->h; + int num_batches = 1; + if (s_row_skip > 0) { + pixel_batch_length = src->w; + num_batches = src->h; + } + + int remaining_pixels = pixel_batch_length % 8; + int perfect_8_pixels = pixel_batch_length / 8; + + int perfect_8_pixels_batch_counter = perfect_8_pixels; + int remaining_pixels_batch_counter = remaining_pixels; + + Uint32 *srcp = (Uint32 *)src->pixels; + Uint32 *dstp = (Uint32 *)newsurf->pixels; + + Uint32 amask = src->format->Amask; + Uint32 rgbmask = ~amask; + + int rgb_weights = + ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) | + (0x1D << src->format->Bshift)); + + __m256i *srcp256 = (__m256i *)src->pixels; + __m256i *dstp256 = (__m256i *)newsurf->pixels; + + __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA, + mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B, + mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray, + mm256_alpha, mm256_rgb_mask, mm256_alpha_mask, + mm256_shuffled_weights_A, mm256_shuffled_weights_B; + + mm256_shuff_mask_A = + _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80, + 18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5, + 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0); + mm256_shuff_mask_B = + _mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80, + 26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13, + 0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8); + + mm256_shuff_mask_gray = _mm256_set_epi8( + 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, + 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + + mm256_two_five_fives = _mm256_set1_epi16(0x00FF); + mm256_rgb_weights = _mm256_set1_epi32(rgb_weights); + mm256_rgb_mask = _mm256_set1_epi32(rgbmask); + mm256_alpha_mask = _mm256_set1_epi32(amask); + + mm256_shuffled_weights_A = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A); + mm256_shuffled_weights_B = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B); + + __m256i _partial8_mask = _mm256_set_epi32( + 0, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0, + (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0, + (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0, + (remaining_pixels > 0) ? -1 : 0); + + while (num_batches--) { + perfect_8_pixels_batch_counter = perfect_8_pixels; + remaining_pixels_batch_counter = remaining_pixels; + while (perfect_8_pixels_batch_counter--) { + mm256_src = _mm256_loadu_si256(srcp256); + // strip out the the alpha and store it + mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); + + // shuffle out the 8 pixels into two spaced out registers + // there are four pixels in each register with 16bits of room + // per channel. This gives us bit space for multiplication. + mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); + mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); + + // Do the 'percentage multiplications' with the weights + // with accuracy correction so values like 255 * '255' + // (here effectively 1.0) = 255 and not 254. + // For our greyscale this should mean 255 white stays 255 white + // after greyscaling. + mm256_dstA = + _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A); + mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives); + mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8); + + mm256_dstB = + _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B); + mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives); + mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8); + + // Add up weighted R+G+B into the first channel of each of the 8 + // pixels. This is the grey value we want in all our colour + // channels. + mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB); + mm256_dst = + _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); + // Shuffle the grey value from ther first channel of each pixel + // into every channel of each pixel + mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); + + // Add the alpha back + mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask); + mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); + + _mm256_storeu_si256(dstp256, mm256_dst); + + srcp256++; + dstp256++; + } + srcp = (Uint32 *)srcp256; + dstp = (Uint32 *)dstp256; + if (remaining_pixels_batch_counter > 0) { + mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask); + mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); + + mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); + mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); + + mm256_dstA = + _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A); + mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives); + mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8); + + mm256_dstB = + _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B); + mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives); + mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8); + + mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB); + mm256_dst = + _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); + mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); + + mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask); + mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); + + _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst); + + srcp += remaining_pixels_batch_counter; + dstp += remaining_pixels_batch_counter; + } + srcp += s_row_skip; + srcp256 = (__m256i *)srcp; + } +} +#else +void +grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) +{ + BAD_AVX2_FUNCTION_CALL; +} +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index e8f8b76696..35689ac72d 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -43,6 +43,15 @@ pg_neon_at_runtime_but_uncompiled() #define _pg_storeu_si32(p, a) (void)(*(int *)(p) = _mm_cvtsi128_si32((a))) #define _pg_storeu_si64(p, a) (_mm_storel_epi64((__m128i *)(p), (a))) +#if defined(ENV64BIT) +#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num) +#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(reg) +#else +#define LOAD_64_INTO_M128(num, reg) \ + *reg = _mm_loadl_epi64((const __m128i *)num) +#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, reg) +#endif + void filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth) @@ -413,4 +422,192 @@ filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, } } +void +grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) +{ + /* For the SSE2 SIMD version of grayscale we do one pixel at a time + * Thus we can calculate the number of loops (and pixels) by multiplying + * the width of the surface to be grayscaled, by the height of that + * surface. + * + * We also need to calculate a 'skip value' in case our surface's rows are + * not contiguous in memory. For surfaces, a single row's worth of pixel + * data is always contiguous (i.e. each pixel is next to each other). + * However, a surface's rows may be seperated from one another in memory, + * most commonly this happens with sub surfaces. + * The vast majority of surfaces used in applications will probably also + * have contiguous rows as that is what happens when you create a standard + * 32bit surface with pygame.Surface. SIMD Transform algorithms, + * should treat this 'most normal' case as the critical path to maximise + * performance. + */ + int s_row_skip = (src->pitch - src->w * 4) / 4; + + // generate number of batches of pixels we need to loop through + int pixel_batch_length = src->w * src->h; + int num_batches = 1; + if (s_row_skip > 0) { + pixel_batch_length = src->w; + num_batches = src->h; + } + int remaining_pixels = pixel_batch_length % 2; + int perfect_2_pixels = pixel_batch_length / 2; + + int perfect_2_pixels_batch_counter = perfect_2_pixels; + int remaining_pixels_batch_counter = remaining_pixels; + + Uint32 *srcp = (Uint32 *)src->pixels; + Uint32 *dstp = (Uint32 *)newsurf->pixels; + + Uint64 amask64 = ((Uint64)src->format->Amask) | src->format->Amask; + Uint64 rgbmask64 = ~amask64; + + Uint64 rgb_weights = + ((Uint64)((0x4C << src->format->Rshift) | + (0x96 << src->format->Gshift) | + (0x1D << src->format->Bshift)) + << 32) | + ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) | + (0x1D << src->format->Bshift)); + + Uint64 *srcp64 = (Uint64 *)src->pixels; + Uint64 *dstp64 = (Uint64 *)newsurf->pixels; + + __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives, + mm_rgb_weights, mm_alpha_mask, mm_rgb_mask; + + mm_zero = _mm_setzero_si128(); + LOAD_64_INTO_M128(&amask64, &mm_alpha_mask); + LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask); + mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF); + + LOAD_64_INTO_M128(&rgb_weights, &mm_rgb_weights); + mm_rgb_weights = _mm_unpacklo_epi8(mm_rgb_weights, mm_zero); + + while (num_batches--) { + perfect_2_pixels_batch_counter = perfect_2_pixels; + remaining_pixels_batch_counter = remaining_pixels; + while (perfect_2_pixels_batch_counter--) { + LOAD_64_INTO_M128(srcp64, &mm_src); + /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/ + /* First we strip out the alpha so we have one of our 4 channels + empty for the rest of the calculation */ + mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask); + /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/ + + /* This is where we do the efficient 8bit 'floating point multiply' + operation of each channel by the weights - using a 16bit integer + multiply, an add and a bitshift. We use this trick repeatedly + for multiplication by a 0 to 1 value in SIMD code. + */ + mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); + /*mm_src = 0x000000RR00GG00BB000000RR00GG00BB*/ + mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); + /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/ + mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); + /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/ + mm_dst = _mm_srli_epi16(mm_dst, 8); + /*mm_dst = 0x000000RR00GG00BB000000RR00GG00BB*/ + + /* now we have the multiplied channels we 'shuffle them out' one + * at a time so there are four copies of red, four copies of green, + * four copies of blue etc. Then we add all these together + * so each of channels contains R+G+B. + */ + mm_dst = _mm_adds_epu8( + _mm_adds_epu8(_mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)), + _PG_SIMD_SHUFFLE(0, 0, 0, 0)), + _mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1)), + _PG_SIMD_SHUFFLE(1, 1, 1, 1))), + _mm_adds_epu8(_mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)), + _PG_SIMD_SHUFFLE(2, 2, 2, 2)), + _mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(3, 3, 3, 3)), + _PG_SIMD_SHUFFLE(3, 3, 3, 3)))); + /* Gr here stands for 'Gray' as we've now added all the channels + * back together after multiplying them above. + * mm_dst = 0x0000GrGr00GrGr00GrGr00GrGr0000GrGr00GrGr00GrGr00GrGr + */ + + /* The rest is just packing the grayscale back to the original + * 8bit pixel layout and adding the alpha we removed earlier back + * in again + */ + mm_dst = _mm_packus_epi16(mm_dst, mm_dst); + /*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/ + mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask); + mm_dst = _mm_or_si128(mm_dst, mm_alpha); + /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/ + STORE_M128_INTO_64(mm_dst, dstp64); + /*dstp = 0xAARRGGBB*/ + srcp64++; + dstp64++; + } + srcp = (Uint32 *)srcp64; + dstp = (Uint32 *)dstp64; + if (remaining_pixels_batch_counter > 0) { + mm_src = _mm_cvtsi32_si128(*srcp); + /*mm_src = 0x000000000000000000000000AARRGGBB*/ + /* First we strip out the alpha so we have one of our 4 channels + empty for the rest of the calculation */ + mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask); + /*mm_src = 0x00000000000000000000000000RRGGBB*/ + + /* This is where we do the efficient 8bit 'floating point multiply' + operation of each channel by the weights - using a 16bit integer + multiply, an add and a bitshift. We use this trick repeatedly + for multiplication by a 0 to 1 value in SIMD code. + */ + mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); + /*mm_src = 0x0000000000000000000000RR00GG00BB*/ + mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); + /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ + mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); + /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ + mm_dst = _mm_srli_epi16(mm_dst, 8); + /*mm_dst = 0x0000000000000000000000RR00GG00BB*/ + + /* now we have the multiplied channels we 'shuffle them out' one + * at a time so there are four copies of red, four copies of green, + * four copies of blue etc. Then we add all these together + * so each of channels contains R+G+B. + */ + mm_dst = _mm_adds_epu8( + _mm_adds_epu8( + _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)), + _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1))), + _mm_adds_epu8( + _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)), + _mm_shufflelo_epi16(mm_dst, + _PG_SIMD_SHUFFLE(3, 3, 3, 3)))); + /* Gr here stands for 'Gray' as we've now added all the channels + * back together after multiplying them above. + * mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr + */ + + /* The rest is just packing the grayscale back to the original + * 8bit pixel layout and adding the alpha we removed earlier back + * in again + */ + mm_dst = _mm_packus_epi16(mm_dst, mm_dst); + /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/ + mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask); + mm_dst = _mm_or_si128(mm_dst, mm_alpha); + /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ + *dstp = _mm_cvtsi128_si32(mm_dst); + /*dstp = 0xAARRGGBB*/ + srcp++; + dstp++; + } + srcp += s_row_skip; + srcp64 = (Uint64 *)srcp; + } +} #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ diff --git a/src_c/transform.c b/src_c/transform.c index b16554ab0a..f4fcf51405 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -2059,6 +2059,36 @@ clamp_4 #endif +void +grayscale_non_simd(SDL_Surface *src, SDL_Surface *newsurf) +{ + int x, y; + for (y = 0; y < src->h; y++) { + for (x = 0; x < src->w; x++) { + Uint32 pixel; + Uint8 *pix; + SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format, + pix); + Uint8 r, g, b, a; + SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); + + /* RGBA to GRAY formula used by OpenCV + * We are using a bitshift and integer addition to align the + * calculation with what is fastest for SIMD operations. + * Results are almost identical to floating point multiplication. + */ + Uint8 grayscale_pixel = + (Uint8)((((76 * r) + 255) >> 8) + (((150 * g) + 255) >> 8) + + (((29 * b) + 255) >> 8)); + Uint32 new_pixel = + SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel, + grayscale_pixel, a); + SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels, + newsurf->format, pix); + } + } +} + SDL_Surface * grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) { @@ -2085,26 +2115,30 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) PyExc_ValueError, "Source and destination surfaces need the same format.")); } - - int x, y; - for (y = 0; y < src->h; y++) { - for (x = 0; x < src->w; x++) { - Uint32 pixel; - Uint8 *pix; - SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format, - pix); - Uint8 r, g, b, a; - SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); - - // RGBA to GRAY formula used by OpenCV - Uint8 grayscale_pixel = (Uint8)(0.299 * r + 0.587 * g + 0.114 * b); - Uint32 new_pixel = - SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel, - grayscale_pixel, a); - SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels, - newsurf->format, pix); +#if defined(__EMSCRIPTEN__) + grayscale_non_simd(src, newsurf); +#else // !defined(__EMSCRIPTEN__) + if (src->format->BytesPerPixel == 4 && + src->format->Rmask == newsurf->format->Rmask && + src->format->Gmask == newsurf->format->Gmask && + src->format->Bmask == newsurf->format->Bmask && + (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) { + if (pg_has_avx2()) { + grayscale_avx2(src, newsurf); } +#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) + else if (pg_HasSSE_NEON()) { + grayscale_sse2(src, newsurf); + } +#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) + else { + grayscale_non_simd(src, newsurf); + } + } + else { + grayscale_non_simd(src, newsurf); } +#endif // !defined(__EMSCRIPTEN__) SDL_UnlockSurface(newsurf); diff --git a/test/transform_test.py b/test/transform_test.py index 982343ae9d..d25cc4449a 100644 --- a/test/transform_test.py +++ b/test/transform_test.py @@ -174,10 +174,24 @@ def test_grayscale(self): s = pygame.Surface((32, 32)) s.fill((255, 0, 0)) - s2 = pygame.transform.grayscale(s) - self.assertEqual(pygame.transform.average_color(s2)[0], 76) - self.assertEqual(pygame.transform.average_color(s2)[1], 76) - self.assertEqual(pygame.transform.average_color(s2)[2], 76) + gray_red = pygame.transform.grayscale(s) + self.assertEqual(pygame.transform.average_color(gray_red)[0], 76) + self.assertEqual(pygame.transform.average_color(gray_red)[1], 76) + self.assertEqual(pygame.transform.average_color(gray_red)[2], 76) + + green_surf = pygame.Surface((32, 32)) + green_surf.fill((0, 255, 0)) + gray_green = pygame.transform.grayscale(green_surf) + self.assertEqual(pygame.transform.average_color(gray_green)[0], 150) + self.assertEqual(pygame.transform.average_color(gray_green)[1], 150) + self.assertEqual(pygame.transform.average_color(gray_green)[2], 150) + + blue_surf = pygame.Surface((32, 32)) + blue_surf.fill((0, 0, 255)) + blue_green = pygame.transform.grayscale(blue_surf) + self.assertEqual(pygame.transform.average_color(blue_green)[0], 29) + self.assertEqual(pygame.transform.average_color(blue_green)[1], 29) + self.assertEqual(pygame.transform.average_color(blue_green)[2], 29) dest = pygame.Surface((32, 32), depth=32) pygame.transform.grayscale(s, dest) @@ -188,16 +202,16 @@ def test_grayscale(self): dest = pygame.Surface((32, 32), depth=32) s.fill((34, 12, 65)) pygame.transform.grayscale(s, dest) - self.assertEqual(pygame.transform.average_color(dest)[0], 24) - self.assertEqual(pygame.transform.average_color(dest)[1], 24) - self.assertEqual(pygame.transform.average_color(dest)[2], 24) + self.assertEqual(pygame.transform.average_color(dest)[0], 27) + self.assertEqual(pygame.transform.average_color(dest)[1], 27) + self.assertEqual(pygame.transform.average_color(dest)[2], 27) dest = pygame.Surface((32, 32), depth=32) s.fill((123, 123, 123)) pygame.transform.grayscale(s, dest) - self.assertIn(pygame.transform.average_color(dest)[0], [123, 122]) - self.assertIn(pygame.transform.average_color(dest)[1], [123, 122]) - self.assertIn(pygame.transform.average_color(dest)[2], [123, 122]) + self.assertIn(pygame.transform.average_color(dest)[0], [124, 122]) + self.assertIn(pygame.transform.average_color(dest)[1], [124, 122]) + self.assertIn(pygame.transform.average_color(dest)[2], [124, 122]) s = pygame.Surface((32, 32), depth=24) s.fill((255, 0, 0)) @@ -215,6 +229,26 @@ def test_grayscale(self): self.assertEqual(pygame.transform.average_color(dest)[1], 76) self.assertEqual(pygame.transform.average_color(dest)[2], 72) + super_surf = pygame.Surface((64, 64), depth=32) + super_surf.fill((255, 255, 255)) + super_surf.fill((255, 0, 0), pygame.Rect(0, 0, 32, 32)) + sub_surf = super_surf.subsurface(pygame.Rect(0, 0, 32, 32)) + + grey_sub_surf = pygame.transform.grayscale(sub_surf) + self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + + def test_grayscale_simd_assumptions(self): + # The grayscale SIMD algorithm relies on the destination surface pitch + # being exactly width * 4 (4 bytes per pixel), for maximum speed. + # This test is here to make sure that assumption is always true. + widths = [1, 5, 6, 23, 54, 233] + for width in widths: + self.assertEqual( + pygame.Surface((width, 1), depth=32).get_pitch(), width * 4 + ) + def test_threshold__honors_third_surface(self): # __doc__ for threshold as of Tue 07/15/2008