pygame-community · MyreMylar · Nov 12, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 10, 2023
@@ -334,6 +334,9 @@ Instead, always begin with the original image and scale to the desired size.)
 
    .. versionadded:: 2.1.4
 
+   .. versionchanged:: 2.4.0 Adjusted formula slightly to support performance optimisation. It may return very slightly
+                       different pixels than before, but should run seven to eleven times faster on most systems.
+
    .. ## pygame.transform.grayscale ##
 
 .. function:: threshold

@@ -1,6 +1,21 @@
 #define NO_PYGAME_C_API
 #include "_surface.h"
 
+/**
+ * MACRO borrowed from SSE2NEON - useful for making the shuffling family of
+ * intrinsics easier to understand by indicating clearly what will go where.
+ *
+ * SSE2Neon description follows...
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _PG_SIMD_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
 #if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__)
 // arm64 has neon optimisations enabled by default, even when fpu=neon is not
 // passed
@@ -10,6 +25,8 @@
 // SSE2 functions
 #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
 
+void
+grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf);
 // smoothscale filters
 void
 filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
@@ -27,3 +44,5 @@ filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
 #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
 
 // AVX2 functions
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf);
@@ -42,3 +42,183 @@ pg_avx2_at_runtime_but_uncompiled()
     }
     return 0;
 }
+
+#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+    !defined(SDL_DISABLE_IMMINTRIN_H)
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    /* See the SSE2 code for a simpler overview of this algorithm
+     * Current AVX2 process
+     * ------------------
+     * - pre loop: Load weights into register x8
+     * - in loop:
+     *     1. Load 8 pixels into register
+     *     2. remove the alpha channel for every pixel and save it.
+     *     3. multiply weights by pixels using standard shuffle to 2x 16bit
+     *        register, mul + 255 then left shift. See multiply blitter mode
+     *        for this operation in isolation.
+     *     4. pack pixels back together from A & B while adding with a
+     *        horizontal add (e.g. adds A+R and G+B in a ARGB layout)
+     *     5. shift and add to make final grey pixel colour in 0th
+     *        8Bit channel in each 'pixel'
+     *     6. shuffle again to push the grey from the 0th channel into every
+     *        channel of every pixel.
+     *     7. add the alpha channel back in.
+     */
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+
+    int remaining_pixels = pixel_batch_length % 8;
+    int perfect_8_pixels = pixel_batch_length / 8;
+
+    int perfect_8_pixels_batch_counter = perfect_8_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 amask = src->format->Amask;
+    Uint32 rgbmask = ~amask;
+
+    int rgb_weights =
+        ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
+         (0x1D << src->format->Bshift));
+
+    __m256i *srcp256 = (__m256i *)src->pixels;
+    __m256i *dstp256 = (__m256i *)newsurf->pixels;
+
+    __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA,
+        mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B,
+        mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray,
+        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask,
+        mm256_shuffled_weights_A, mm256_shuffled_weights_B;
+
+    mm256_shuff_mask_A =
+        _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
+                        18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5,
+                        0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
+    mm256_shuff_mask_B =
+        _mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80,
+                        26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13,
+                        0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
+
+    mm256_shuff_mask_gray = _mm256_set_epi8(
+        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12,
+        12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+
+    mm256_two_five_fives = _mm256_set1_epi16(0x00FF);
+    mm256_rgb_weights = _mm256_set1_epi32(rgb_weights);
+    mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
+    mm256_alpha_mask = _mm256_set1_epi32(amask);
+
+    mm256_shuffled_weights_A =
+        _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
+    mm256_shuffled_weights_B =
+        _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
+
+    __m256i _partial8_mask = _mm256_set_epi32(
+        0, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
+        (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
+        (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
+        (remaining_pixels > 0) ? -1 : 0);
+
+    while (num_batches--) {
+        perfect_8_pixels_batch_counter = perfect_8_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_8_pixels_batch_counter--) {
+            mm256_src = _mm256_loadu_si256(srcp256);
+            // strip out the the alpha and store it
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
+            // shuffle out the 8 pixels into two spaced out registers
+            // there are four pixels in each register with 16bits of room
+            // per channel. This gives us bit space for multiplication.
+            mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+            mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
+
+            // Do the 'percentage multiplications' with the weights
+            // with accuracy correction so values like 255 * '255'
+            // (here effectively 1.0) = 255 and not 254.
+            // For our greyscale this should mean 255 white stays 255 white
+            // after greyscaling.
+            mm256_dstA =
+                _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A);
+            mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
+            mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+
+            mm256_dstB =
+                _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B);
+            mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
+            mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
+
+            // Add up weighted R+G+B into the first channel of each of the 8
+            // pixels. This is the grey value we want in all our colour
+            // channels.
+            mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
+            mm256_dst =
+                _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            // Shuffle the grey value from ther first channel of each pixel
+            // into every channel of each pixel
+            mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
+
+            // Add the alpha back
+            mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
+
+            _mm256_storeu_si256(dstp256, mm256_dst);
+
+            srcp256++;
+            dstp256++;
+        }
+        srcp = (Uint32 *)srcp256;
+        dstp = (Uint32 *)dstp256;
+        if (remaining_pixels_batch_counter > 0) {
+            mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
+            mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+            mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
+
+            mm256_dstA =
+                _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A);
+            mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
+            mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+
+            mm256_dstB =
+                _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B);
+            mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
+            mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
+
+            mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
+            mm256_dst =
+                _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
+
+            mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
+
+            _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
+
+            srcp += remaining_pixels_batch_counter;
+            dstp += remaining_pixels_batch_counter;
+        }
+        srcp += s_row_skip;
+        srcp256 = (__m256i *)srcp;
+    }
+}
+#else
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    BAD_AVX2_FUNCTION_CALL;
+}
+#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+          !defined(SDL_DISABLE_IMMINTRIN_H) */