diff --git a/docs/reST/ref/transform.rst b/docs/reST/ref/transform.rst
index 9823f42bf1..81002f8c90 100644
--- a/docs/reST/ref/transform.rst
+++ b/docs/reST/ref/transform.rst
@@ -334,6 +334,9 @@ Instead, always begin with the original image and scale to the desired size.)
    
    .. versionadded:: 2.1.4
 
+   .. versionchanged:: 2.4.0 Adjusted formula slightly to support performance optimisation. It may return very slightly
+                       different pixels than before, but should run seven to eleven times faster on most systems.
+
    .. ## pygame.transform.grayscale ##
 
 .. function:: threshold
diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h
index 4f25905873..2d4951e488 100644
--- a/src_c/simd_transform.h
+++ b/src_c/simd_transform.h
@@ -1,6 +1,21 @@
 #define NO_PYGAME_C_API
 #include "_surface.h"
 
+/**
+ * MACRO borrowed from SSE2NEON - useful for making the shuffling family of
+ * intrinsics easier to understand by indicating clearly what will go where.
+ *
+ * SSE2Neon description follows...
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _PG_SIMD_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
 #if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__)
 // arm64 has neon optimisations enabled by default, even when fpu=neon is not
 // passed
@@ -10,6 +25,8 @@
 // SSE2 functions
 #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
 
+void
+grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf);
 // smoothscale filters
 void
 filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
@@ -27,3 +44,5 @@ filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
 #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
 
 // AVX2 functions
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf);
diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index f1889f1bb7..5be6f9863f 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -42,3 +42,183 @@ pg_avx2_at_runtime_but_uncompiled()
     }
     return 0;
 }
+
+#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+    !defined(SDL_DISABLE_IMMINTRIN_H)
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    /* See the SSE2 code for a simpler overview of this algorithm
+     * Current AVX2 process
+     * ------------------
+     * - pre loop: Load weights into register x8
+     * - in loop:
+     *     1. Load 8 pixels into register
+     *     2. remove the alpha channel for every pixel and save it.
+     *     3. multiply weights by pixels using standard shuffle to 2x 16bit
+     *        register, mul + 255 then left shift. See multiply blitter mode
+     *        for this operation in isolation.
+     *     4. pack pixels back together from A & B while adding with a
+     *        horizontal add (e.g. adds A+R and G+B in a ARGB layout)
+     *     5. shift and add to make final grey pixel colour in 0th
+     *        8Bit channel in each 'pixel'
+     *     6. shuffle again to push the grey from the 0th channel into every
+     *        channel of every pixel.
+     *     7. add the alpha channel back in.
+     */
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+
+    int remaining_pixels = pixel_batch_length % 8;
+    int perfect_8_pixels = pixel_batch_length / 8;
+
+    int perfect_8_pixels_batch_counter = perfect_8_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 amask = src->format->Amask;
+    Uint32 rgbmask = ~amask;
+
+    int rgb_weights =
+        ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
+         (0x1D << src->format->Bshift));
+
+    __m256i *srcp256 = (__m256i *)src->pixels;
+    __m256i *dstp256 = (__m256i *)newsurf->pixels;
+
+    __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA,
+        mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B,
+        mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray,
+        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask,
+        mm256_shuffled_weights_A, mm256_shuffled_weights_B;
+
+    mm256_shuff_mask_A =
+        _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
+                        18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5,
+                        0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
+    mm256_shuff_mask_B =
+        _mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80,
+                        26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13,
+                        0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
+
+    mm256_shuff_mask_gray = _mm256_set_epi8(
+        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12,
+        12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+
+    mm256_two_five_fives = _mm256_set1_epi16(0x00FF);
+    mm256_rgb_weights = _mm256_set1_epi32(rgb_weights);
+    mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
+    mm256_alpha_mask = _mm256_set1_epi32(amask);
+
+    mm256_shuffled_weights_A =
+        _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
+    mm256_shuffled_weights_B =
+        _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
+
+    __m256i _partial8_mask = _mm256_set_epi32(
+        0, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
+        (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
+        (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
+        (remaining_pixels > 0) ? -1 : 0);
+
+    while (num_batches--) {
+        perfect_8_pixels_batch_counter = perfect_8_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_8_pixels_batch_counter--) {
+            mm256_src = _mm256_loadu_si256(srcp256);
+            // strip out the the alpha and store it
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
+            // shuffle out the 8 pixels into two spaced out registers
+            // there are four pixels in each register with 16bits of room
+            // per channel. This gives us bit space for multiplication.
+            mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+            mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
+
+            // Do the 'percentage multiplications' with the weights
+            // with accuracy correction so values like 255 * '255'
+            // (here effectively 1.0) = 255 and not 254.
+            // For our greyscale this should mean 255 white stays 255 white
+            // after greyscaling.
+            mm256_dstA =
+                _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A);
+            mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
+            mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+
+            mm256_dstB =
+                _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B);
+            mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
+            mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
+
+            // Add up weighted R+G+B into the first channel of each of the 8
+            // pixels. This is the grey value we want in all our colour
+            // channels.
+            mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
+            mm256_dst =
+                _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            // Shuffle the grey value from ther first channel of each pixel
+            // into every channel of each pixel
+            mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
+
+            // Add the alpha back
+            mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
+
+            _mm256_storeu_si256(dstp256, mm256_dst);
+
+            srcp256++;
+            dstp256++;
+        }
+        srcp = (Uint32 *)srcp256;
+        dstp = (Uint32 *)dstp256;
+        if (remaining_pixels_batch_counter > 0) {
+            mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
+            mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+            mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
+
+            mm256_dstA =
+                _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A);
+            mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
+            mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+
+            mm256_dstB =
+                _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B);
+            mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
+            mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
+
+            mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
+            mm256_dst =
+                _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
+
+            mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
+
+            _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
+
+            srcp += remaining_pixels_batch_counter;
+            dstp += remaining_pixels_batch_counter;
+        }
+        srcp += s_row_skip;
+        srcp256 = (__m256i *)srcp;
+    }
+}
+#else
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    BAD_AVX2_FUNCTION_CALL;
+}
+#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+          !defined(SDL_DISABLE_IMMINTRIN_H) */
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index e8f8b76696..35689ac72d 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -43,6 +43,15 @@ pg_neon_at_runtime_but_uncompiled()
 #define _pg_storeu_si32(p, a) (void)(*(int *)(p) = _mm_cvtsi128_si32((a)))
 #define _pg_storeu_si64(p, a) (_mm_storel_epi64((__m128i *)(p), (a)))
 
+#if defined(ENV64BIT)
+#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
+#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(reg)
+#else
+#define LOAD_64_INTO_M128(num, reg) \
+    *reg = _mm_loadl_epi64((const __m128i *)num)
+#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, reg)
+#endif
+
 void
 filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
                      int dstpitch, int srcwidth, int dstwidth)
@@ -413,4 +422,192 @@ filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     }
 }
 
+void
+grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    /* For the SSE2 SIMD version of grayscale we do one pixel at a time
+     * Thus we can calculate the number of loops (and pixels) by multiplying
+     * the width of the surface to be grayscaled, by the height of that
+     * surface.
+     *
+     * We also need to calculate a 'skip value' in case our surface's rows are
+     * not contiguous in memory. For surfaces, a single row's worth of pixel
+     * data is always contiguous (i.e. each pixel is next to each other).
+     * However, a surface's rows may be seperated from one another in memory,
+     * most commonly this happens with sub surfaces.
+     * The vast majority of surfaces used in applications will probably also
+     * have contiguous rows as that is what happens when you create a standard
+     * 32bit surface with pygame.Surface. SIMD Transform algorithms,
+     * should treat this 'most normal' case as the critical path to maximise
+     * performance.
+     */
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+    int remaining_pixels = pixel_batch_length % 2;
+    int perfect_2_pixels = pixel_batch_length / 2;
+
+    int perfect_2_pixels_batch_counter = perfect_2_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint64 amask64 = ((Uint64)src->format->Amask) | src->format->Amask;
+    Uint64 rgbmask64 = ~amask64;
+
+    Uint64 rgb_weights =
+        ((Uint64)((0x4C << src->format->Rshift) |
+                  (0x96 << src->format->Gshift) |
+                  (0x1D << src->format->Bshift))
+         << 32) |
+        ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
+         (0x1D << src->format->Bshift));
+
+    Uint64 *srcp64 = (Uint64 *)src->pixels;
+    Uint64 *dstp64 = (Uint64 *)newsurf->pixels;
+
+    __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives,
+        mm_rgb_weights, mm_alpha_mask, mm_rgb_mask;
+
+    mm_zero = _mm_setzero_si128();
+    LOAD_64_INTO_M128(&amask64, &mm_alpha_mask);
+    LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask);
+    mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF);
+
+    LOAD_64_INTO_M128(&rgb_weights, &mm_rgb_weights);
+    mm_rgb_weights = _mm_unpacklo_epi8(mm_rgb_weights, mm_zero);
+
+    while (num_batches--) {
+        perfect_2_pixels_batch_counter = perfect_2_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_2_pixels_batch_counter--) {
+            LOAD_64_INTO_M128(srcp64, &mm_src);
+            /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/
+            /* First we strip out the alpha so we have one of our 4 channels
+               empty for the rest of the calculation */
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
+            /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/
+
+            /* This is where we do the efficient 8bit 'floating point multiply'
+               operation of each channel by the weights - using a 16bit integer
+               multiply, an add and a bitshift. We use this trick repeatedly
+               for multiplication by a 0 to 1 value in SIMD code.
+            */
+            mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
+            /*mm_src = 0x000000RR00GG00BB000000RR00GG00BB*/
+            mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
+            /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/
+            mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
+            /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/
+            mm_dst = _mm_srli_epi16(mm_dst, 8);
+            /*mm_dst = 0x000000RR00GG00BB000000RR00GG00BB*/
+
+            /* now we have the multiplied channels we 'shuffle them out' one
+             * at a time so there are four copies of red, four copies of green,
+             * four copies of blue etc. Then we add all these together
+             * so each of channels contains R+G+B.
+             */
+            mm_dst = _mm_adds_epu8(
+                _mm_adds_epu8(_mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
+                                  _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
+                              _mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1)),
+                                  _PG_SIMD_SHUFFLE(1, 1, 1, 1))),
+                _mm_adds_epu8(_mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
+                                  _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
+                              _mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(3, 3, 3, 3)),
+                                  _PG_SIMD_SHUFFLE(3, 3, 3, 3))));
+            /* Gr here stands for 'Gray' as we've now added all the channels
+             * back together after multiplying them above.
+             * mm_dst = 0x0000GrGr00GrGr00GrGr00GrGr0000GrGr00GrGr00GrGr00GrGr
+             */
+
+            /* The rest is just packing the grayscale back to the original
+             * 8bit pixel layout and adding the alpha we removed earlier back
+             * in again
+             */
+            mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
+            /*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/
+            mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
+            /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
+            STORE_M128_INTO_64(mm_dst, dstp64);
+            /*dstp = 0xAARRGGBB*/
+            srcp64++;
+            dstp64++;
+        }
+        srcp = (Uint32 *)srcp64;
+        dstp = (Uint32 *)dstp64;
+        if (remaining_pixels_batch_counter > 0) {
+            mm_src = _mm_cvtsi32_si128(*srcp);
+            /*mm_src = 0x000000000000000000000000AARRGGBB*/
+            /* First we strip out the alpha so we have one of our 4 channels
+               empty for the rest of the calculation */
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
+            /*mm_src = 0x00000000000000000000000000RRGGBB*/
+
+            /* This is where we do the efficient 8bit 'floating point multiply'
+               operation of each channel by the weights - using a 16bit integer
+               multiply, an add and a bitshift. We use this trick repeatedly
+               for multiplication by a 0 to 1 value in SIMD code.
+            */
+            mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
+            /*mm_src = 0x0000000000000000000000RR00GG00BB*/
+            mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
+            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
+            mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
+            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
+            mm_dst = _mm_srli_epi16(mm_dst, 8);
+            /*mm_dst = 0x0000000000000000000000RR00GG00BB*/
+
+            /* now we have the multiplied channels we 'shuffle them out' one
+             * at a time so there are four copies of red, four copies of green,
+             * four copies of blue etc. Then we add all these together
+             * so each of channels contains R+G+B.
+             */
+            mm_dst = _mm_adds_epu8(
+                _mm_adds_epu8(
+                    _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
+                    _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1))),
+                _mm_adds_epu8(
+                    _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
+                    _mm_shufflelo_epi16(mm_dst,
+                                        _PG_SIMD_SHUFFLE(3, 3, 3, 3))));
+            /* Gr here stands for 'Gray' as we've now added all the channels
+             * back together after multiplying them above.
+             * mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr
+             */
+
+            /* The rest is just packing the grayscale back to the original
+             * 8bit pixel layout and adding the alpha we removed earlier back
+             * in again
+             */
+            mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
+            /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
+            mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
+            /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
+            *dstp = _mm_cvtsi128_si32(mm_dst);
+            /*dstp = 0xAARRGGBB*/
+            srcp++;
+            dstp++;
+        }
+        srcp += s_row_skip;
+        srcp64 = (Uint64 *)srcp;
+    }
+}
 #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
diff --git a/src_c/transform.c b/src_c/transform.c
index b16554ab0a..f4fcf51405 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -2059,6 +2059,36 @@ clamp_4
 
 #endif
 
+void
+grayscale_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int x, y;
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++) {
+            Uint32 pixel;
+            Uint8 *pix;
+            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
+                        pix);
+            Uint8 r, g, b, a;
+            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
+
+            /* RGBA to GRAY formula used by OpenCV
+             * We are using a bitshift and integer addition to align the
+             * calculation with what is fastest for SIMD operations.
+             * Results are almost identical to floating point multiplication.
+             */
+            Uint8 grayscale_pixel =
+                (Uint8)((((76 * r) + 255) >> 8) + (((150 * g) + 255) >> 8) +
+                        (((29 * b) + 255) >> 8));
+            Uint32 new_pixel =
+                SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel,
+                            grayscale_pixel, a);
+            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
+                        newsurf->format, pix);
+        }
+    }
+}
+
 SDL_Surface *
 grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
 {
@@ -2085,26 +2115,30 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             PyExc_ValueError,
             "Source and destination surfaces need the same format."));
     }
-
-    int x, y;
-    for (y = 0; y < src->h; y++) {
-        for (x = 0; x < src->w; x++) {
-            Uint32 pixel;
-            Uint8 *pix;
-            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
-                        pix);
-            Uint8 r, g, b, a;
-            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
-
-            // RGBA to GRAY formula used by OpenCV
-            Uint8 grayscale_pixel = (Uint8)(0.299 * r + 0.587 * g + 0.114 * b);
-            Uint32 new_pixel =
-                SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel,
-                            grayscale_pixel, a);
-            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
-                        newsurf->format, pix);
+#if defined(__EMSCRIPTEN__)
+    grayscale_non_simd(src, newsurf);
+#else  // !defined(__EMSCRIPTEN__)
+    if (src->format->BytesPerPixel == 4 &&
+        src->format->Rmask == newsurf->format->Rmask &&
+        src->format->Gmask == newsurf->format->Gmask &&
+        src->format->Bmask == newsurf->format->Bmask &&
+        (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
+        if (pg_has_avx2()) {
+            grayscale_avx2(src, newsurf);
         }
+#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else if (pg_HasSSE_NEON()) {
+            grayscale_sse2(src, newsurf);
+        }
+#endif  // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else {
+            grayscale_non_simd(src, newsurf);
+        }
+    }
+    else {
+        grayscale_non_simd(src, newsurf);
     }
+#endif  // !defined(__EMSCRIPTEN__)
 
     SDL_UnlockSurface(newsurf);
 
diff --git a/test/transform_test.py b/test/transform_test.py
index 982343ae9d..d25cc4449a 100644
--- a/test/transform_test.py
+++ b/test/transform_test.py
@@ -174,10 +174,24 @@ def test_grayscale(self):
         s = pygame.Surface((32, 32))
         s.fill((255, 0, 0))
 
-        s2 = pygame.transform.grayscale(s)
-        self.assertEqual(pygame.transform.average_color(s2)[0], 76)
-        self.assertEqual(pygame.transform.average_color(s2)[1], 76)
-        self.assertEqual(pygame.transform.average_color(s2)[2], 76)
+        gray_red = pygame.transform.grayscale(s)
+        self.assertEqual(pygame.transform.average_color(gray_red)[0], 76)
+        self.assertEqual(pygame.transform.average_color(gray_red)[1], 76)
+        self.assertEqual(pygame.transform.average_color(gray_red)[2], 76)
+
+        green_surf = pygame.Surface((32, 32))
+        green_surf.fill((0, 255, 0))
+        gray_green = pygame.transform.grayscale(green_surf)
+        self.assertEqual(pygame.transform.average_color(gray_green)[0], 150)
+        self.assertEqual(pygame.transform.average_color(gray_green)[1], 150)
+        self.assertEqual(pygame.transform.average_color(gray_green)[2], 150)
+
+        blue_surf = pygame.Surface((32, 32))
+        blue_surf.fill((0, 0, 255))
+        blue_green = pygame.transform.grayscale(blue_surf)
+        self.assertEqual(pygame.transform.average_color(blue_green)[0], 29)
+        self.assertEqual(pygame.transform.average_color(blue_green)[1], 29)
+        self.assertEqual(pygame.transform.average_color(blue_green)[2], 29)
 
         dest = pygame.Surface((32, 32), depth=32)
         pygame.transform.grayscale(s, dest)
@@ -188,16 +202,16 @@ def test_grayscale(self):
         dest = pygame.Surface((32, 32), depth=32)
         s.fill((34, 12, 65))
         pygame.transform.grayscale(s, dest)
-        self.assertEqual(pygame.transform.average_color(dest)[0], 24)
-        self.assertEqual(pygame.transform.average_color(dest)[1], 24)
-        self.assertEqual(pygame.transform.average_color(dest)[2], 24)
+        self.assertEqual(pygame.transform.average_color(dest)[0], 27)
+        self.assertEqual(pygame.transform.average_color(dest)[1], 27)
+        self.assertEqual(pygame.transform.average_color(dest)[2], 27)
 
         dest = pygame.Surface((32, 32), depth=32)
         s.fill((123, 123, 123))
         pygame.transform.grayscale(s, dest)
-        self.assertIn(pygame.transform.average_color(dest)[0], [123, 122])
-        self.assertIn(pygame.transform.average_color(dest)[1], [123, 122])
-        self.assertIn(pygame.transform.average_color(dest)[2], [123, 122])
+        self.assertIn(pygame.transform.average_color(dest)[0], [124, 122])
+        self.assertIn(pygame.transform.average_color(dest)[1], [124, 122])
+        self.assertIn(pygame.transform.average_color(dest)[2], [124, 122])
 
         s = pygame.Surface((32, 32), depth=24)
         s.fill((255, 0, 0))
@@ -215,6 +229,26 @@ def test_grayscale(self):
         self.assertEqual(pygame.transform.average_color(dest)[1], 76)
         self.assertEqual(pygame.transform.average_color(dest)[2], 72)
 
+        super_surf = pygame.Surface((64, 64), depth=32)
+        super_surf.fill((255, 255, 255))
+        super_surf.fill((255, 0, 0), pygame.Rect(0, 0, 32, 32))
+        sub_surf = super_surf.subsurface(pygame.Rect(0, 0, 32, 32))
+
+        grey_sub_surf = pygame.transform.grayscale(sub_surf)
+        self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
+        self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
+        self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
+
+    def test_grayscale_simd_assumptions(self):
+        # The grayscale SIMD algorithm relies on the destination surface pitch
+        # being exactly width * 4 (4 bytes per pixel), for maximum speed.
+        # This test is here to make sure that assumption is always true.
+        widths = [1, 5, 6, 23, 54, 233]
+        for width in widths:
+            self.assertEqual(
+                pygame.Surface((width, 1), depth=32).get_pitch(), width * 4
+            )
+
     def test_threshold__honors_third_surface(self):
         # __doc__ for threshold as of Tue 07/15/2008