Skip to content

Commit

Permalink
Merge pull request hrydgard#19753 from hrydgard/more-simd
Browse files Browse the repository at this point in the history
Implement fp64's color conversion function in ConvertRGBA8888ToRGBA5551
  • Loading branch information
hrydgard authored Dec 20, 2024
2 parents 7b3640e + 6481fc9 commit bff2498
Showing 1 changed file with 19 additions and 28 deletions.
47 changes: 19 additions & 28 deletions Common/Data/Convert/ColorConv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,38 +65,29 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
}

#if PPSSPP_ARCH(SSE2)
/*
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
[[gnu::target("sse4.1")]]
#endif
*/
// fp64's improved version, see #19751
static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
const __m128i maskGA = _mm_set1_epi32(0x8000F800);
const __m128i mulRB = _mm_set1_epi32(0x04000001);
const __m128i mulGA = _mm_set1_epi32(0x00400001);

for (u32 i = 0; i < sseChunks; i += 2) {
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
__m128i ag, rb;

ag = _mm_and_si128(c1, maskAG);
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
rb = _mm_and_si128(c1, maskRB);
rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);

ag = _mm_and_si128(c2, maskAG);
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
rb = _mm_and_si128(c2, maskRB);
rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);

#if 0
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
#else
_mm_store_si128(&dstp[i / 2], _mm_packu2_epi32_SSE2(c1, c2));
#endif
__m128i c0 = _mm_load_si128(&srcp[i + 0]);
__m128i c1 = _mm_load_si128(&srcp[i + 1]);

__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000
__m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000
__m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000
__m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000
rb0 = _mm_madd_epi16(_mm_srli_epi32(rb0, 3), mulRB); // 00000000000000000bbbbb00000rrrrr
rb1 = _mm_madd_epi16(_mm_srli_epi32(rb1, 3), mulRB); // 00000000000000000bbbbb00000rrrrr
ga0 = _mm_madd_epi16(_mm_srli_epi32(ga0, 11), mulGA); // 000000000000000000000a00000ggggg
ga1 = _mm_madd_epi16(_mm_srli_epi32(ga1, 11), mulGA); // 000000000000000000000a00000ggggg
__m128i rb = _mm_packs_epi32(rb0, rb1);
__m128i ga = _mm_slli_epi32(_mm_packs_epi32(ga0, ga1), 5);

_mm_store_si128(&dstp[i / 2], _mm_or_si128(ga, rb));
}
}
#endif
Expand Down

0 comments on commit bff2498

Please sign in to comment.