8-bit AVX2 MC HV* Optimisations #234

frankplow · 2024-06-17T13:38:09Z

The functions ff_h2656_put_(uni_)?{4,8}tap_hv32_8_avx2 are defined in libavcodec/x86/h26x/h2656_inter.asm. They are used directly in the HEVC decoder, and also indirectly to define optimisations for larger sizes ff_h2656_put_(uni_)?{4,8}tap_hv{64,128}_8_avx2, using the helper mc_rep_func.

None of these functions are currently used in the VVC decoder. As HEVC does not have size-128 CUs, the size-128 functions are not used anywhere. The patch below updates the VVC decoder to use these optimisations.

 libavcodec/x86/vvc/vvcdsp_init.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 4b4a2aa937..b341e2e85a 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -162,7 +162,10 @@ FW_PUT_SSE4(12)
     FW_PUT(n ## tap_h128,  bitd, avx2)  \
     FW_PUT(n ## tap_v32,   bitd, avx2)  \
     FW_PUT(n ## tap_v64,   bitd, avx2)  \
-    FW_PUT(n ## tap_v128,  bitd, avx2)
+    FW_PUT(n ## tap_v128,  bitd, avx2)  \
+    FW_PUT(n ## tap_hv32,  bitd, avx2)  \
+    FW_PUT(n ## tap_hv64,  bitd, avx2)  \
+    FW_PUT(n ## tap_hv128, bitd, avx2)
 
 #define FW_PUT_AVX2(bitd) \
     FW_PUT(pixels32,  bitd, avx2) \
@@ -178,10 +181,7 @@ FW_PUT_AVX2(12)
 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
     FW_PUT(n ## tap_h16,   bitd, avx2) \
     FW_PUT(n ## tap_v16,   bitd, avx2) \
-    FW_PUT(n ## tap_hv16,  bitd, avx2) \
-    FW_PUT(n ## tap_hv32,  bitd, avx2) \
-    FW_PUT(n ## tap_hv64,  bitd, avx2) \
-    FW_PUT(n ## tap_hv128, bitd, avx2)
+    FW_PUT(n ## tap_hv16,  bitd, avx2)
 
 #define FW_PUT_16BPC_AVX2(bitd)     \
     FW_PUT(pixels16, bitd, avx2)    \
@@ -281,6 +281,9 @@ ALF_FUNCS(16, 12, avx2)
         PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32,  bd, avx2)  \
         PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64,  bd, avx2)  \
         PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2)  \
+        PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32,  bd, avx2) \
+        PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64,  bd, avx2) \
+        PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
     } while (0)
 
 #define MC_LINKS_AVX2(bd)                                            \
@@ -292,9 +295,6 @@ ALF_FUNCS(16, 12, avx2)
         PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2)   \
         PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2)   \
         PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2)  \
-        PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2)  \
-        PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2)  \
-        PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
     } while (0)
 
 #define MC_LINKS_16BPC_AVX2(bd)                                      \

Unfortunately, applying this patch results in checkasm and the conformance bitstreams failing. It appears that there is some difference between HEVC and VVC that is not being accounted for. The problem is specific to 8-bit HV; 8-bit H and V functions and HV functions for >= 10-bit are already in use and work correctly.

The text was updated successfully, but these errors were encountered:

frankplow · 2024-06-17T13:41:14Z

Also the HEVC decoder uses AVX2 for the 4-tap HV32 but not the 8-tap, perhaps we'd want to add the patch below?

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 2c0fca303e..ca9ee4d60c 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -184,6 +184,7 @@ FW_EPEL_HV(16, 10, avx2)
 FW_QPEL(32,  8, avx2)
 FW_QPEL(16, 10, avx2)
 
+FW_QPEL_HV(32, 8, avx2)
 FW_QPEL_HV(16, 10, avx2)
 
 #endif

nuomi2021 · 2024-06-20T12:50:54Z

Hi @frankplow , thank you for the issue.
@QSXW could you help this.

thank you

QSXW · 2024-06-25T19:46:40Z

Hi. What is the error shown when applying this patch?

frankplow · 2024-06-26T22:30:28Z

@QSXW

Hi. What is the error shown when applying this patch?

There are no error messages, the patch causes mismatches in the checkasm and conformance suite. In other words, these optimisations do not perform the same operation as their C equivalents.

QSXW · 2024-06-27T13:58:55Z

Sorry my bad. I mean what is the error shown by checkasm? Can you paste them here?

frankplow · 2024-06-28T18:10:31Z

@QSXW

Sorry my bad. I mean what is the error shown by checkasm? Can you paste them here?

It only shows that there is a mismatch for these functions:

frank@desk ~/dev/ffmpeg (master *) 
> ./tests/checkasm/checkasm --test=vvc_mc
checkasm: using random seed 837724631
SSE4.1:
 - vvc_mc.put_luma       [OK]
 - vvc_mc.put_uni_luma   [OK]
 - vvc_mc.put_chroma     [OK]
 - vvc_mc.put_uni_chroma [OK]
AVX2:
 - vvc_mc.sad            [OK]
   put_luma_hv_8_32x4_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x4_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x4_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x8_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x8_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x8_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x16_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x16_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x16_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x32_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x32_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x32_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x64_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x64_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x64_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x128_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x128_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x128_avx2 (vvc_mc.c:103)
 - vvc_mc.put_luma       [FAILED]
   put_uni_hv_luma_8_32x4_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x4_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x4_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x8_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x8_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x8_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x16_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x16_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x16_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x32_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x32_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x32_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x64_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x64_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x64_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x128_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x128_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x128_avx2 (vvc_mc.c:154)
 - vvc_mc.put_uni_luma   [FAILED]
 - vvc_mc.put_chroma     [OK]
 - vvc_mc.put_uni_chroma [OK]
 - vvc_mc.avg            [OK]
checkasm: 36 of 3281 tests have failed

nuomi2021 · 2024-07-13T04:21:56Z

@QSXW could you help check?
thank you

QSXW · 2024-07-18T17:21:16Z

@QSXW could you help check? thank you

Sorry, recently I got stuck with some personal errands. I'm back now and I will check it by two days.

QSXW · 2024-07-20T18:06:17Z

FW_QPEL_HV(32, 8, avx2)

Hi @nuomi2021 @frankplow

   put_hevc_qpel_hv32_8_avx2 (tests/checkasm/hevc_pel.c:116)
 - hevc_pel.qpel                         [FAILED]

The hevc actually doesn't use the hv32, where there may have some issue to make them disable the usage of hv32, which means it cannot be used by vvc as well directly.

The hv32 avx2 algorithm doesn't exist. It is only used to make that #define MC_REP_FUNCS_AVX2(fname) will not break the compilation.

frankplow · 2024-07-22T16:32:38Z

@QSXW

FW_QPEL_HV(32, 8, avx2)
Hi @nuomi2021 @frankplow
   put_hevc_qpel_hv32_8_avx2 (tests/checkasm/hevc_pel.c:116)
 - hevc_pel.qpel                         [FAILED]
The hevc actually doesn't use the hv32, where there may have some issue to make them disable the usage of hv32, which means it cannot be used by vvc as well directly.

The hv32 avx2 algorithm doesn't exist. It is only used to make that #define MC_REP_FUNCS_AVX2(fname) will not break the compilation.

I'm not sure I understand entirely how the HV32 algorithm doesn't exist? What is the code run by, for example, put_uni_hv_luma_8_32x4_avx2 then?

In any case, if it is not trivial to fix/write assembly optimisations for these functions, would you agree the best way forward is to modify MC_REP_FUNCS_AVX2 and any other macros which declare/define these functions? So as not to pollute the namespace with unused symbols and reduce binary size.

QSXW · 2024-07-22T17:44:59Z

@QSXW
FW_QPEL_HV(32, 8, avx2)
Hi @nuomi2021 @frankplow
   put_hevc_qpel_hv32_8_avx2 (tests/checkasm/hevc_pel.c:116)
 - hevc_pel.qpel                         [FAILED]
The hevc actually doesn't use the hv32, where there may have some issue to make them disable the usage of hv32, which means it cannot be used by vvc as well directly.
The hv32 avx2 algorithm doesn't exist. It is only used to make that #define MC_REP_FUNCS_AVX2(fname) will not break the compilation.
I'm not sure I understand entirely how the HV32 algorithm doesn't exist? What is the code run by, for example, put_uni_hv_luma_8_32x4_avx2 then?

The author of hevc asm just doesn't write an available hv32 assembly code. The symbol of hv32_8_avx2 there are just used to make the compilation successful actually.

In any case, if it is not trivial to fix/write assembly optimisations for these functions, would you agree the best way forward is to modify MC_REP_FUNCS_AVX2 and any other macros which declare/define these functions? So as not to pollute the namespace with unused symbols and reduce binary size.

I think it will be difficult to modify the MC_REP_FUNCS_AVX2, so the former author adds a faker entry of hv32 there. And, I think the compiler will strip the unused symbols.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

8-bit AVX2 MC HV* Optimisations #234

8-bit AVX2 MC HV* Optimisations #234

frankplow commented Jun 17, 2024 •

edited

Loading

frankplow commented Jun 17, 2024

nuomi2021 commented Jun 20, 2024

QSXW commented Jun 25, 2024

frankplow commented Jun 26, 2024

QSXW commented Jun 27, 2024

frankplow commented Jun 28, 2024

nuomi2021 commented Jul 13, 2024

QSXW commented Jul 18, 2024

QSXW commented Jul 20, 2024 •

edited

Loading

frankplow commented Jul 22, 2024

QSXW commented Jul 22, 2024 •

edited

Loading

8-bit AVX2 MC HV* Optimisations #234

8-bit AVX2 MC HV* Optimisations #234

Comments

frankplow commented Jun 17, 2024 • edited Loading

frankplow commented Jun 17, 2024

nuomi2021 commented Jun 20, 2024

QSXW commented Jun 25, 2024

frankplow commented Jun 26, 2024

QSXW commented Jun 27, 2024

frankplow commented Jun 28, 2024

nuomi2021 commented Jul 13, 2024

QSXW commented Jul 18, 2024

QSXW commented Jul 20, 2024 • edited Loading

frankplow commented Jul 22, 2024

QSXW commented Jul 22, 2024 • edited Loading

frankplow commented Jun 17, 2024 •

edited

Loading

QSXW commented Jul 20, 2024 •

edited

Loading

QSXW commented Jul 22, 2024 •

edited

Loading