[FFmpeg-devel] [PATCH 1/2] lavc/vc1dsp: match C block content in inv_trans_8x4_rvv

Tue Jun 11 17:55:03 EEST 2024

This shifts the mid-point (after horizontal, before vertical) block
state of the transform to match the C code. This forces shifting 8
vectors of 4 elements instead of 4 vectors of 8 elements and is thus
slight slower.
---
 libavcodec/riscv/vc1dsp_rvv.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4b7ab33307..7e1fb84b0c 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -257,6 +257,9 @@ func ff_vc1_inv_trans_8x4_rvv, zve32x
         vsetivli    zero, 4, e16, mf2, ta, ma
         vlseg8e16.v v0, (a2)
         jal         t0, ff_vc1_inv_trans_8_rvv
+        .irp    n,0,1,2,3,4,5,6,7
+        vssra.vi    v\n, v\n, 3
+        .endr
         vsseg8e16.v v0, (a2)
         addi        a3, a2, 1 * 8 * 2
         vsetivli    zero, 8, e16, m1, ta, ma
@@ -266,10 +269,6 @@ func ff_vc1_inv_trans_8x4_rvv, zve32x
         addi        a5, a2, 3 * 8 * 2
         vle16.v     v2, (a4)
         vle16.v     v3, (a5)
-        .irp    n,0,1,2,3
-        # shift 4 vectors of 8 elems after transpose instead of 8 of 4
-        vssra.vi    v\n, v\n, 3
-        .endr
         li          t1, 7
         jal         t0, ff_vc1_inv_trans_4_rvv
         add         a3, a1, a0
-- 
2.45.1