[FFmpeg-devel] [PATCH] lavc/h264dsp: R-V V intra loop filter
Rémi Denis-Courmont
remi at remlab.net
Tue Dec 3 21:50:53 EET 2024
As with the inter loop filter, performance metrics seem to be biased in
favour of the C implementation because checkasm inputs almost always
fall in the no-op case.
h264_h_loop_filter_chroma_intra_8bpp_c: 82.8 ( 1.00x)
h264_h_loop_filter_chroma_intra_8bpp_rvv_i32: 72.6 ( 1.14x)
h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 41.1 ( 1.00x)
h264_h_loop_filter_chroma_mbaff_intra_8bpp_rvv_i32: 72.6 ( 0.57x)
h264_h_loop_filter_luma_intra_8bpp_c: 166.1 ( 1.00x)
h264_h_loop_filter_luma_intra_8bpp_rvv_i32: 395.4 ( 0.42x)
h264_h_loop_filter_luma_mbaff_intra_8bpp_c: 93.3 ( 1.00x)
h264_h_loop_filter_luma_mbaff_intra_8bpp_rvv_i32: 395.4 ( 0.24x)
h264_v_loop_filter_chroma_intra_8bpp_c: 134.8 ( 1.00x)
h264_v_loop_filter_chroma_intra_8bpp_rvv_i32: 51.6 ( 2.61x)
h264_v_loop_filter_luma_intra_8bpp_c: 468.1 ( 1.00x)
h264_v_loop_filter_luma_intra_8bpp_rvv_i32: 134.8 ( 3.47x)
---
libavcodec/riscv/h264dsp_init.c | 26 +++++
libavcodec/riscv/h264dsp_rvv.S | 163 ++++++++++++++++++++++++++++++++
2 files changed, 189 insertions(+)
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 30dd272d6e..f214486bbe 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -40,6 +40,12 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_luma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv(uint8_t *pix, ptrdiff_t s,
+ int a, int b);
void ff_h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
@@ -47,6 +53,13 @@ void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
void ff_h264_h_loop_filter_chroma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta,
int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *pix,
+ ptrdiff_t stride,
+ int alpha, int beta);
#define IDCT_DEPTH(depth) \
void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
@@ -125,13 +138,26 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff =
ff_h264_h_loop_filter_luma_mbaff_8_rvv;
+ dsp->h264_v_loop_filter_luma_intra =
+ ff_h264_v_loop_filter_luma_intra_8_rvv;
+ dsp->h264_h_loop_filter_luma_intra =
+ ff_h264_h_loop_filter_luma_intra_8_rvv;
+ dsp->h264_h_loop_filter_luma_mbaff_intra =
+ ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv;
dsp->h264_v_loop_filter_chroma =
ff_h264_v_loop_filter_chroma_8_rvv;
+ dsp->h264_v_loop_filter_chroma_intra =
+ ff_h264_v_loop_filter_chroma_intra_8_rvv;
+
if (chroma_format_idc <= 1) {
dsp->h264_h_loop_filter_chroma =
ff_h264_h_loop_filter_chroma_8_rvv;
dsp->h264_h_loop_filter_chroma_mbaff =
ff_h264_h_loop_filter_chroma_mbaff_8_rvv;
+ dsp->h264_h_loop_filter_chroma_intra =
+ ff_h264_h_loop_filter_chroma_intra_8_rvv;
+ dsp->h264_h_loop_filter_chroma_mbaff_intra =
+ ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv;
}
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 5423b716ca..60015a7020 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -308,6 +308,125 @@ func ff_h264_v_loop_filter_\type\()_8_rvv, zve32x
vse8.v v11, (a0)
ret
endfunc
+
+ .variant_cc ff_h264_loop_filter_\type\()_intra_8_rvv
+func ff_h264_loop_filter_\type\()_intra_8_rvv, zve32x
+ # p3: v8, p2: v9, p1: v10, p0: v11, q0: v12, q1: v13, q2: v14, q3: v15
+ # alpha: a2, beta: a3
+ csrwi vxrm, 0
+ srai a4, a2, 2
+ vwsubu.vv v16, v11, v12
+ addi a4, a4, 2
+ vwsubu.vv v18, v12, v11
+ vwsubu.vv v20, v10, v11
+ vwsubu.vv v22, v11, v10
+ vwsubu.vv v24, v13, v12
+ vwsubu.vv v26, v12, v13
+ vwsubu.vv v28, v11, v9
+ vwsubu.vv v30, v9, v11
+ vwsubu.vv v4, v14, v12
+ vwsubu.vv v6, v12, v14
+ vsetvli zero, zero, e16, \e16mul, ta, ma
+ vmax.vv v16, v16, v18 # abs(p0 - q0)
+ vmax.vv v20, v20, v22 # abs(p1 - p0)
+ vmslt.vx v18, v16, a2
+ vmax.vv v24, v24, v26 # abs(q1 - q0)
+ vmslt.vx v22, v20, a3
+.ifc \type, luma
+ vmax.vv v28, v28, v30 # abs(p2 - p0)
+.endif
+ vmand.mm v18, v18, v22
+ vmslt.vx v23, v24, a3
+.ifc \type, luma
+ vmax.vv v4, v4, v6 # abs(q2 - q0)
+ vmand.mm v1, v18, v23 # abs(...) < A && abs(..) < B && abs(..) < B
+ vmslt.vx v3, v16, a4 # abs(p0 - q0) < (alpha / 4) + 2
+ vmslt.vx v6, v28, a3 # abs(p2 - p0) < beta
+ vmslt.vx v7, v4, a3 # abs(q2 - q0) < beta
+ vmand.mm v2, v3, v6
+ vmand.mm v3, v3, v7
+.else
+ vmand.mm v0, v18, v23
+.endif
+ vsetvli zero, zero, e8, \e8mul, ta, mu
+ vwaddu.vv v22, v11, v13
+ vwaddu.vv v30, v10, v12
+ vwaddu.wv v22, v22, v10
+ vwaddu.wv v30, v30, v13
+ vwaddu.wv v22, v22, v10 # 2p1 + p0 + q1
+ vwaddu.wv v30, v30, v13 # p1 + q0 + 2q1
+.ifc \type, luma
+ vwaddu.vv v16, v10, v11
+ vwaddu.vv v20, v8, v9
+ vwaddu.wv v16, v16, v12 # p1 + p0 + q0
+ vwaddu.vv v24, v11, v12
+ vwaddu.vv v28, v14, v15
+ vwaddu.wv v24, v24, v13 # p0 + q0 + q1
+ vwaddu.wv v18, v16, v9 # p2 + p1 + p0 + q0
+ vwaddu.wv v16, v16, v13 # p1 + p0 + q0 + q1
+ vwaddu.wv v26, v24, v14 # p0 + q0 + q1 + q2
+ vwaddu.wv v24, v24, v10 # p1 + p0 + q0 + q1
+ vsetvli zero, zero, e16, \e16mul, ta, ma
+ vsll.vi v20, v20, 1 # 2p3 +2p2
+ vadd.vv v16, v16, v18 # p2 +2p1 +2p0 +2q0 + q1
+ vadd.vv v20, v18, v20 # 2p3 +3p3 + p1 + p0 + q0
+ vsll.vi v28, v28, 1 # 2q2 +2q3
+ vadd.vv v24, v24, v26 # p1 +2p0 +2q0 +2q1 + q2
+ vadd.vv v28, v26, v28 # p0 + q0 + q1 +3q2 +2q3
+ vsetvli zero, zero, e8, \e8mul, ta, mu
+ vmand.mm v0, v1, v2
+ vnclipu.wi v11, v16, 3, v0.t # p0'
+ vnclipu.wi v10, v18, 2, v0.t # p1'
+ vnclipu.wi v9, v20, 3, v0.t # p2'
+ vmandn.mm v0, v1, v2
+.endif
+ vnclipu.wi v11, v22, 2, v0.t # p0'
+.ifc \type, luma
+ vmand.mm v0, v1, v3
+ vnclipu.wi v12, v24, 3, v0.t # q0'
+ vnclipu.wi v13, v26, 2, v0.t # q1'
+ vnclipu.wi v14, v28, 3, v0.t # q2'
+ vmandn.mm v0, v1, v3
+.endif
+ vnclipu.wi v12, v30, 2, v0.t # q0'
+ jr t0
+endfunc
+
+func ff_h264_v_loop_filter_\type\()_intra_8_rvv, zve32x
+ lpad 0
+ sub t3, a0, a1
+ vsetivli zero, 4 * \inners, e8, \e8mul, ta, ma
+ vle8.v v12, (a0)
+ sub t2, t3, a1
+ vle8.v v11, (t3)
+ add t4, a0, a1
+ vle8.v v10, (t2)
+.ifc \type, luma
+ sub t1, t2, a1
+.endif
+ vle8.v v13, (t4)
+.ifc \type, luma
+ sub t0, t1, a1
+ vle8.v v9, (t1)
+ add t5, t4, a1
+ vle8.v v8, (t0)
+ add t6, t5, a1
+ vle8.v v14, (t5)
+ vle8.v v15, (t6)
+.endif
+ jal t0, ff_h264_loop_filter_\type\()_intra_8_rvv
+.ifc \type, luma
+ vse8.v v9, (t1)
+ vse8.v v10, (t2)
+.endif
+ vse8.v v11, (t3)
+ vse8.v v12, (a0)
+.ifc \type, luma
+ vse8.v v13, (t4)
+ vse8.v v14, (t5)
+.endif
+ ret
+endfunc
.endm
loop_filter luma, 4, m1, m2
@@ -373,3 +492,47 @@ func ff_h264_h_loop_filter_chroma_mbaff_8_rvv, zve32x
vssseg2e8.v v10, (a0), a1
ret
endfunc
+
+func ff_h264_h_loop_filter_luma_intra_8_rvv, zve32x
+ lpad 0
+ addi a0, a0, -4
+ vsetivli zero, 16, e8, m1, ta, ma
+ vlsseg8e8.v v8, (a0), a1
+ addi a0, a0, 1
+ jal t0, ff_h264_loop_filter_luma_intra_8_rvv
+ vssseg6e8.v v9, (a0), a1
+ ret
+endfunc
+
+func ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv, zve32x
+ lpad 0
+ addi a0, a0, -4
+ vsetivli zero, 8, e8, m1, ta, ma
+ vlsseg8e8.v v8, (a0), a1
+ addi a0, a0, 1
+ jal t0, ff_h264_loop_filter_luma_intra_8_rvv
+ vssseg6e8.v v9, (a0), a1
+ ret
+endfunc
+
+func ff_h264_h_loop_filter_chroma_intra_8_rvv, zve32x
+ lpad 0
+ addi a0, a0, -2
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vlsseg4e8.v v10, (a0), a1
+ addi a0, a0, 1
+ jal t0, ff_h264_loop_filter_chroma_intra_8_rvv
+ vssseg2e8.v v11, (a0), a1
+ ret
+endfunc
+
+func ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv, zve32x
+ lpad 0
+ addi a0, a0, -2
+ vsetivli zero, 4, e8, mf2, ta, ma
+ vlsseg4e8.v v10, (a0), a1
+ addi a0, a0, 1
+ jal t0, ff_h264_loop_filter_chroma_intra_8_rvv
+ vssseg2e8.v v11, (a0), a1
+ ret
+endfunc
--
2.45.2
More information about the ffmpeg-devel
mailing list