[FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
Arnie Chang
arnie.chang at sifive.com
Tue Jul 25 06:37:14 EEST 2023
It appears that all the issues raised during the review have been fixed,
and there have been no additional comments for over 1 month.
Could I kindly request assistance in pushing the patch?
On Mon, Jun 19, 2023 at 9:06 PM Arnie Chang <arnie.chang at sifive.com> wrote:
> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang at sifive.com>
> ---
> V2:
> 1. Change the \width to an run time argument
> 2. Call to an internal function instead of instantiating similar code
> three times
>
> RVVi32:
> - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
>
> libavcodec/riscv/h264_chroma_init_riscv.c | 8 +
> libavcodec/riscv/h264_mc_chroma.S | 237 ++++++++++++++--------
> 2 files changed, 160 insertions(+), 85 deletions(-)
>
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c
> index 7c905edfcd..9f95150ea3 100644
> --- a/libavcodec/riscv/h264_chroma_init_riscv.c
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -27,6 +27,10 @@
>
> void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
>
> av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> {
> @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext
> *c, int bit_depth)
> if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_get_rv_vlenb() >= 16) {
> c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
> + c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
> + c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
> + c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
> }
> #endif
> }
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S
> index 364bc3156e..ce99bda44d 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
> */
> #include "libavutil/riscv/asm.S"
>
> -.macro h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro do_chroma_mc type unroll
> csrw vxrm, zero
> slli t2, a5, 3
> mul t1, a5, a4
> @@ -30,94 +29,100 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> sub a7, a4, t1
> addi a6, a5, 64
> sub t0, t2, t1
> - vsetivli t3, 8, e8, m1, ta, mu
> + vsetvli t3, t6, e8, m1, ta, mu
> beqz t1, 2f
> blez a3, 8f
> li t4, 0
> li t2, 0
> li t5, 1
> addi a5, t3, 1
> - slli t3, a2, 2
> + slli t3, a2, (1 + \unroll)
> 1: # if (xy != 0)
> add a4, a1, t4
> vsetvli zero, a5, e8, m1, ta, ma
> + .ifc \unroll,1
> addi t2, t2, 4
> + .else
> + addi t2, t2, 2
> + .endif
> vle8.v v10, (a4)
> add a4, a4, a2
> vslide1down.vx v11, v10, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v8, v10, a6
> vwmaccu.vx v8, a7, v11
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v12, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> add a4, a4, a2
> vwmaccu.vx v8, t0, v12
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v13, v12, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v10, v12, a6
> vwmaccu.vx v8, t1, v13
> vwmaccu.vx v10, a7, v13
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v14, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> add a4, a4, a2
> vwmaccu.vx v10, t0, v14
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v15, v14, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v14, a6
> vwmaccu.vx v10, t1, v15
> vwmaccu.vx v12, a7, v15
> + vnclipu.wi v15, v8, 6
> + .ifc \type,avg
> + vle8.v v9, (a0)
> + vaaddu.vv v15, v15, v9
> + .endif
> + vse8.v v15, (a0)
> + add a0, a0, a2
> + vnclipu.wi v8, v10, 6
> + .ifc \type,avg
> + vle8.v v9, (a0)
> + vaaddu.vv v8, v8, v9
> + .endif
> + add t4, t4, t3
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .ifc \unroll,1
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v14, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> add a4, a4, a2
> vwmaccu.vx v12, t0, v14
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v15, v14, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v16, v14, a6
> vwmaccu.vx v12, t1, v15
> vwmaccu.vx v16, a7, v15
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v14, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> - add a4, a0, t4
> - add t4, t4, t3
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmaccu.vx v16, t0, v14
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v14, v14, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> - vnclipu.wi v15, v8, 6
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmaccu.vx v16, t1, v14
> - .ifc \type,avg
> - vle8.v v9, (a4)
> - vaaddu.vv v15, v15, v9
> - .endif
> - vse8.v v15, (a4)
> - add a4, a4, a2
> - vnclipu.wi v8, v10, 6
> - .ifc \type,avg
> - vle8.v v9, (a4)
> - vaaddu.vv v8, v8, v9
> - .endif
> - vse8.v v8, (a4)
> - add a4, a4, a2
> vnclipu.wi v8, v12, 6
> .ifc \type,avg
> - vle8.v v9, (a4)
> + vle8.v v9, (a0)
> vaaddu.vv v8, v8, v9
> .endif
> - vse8.v v8, (a4)
> - add a4, a4, a2
> + vse8.v v8, (a0)
> + add a0, a0, a2
> vnclipu.wi v8, v16, 6
> .ifc \type,avg
> - vle8.v v9, (a4)
> + vle8.v v9, (a0)
> vaaddu.vv v8, v8, v9
> .endif
> - vse8.v v8, (a4)
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .endif
> blt t2, a3, 1b
> j 8f
> 2:
> @@ -126,11 +131,15 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> blez a3, 8f
> li a4, 0
> li t1, 0
> - slli a7, a2, 2
> + slli a7, a2, (1 + \unroll)
> 3: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> add a5, a1, a4
> vsetvli zero, zero, e8, m1, ta, ma
> + .ifc \unroll,1
> addi t1, t1, 4
> + .else
> + addi t1, t1, 2
> + .endif
> vle8.v v8, (a5)
> add a5, a5, a2
> add t2, a5, a2
> @@ -141,42 +150,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> add t2, t2, a2
> add a5, t2, a2
> vwmaccu.vx v10, t0, v8
> - vle8.v v8, (t2)
> - vle8.v v14, (a5)
> - add a5, a0, a4
> add a4, a4, a7
> vwmaccu.vx v12, t0, v9
> vnclipu.wi v15, v10, 6
> vwmulu.vx v10, v9, a6
> + vnclipu.wi v9, v12, 6
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v15, v15, v16
> .endif
> - vse8.v v15, (a5)
> - add a5, a5, a2
> - vnclipu.wi v9, v12, 6
> - vwmaccu.vx v10, t0, v8
> - vwmulu.vx v12, v8, a6
> + vse8.v v15, (a0)
> + add a0, a0, a2
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v9, v9, v16
> .endif
> - vse8.v v9, (a5)
> - add a5, a5, a2
> + vse8.v v9, (a0)
> + add a0, a0, a2
> + .ifc \unroll,1
> + vle8.v v8, (t2)
> + vle8.v v14, (a5)
> + vwmaccu.vx v10, t0, v8
> + vwmulu.vx v12, v8, a6
> vnclipu.wi v8, v10, 6
> vwmaccu.vx v12, t0, v14
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v8, v8, v16
> .endif
> - vse8.v v8, (a5)
> - add a5, a5, a2
> + vse8.v v8, (a0)
> + add a0, a0, a2
> vnclipu.wi v8, v12, 6
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v8, v8, v16
> .endif
> - vse8.v v8, (a5)
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .endif
> blt t1, a3, 3b
> j 8f
> 4:
> @@ -186,87 +197,95 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> li a4, 0
> li t2, 0
> addi t0, t3, 1
> - slli t1, a2, 2
> + slli t1, a2, (1 + \unroll)
> 5: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
> add a5, a1, a4
> vsetvli zero, t0, e8, m1, ta, ma
> + .ifc \unroll,1
> addi t2, t2, 4
> + .else
> + addi t2, t2, 2
> + .endif
> vle8.v v8, (a5)
> add a5, a5, a2
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v10, v8, a6
> vwmaccu.vx v10, a7, v9
> vsetvli zero, t0, e8, m1, ta, ma
> vle8.v v8, (a5)
> add a5, a5, a2
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v8, a6
> vwmaccu.vx v12, a7, v9
> + vnclipu.wi v16, v10, 6
> + .ifc \type,avg
> + vle8.v v18, (a0)
> + vaaddu.vv v16, v16, v18
> + .endif
> + vse8.v v16, (a0)
> + add a0, a0, a2
> + vnclipu.wi v10, v12, 6
> + .ifc \type,avg
> + vle8.v v18, (a0)
> + vaaddu.vv v10, v10, v18
> + .endif
> + add a4, a4, t1
> + vse8.v v10, (a0)
> + add a0, a0, a2
> + .ifc \unroll,1
> vsetvli zero, t0, e8, m1, ta, ma
> vle8.v v8, (a5)
> add a5, a5, a2
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v14, v8, a6
> vwmaccu.vx v14, a7, v9
> vsetvli zero, t0, e8, m1, ta, ma
> vle8.v v8, (a5)
> - add a5, a0, a4
> - add a4, a4, t1
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> - vnclipu.wi v16, v10, 6
> - .ifc \type,avg
> - vle8.v v18, (a5)
> - vaaddu.vv v16, v16, v18
> - .endif
> - vse8.v v16, (a5)
> - add a5, a5, a2
> - vnclipu.wi v10, v12, 6
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v8, a6
> - .ifc \type,avg
> - vle8.v v18, (a5)
> - vaaddu.vv v10, v10, v18
> - .endif
> - vse8.v v10, (a5)
> - add a5, a5, a2
> vnclipu.wi v8, v14, 6
> vwmaccu.vx v12, a7, v9
> .ifc \type,avg
> - vle8.v v18, (a5)
> + vle8.v v18, (a0)
> vaaddu.vv v8, v8, v18
> .endif
> - vse8.v v8, (a5)
> - add a5, a5, a2
> + vse8.v v8, (a0)
> + add a0, a0, a2
> vnclipu.wi v8, v12, 6
> .ifc \type,avg
> - vle8.v v18, (a5)
> + vle8.v v18, (a0)
> vaaddu.vv v8, v8, v18
> .endif
> - vse8.v v8, (a5)
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .endif
> blt t2, a3, 5b
> j 8f
> 6:
> blez a3, 8f
> li a4, 0
> li t2, 0
> - slli a7, a2, 2
> + slli a7, a2, (1 + \unroll)
> 7: # the final else, none of the above
> conditions are met
> add t0, a1, a4
> vsetvli zero, zero, e8, m1, ta, ma
> add a5, a0, a4
> add a4, a4, a7
> + .ifc \unroll,1
> addi t2, t2, 4
> + .else
> + addi t2, t2, 2
> + .endif
> vle8.v v8, (t0)
> add t0, t0, a2
> add t1, t0, a2
> vwmulu.vx v10, v8, a6
> vle8.v v8, (t0)
> add t0, t1, a2
> - vle8.v v9, (t1)
> - vle8.v v12, (t0)
> vnclipu.wi v13, v10, 6
> vwmulu.vx v10, v8, a6
> .ifc \type,avg
> @@ -276,13 +295,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> vse8.v v13, (a5)
> add a5, a5, a2
> vnclipu.wi v8, v10, 6
> - vwmulu.vx v10, v9, a6
> .ifc \type,avg
> vle8.v v18, (a5)
> vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> add a5, a5, a2
> + .ifc \unroll,1
> + vle8.v v9, (t1)
> + vle8.v v12, (t0)
> + vwmulu.vx v10, v9, a6
> vnclipu.wi v8, v10, 6
> vwmulu.vx v10, v12, a6
> .ifc \type,avg
> @@ -297,11 +319,56 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> + .endif
> blt t2, a3, 7b
> 8:
> ret
> -endfunc
> .endm
>
> -h264_chroma_mc8 put
> -h264_chroma_mc8 avg
> +func h264_put_chroma_mc_rvv, zve32x
> +11:
> + li a7, 3
> + blt a3, a7, 12f
> + do_chroma_mc put 1
> +12:
> + do_chroma_mc put 0
> +endfunc
> +
> +func h264_avg_chroma_mc_rvv, zve32x
> +21:
> + li a7, 3
> + blt a3, a7, 22f
> + do_chroma_mc avg 1
> +22:
> + do_chroma_mc avg 0
> +endfunc
> +
> +func h264_put_chroma_mc8_rvv, zve32x
> + li t6, 8
> + j 11b
> +endfunc
> +
> +func h264_put_chroma_mc4_rvv, zve32x
> + li t6, 4
> + j 11b
> +endfunc
> +
> +func h264_put_chroma_mc2_rvv, zve32x
> + li t6, 2
> + j 11b
> +endfunc
> +
> +func h264_avg_chroma_mc8_rvv, zve32x
> + li t6, 8
> + j 21b
> +endfunc
> +
> +func h264_avg_chroma_mc4_rvv, zve32x
> + li t6, 4
> + j 21b
> +endfunc
> +
> +func h264_avg_chroma_mc2_rvv, zve32x
> + li t6, 2
> + j 21b
> +endfunc
> --
> 2.17.1
>
>
More information about the ffmpeg-devel
mailing list