[FFmpeg-devel] [PATCH 7/7] avcodec/riscv: add h264 qpel

J. Dekker jdek at itanimul.li
Tue Aug 13 17:03:36 EEST 2024


From: Niklas Haas <git at haasn.dev>

checkasm: bench runs 131072 (1 << 17)
avg_h264_qpel_4_mc00_8_c:                               37.6 ( 1.00x)
avg_h264_qpel_4_mc00_8_rvv_i32:                         27.4 ( 1.37x)
avg_h264_qpel_4_mc01_8_c:                              214.6 ( 1.00x)
avg_h264_qpel_4_mc01_8_rvv_i32:                         79.3 ( 2.70x)
avg_h264_qpel_4_mc02_8_c:                              214.8 ( 1.00x)
avg_h264_qpel_4_mc02_8_rvv_i32:                         79.3 ( 2.71x)
avg_h264_qpel_4_mc03_8_c:                              214.8 ( 1.00x)
avg_h264_qpel_4_mc03_8_rvv_i32:                         79.3 ( 2.71x)
avg_h264_qpel_4_mc10_8_c:                              173.1 ( 1.00x)
avg_h264_qpel_4_mc10_8_rvv_i32:                        120.8 ( 1.43x)
avg_h264_qpel_4_mc11_8_c:                              339.9 ( 1.00x)
avg_h264_qpel_4_mc11_8_rvv_i32:                        183.3 ( 1.85x)
avg_h264_qpel_4_mc12_8_c:                              537.6 ( 1.00x)
avg_h264_qpel_4_mc12_8_rvv_i32:                        339.9 ( 1.58x)
avg_h264_qpel_4_mc13_8_c:                              339.9 ( 1.00x)
avg_h264_qpel_4_mc13_8_rvv_i32:                        194.1 ( 1.75x)
avg_h264_qpel_4_mc20_8_c:                              141.8 ( 1.00x)
avg_h264_qpel_4_mc20_8_rvv_i32:                        121.1 ( 1.17x)
avg_h264_qpel_4_mc21_8_c:                              485.6 ( 1.00x)
avg_h264_qpel_4_mc21_8_rvv_i32:                        381.4 ( 1.27x)
avg_h264_qpel_4_mc22_8_c:                              350.1 ( 1.00x)
avg_h264_qpel_4_mc22_8_rvv_i32:                        266.9 ( 1.31x)
avg_h264_qpel_4_mc23_8_c:                              485.6 ( 1.00x)
avg_h264_qpel_4_mc23_8_rvv_i32:                        381.6 ( 1.27x)
avg_h264_qpel_4_mc30_8_c:                              173.1 ( 1.00x)
avg_h264_qpel_4_mc30_8_rvv_i32:                        131.6 ( 1.32x)
avg_h264_qpel_4_mc31_8_c:                              339.9 ( 1.00x)
avg_h264_qpel_4_mc31_8_rvv_i32:                        183.3 ( 1.85x)
avg_h264_qpel_4_mc32_8_c:                              537.9 ( 1.00x)
avg_h264_qpel_4_mc32_8_rvv_i32:                        339.9 ( 1.58x)
avg_h264_qpel_4_mc33_8_c:                              339.9 ( 1.00x)
avg_h264_qpel_4_mc33_8_rvv_i32:                        193.8 ( 1.75x)
avg_h264_qpel_8_mc00_8_c:                              110.6 ( 1.00x)
avg_h264_qpel_8_mc00_8_rvv_i32:                         48.1 ( 2.30x)
avg_h264_qpel_8_mc01_8_c:                              766.9 ( 1.00x)
avg_h264_qpel_8_mc01_8_rvv_i32:                        152.1 ( 5.04x)
avg_h264_qpel_8_mc02_8_c:                              766.9 ( 1.00x)
avg_h264_qpel_8_mc02_8_rvv_i32:                        141.8 ( 5.41x)
avg_h264_qpel_8_mc03_8_c:                              777.4 ( 1.00x)
avg_h264_qpel_8_mc03_8_rvv_i32:                        152.3 ( 5.10x)
avg_h264_qpel_8_mc10_8_c:                              620.9 ( 1.00x)
avg_h264_qpel_8_mc10_8_rvv_i32:                        235.6 ( 2.64x)
avg_h264_qpel_8_mc11_8_c:                             1204.6 ( 1.00x)
avg_h264_qpel_8_mc11_8_rvv_i32:                        360.6 ( 3.34x)
avg_h264_qpel_8_mc12_8_c:                             1912.6 ( 1.00x)
avg_h264_qpel_8_mc12_8_rvv_i32:                        558.4 ( 3.43x)
avg_h264_qpel_8_mc13_8_c:                             1214.6 ( 1.00x)
avg_h264_qpel_8_mc13_8_rvv_i32:                        360.6 ( 3.37x)
avg_h264_qpel_8_mc20_8_c:                              506.4 ( 1.00x)
avg_h264_qpel_8_mc20_8_rvv_i32:                        225.1 ( 2.25x)
avg_h264_qpel_8_mc21_8_c:                             1714.8 ( 1.00x)
avg_h264_qpel_8_mc21_8_rvv_i32:                        631.6 ( 2.72x)
avg_h264_qpel_8_mc22_8_c:                             1266.8 ( 1.00x)
avg_h264_qpel_8_mc22_8_rvv_i32:                        423.1 ( 2.99x)
avg_h264_qpel_8_mc23_8_c:                             1714.6 ( 1.00x)
avg_h264_qpel_8_mc23_8_rvv_i32:                        631.4 ( 2.72x)
avg_h264_qpel_8_mc30_8_c:                              610.6 ( 1.00x)
avg_h264_qpel_8_mc30_8_rvv_i32:                        235.6 ( 2.59x)
avg_h264_qpel_8_mc31_8_c:                             1214.6 ( 1.00x)
avg_h264_qpel_8_mc31_8_rvv_i32:                        350.1 ( 3.47x)
avg_h264_qpel_8_mc32_8_c:                             1902.3 ( 1.00x)
avg_h264_qpel_8_mc32_8_rvv_i32:                        558.6 ( 3.41x)
avg_h264_qpel_8_mc33_8_c:                             1214.8 ( 1.00x)
avg_h264_qpel_8_mc33_8_rvv_i32:                        360.6 ( 3.37x)
avg_h264_qpel_16_mc00_8_c:                             423.1 ( 1.00x)
avg_h264_qpel_16_mc00_8_rvv_i32:                        68.8 ( 6.15x)
avg_h264_qpel_16_mc01_8_c:                            2850.1 ( 1.00x)
avg_h264_qpel_16_mc01_8_rvv_i32:                       298.1 ( 9.56x)
avg_h264_qpel_16_mc02_8_c:                            2954.6 ( 1.00x)
avg_h264_qpel_16_mc02_8_rvv_i32:                       277.4 (10.65x)
avg_h264_qpel_16_mc03_8_c:                            2871.1 ( 1.00x)
avg_h264_qpel_16_mc03_8_rvv_i32:                       298.1 ( 9.63x)
avg_h264_qpel_16_mc10_8_c:                            2423.1 ( 1.00x)
avg_h264_qpel_16_mc10_8_rvv_i32:                       464.9 ( 5.21x)
avg_h264_qpel_16_mc11_8_c:                            4683.6 ( 1.00x)
avg_h264_qpel_16_mc11_8_rvv_i32:                       714.6 ( 6.55x)
avg_h264_qpel_16_mc12_8_c:                            7496.4 ( 1.00x)
avg_h264_qpel_16_mc12_8_rvv_i32:                      1037.6 ( 7.22x)
avg_h264_qpel_16_mc13_8_c:                            4642.1 ( 1.00x)
avg_h264_qpel_16_mc13_8_rvv_i32:                       704.4 ( 6.59x)
avg_h264_qpel_16_mc20_8_c:                            2069.1 ( 1.00x)
avg_h264_qpel_16_mc20_8_rvv_i32:                       443.9 ( 4.66x)
avg_h264_qpel_16_mc21_8_c:                            6808.6 ( 1.00x)
avg_h264_qpel_16_mc21_8_rvv_i32:                      1204.3 ( 5.65x)
avg_h264_qpel_16_mc22_8_c:                            5048.4 ( 1.00x)
avg_h264_qpel_16_mc22_8_rvv_i32:                       777.4 ( 6.49x)
avg_h264_qpel_16_mc23_8_c:                            6819.1 ( 1.00x)
avg_h264_qpel_16_mc23_8_rvv_i32:                      1214.8 ( 5.61x)
avg_h264_qpel_16_mc30_8_c:                            2412.8 ( 1.00x)
avg_h264_qpel_16_mc30_8_rvv_i32:                       464.9 ( 5.19x)
avg_h264_qpel_16_mc31_8_c:                            4662.9 ( 1.00x)
avg_h264_qpel_16_mc31_8_rvv_i32:                       714.6 ( 6.53x)
avg_h264_qpel_16_mc32_8_c:                            7516.9 ( 1.00x)
avg_h264_qpel_16_mc32_8_rvv_i32:                      1058.6 ( 7.10x)
avg_h264_qpel_16_mc33_8_c:                            4673.4 ( 1.00x)
avg_h264_qpel_16_mc33_8_rvv_i32:                       714.9 ( 6.54x)
put_h264_qpel_4_mc00_8_c:                               27.4 ( 1.00x)
put_h264_qpel_4_mc00_8_rvv_i32:                         16.9 ( 1.62x)
put_h264_qpel_4_mc01_8_c:                              214.6 ( 1.00x)
put_h264_qpel_4_mc01_8_rvv_i32:                         79.3 ( 2.70x)
put_h264_qpel_4_mc02_8_c:                              183.3 ( 1.00x)
put_h264_qpel_4_mc02_8_rvv_i32:                         79.3 ( 2.31x)
put_h264_qpel_4_mc03_8_c:                              204.3 ( 1.00x)
put_h264_qpel_4_mc03_8_rvv_i32:                         89.6 ( 2.28x)
put_h264_qpel_4_mc10_8_c:                              173.1 ( 1.00x)
put_h264_qpel_4_mc10_8_rvv_i32:                        120.8 ( 1.43x)
put_h264_qpel_4_mc11_8_c:                              339.6 ( 1.00x)
put_h264_qpel_4_mc11_8_rvv_i32:                        183.3 ( 1.85x)
put_h264_qpel_4_mc12_8_c:                              527.4 ( 1.00x)
put_h264_qpel_4_mc12_8_rvv_i32:                        339.9 ( 1.55x)
put_h264_qpel_4_mc13_8_c:                              329.4 ( 1.00x)
put_h264_qpel_4_mc13_8_rvv_i32:                        183.6 ( 1.79x)
put_h264_qpel_4_mc20_8_c:                              121.1 ( 1.00x)
put_h264_qpel_4_mc20_8_rvv_i32:                        110.6 ( 1.09x)
put_h264_qpel_4_mc21_8_c:                              464.6 ( 1.00x)
put_h264_qpel_4_mc21_8_rvv_i32:                        371.1 ( 1.25x)
put_h264_qpel_4_mc22_8_c:                              329.4 ( 1.00x)
put_h264_qpel_4_mc22_8_rvv_i32:                        256.4 ( 1.28x)
put_h264_qpel_4_mc23_8_c:                              475.1 ( 1.00x)
put_h264_qpel_4_mc23_8_rvv_i32:                        371.1 ( 1.28x)
put_h264_qpel_4_mc30_8_c:                              162.6 ( 1.00x)
put_h264_qpel_4_mc30_8_rvv_i32:                        121.1 ( 1.34x)
put_h264_qpel_4_mc31_8_c:                              339.9 ( 1.00x)
put_h264_qpel_4_mc31_8_rvv_i32:                        183.6 ( 1.85x)
put_h264_qpel_4_mc32_8_c:                              527.1 ( 1.00x)
put_h264_qpel_4_mc32_8_rvv_i32:                        339.9 ( 1.55x)
put_h264_qpel_4_mc33_8_c:                              339.9 ( 1.00x)
put_h264_qpel_4_mc33_8_rvv_i32:                        183.3 ( 1.85x)
put_h264_qpel_8_mc00_8_c:                               89.8 ( 1.00x)
put_h264_qpel_8_mc00_8_rvv_i32:                         37.6 ( 2.39x)
put_h264_qpel_8_mc01_8_c:                              725.1 ( 1.00x)
put_h264_qpel_8_mc01_8_rvv_i32:                        141.8 ( 5.11x)
put_h264_qpel_8_mc02_8_c:                              662.9 ( 1.00x)
put_h264_qpel_8_mc02_8_rvv_i32:                        131.3 ( 5.05x)
put_h264_qpel_8_mc03_8_c:                              735.6 ( 1.00x)
put_h264_qpel_8_mc03_8_rvv_i32:                        141.8 ( 5.19x)
put_h264_qpel_8_mc10_8_c:                              600.4 ( 1.00x)
put_h264_qpel_8_mc10_8_rvv_i32:                        225.1 ( 2.67x)
put_h264_qpel_8_mc11_8_c:                             1173.1 ( 1.00x)
put_h264_qpel_8_mc11_8_rvv_i32:                        339.9 ( 3.45x)
put_h264_qpel_8_mc12_8_c:                             1871.1 ( 1.00x)
put_h264_qpel_8_mc12_8_rvv_i32:                        548.1 ( 3.41x)
put_h264_qpel_8_mc13_8_c:                             1173.1 ( 1.00x)
put_h264_qpel_8_mc13_8_rvv_i32:                        339.9 ( 3.45x)
put_h264_qpel_8_mc20_8_c:                              454.6 ( 1.00x)
put_h264_qpel_8_mc20_8_rvv_i32:                        214.8 ( 2.12x)
put_h264_qpel_8_mc21_8_c:                             1683.6 ( 1.00x)
put_h264_qpel_8_mc21_8_rvv_i32:                        621.1 ( 2.71x)
put_h264_qpel_8_mc22_8_c:                             1162.6 ( 1.00x)
put_h264_qpel_8_mc22_8_rvv_i32:                        412.9 ( 2.82x)
put_h264_qpel_8_mc23_8_c:                             1673.3 ( 1.00x)
put_h264_qpel_8_mc23_8_rvv_i32:                        631.4 ( 2.65x)
put_h264_qpel_8_mc30_8_c:                              589.9 ( 1.00x)
put_h264_qpel_8_mc30_8_rvv_i32:                        225.3 ( 2.62x)
put_h264_qpel_8_mc31_8_c:                             1173.1 ( 1.00x)
put_h264_qpel_8_mc31_8_rvv_i32:                        339.9 ( 3.45x)
put_h264_qpel_8_mc32_8_c:                             1871.1 ( 1.00x)
put_h264_qpel_8_mc32_8_rvv_i32:                        548.1 ( 3.41x)
put_h264_qpel_8_mc33_8_c:                             1162.6 ( 1.00x)
put_h264_qpel_8_mc33_8_rvv_i32:                        350.1 ( 3.32x)
put_h264_qpel_16_mc00_8_c:                             308.6 ( 1.00x)
put_h264_qpel_16_mc00_8_rvv_i32:                        48.1 ( 6.42x)
put_h264_qpel_16_mc01_8_c:                            2746.1 ( 1.00x)
put_h264_qpel_16_mc01_8_rvv_i32:                       277.4 ( 9.90x)
put_h264_qpel_16_mc02_8_c:                            2558.6 ( 1.00x)
put_h264_qpel_16_mc02_8_rvv_i32:                       266.9 ( 9.59x)
put_h264_qpel_16_mc03_8_c:                            2756.6 ( 1.00x)
put_h264_qpel_16_mc03_8_rvv_i32:                       277.4 ( 9.94x)
put_h264_qpel_16_mc10_8_c:                            2287.8 ( 1.00x)
put_h264_qpel_16_mc10_8_rvv_i32:                       443.9 ( 5.15x)
put_h264_qpel_16_mc11_8_c:                            4558.6 ( 1.00x)
put_h264_qpel_16_mc11_8_rvv_i32:                       683.4 ( 6.67x)
put_h264_qpel_16_mc12_8_c:                            7381.9 ( 1.00x)
put_h264_qpel_16_mc12_8_rvv_i32:                      1027.1 ( 7.19x)
put_h264_qpel_16_mc13_8_c:                            4548.4 ( 1.00x)
put_h264_qpel_16_mc13_8_rvv_i32:                       683.6 ( 6.65x)
put_h264_qpel_16_mc20_8_c:                            1819.1 ( 1.00x)
put_h264_qpel_16_mc20_8_rvv_i32:                       423.4 ( 4.30x)
put_h264_qpel_16_mc21_8_c:                            6704.6 ( 1.00x)
put_h264_qpel_16_mc21_8_rvv_i32:                      1183.6 ( 5.66x)
put_h264_qpel_16_mc22_8_c:                            4641.9 ( 1.00x)
put_h264_qpel_16_mc22_8_rvv_i32:                       756.4 ( 6.14x)
put_h264_qpel_16_mc23_8_c:                            6725.6 ( 1.00x)
put_h264_qpel_16_mc23_8_rvv_i32:                      1183.6 ( 5.68x)
put_h264_qpel_16_mc30_8_c:                            2308.6 ( 1.00x)
put_h264_qpel_16_mc30_8_rvv_i32:                       443.9 ( 5.20x)
put_h264_qpel_16_mc31_8_c:                            4548.4 ( 1.00x)
put_h264_qpel_16_mc31_8_rvv_i32:                       704.4 ( 6.46x)
put_h264_qpel_16_mc32_8_c:                            7412.9 ( 1.00x)
put_h264_qpel_16_mc32_8_rvv_i32:                      1037.8 ( 7.14x)
put_h264_qpel_16_mc33_8_c:                            4558.6 ( 1.00x)
put_h264_qpel_16_mc33_8_rvv_i32:                       694.1 ( 6.57x)

Signed-off-by: Niklas Haas <git at haasn.dev>
Signed-off-by: J. Dekker <jdek at itanimul.li>
---
 libavcodec/h264qpel.c            |   2 +
 libavcodec/h264qpel.h            |   1 +
 libavcodec/riscv/Makefile        |   2 +
 libavcodec/riscv/h264qpel_init.c | 113 +++++++
 libavcodec/riscv/h264qpel_rvv.S  | 554 +++++++++++++++++++++++++++++++
 5 files changed, 672 insertions(+)
 create mode 100644 libavcodec/riscv/h264qpel_init.c
 create mode 100644 libavcodec/riscv/h264qpel_rvv.S

diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index 65fef03304..faca1e8953 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
     ff_h264qpel_init_arm(c, bit_depth);
 #elif ARCH_PPC
     ff_h264qpel_init_ppc(c, bit_depth);
+#elif ARCH_RISCV
+    ff_h264qpel_init_riscv(c, bit_depth);
 #elif ARCH_X86
     ff_h264qpel_init_x86(c, bit_depth);
 #elif ARCH_MIPS
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 0259e8de23..24baf826f9 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth);
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index b3a6b588c9..d4276521f3 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
 RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
                               riscv/h264idct_rvv.o
+OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
+RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
 OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
diff --git a/libavcodec/riscv/h264qpel_init.c b/libavcodec/riscv/h264qpel_init.c
new file mode 100644
index 0000000000..69a1345447
--- /dev/null
+++ b/libavcodec/riscv/h264qpel_init.c
@@ -0,0 +1,113 @@
+/*
+ * RISC-V optimised DSP functions
+ * Copyright (c) 2024 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+#define DECL_QPEL_OPS(OP, SIZE, EXT)                                                                       \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
+void ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+DECL_QPEL_OPS(put, 16, rvv256)
+DECL_QPEL_OPS(put, 8,  rvv256)
+DECL_QPEL_OPS(put, 4,  rvv256)
+
+DECL_QPEL_OPS(avg, 16, rvv256)
+DECL_QPEL_OPS(avg, 8,  rvv256)
+DECL_QPEL_OPS(avg, 4,  rvv256)
+
+DECL_QPEL_OPS(put, 16, rvv)
+DECL_QPEL_OPS(put, 8,  rvv)
+DECL_QPEL_OPS(put, 4,  rvv)
+
+DECL_QPEL_OPS(avg, 16, rvv)
+DECL_QPEL_OPS(avg, 8,  rvv)
+DECL_QPEL_OPS(avg, 4,  rvv)
+
+#define SET_QPEL_FNS(OP, IDX, SIZE, EXT)                                                        \
+do {                                                                                            \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 0] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 1] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 2] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 3] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 4] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 5] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 6] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 7] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 8] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][ 9] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][10] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][11] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][12] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][13] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][14] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT; \
+    c->OP ## _h264_qpel_pixels_tab[IDX][15] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT; \
+} while (0)
+
+av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+    if (flags & AV_CPU_FLAG_RVV_I32) {
+        const int vlen = 8 * ff_get_rv_vlenb();
+
+        switch (bit_depth) {
+        case 8:
+            if (vlen >= 256) {
+                SET_QPEL_FNS(put, 0, 16, rvv256);
+                SET_QPEL_FNS(put, 1, 8,  rvv256);
+                SET_QPEL_FNS(put, 2, 4,  rvv256);
+
+                SET_QPEL_FNS(avg, 0, 16, rvv256);
+                SET_QPEL_FNS(avg, 1, 8,  rvv256);
+                SET_QPEL_FNS(avg, 2, 4,  rvv256);
+            } else  if (vlen >= 128) {
+                SET_QPEL_FNS(put, 0, 16, rvv);
+                SET_QPEL_FNS(put, 1, 8,  rvv);
+                SET_QPEL_FNS(put, 2, 4,  rvv);
+
+                SET_QPEL_FNS(avg, 0, 16, rvv);
+                SET_QPEL_FNS(avg, 1, 8,  rvv);
+                SET_QPEL_FNS(avg, 2, 4,  rvv);
+            }
+            break;
+        }
+    }
+#endif
+}
diff --git a/libavcodec/riscv/h264qpel_rvv.S b/libavcodec/riscv/h264qpel_rvv.S
new file mode 100644
index 0000000000..7713372f23
--- /dev/null
+++ b/libavcodec/riscv/h264qpel_rvv.S
@@ -0,0 +1,554 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Niklas Haas
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vnclipsu.wi      shifti, lmul, lmul2, vregs:vararg
+        vsetvli         zero, zero, e16, \lmul2, ta, ma
+    .irp x, \vregs
+        vmax.vx         \x, \x, zero
+    .endr
+        vsetvli         zero, zero, e8, \lmul, ta, ma
+    .irp x, \vregs
+        vnclipu.wi      \x, \x, \shifti
+    .endr
+.endm
+
+.macro lowpass_init      lmul, sizei, size, w0, w1, backup
+        vsetivli         zero, \sizei, e8, \lmul, ta, ma
+        csrwi            vxrm, 0
+        li               \size, \sizei
+    .ifnb \w0
+        li               \w0, 20
+        li               \w1, -5
+    .endif
+.endm
+
+        /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */
+.macro lowpass_h         vdst, src, w0, w1, tmp=t3, tmp2=t4
+        addi             \tmp, \src, 3
+        lbu              \tmp2, 2(\src)
+        vle8.v           v31, (\tmp)
+        lbu              \tmp, 1(\src)
+        vslide1up.vx     v30, v31, \tmp2
+        lbu              \tmp2, 0(\src)
+        vslide1up.vx     v29, v30, \tmp
+        lbu              \tmp, -1(\src)
+        vslide1up.vx     v28, v29, \tmp2
+        lbu              \tmp2, -2(\src)
+        vslide1up.vx     v27, v28, \tmp
+        vslide1up.vx     v26, v27, \tmp2
+        vwaddu.vv        \vdst, v26, v31
+        vwmaccu.vx       \vdst, \w0, v28
+        vwmaccu.vx       \vdst, \w0, v29
+        vwmaccsu.vx      \vdst, \w1, v27
+        vwmaccsu.vx      \vdst, \w1, v30
+.endm
+
+        /* output is unclipped */
+.macro lowpass_v         w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, signed=0
+    .if \signed
+        vwadd.vv         \vdst, \vsrc0, \vsrc5
+        vwmacc.vx        \vdst, \w0, \vsrc2
+        vwmacc.vx        \vdst, \w0, \vsrc3
+        vwmacc.vx        \vdst, \w1, \vsrc1
+        vwmacc.vx        \vdst, \w1, \vsrc4
+    .else
+        vwaddu.vv        \vdst, \vsrc0, \vsrc5
+        vwmaccu.vx       \vdst, \w0, \vsrc2
+        vwmaccu.vx       \vdst, \w0, \vsrc3
+        vwmaccsu.vx      \vdst, \w1, \vsrc1
+        vwmaccsu.vx      \vdst, \w1, \vsrc4
+    .endif
+.endm
+
+.macro qpel_mc00         op, dst, src, stride, size
+func ff_\op\()_h264_qpel_pixels, zve32x
+1:
+        add              t0, \stride, \src
+        add              t1, \stride, t0
+        add              t2, \stride, t1
+        vle8.v           v0, (\src)
+        vle8.v           v1, (t0)
+        vle8.v           v2, (t1)
+        vle8.v           v3, (t2)
+        addi             \size, \size, -4
+        add              \src, \stride, t2
+        add              t0, \stride, \dst
+        add              t1, \stride, t0
+        add              t2, \stride, t1
+    .ifc \op, avg
+        vle8.v           v4, (\dst)
+        vle8.v           v5, (t0)
+        vle8.v           v6, (t1)
+        vle8.v           v7, (t2)
+        vaaddu.vv        v0, v0, v4
+        vaaddu.vv        v1, v1, v5
+        vaaddu.vv        v2, v2, v6
+        vaaddu.vv        v3, v3, v7
+    .endif
+        vse8.v           v0, (\dst)
+        vse8.v           v1, (t0)
+        vse8.v           v2, (t1)
+        vse8.v           v3, (t2)
+        add              \dst, \stride, t2
+        bnez             \size, 1b
+        ret
+endfunc
+.endm
+
+        qpel_mc00        put, a0, a1, a2, a4
+        qpel_mc00        avg, a0, a1, a2, a4
+
+.macro qpel_lowpass      op, ext, lmul, lmul2, dst, src, dst_stride, src_stride, size, w0, w1, src2, src2_stride
+func ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x
+1:
+        add              t0, \src_stride, \src
+        add              t1, \src_stride, t0
+        add              t2, \src_stride, t1
+        lowpass_h        v0, \src, \w0, \w1
+        lowpass_h        v2, t0,   \w0, \w1
+        lowpass_h        v4, t1,   \w0, \w1
+        lowpass_h        v6, t2,   \w0, \w1
+        add              \src, \src_stride, t2
+        addi             \size, \size, -4
+        vnclipsu.wi      5, \lmul, \lmul2, v0, v2, v4, v6
+    .ifnb \src2
+        add              t0, \src2_stride, \src2
+        add              t1, \src2_stride, t0
+        add              t2, \src2_stride, t1
+        vle8.v           v8,  (\src2)
+        vle8.v           v10, (t0)
+        vle8.v           v12, (t1)
+        vle8.v           v14, (t2)
+        add              \src2, \dst_stride, t2
+        vaaddu.vv        v0, v0, v8
+        vaaddu.vv        v2, v2, v10
+        vaaddu.vv        v4, v4, v12
+        vaaddu.vv        v6, v6, v14
+    .endif
+        add              t0, \dst_stride, \dst
+        add              t1, \dst_stride, t0
+        add              t2, \dst_stride, t1
+    .ifc \op, avg
+        vle8.v           v1, (\dst)
+        vle8.v           v3, (t0)
+        vle8.v           v5, (t1)
+        vle8.v           v7, (t2)
+        vaaddu.vv        v0, v0, v1
+        vaaddu.vv        v2, v2, v3
+        vaaddu.vv        v4, v4, v5
+        vaaddu.vv        v6, v6, v7
+    .endif
+        vse8.v           v0, (\dst)
+        vse8.v           v2, (t0)
+        vse8.v           v4, (t1)
+        vse8.v           v6, (t2)
+        add              \dst, \dst_stride, t2
+        bnez             \size, 1b
+        ret
+endfunc
+
+func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x
+        sub              t0, \src, \src_stride
+        sub              t1, t0,   \src_stride
+        vle8.v           v2, (\src)
+        vle8.v           v1, (t0)
+        vle8.v           v0, (t1)
+        add              t0, \src, \src_stride
+        add              t1,   t0, \src_stride
+        add              \src, t1, \src_stride
+        vle8.v           v3, (t0)
+        vle8.v           v4, (t1)
+1:
+        add              t0, \src_stride, \src
+        add              t1, \src_stride, t0
+        add              t2, \src_stride, t1
+        vle8.v           v5, (\src)
+        vle8.v           v6, (t0)
+        vle8.v           v7, (t1)
+        vle8.v           v8, (t2)
+        add              \src, \src_stride, t2
+        lowpass_v        \w0, \w1, v24, v0, v1, v2, v3, v4, v5
+        lowpass_v        \w0, \w1, v26, v1, v2, v3, v4, v5, v6
+        lowpass_v        \w0, \w1, v28, v2, v3, v4, v5, v6, v7
+        lowpass_v        \w0, \w1, v30, v3, v4, v5, v6, v7, v8
+        addi             \size, \size, -4
+        vnclipsu.wi      5, \lmul, \lmul2, v24, v26, v28, v30
+    .ifnb \src2
+        add              t0, \src2_stride, \src2
+        add              t1, \src2_stride, t0
+        add              t2, \src2_stride, t1
+        vle8.v           v9, (\src2)
+        vle8.v           v10, (t0)
+        vle8.v           v11, (t1)
+        vle8.v           v12, (t2)
+        add              \src2, \src2_stride, t2
+        vaaddu.vv        v24, v24, v9
+        vaaddu.vv        v26, v26, v10
+        vaaddu.vv        v28, v28, v11
+        vaaddu.vv        v30, v30, v12
+    .endif
+        add              t0, \dst_stride, \dst
+        add              t1, \dst_stride, t0
+        add              t2, \dst_stride, t1
+    .ifc \op, avg
+        vle8.v           v9, (\dst)
+        vle8.v           v10, (t0)
+        vle8.v           v11, (t1)
+        vle8.v           v12, (t2)
+        vaaddu.vv        v24, v24, v9
+        vaaddu.vv        v26, v26, v10
+        vaaddu.vv        v28, v28, v11
+        vaaddu.vv        v30, v30, v12
+    .endif
+        vse8.v           v24, (\dst)
+        vse8.v           v26, (t0)
+        vse8.v           v28, (t1)
+        vse8.v           v30, (t2)
+        add              \dst, \dst_stride, t2
+        vmv.v.v          v0, v4
+        vmv.v.v          v1, v5
+        vmv.v.v          v2, v6
+        vmv.v.v          v3, v7
+        vmv.v.v          v4, v8
+        bnez             \size, 1b
+        ret
+endfunc
+
+func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x
+        sub              t0, \src, \src_stride
+        sub              t1, t0,   \src_stride
+        lowpass_h        v4, \src, \w0, \w1
+        lowpass_h        v2, t0,   \w0, \w1
+        lowpass_h        v0, t1,   \w0, \w1
+        add              t0, \src, \src_stride
+        add              t1,   t0, \src_stride
+        add              \src, t1, \src_stride
+        lowpass_h        v6, t0,   \w0, \w1
+        lowpass_h        v8, t1,   \w0, \w1
+1:
+        add              t0, \src_stride, \src
+        add              t1, \src_stride, t0
+        add              t2, \src_stride, t1
+        lowpass_h        v10, \src, \w0, \w1
+        lowpass_h        v12, t0,   \w0, \w1
+        lowpass_h        v14, t1,   \w0, \w1
+        lowpass_h        v16, t2,   \w0, \w1
+        vsetvli          zero, zero, e16, \lmul2, ta, ma
+        addi             \size, \size, -4
+        lowpass_v        \w0, \w1, v20, v0, v2,  v4,  v6,  v8, v10, signed=1
+        lowpass_v        \w0, \w1, v24, v2, v4,  v6,  v8, v10, v12, signed=1
+        lowpass_v        \w0, \w1, v28, v4, v6,  v8, v10, v12, v14, signed=1
+        vnclip.wi        v0, v20, 10
+        lowpass_v        \w0, \w1, v20, v6, v8, v10, v12, v14, v16, signed=1
+        vnclip.wi        v2, v24, 10
+        vnclip.wi        v4, v28, 10
+        vnclip.wi        v6, v20, 10
+        vmax.vx          v18, v0, zero
+        vmax.vx          v20, v2, zero
+        vmax.vx          v22, v4, zero
+        vmax.vx          v24, v6, zero
+        vmv.v.v          v0, v8
+        vmv.v.v          v2, v10
+        vmv.v.v          v4, v12
+        vmv.v.v          v6, v14
+        vmv.v.v          v8, v16
+        add              \src, \src_stride, t2
+        vsetvli          zero, zero, e8, \lmul, ta, ma
+        vnclipu.wi       v18, v18, 0
+        vnclipu.wi       v20, v20, 0
+        vnclipu.wi       v22, v22, 0
+        vnclipu.wi       v24, v24, 0
+    .ifnb \src2
+        add              t0, \src2_stride, \src2
+        add              t1, \src2_stride, t0
+        add              t2, \src2_stride, t1
+        vle8.v           v26, (\src2)
+        vle8.v           v27, (t0)
+        vle8.v           v28, (t1)
+        vle8.v           v29, (t2)
+        add              \src2, \src2_stride, t2
+        vaaddu.vv        v18, v18, v26
+        vaaddu.vv        v20, v20, v27
+        vaaddu.vv        v22, v22, v28
+        vaaddu.vv        v24, v24, v29
+    .endif
+        add              t0, \dst_stride, \dst
+        add              t1, \dst_stride, t0
+        add              t2, \dst_stride, t1
+    .ifc \op, avg
+        vle8.v           v26, (\dst)
+        vle8.v           v27, (t0)
+        vle8.v           v28, (t1)
+        vle8.v           v29, (t2)
+        vaaddu.vv        v18, v18, v26
+        vaaddu.vv        v20, v20, v27
+        vaaddu.vv        v22, v22, v28
+        vaaddu.vv        v24, v24, v29
+    .endif
+        vse8.v           v18, (\dst)
+        vse8.v           v20, (t0)
+        vse8.v           v22, (t1)
+        vse8.v           v24, (t2)
+        add              \dst, \dst_stride, t2
+        bnez             \size, 1b
+        ret
+endfunc
+.endm
+
+/* Note: We could possibly specialize for the width 8 / width 4 cases by
+   loading 32 bit integers, but this makes the convolutions more complicated
+   to implement, so it's not necessarily any faster. */
+
+.macro h264_qpel         lmul, lmul2
+        qpel_lowpass     put,    , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6
+        qpel_lowpass     put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6, a5, a6
+        qpel_lowpass     avg,    , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6
+        qpel_lowpass     avg, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6, a5, a6
+.endm
+
+        h264_qpel        m1,  m2
+        h264_qpel        mf2, m1
+        h264_qpel        mf4, mf2
+        h264_qpel        mf8, mf4
+
+.macro ff_h264_qpel_fns  op, lmul, sizei, ext=rvv, dst, src, dst_stride, src_stride, size, w0, w1, src2, src2_stride, tmp
+func ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size,
+        j                ff_\op\()_h264_qpel_pixels
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        mv               \src2, \src
+        mv               \src2_stride, \src_stride
+        j                ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        j                ff_\op\()_h264_qpel_h_lowpass_\lmul\()
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        addi             \src2, \src, 1
+        mv               \src2_stride, \src_stride
+        j                ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        mv               \src2, \src
+        mv               \src2_stride, \src_stride
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        add              \src2, \src, \src_stride
+        mv               \src2_stride, \src_stride
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_h_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_h_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        addi             \src, \src, 1
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        add              \src, \src, \src_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_h_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        add              \src, \src, \src_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_h_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        addi             \src, \src, 1
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        mv               \src_stride, \dst_stride
+        j                ff_\op\()_h264_qpel_hv_lowpass_\lmul
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_h_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        add              \src, \src, \src_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_h_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        mv               \src_stride, \dst_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_v_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2
+endfunc
+
+func ff_\op\()_h264_qpel\sizei\()_mc32_\ext, zve32x
+        lowpass_init     \lmul, \sizei, \size, \w0, \w1
+        push             \dst, \src
+        mv               \tmp, ra
+        addi             \src, \src, 1
+        mv               \src_stride, \dst_stride
+        addi             \dst, sp, -(\sizei * \sizei)
+        li               \dst_stride, \sizei
+        call             ff_put_h264_qpel_v_lowpass_\lmul
+        addi             \src2, sp, -(\sizei * \sizei)
+        mv               \src2_stride, \dst_stride
+        pop              \dst, \src
+        mv               \dst_stride, \src_stride
+        li               \size, \sizei
+        mv               ra, \tmp
+        j                ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2
+endfunc
+.endm
+
+        ff_h264_qpel_fns put, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns put, mf4, 8,  rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns put, mf8, 4,  rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+
+        ff_h264_qpel_fns avg, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns avg, mf4, 8,  rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns avg, mf8, 4,  rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+
+        ff_h264_qpel_fns put, m1,  16, rvv,    a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns put, mf2, 8,  rvv,    a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns put, mf4, 4,  rvv,    a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+
+        ff_h264_qpel_fns avg, m1,  16, rvv,    a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns avg, mf2, 8,  rvv,    a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
+        ff_h264_qpel_fns avg, mf4, 4,  rvv,    a0, a1, a2, a3, a4, t5, t6, a5, a6, a7
-- 
2.45.1



More information about the ffmpeg-devel mailing list