[FFmpeg-devel] [PATCH v1 6/6] avcodec/hevc: Add asm opt for the following functions
jinbo
jinbo at loongson.cn
Fri Dec 22 12:52:14 EET 2023
tests/checkasm/checkasm: C LSX LASX
put_hevc_qpel_uni_h4_8_c: 5.7 1.2
put_hevc_qpel_uni_h6_8_c: 12.2 2.7
put_hevc_qpel_uni_h8_8_c: 21.5 3.2
put_hevc_qpel_uni_h12_8_c: 47.2 9.2 7.2
put_hevc_qpel_uni_h16_8_c: 87.0 11.7 9.0
put_hevc_qpel_uni_h24_8_c: 188.2 27.5 21.0
put_hevc_qpel_uni_h32_8_c: 335.2 46.7 28.5
put_hevc_qpel_uni_h48_8_c: 772.5 104.5 65.2
put_hevc_qpel_uni_h64_8_c: 1383.2 142.2 109.0
put_hevc_epel_uni_w_v4_8_c: 5.0 1.5
put_hevc_epel_uni_w_v6_8_c: 10.7 3.5 2.5
put_hevc_epel_uni_w_v8_8_c: 18.2 3.7 3.0
put_hevc_epel_uni_w_v12_8_c: 40.2 10.7 7.5
put_hevc_epel_uni_w_v16_8_c: 70.2 13.0 9.2
put_hevc_epel_uni_w_v24_8_c: 158.2 30.2 22.5
put_hevc_epel_uni_w_v32_8_c: 281.0 52.0 36.5
put_hevc_epel_uni_w_v48_8_c: 631.7 116.7 82.7
put_hevc_epel_uni_w_v64_8_c: 1108.2 207.5 142.2
put_hevc_epel_uni_w_h4_8_c: 4.7 1.2
put_hevc_epel_uni_w_h6_8_c: 9.7 3.5 2.7
put_hevc_epel_uni_w_h8_8_c: 17.2 4.2 3.5
put_hevc_epel_uni_w_h12_8_c: 38.0 11.5 7.2
put_hevc_epel_uni_w_h16_8_c: 69.2 14.5 9.2
put_hevc_epel_uni_w_h24_8_c: 152.0 34.7 22.5
put_hevc_epel_uni_w_h32_8_c: 271.0 58.0 40.0
put_hevc_epel_uni_w_h48_8_c: 597.5 136.7 95.0
put_hevc_epel_uni_w_h64_8_c: 1074.0 252.2 168.0
put_hevc_epel_bi_h4_8_c: 4.5 0.7
put_hevc_epel_bi_h6_8_c: 9.0 1.5
put_hevc_epel_bi_h8_8_c: 15.2 1.7
put_hevc_epel_bi_h12_8_c: 33.5 4.2 3.7
put_hevc_epel_bi_h16_8_c: 59.7 5.2 4.7
put_hevc_epel_bi_h24_8_c: 132.2 11.0
put_hevc_epel_bi_h32_8_c: 232.7 20.2 13.2
put_hevc_epel_bi_h48_8_c: 521.7 45.2 31.2
put_hevc_epel_bi_h64_8_c: 949.0 71.5 51.0
After this patch, the peformance of decoding H265 4K 30FPS
30Mbps on 3A6000 with 8 threads improves 1fps(55fps-->56fsp).
Change-Id: I8cc1e41daa63ca478039bc55d1ee8934a7423f51
---
libavcodec/loongarch/hevc_mc.S | 1991 ++++++++++++++++-
libavcodec/loongarch/hevcdsp_init_loongarch.c | 66 +
libavcodec/loongarch/hevcdsp_lasx.h | 54 +
libavcodec/loongarch/hevcdsp_lsx.h | 36 +-
4 files changed, 2144 insertions(+), 3 deletions(-)
diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
index 0b0647546b..a0e5938fbd 100644
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -1784,8 +1784,12 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
endfunc
const shufb
- .byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6
- .byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10
+ .byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6 //mask for epel_uni_w(128-bit)
+ .byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 //mask for epel_uni_w(256-bit)
+ .byte 0,1,2,3, 4,5,6,7 ,1,2,3,4, 5,6,7,8 //mask for qpel_uni_h4
+ .byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for qpel_uni_h/v6/8...
+ .byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6, 4,5,6,7, 5,6,7,8, 6,7,8,9, 7,8,9,10 //epel_uni_w_h16/24/32/48/64
+ .byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8, 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for bi_epel_h16/24/32/48/64
endconst
.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
@@ -2584,3 +2588,1986 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx
addi.d t5, t5, -1
bnez t5, .LOOP_HV64_LASX
endfunc
+
+/*
+ * void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, intptr_t mx, intptr_t my,
+ * int width)
+ */
+function ff_hevc_put_hevc_uni_qpel_h4_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr5, t1, t0 //filter
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr1, t1
+ la.local t1, shufb
+ vld vr2, t1, 32 //mask0 0 1
+ vaddi.bu vr3, vr2, 2 //mask1 2 3
+.LOOP_UNI_H4:
+ vld vr18, a2, 0
+ vldx vr19, a2, a3
+ alsl.d a2, a3, a2, 1
+ vshuf.b vr6, vr18, vr18, vr2
+ vshuf.b vr7, vr18, vr18, vr3
+ vshuf.b vr8, vr19, vr19, vr2
+ vshuf.b vr9, vr19, vr19, vr3
+ vdp2.h.bu.b vr10, vr6, vr5
+ vdp2.h.bu.b vr11, vr7, vr5
+ vdp2.h.bu.b vr12, vr8, vr5
+ vdp2.h.bu.b vr13, vr9, vr5
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vpickev.w vr10, vr11, vr10
+ vpickev.w vr11, vr13, vr12
+ vpickev.h vr10, vr11, vr10
+ vadd.h vr10, vr10, vr1
+ vsrai.h vr10, vr10, 6
+ vssrani.bu.h vr10, vr10, 0
+ fst.s f10, a0, 0
+ vbsrl.v vr10, vr10, 4
+ fstx.s f10, a0, a1
+ alsl.d a0, a1, a0, 1
+ addi.d a4, a4, -2
+ bnez a4, .LOOP_UNI_H4
+endfunc
+
+.macro HEVC_UNI_QPEL_H8_LSX in0, out0
+ vshuf.b vr10, \in0, \in0, vr5
+ vshuf.b vr11, \in0, \in0, vr6
+ vshuf.b vr12, \in0, \in0, vr7
+ vshuf.b vr13, \in0, \in0, vr8
+ vdp2.h.bu.b \out0, vr10, vr0 //(QPEL_FILTER(src, 1)
+ vdp2add.h.bu.b \out0, vr11, vr1
+ vdp2add.h.bu.b \out0, vr12, vr2
+ vdp2add.h.bu.b \out0, vr13, vr3
+ vadd.h \out0, \out0, vr4
+ vsrai.h \out0, \out0, 6
+.endm
+
+.macro HEVC_UNI_QPEL_H16_LASX in0, out0
+ xvshuf.b xr10, \in0, \in0, xr5
+ xvshuf.b xr11, \in0, \in0, xr6
+ xvshuf.b xr12, \in0, \in0, xr7
+ xvshuf.b xr13, \in0, \in0, xr8
+ xvdp2.h.bu.b \out0, xr10, xr0 //(QPEL_FILTER(src, 1)
+ xvdp2add.h.bu.b \out0, xr11, xr1
+ xvdp2add.h.bu.b \out0, xr12, xr2
+ xvdp2add.h.bu.b \out0, xr13, xr3
+ xvadd.h \out0, \out0, xr4
+ xvsrai.h \out0, \out0, 6
+.endm
+
+function ff_hevc_put_hevc_uni_qpel_h6_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H6:
+ vld vr9, a2, 0
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vssrani.bu.h vr14, vr14, 0
+ fst.s f14, a0, 0
+ vstelm.h vr14, a0, 4, 2
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H6
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h8_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H8:
+ vld vr9, a2, 0
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vssrani.bu.h vr14, vr14, 0
+ fst.d f14, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H8
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h12_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H12:
+ vld vr9, a2, 0
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vld vr9, a2, 8
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr15
+ vssrani.bu.h vr15, vr14, 0
+ fst.d f15, a0, 0
+ vstelm.w vr15, a0, 8, 2
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H12
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h12_8_lasx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1 //cd...
+ xvrepl128vei.h xr2, xr0, 2 //ef...
+ xvrepl128vei.h xr3, xr0, 3 //gh...
+ xvrepl128vei.h xr0, xr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ xvreplgr2vr.h xr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ xvreplve0.q xr5, xr5
+ xvaddi.bu xr6, xr5, 2
+ xvaddi.bu xr7, xr5, 4
+ xvaddi.bu xr8, xr5, 6
+.LOOP_UNI_H12_LASX:
+ xvld xr9, a2, 0
+ add.d a2, a2, a3
+ xvpermi.d xr9, xr9, 0x94 //rearrange data
+ HEVC_UNI_QPEL_H16_LASX xr9, xr14
+ xvpermi.q xr15, xr14, 0x01
+ vssrani.bu.h vr15, vr14, 0
+ fst.d f15, a0, 0
+ vstelm.w vr15, a0, 8, 2
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H12_LASX
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h16_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H16:
+ vld vr9, a2, 0
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vld vr9, a2, 8
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr15
+ vssrani.bu.h vr15, vr14, 0
+ vst vr15, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H16
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h16_8_lasx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1 //cd...
+ xvrepl128vei.h xr2, xr0, 2 //ef...
+ xvrepl128vei.h xr3, xr0, 3 //gh...
+ xvrepl128vei.h xr0, xr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ xvreplgr2vr.h xr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ xvreplve0.q xr5, xr5
+ xvaddi.bu xr6, xr5, 2
+ xvaddi.bu xr7, xr5, 4
+ xvaddi.bu xr8, xr5, 6
+.LOOP_UNI_H16_LASX:
+ xvld xr9, a2, 0
+ add.d a2, a2, a3
+ xvpermi.d xr9, xr9, 0x94 //rearrange data
+ HEVC_UNI_QPEL_H16_LASX xr9, xr14
+ xvpermi.q xr15, xr14, 0x01
+ vssrani.bu.h vr15, vr14, 0
+ vst vr15, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H16_LASX
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h24_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H24:
+ vld vr9, a2, 0
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vld vr9, a2, 8
+ HEVC_UNI_QPEL_H8_LSX vr9, vr15
+ vld vr9, a2, 16
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr16
+ vssrani.bu.h vr15, vr14, 0
+ vssrani.bu.h vr16, vr16, 0
+ vst vr15, a0, 0
+ fst.d f16, a0, 16
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H24
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h24_8_lasx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1 //cd...
+ xvrepl128vei.h xr2, xr0, 2 //ef...
+ xvrepl128vei.h xr3, xr0, 3 //gh...
+ xvrepl128vei.h xr0, xr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ xvreplgr2vr.h xr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ xvreplve0.q xr5, xr5
+ xvaddi.bu xr6, xr5, 2
+ xvaddi.bu xr7, xr5, 4
+ xvaddi.bu xr8, xr5, 6
+.LOOP_UNI_H24_LASX:
+ xvld xr9, a2, 0
+ xvpermi.q xr19, xr9, 0x01 //16...23
+ add.d a2, a2, a3
+ xvpermi.d xr9, xr9, 0x94 //rearrange data
+ HEVC_UNI_QPEL_H16_LASX xr9, xr14
+ xvpermi.q xr15, xr14, 0x01
+ vssrani.bu.h vr15, vr14, 0
+ vst vr15, a0, 0
+ HEVC_UNI_QPEL_H8_LSX vr19, vr16
+ vssrani.bu.h vr16, vr16, 0
+ fst.d f16, a0, 16
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H24_LASX
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h32_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H32:
+ vld vr9, a2, 0
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vld vr9, a2, 8
+ HEVC_UNI_QPEL_H8_LSX vr9, vr15
+ vld vr9, a2, 16
+ HEVC_UNI_QPEL_H8_LSX vr9, vr16
+ vld vr9, a2, 24
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr17
+ vssrani.bu.h vr15, vr14, 0
+ vssrani.bu.h vr17, vr16, 0
+ vst vr15, a0, 0
+ vst vr17, a0, 16
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H32
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h32_8_lasx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1 //cd...
+ xvrepl128vei.h xr2, xr0, 2 //ef...
+ xvrepl128vei.h xr3, xr0, 3 //gh...
+ xvrepl128vei.h xr0, xr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ xvreplgr2vr.h xr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ xvreplve0.q xr5, xr5
+ xvaddi.bu xr6, xr5, 2
+ xvaddi.bu xr7, xr5, 4
+ xvaddi.bu xr8, xr5, 6
+.LOOP_UNI_H32_LASX:
+ xvld xr9, a2, 0
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr14
+ xvld xr9, a2, 16
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr15
+ add.d a2, a2, a3
+ xvssrani.bu.h xr15, xr14, 0
+ xvpermi.d xr15, xr15, 0xd8
+ xvst xr15, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H32_LASX
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h48_8_lsx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ vreplvei.h vr1, vr0, 1 //cd...
+ vreplvei.h vr2, vr0, 2 //ef...
+ vreplvei.h vr3, vr0, 3 //gh...
+ vreplvei.h vr0, vr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ vreplgr2vr.h vr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ vaddi.bu vr6, vr5, 2
+ vaddi.bu vr7, vr5, 4
+ vaddi.bu vr8, vr5, 6
+.LOOP_UNI_H48:
+ vld vr9, a2, 0
+ HEVC_UNI_QPEL_H8_LSX vr9, vr14
+ vld vr9, a2, 8
+ HEVC_UNI_QPEL_H8_LSX vr9, vr15
+ vld vr9, a2, 16
+ HEVC_UNI_QPEL_H8_LSX vr9, vr16
+ vld vr9, a2, 24
+ HEVC_UNI_QPEL_H8_LSX vr9, vr17
+ vld vr9, a2, 32
+ HEVC_UNI_QPEL_H8_LSX vr9, vr18
+ vld vr9, a2, 40
+ add.d a2, a2, a3
+ HEVC_UNI_QPEL_H8_LSX vr9, vr19
+ vssrani.bu.h vr15, vr14, 0
+ vssrani.bu.h vr17, vr16, 0
+ vssrani.bu.h vr19, vr18, 0
+ vst vr15, a0, 0
+ vst vr17, a0, 16
+ vst vr19, a0, 32
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H48
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h48_8_lasx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1 //cd...
+ xvrepl128vei.h xr2, xr0, 2 //ef...
+ xvrepl128vei.h xr3, xr0, 3 //gh...
+ xvrepl128vei.h xr0, xr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ xvreplgr2vr.h xr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ xvreplve0.q xr5, xr5
+ xvaddi.bu xr6, xr5, 2
+ xvaddi.bu xr7, xr5, 4
+ xvaddi.bu xr8, xr5, 6
+.LOOP_UNI_H48_LASX:
+ xvld xr9, a2, 0
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr14
+ xvld xr9, a2, 16
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr15
+ xvld xr9, a2, 32
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr16
+ add.d a2, a2, a3
+ xvssrani.bu.h xr15, xr14, 0
+ xvpermi.d xr15, xr15, 0xd8
+ xvst xr15, a0, 0
+ xvpermi.q xr17, xr16, 0x01
+ vssrani.bu.h vr17, vr16, 0
+ vst vr17, a0, 32
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H48_LASX
+endfunc
+
+function ff_hevc_put_hevc_uni_qpel_h64_8_lasx
+ addi.d t0, a5, -1
+ slli.w t0, t0, 4
+ la.local t1, ff_hevc_qpel_filters
+ vldx vr0, t1, t0 //filter abcdefgh
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1 //cd...
+ xvrepl128vei.h xr2, xr0, 2 //ef...
+ xvrepl128vei.h xr3, xr0, 3 //gh...
+ xvrepl128vei.h xr0, xr0, 0 //ab...
+ addi.d a2, a2, -3 //src -= 3
+ addi.w t1, zero, 32
+ xvreplgr2vr.h xr4, t1
+ la.local t1, shufb
+ vld vr5, t1, 48
+ xvreplve0.q xr5, xr5
+ xvaddi.bu xr6, xr5, 2
+ xvaddi.bu xr7, xr5, 4
+ xvaddi.bu xr8, xr5, 6
+.LOOP_UNI_H64_LASX:
+ xvld xr9, a2, 0
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr14
+ xvld xr9, a2, 16
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr15
+ xvld xr9, a2, 32
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr16
+ xvld xr9, a2, 48
+ xvpermi.d xr9, xr9, 0x94
+ HEVC_UNI_QPEL_H16_LASX xr9, xr17
+ add.d a2, a2, a3
+ xvssrani.bu.h xr15, xr14, 0
+ xvpermi.d xr15, xr15, 0xd8
+ xvst xr15, a0, 0
+ xvssrani.bu.h xr17, xr16, 0
+ xvpermi.d xr17, xr17, 0xd8
+ xvst xr17, a0, 32
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_H64_LASX
+endfunc
+
+/*
+ * void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, int denom, int wx, int ox,
+ * intptr_t mx, intptr_t my, int width)
+ */
+function ff_hevc_put_hevc_epel_uni_w_v4_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ fld.s f6, a2, 0 //0
+ fldx.s f7, a2, a3 //1
+ fldx.s f8, a2, t0 //2
+ add.d a2, a2, t1
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr8, vr8
+ vilvl.h vr6, vr7, vr6
+ vreplvei.w vr0, vr0, 0
+.LOOP_UNI_V4:
+ fld.s f9, a2, 0 //3
+ fldx.s f10, a2, a3 //4
+ add.d a2, a2, t0
+ vextrins.b vr6, vr9, 0x30 //insert the 3th load
+ vextrins.b vr6, vr9, 0x71
+ vextrins.b vr6, vr9, 0xb2
+ vextrins.b vr6, vr9, 0xf3
+ vbsrl.v vr7, vr6, 1
+ vextrins.b vr7, vr10, 0x30 //insert the 4th load
+ vextrins.b vr7, vr10, 0x71
+ vextrins.b vr7, vr10, 0xb2
+ vextrins.b vr7, vr10, 0xf3
+ vdp2.h.bu.b vr8, vr6, vr0 //EPEL_FILTER(src, stride)
+ vdp2.h.bu.b vr9, vr7, vr0
+ vhaddw.w.h vr10, vr8, vr8
+ vhaddw.w.h vr11, vr9, vr9
+ vmulwev.w.h vr10, vr10, vr1 //EPEL_FILTER(src, stride) * wx
+ vmulwev.w.h vr11, vr11, vr1
+ vadd.w vr10, vr10, vr2 // + offset
+ vadd.w vr11, vr11, vr2
+ vsra.w vr10, vr10, vr3 // >> shift
+ vsra.w vr11, vr11, vr3
+ vadd.w vr10, vr10, vr4 // + ox
+ vadd.w vr11, vr11, vr4
+ vssrani.h.w vr11, vr10, 0
+ vssrani.bu.h vr10, vr11, 0
+ vbsrl.v vr6, vr7, 1
+ fst.s f10, a0, 0
+ vbsrl.v vr10, vr10, 4
+ fstx.s f10, a0, a1
+ alsl.d a0, a1, a0, 1
+ addi.d a4, a4, -2
+ bnez a4, .LOOP_UNI_V4
+endfunc
+
+.macro CALC_EPEL_FILTER_LSX out0, out1
+ vdp2.h.bu.b vr12, vr10, vr0 //EPEL_FILTER(src, stride)
+ vdp2add.h.bu.b vr12, vr11, vr5
+ vexth.w.h vr13, vr12
+ vsllwil.w.h vr12, vr12, 0
+ vmulwev.w.h vr12, vr12, vr1 //EPEL_FILTER(src, stride) * wx
+ vmulwev.w.h vr13, vr13, vr1 //EPEL_FILTER(src, stride) * wx
+ vadd.w vr12, vr12, vr2 // + offset
+ vadd.w vr13, vr13, vr2
+ vsra.w vr12, vr12, vr3 // >> shift
+ vsra.w vr13, vr13, vr3
+ vadd.w \out0, vr12, vr4 // + ox
+ vadd.w \out1, vr13, vr4
+.endm
+
+.macro CALC_EPEL_FILTER_LASX out0
+ xvdp2.h.bu.b xr11, xr12, xr0 //EPEL_FILTER(src, stride)
+ xvhaddw.w.h xr12, xr11, xr11
+ xvmulwev.w.h xr12, xr12, xr1 //EPEL_FILTER(src, stride) * wx
+ xvadd.w xr12, xr12, xr2 // + offset
+ xvsra.w xr12, xr12, xr3 // >> shift
+ xvadd.w \out0, xr12, xr4 // + ox
+.endm
+
+//w is a label, also can be used as a condition for ".if" statement.
+.macro PUT_HEVC_EPEL_UNI_W_V8_LSX w
+ fld.d f6, a2, 0 //0
+ fldx.d f7, a2, a3 //1
+ fldx.d f8, a2, t0 //2
+ add.d a2, a2, t1
+.LOOP_UNI_V8_\w:
+ fld.d f9, a2, 0 // 3
+ add.d a2, a2, a3
+ vilvl.b vr10, vr7, vr6
+ vilvl.b vr11, vr9, vr8
+ vaddi.bu vr6, vr7, 0 //back up previous value
+ vaddi.bu vr7, vr8, 0
+ vaddi.bu vr8, vr9, 0
+ CALC_EPEL_FILTER_LSX vr12, vr13
+ vssrani.h.w vr13, vr12, 0
+ vssrani.bu.h vr13, vr13, 0
+.if \w < 8
+ fst.s f13, a0, 0
+ vstelm.h vr13, a0, 4, 2
+.else
+ fst.d f13, a0, 0
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_V8_\w
+.endm
+
+//w is a label, also can be used as a condition for ".if" statement.
+.macro PUT_HEVC_EPEL_UNI_W_V8_LASX w
+ fld.d f6, a2, 0 //0
+ fldx.d f7, a2, a3 //1
+ fldx.d f8, a2, t0 //2
+ add.d a2, a2, t1
+.LOOP_UNI_V8_LASX_\w:
+ fld.d f9, a2, 0 // 3
+ add.d a2, a2, a3
+ vilvl.b vr10, vr7, vr6
+ vilvl.b vr11, vr9, vr8
+ xvilvl.h xr12, xr11, xr10
+ xvilvh.h xr13, xr11, xr10
+ xvpermi.q xr12, xr13, 0x02
+ vaddi.bu vr6, vr7, 0 //back up previous value
+ vaddi.bu vr7, vr8, 0
+ vaddi.bu vr8, vr9, 0
+ CALC_EPEL_FILTER_LASX xr12
+ xvpermi.q xr13, xr12, 0x01
+ vssrani.h.w vr13, vr12, 0
+ vssrani.bu.h vr13, vr13, 0
+.if \w < 8
+ fst.s f13, a0, 0
+ vstelm.h vr13, a0, 4, 2
+.else
+ fst.d f13, a0, 0
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_V8_LASX_\w
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v6_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ PUT_HEVC_EPEL_UNI_W_V8_LSX 6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v6_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ PUT_HEVC_EPEL_UNI_W_V8_LASX 6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v8_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ PUT_HEVC_EPEL_UNI_W_V8_LSX 8
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v8_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ PUT_HEVC_EPEL_UNI_W_V8_LASX 8
+endfunc
+
+//w is a label, also can be used as a condition for ".if" statement.
+.macro PUT_HEVC_EPEL_UNI_W_V16_LSX w
+ vld vr6, a2, 0 //0
+ vldx vr7, a2, a3 //1
+ vldx vr8, a2, t0 //2
+ add.d a2, a2, t1
+.LOOP_UNI_V16_\w:
+ vld vr9, a2, 0 //3
+ add.d a2, a2, a3
+ vilvl.b vr10, vr7, vr6
+ vilvl.b vr11, vr9, vr8
+ CALC_EPEL_FILTER_LSX vr14, vr15
+ vilvh.b vr10, vr7, vr6
+ vilvh.b vr11, vr9, vr8
+ CALC_EPEL_FILTER_LSX vr16, vr17
+ vssrani.h.w vr15, vr14, 0
+ vssrani.h.w vr17, vr16, 0
+ vssrani.bu.h vr17, vr15, 0
+ vaddi.bu vr6, vr7, 0 //back up previous value
+ vaddi.bu vr7, vr8, 0
+ vaddi.bu vr8, vr9, 0
+.if \w < 16
+ fst.d f17, a0, 0
+ vstelm.w vr17, a0, 8, 2
+.else
+ vst vr17, a0, 0
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_V16_\w
+.endm
+
+//w is a label, also can be used as a condition for ".if" statement.
+.macro PUT_HEVC_EPEL_UNI_W_V16_LASX w
+ vld vr6, a2, 0 //0
+ vldx vr7, a2, a3 //1
+ vldx vr8, a2, t0 //2
+ add.d a2, a2, t1
+.LOOP_UNI_V16_LASX_\w:
+ vld vr9, a2, 0 //3
+ add.d a2, a2, a3
+ xvilvl.b xr10, xr7, xr6
+ xvilvh.b xr11, xr7, xr6
+ xvpermi.q xr11, xr10, 0x20
+ xvilvl.b xr12, xr9, xr8
+ xvilvh.b xr13, xr9, xr8
+ xvpermi.q xr13, xr12, 0x20
+ xvdp2.h.bu.b xr10, xr11, xr0 //EPEL_FILTER(src, stride)
+ xvdp2add.h.bu.b xr10, xr13, xr5
+ xvexth.w.h xr11, xr10
+ xvsllwil.w.h xr10, xr10, 0
+ xvmulwev.w.h xr10, xr10, xr1 //EPEL_FILTER(src, stride) * wx
+ xvmulwev.w.h xr11, xr11, xr1
+ xvadd.w xr10, xr10, xr2 // + offset
+ xvadd.w xr11, xr11, xr2
+ xvsra.w xr10, xr10, xr3 // >> shift
+ xvsra.w xr11, xr11, xr3
+ xvadd.w xr10, xr10, xr4 // + wx
+ xvadd.w xr11, xr11, xr4
+ xvssrani.h.w xr11, xr10, 0
+ xvpermi.q xr10, xr11, 0x01
+ vssrani.bu.h vr10, vr11, 0
+ vaddi.bu vr6, vr7, 0 //back up previous value
+ vaddi.bu vr7, vr8, 0
+ vaddi.bu vr8, vr9, 0
+.if \w < 16
+ fst.d f10, a0, 0
+ vstelm.w vr10, a0, 8, 2
+.else
+ vst vr10, a0, 0
+.endif
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_V16_LASX_\w
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v12_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v12_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.q xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ xvrepl128vei.h xr5, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v16_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v16_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.q xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ xvrepl128vei.h xr5, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v24_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ addi.d t2, a0, 0 //save init
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 24
+ addi.d a0, t2, 16 //increase step
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V8_LSX 24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v24_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr20, xr0 //save xr0
+ xvreplve0.q xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ xvrepl128vei.h xr5, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ addi.d t2, a0, 0 //save init
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 24
+ addi.d a0, t2, 16 //increase step
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ xvaddi.bu xr0, xr20, 0
+ PUT_HEVC_EPEL_UNI_W_V8_LASX 24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v32_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 32
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 33
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v32_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.q xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ xvrepl128vei.h xr5, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 32
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 33
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v48_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 48
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 49
+ addi.d a0, t2, 32
+ addi.d a2, t3, 32
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 50
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v48_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.q xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ xvrepl128vei.h xr5, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 48
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 49
+ addi.d a0, t2, 32
+ addi.d a2, t3, 32
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 50
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v64_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 64
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 65
+ addi.d a0, t2, 32
+ addi.d a2, t3, 32
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 66
+ addi.d a0, t2, 48
+ addi.d a2, t3, 48
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LSX 67
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v64_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 8 //my
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.q xr0, xr0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ sub.d a2, a2, a3 //src -= stride
+ xvrepl128vei.h xr5, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ addi.d t2, a0, 0
+ addi.d t3, a2, 0
+ addi.d t4, a4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 64
+ addi.d a0, t2, 16
+ addi.d a2, t3, 16
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 65
+ addi.d a0, t2, 32
+ addi.d a2, t3, 32
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 66
+ addi.d a0, t2, 48
+ addi.d a2, t3, 48
+ addi.d a4, t4, 0
+ PUT_HEVC_EPEL_UNI_W_V16_LASX 67
+endfunc
+
+/*
+ * void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * int height, int denom, int wx, int ox,
+ * intptr_t mx, intptr_t my, int width)
+ */
+function ff_hevc_put_hevc_epel_uni_w_h4_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr5, t1, 0
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H4:
+ fld.d f6, a2, 0
+ add.d a2, a2, a3
+ vshuf.b vr6, vr6, vr6, vr5
+ vdp2.h.bu.b vr7, vr6, vr0
+ vhaddw.w.h vr7, vr7, vr7
+ vmulwev.w.h vr7, vr7, vr1
+ vadd.w vr7, vr7, vr2
+ vsra.w vr7, vr7, vr3
+ vadd.w vr7, vr7, vr4
+ vssrani.h.w vr7, vr7, 0
+ vssrani.bu.h vr7, vr7, 0
+ fst.s f7, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H4
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h6_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H6:
+ vld vr8, a2, 0
+ add.d a2, a2, a3
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr14, vr15
+ vssrani.h.w vr15, vr14, 0
+ vssrani.bu.h vr15, vr15, 0
+ fst.s f15, a0, 0
+ vstelm.h vr15, a0, 4, 2
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h6_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H6_LASX:
+ vld vr8, a2, 0
+ xvreplve0.q xr8, xr8
+ add.d a2, a2, a3
+ xvshuf.b xr12, xr8, xr8, xr6
+ CALC_EPEL_FILTER_LASX xr14
+ xvpermi.q xr15, xr14, 0x01
+ vssrani.h.w vr15, vr14, 0
+ vssrani.bu.h vr15, vr15, 0
+ fst.s f15, a0, 0
+ vstelm.h vr15, a0, 4, 2
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H6_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h8_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H8:
+ vld vr8, a2, 0
+ add.d a2, a2, a3
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr14, vr15
+ vssrani.h.w vr15, vr14, 0
+ vssrani.bu.h vr15, vr15, 0
+ fst.d f15, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H8
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h8_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H8_LASX:
+ vld vr8, a2, 0
+ xvreplve0.q xr8, xr8
+ add.d a2, a2, a3
+ xvshuf.b xr12, xr8, xr8, xr6
+ CALC_EPEL_FILTER_LASX xr14
+ xvpermi.q xr15, xr14, 0x01
+ vssrani.h.w vr15, vr14, 0
+ vssrani.bu.h vr15, vr15, 0
+ fst.d f15, a0, 0
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H8_LASX
+endfunc
+
+.macro EPEL_UNI_W_H16_LOOP_LSX idx0, idx1, idx2
+ vld vr8, a2, \idx0
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr14, vr15
+ vld vr8, a2, \idx1
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr16, vr17
+ vssrani.h.w vr15, vr14, 0
+ vssrani.h.w vr17, vr16, 0
+ vssrani.bu.h vr17, vr15, 0
+ vst vr17, a0, \idx2
+.endm
+
+.macro EPEL_UNI_W_H16_LOOP_LASX idx0, idx2, w
+ xvld xr8, a2, \idx0
+ xvpermi.d xr9, xr8, 0x09
+ xvreplve0.q xr8, xr8
+ xvshuf.b xr12, xr8, xr8, xr6
+ CALC_EPEL_FILTER_LASX xr14
+ xvreplve0.q xr8, xr9
+ xvshuf.b xr12, xr8, xr8, xr6
+ CALC_EPEL_FILTER_LASX xr16
+ xvssrani.h.w xr16, xr14, 0
+ xvpermi.q xr17, xr16, 0x01
+ vssrani.bu.h vr17, vr16, 0
+ vpermi.w vr17, vr17, 0xd8
+.if \w == 12
+ fst.d f17, a0, 0
+ vstelm.w vr17, a0, 8, 2
+.else
+ vst vr17, a0, \idx2
+.endif
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_h12_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H12:
+ vld vr8, a2, 0
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr14, vr15
+ vld vr8, a2, 8
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr16, vr17
+ vssrani.h.w vr15, vr14, 0
+ vssrani.h.w vr17, vr16, 0
+ vssrani.bu.h vr17, vr15, 0
+ fst.d f17, a0, 0
+ vstelm.w vr17, a0, 8, 2
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h12_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H12_LASX:
+ EPEL_UNI_W_H16_LOOP_LASX 0, 0, 12
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H12_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h16_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H16:
+ EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h16_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H16_LASX:
+ EPEL_UNI_W_H16_LOOP_LASX 0, 0, 16
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H16_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h24_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H24:
+ EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
+ vld vr8, a2, 16
+ add.d a2, a2, a3
+ vshuf.b vr10, vr8, vr8, vr6
+ vshuf.b vr11, vr8, vr8, vr7
+ CALC_EPEL_FILTER_LSX vr18, vr19
+ vssrani.h.w vr19, vr18, 0
+ vssrani.bu.h vr19, vr19, 0
+ fst.d f19, a0, 16
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h24_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H24_LASX:
+ EPEL_UNI_W_H16_LOOP_LASX 0, 0, 24
+ vld vr8, a2, 16
+ add.d a2, a2, a3
+ xvreplve0.q xr8, xr8
+ xvshuf.b xr12, xr8, xr8, xr6
+ CALC_EPEL_FILTER_LASX xr14
+ xvpermi.q xr15, xr14, 0x01
+ vssrani.h.w vr15, vr14, 0
+ vssrani.bu.h vr15, vr15, 0
+ fst.d f15, a0, 16
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H24_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h32_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H32:
+ EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
+ EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H32
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h32_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H32_LASX:
+ EPEL_UNI_W_H16_LOOP_LASX 0, 0, 32
+ EPEL_UNI_W_H16_LOOP_LASX 16, 16, 32
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H32_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h48_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H48:
+ EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
+ EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16
+ EPEL_UNI_W_H16_LOOP_LSX 32, 40, 32
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H48
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h48_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H48_LASX:
+ EPEL_UNI_W_H16_LOOP_LASX 0, 0, 48
+ EPEL_UNI_W_H16_LOOP_LASX 16, 16, 48
+ EPEL_UNI_W_H16_LOOP_LASX 32, 32, 48
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H48_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h64_8_lsx
+ LOAD_VAR 128
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ vreplvei.w vr0, vr0, 0
+ la.local t1, shufb
+ vld vr6, t1, 48
+ vaddi.bu vr7, vr6, 2
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+ vreplvei.h vr5, vr0, 1
+ vreplvei.h vr0, vr0, 0
+.LOOP_UNI_W_H64:
+ EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
+ EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16
+ EPEL_UNI_W_H16_LOOP_LSX 32, 40, 32
+ EPEL_UNI_W_H16_LOOP_LSX 48, 56, 48
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H64
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h64_8_lasx
+ LOAD_VAR 256
+ ld.d t0, sp, 0 //mx
+ addi.d t0, t0, -1
+ slli.w t0, t0, 2
+ la.local t1, ff_hevc_epel_filters
+ vldx vr0, t1, t0 //filter
+ xvreplve0.w xr0, xr0
+ la.local t1, shufb
+ xvld xr6, t1, 64
+ slli.d t0, a3, 1 //stride * 2
+ add.d t1, t0, a3 //stride * 3
+ addi.d a2, a2, -1 //src -= 1
+.LOOP_UNI_W_H64_LASX:
+ EPEL_UNI_W_H16_LOOP_LASX 0, 0, 64
+ EPEL_UNI_W_H16_LOOP_LASX 16, 16, 64
+ EPEL_UNI_W_H16_LOOP_LASX 32, 32, 64
+ EPEL_UNI_W_H16_LOOP_LASX 48, 48, 64
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.d a4, a4, -1
+ bnez a4, .LOOP_UNI_W_H64_LASX
+endfunc
+
+/*
+ * void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride,
+ * const uint8_t *_src, ptrdiff_t _srcstride,
+ * const int16_t *src2, int height, intptr_t mx,
+ * intptr_t my, int width)
+ */
+function ff_hevc_put_hevc_bi_epel_h4_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ vreplvei.w vr0, vr0, 0
+ la.local t0, shufb
+ vld vr1, t0, 0 // mask
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H4:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ vshuf.b vr5, vr5, vr5, vr1
+ vdp2.h.bu.b vr6, vr5, vr0 // EPEL_FILTER(src, 1)
+ vsllwil.w.h vr4, vr4, 0
+ vhaddw.w.h vr6, vr6, vr6
+ vadd.w vr6, vr6, vr4 // src2[x]
+ vssrani.h.w vr6, vr6, 0
+ vssrarni.bu.h vr6, vr6, 7
+ fst.s f6, a0, 0
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H4
+endfunc
+
+.macro PUT_HEVC_BI_EPEL_H8_LSX in0, in1, in2, in3, out0
+ vshuf.b vr6, \in1, \in0, \in2
+ vshuf.b vr7, \in1, \in0, \in3
+ vdp2.h.bu.b vr8, vr6, vr0 // EPEL_FILTER(src, 1)
+ vdp2add.h.bu.b vr8, vr7, vr1 // EPEL_FILTER(src, 1)
+ vsadd.h \out0, vr8, vr4 // src2[x]
+.endm
+
+.macro PUT_HEVC_BI_EPEL_H16_LASX in0, in1, in2, in3, out0
+ xvshuf.b xr6, \in1, \in0, \in2
+ xvshuf.b xr7, \in1, \in0, \in3
+ xvdp2.h.bu.b xr8, xr6, xr0 // EPEL_FILTER(src, 1)
+ xvdp2add.h.bu.b xr8, xr7, xr1 // EPEL_FILTER(src, 1)
+ xvsadd.h \out0, xr8, xr4 // src2[x]
+.endm
+
+function ff_hevc_put_hevc_bi_epel_h6_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ vreplvei.h vr1, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ la.local t0, shufb
+ vld vr2, t0, 48// mask
+ vaddi.bu vr3, vr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H6:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr7
+ vssrarni.bu.h vr7, vr7, 7
+ fst.s f7, a0, 0
+ vstelm.h vr7, a0, 4, 2
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H6
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h8_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ vreplvei.h vr1, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ la.local t0, shufb
+ vld vr2, t0, 48// mask
+ vaddi.bu vr3, vr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H8:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr7
+ vssrarni.bu.h vr7, vr7, 7
+ fst.d f7, a0, 0
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H8
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h12_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ vreplvei.h vr1, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ la.local t0, shufb
+ vld vr2, t0, 48// mask
+ vaddi.bu vr3, vr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H12:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr11
+ vld vr5, a2, 8
+ vld vr4, a4, 16
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12
+ vssrarni.bu.h vr12, vr11, 7
+ fst.d f12, a0, 0
+ vstelm.w vr12, a0, 8, 2
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H12
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h12_8_lasx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ la.local t0, shufb
+ xvld xr2, t0, 96// mask
+ xvaddi.bu xr3, xr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H12_LASX:
+ xvld xr4, a4, 0 // src2
+ xvld xr5, a2, 0
+ xvpermi.d xr5, xr5, 0x94
+ PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9
+ xvpermi.q xr10, xr9, 0x01
+ vssrarni.bu.h vr10, vr9, 7
+ fst.d f10, a0, 0
+ vstelm.w vr10, a0, 8, 2
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H12_LASX
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h16_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ vreplvei.h vr1, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ la.local t0, shufb
+ vld vr2, t0, 48// mask
+ vaddi.bu vr3, vr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H16:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr11
+ vld vr5, a2, 8
+ vld vr4, a4, 16
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12
+ vssrarni.bu.h vr12, vr11, 7
+ vst vr12, a0, 0
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H16
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h16_8_lasx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ la.local t0, shufb
+ xvld xr2, t0, 96// mask
+ xvaddi.bu xr3, xr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H16_LASX:
+ xvld xr4, a4, 0 // src2
+ xvld xr5, a2, 0
+ xvpermi.d xr5, xr5, 0x94
+ PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9
+ xvpermi.q xr10, xr9, 0x01
+ vssrarni.bu.h vr10, vr9, 7
+ vst vr10, a0, 0
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H16_LASX
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h32_8_lasx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ la.local t0, shufb
+ xvld xr2, t0, 96// mask
+ xvaddi.bu xr3, xr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H32_LASX:
+ xvld xr4, a4, 0 // src2
+ xvld xr5, a2, 0
+ xvpermi.q xr15, xr5, 0x01
+ xvpermi.d xr5, xr5, 0x94
+ PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9
+ xvld xr4, a4, 32
+ xvld xr15, a2, 16
+ xvpermi.d xr15, xr15, 0x94
+ PUT_HEVC_BI_EPEL_H16_LASX xr15, xr15, xr2, xr3, xr11
+ xvssrarni.bu.h xr11, xr9, 7
+ xvpermi.d xr11, xr11, 0xd8
+ xvst xr11, a0, 0
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H32_LASX
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h48_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6// filter
+ vreplvei.h vr1, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ la.local t0, shufb
+ vld vr2, t0, 48// mask
+ vaddi.bu vr3, vr2, 2
+ vaddi.bu vr21, vr2, 8
+ vaddi.bu vr22, vr2, 10
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H48:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ vld vr9, a2, 16
+ vld vr10, a2, 32
+ vld vr11, a2, 48
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12
+ vld vr4, a4, 16
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr9, vr21, vr22, vr13
+ vld vr4, a4, 32
+ PUT_HEVC_BI_EPEL_H8_LSX vr9, vr9, vr2, vr3, vr14
+ vld vr4, a4, 48
+ PUT_HEVC_BI_EPEL_H8_LSX vr9, vr10, vr21, vr22, vr15
+ vld vr4, a4, 64
+ PUT_HEVC_BI_EPEL_H8_LSX vr10, vr10, vr2, vr3, vr16
+ vld vr4, a4, 80
+ PUT_HEVC_BI_EPEL_H8_LSX vr10, vr11, vr21, vr22, vr17
+ vssrarni.bu.h vr13, vr12, 7
+ vssrarni.bu.h vr15, vr14, 7
+ vssrarni.bu.h vr17, vr16, 7
+ vst vr13, a0, 0
+ vst vr15, a0, 16
+ vst vr17, a0, 32
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H48
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h48_8_lasx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ la.local t0, shufb
+ xvld xr2, t0, 96// mask
+ xvaddi.bu xr3, xr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H48_LASX:
+ xvld xr4, a4, 0 // src2
+ xvld xr5, a2, 0
+ xvld xr9, a2, 32
+ xvpermi.d xr10, xr9, 0x94
+ xvpermi.q xr9, xr5, 0x21
+ xvpermi.d xr9, xr9, 0x94
+ xvpermi.d xr5, xr5, 0x94
+ PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr11
+ xvld xr4, a4, 32
+ PUT_HEVC_BI_EPEL_H16_LASX xr9, xr9, xr2, xr3, xr12
+ xvld xr4, a4, 64
+ PUT_HEVC_BI_EPEL_H16_LASX xr10, xr10, xr2, xr3, xr13
+ xvssrarni.bu.h xr12, xr11, 7
+ xvpermi.d xr12, xr12, 0xd8
+ xvpermi.q xr14, xr13, 0x01
+ vssrarni.bu.h vr14, vr13, 7
+ xvst xr12, a0, 0
+ vst vr14, a0, 32
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H48_LASX
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h64_8_lsx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6// filter
+ vreplvei.h vr1, vr0, 1
+ vreplvei.h vr0, vr0, 0
+ la.local t0, shufb
+ vld vr2, t0, 48// mask
+ vaddi.bu vr3, vr2, 2
+ vaddi.bu vr21, vr2, 8
+ vaddi.bu vr22, vr2, 10
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H64:
+ vld vr4, a4, 0 // src2
+ vld vr5, a2, 0
+ vld vr9, a2, 16
+ vld vr10, a2, 32
+ vld vr11, a2, 48
+ vld vr12, a2, 64
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr13
+ vld vr4, a4, 16
+ PUT_HEVC_BI_EPEL_H8_LSX vr5, vr9, vr21, vr22, vr14
+ vld vr4, a4, 32
+ PUT_HEVC_BI_EPEL_H8_LSX vr9, vr9, vr2, vr3, vr15
+ vld vr4, a4, 48
+ PUT_HEVC_BI_EPEL_H8_LSX vr9, vr10, vr21, vr22, vr16
+ vld vr4, a4, 64
+ PUT_HEVC_BI_EPEL_H8_LSX vr10, vr10, vr2, vr3, vr17
+ vld vr4, a4, 80
+ PUT_HEVC_BI_EPEL_H8_LSX vr10, vr11, vr21, vr22, vr18
+ vld vr4, a4, 96
+ PUT_HEVC_BI_EPEL_H8_LSX vr11, vr11, vr2, vr3, vr19
+ vld vr4, a4, 112
+ PUT_HEVC_BI_EPEL_H8_LSX vr11, vr12, vr21, vr22, vr20
+ vssrarni.bu.h vr14, vr13, 7
+ vssrarni.bu.h vr16, vr15, 7
+ vssrarni.bu.h vr18, vr17, 7
+ vssrarni.bu.h vr20, vr19, 7
+ vst vr14, a0, 0
+ vst vr16, a0, 16
+ vst vr18, a0, 32
+ vst vr20, a0, 48
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H64
+endfunc
+
+function ff_hevc_put_hevc_bi_epel_h64_8_lasx
+ addi.d a6, a6, -1
+ slli.w a6, a6, 2
+ la.local t0, ff_hevc_epel_filters
+ vldx vr0, t0, a6 // filter
+ xvreplve0.q xr0, xr0
+ xvrepl128vei.h xr1, xr0, 1
+ xvrepl128vei.h xr0, xr0, 0
+ la.local t0, shufb
+ xvld xr2, t0, 96// mask
+ xvaddi.bu xr3, xr2, 2
+ addi.d a2, a2, -1 // src -= 1
+.LOOP_BI_EPEL_H64_LASX:
+ xvld xr4, a4, 0 // src2
+ xvld xr5, a2, 0
+ xvld xr9, a2, 32
+ xvld xr11, a2, 48
+ xvpermi.d xr11, xr11, 0x94
+ xvpermi.d xr10, xr9, 0x94
+ xvpermi.q xr9, xr5, 0x21
+ xvpermi.d xr9, xr9, 0x94
+ xvpermi.d xr5, xr5, 0x94
+ PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr12
+ xvld xr4, a4, 32
+ PUT_HEVC_BI_EPEL_H16_LASX xr9, xr9, xr2, xr3, xr13
+ xvld xr4, a4, 64
+ PUT_HEVC_BI_EPEL_H16_LASX xr10, xr10, xr2, xr3, xr14
+ xvld xr4, a4, 96
+ PUT_HEVC_BI_EPEL_H16_LASX xr11, xr11, xr2, xr3, xr15
+ xvssrarni.bu.h xr13, xr12, 7
+ xvssrarni.bu.h xr15, xr14, 7
+ xvpermi.d xr13, xr13, 0xd8
+ xvpermi.d xr15, xr15, 0xd8
+ xvst xr13, a0, 0
+ xvst xr15, a0, 32
+ add.d a2, a2, a3
+ addi.d a4, a4, 128
+ add.d a0, a0, a1
+ addi.d a5, a5, -1
+ bnez a5, .LOOP_BI_EPEL_H64_LASX
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 245a833947..2756755733 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -124,8 +124,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_lsx;
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_lsx;
+ c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_lsx;
+ c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_lsx;
+ c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_lsx;
+ c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_lsx;
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_lsx;
c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_lsx;
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lsx;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lsx;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lsx;
c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_lsx;
c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_lsx;
@@ -138,6 +145,14 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+ c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_lsx;
+ c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_uni_qpel_h6_8_lsx;
+ c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_lsx;
+ c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_lsx;
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_lsx;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_lsx;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_lsx;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_lsx;
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lsx;
c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_lsx;
@@ -191,6 +206,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+ c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_epel_uni_w_h4_8_lsx;
+ c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_epel_uni_w_h6_8_lsx;
+ c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_epel_uni_w_h8_8_lsx;
+ c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_epel_uni_w_h12_8_lsx;
+ c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_epel_uni_w_h16_8_lsx;
+ c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_epel_uni_w_h24_8_lsx;
+ c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_epel_uni_w_h32_8_lsx;
+ c->put_hevc_epel_uni_w[8][0][1] = ff_hevc_put_hevc_epel_uni_w_h48_8_lsx;
+ c->put_hevc_epel_uni_w[9][0][1] = ff_hevc_put_hevc_epel_uni_w_h64_8_lsx;
+
+ c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_epel_uni_w_v4_8_lsx;
+ c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_epel_uni_w_v6_8_lsx;
+ c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_epel_uni_w_v8_8_lsx;
+ c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_epel_uni_w_v12_8_lsx;
+ c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_epel_uni_w_v16_8_lsx;
+ c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_epel_uni_w_v24_8_lsx;
+ c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_epel_uni_w_v32_8_lsx;
+ c->put_hevc_epel_uni_w[8][1][0] = ff_hevc_put_hevc_epel_uni_w_v48_8_lsx;
+ c->put_hevc_epel_uni_w[9][1][0] = ff_hevc_put_hevc_epel_uni_w_v64_8_lsx;
+
c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@@ -277,6 +312,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx;
c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx;
+ c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_epel_uni_w_h6_8_lasx;
+ c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_epel_uni_w_h8_8_lasx;
+ c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_epel_uni_w_h12_8_lasx;
+ c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_epel_uni_w_h16_8_lasx;
+ c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_epel_uni_w_h24_8_lasx;
+ c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_epel_uni_w_h32_8_lasx;
+ c->put_hevc_epel_uni_w[8][0][1] = ff_hevc_put_hevc_epel_uni_w_h48_8_lasx;
+ c->put_hevc_epel_uni_w[9][0][1] = ff_hevc_put_hevc_epel_uni_w_h64_8_lasx;
+
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;
@@ -285,6 +329,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_qpel_uni_w_v48_8_lasx;
c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_qpel_uni_w_v64_8_lasx;
+ c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_epel_uni_w_v6_8_lasx;
+ c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_epel_uni_w_v8_8_lasx;
+ c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_epel_uni_w_v12_8_lasx;
+ c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_epel_uni_w_v16_8_lasx;
+ c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_epel_uni_w_v24_8_lasx;
+ c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_epel_uni_w_v32_8_lasx;
+ c->put_hevc_epel_uni_w[8][1][0] = ff_hevc_put_hevc_epel_uni_w_v48_8_lasx;
+ c->put_hevc_epel_uni_w[9][1][0] = ff_hevc_put_hevc_epel_uni_w_v64_8_lasx;
+
c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_qpel_uni_w_h4_8_lasx;
c->put_hevc_qpel_uni_w[2][0][1] = ff_hevc_put_hevc_qpel_uni_w_h6_8_lasx;
c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_qpel_uni_w_h8_8_lasx;
@@ -294,6 +347,19 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_qpel_uni_w_h32_8_lasx;
c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_qpel_uni_w_h48_8_lasx;
c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx;
+
+ c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_lasx;
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_lasx;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_lasx;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_lasx;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_lasx;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lasx;
+
+ c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_lasx;
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_lasx;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lasx;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lasx;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lasx;
}
}
}
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
index 7f09d0943a..5db35eed47 100644
--- a/libavcodec/loongarch/hevcdsp_lasx.h
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -75,6 +75,60 @@ PEL_UNI_W(epel, hv, 32);
PEL_UNI_W(epel, hv, 48);
PEL_UNI_W(epel, hv, 64);
+PEL_UNI_W(epel, v, 6);
+PEL_UNI_W(epel, v, 8);
+PEL_UNI_W(epel, v, 12);
+PEL_UNI_W(epel, v, 16);
+PEL_UNI_W(epel, v, 24);
+PEL_UNI_W(epel, v, 32);
+PEL_UNI_W(epel, v, 48);
+PEL_UNI_W(epel, v, 64);
+
+PEL_UNI_W(epel, h, 6);
+PEL_UNI_W(epel, h, 8);
+PEL_UNI_W(epel, h, 12);
+PEL_UNI_W(epel, h, 16);
+PEL_UNI_W(epel, h, 24);
+PEL_UNI_W(epel, h, 32);
+PEL_UNI_W(epel, h, 48);
+PEL_UNI_W(epel, h, 64);
+
#undef PEL_UNI_W
+#define UNI_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lasx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
+UNI_MC(qpel, h, 64);
+
+#undef UNI_MC
+
+#define BI_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lasx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ const int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
+BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
+
+#undef BI_MC
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 7769cf25ae..a5ef237b5d 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -126,8 +126,15 @@ BI_MC(qpel, hv, 32);
BI_MC(qpel, hv, 48);
BI_MC(qpel, hv, 64);
+BI_MC(epel, h, 4);
+BI_MC(epel, h, 6);
+BI_MC(epel, h, 8);
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
BI_MC(epel, h, 24);
BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
BI_MC(epel, v, 12);
BI_MC(epel, v, 16);
@@ -151,7 +158,14 @@ void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
intptr_t mx, \
intptr_t my, \
int width)
-
+UNI_MC(qpel, h, 4);
+UNI_MC(qpel, h, 6);
+UNI_MC(qpel, h, 8);
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
UNI_MC(qpel, h, 64);
UNI_MC(qpel, v, 24);
@@ -287,6 +301,26 @@ PEL_UNI_W(epel, hv, 32);
PEL_UNI_W(epel, hv, 48);
PEL_UNI_W(epel, hv, 64);
+PEL_UNI_W(epel, h, 4);
+PEL_UNI_W(epel, h, 6);
+PEL_UNI_W(epel, h, 8);
+PEL_UNI_W(epel, h, 12);
+PEL_UNI_W(epel, h, 16);
+PEL_UNI_W(epel, h, 24);
+PEL_UNI_W(epel, h, 32);
+PEL_UNI_W(epel, h, 48);
+PEL_UNI_W(epel, h, 64);
+
+PEL_UNI_W(epel, v, 4);
+PEL_UNI_W(epel, v, 6);
+PEL_UNI_W(epel, v, 8);
+PEL_UNI_W(epel, v, 12);
+PEL_UNI_W(epel, v, 16);
+PEL_UNI_W(epel, v, 24);
+PEL_UNI_W(epel, v, 32);
+PEL_UNI_W(epel, v, 48);
+PEL_UNI_W(epel, v, 64);
+
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
More information about the ffmpeg-devel
mailing list