[FFmpeg-devel] [PATCH v2 5/7] avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt

Wed Dec 27 06:50:17 EET 2023

tests/checkasm/checkasm:           C       LSX     LASX
put_hevc_epel_uni_w_hv4_8_c:       9.5     2.2
put_hevc_epel_uni_w_hv6_8_c:       18.5    5.0     3.7
put_hevc_epel_uni_w_hv8_8_c:       30.7    6.0     4.5
put_hevc_epel_uni_w_hv12_8_c:      63.7    14.0    10.7
put_hevc_epel_uni_w_hv16_8_c:      107.5   22.7    17.0
put_hevc_epel_uni_w_hv24_8_c:      236.7   50.2    31.7
put_hevc_epel_uni_w_hv32_8_c:      414.5   88.0    53.0
put_hevc_epel_uni_w_hv48_8_c:      917.5   197.7   118.5
put_hevc_epel_uni_w_hv64_8_c:      1617.0  349.5   203.0

After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 3fps (52fps-->55fsp).

Change-Id: If067e394cec4685c62193e7adb829ac93ba4804d
---
 libavcodec/loongarch/hevc_mc.S                | 821 ++++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |  19 +
 libavcodec/loongarch/hevcdsp_lasx.h           |   9 +
 libavcodec/loongarch/hevcdsp_lsx.h            |  10 +
 4 files changed, 859 insertions(+)

diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
index 2ee338fb8e..0b0647546b 100644
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -22,6 +22,7 @@
 #include "loongson_asm.S"
 
 .extern ff_hevc_qpel_filters
+.extern ff_hevc_epel_filters
 
 .macro LOAD_VAR bit
     addi.w         t1,     a5,      6  //shift
@@ -206,6 +207,12 @@
 .endif
 .endm
 
+/*
+ * void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
+ *                                      const uint8_t *_src, ptrdiff_t _srcstride,
+ *                                      int height, int denom, int wx, int ox,
+ *                                      intptr_t mx, intptr_t my, int width)
+ */
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
     LOAD_VAR 128
     srli.w         t0,     a4,      1
@@ -482,6 +489,12 @@ endfunc
     xvhaddw.d.w  \in0,  \in0,  \in0
 .endm
 
+/*
+ * void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+ *                                  const uint8_t *_src, ptrdiff_t _srcstride,
+ *                                  int height, int denom, int wx, int ox,
+ *                                  intptr_t mx, intptr_t my, int width)
+ */
 function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx
     LOAD_VAR 128
     ld.d           t0,     sp,      8  //my
@@ -1253,6 +1266,12 @@ endfunc
     xvssrani.bu.h  \out0,  xr11,    0
 .endm
 
+/*
+ * void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+ *                                  const uint8_t *_src, ptrdiff_t _srcstride,
+ *                                  int height, int denom, int wx, int ox,
+ *                                  intptr_t mx, intptr_t my, int width)
+ */
 function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx
     LOAD_VAR 128
     ld.d           t0,     sp,      0  //mx
@@ -1763,3 +1782,805 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
     addi.d         a4,     a4,      -1
     bnez           a4,     .LOOP_H64_LASX
 endfunc
+
+const shufb
+    .byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6
+    .byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10
+endconst
+
+.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
+    fld.d          f7,     a2,      0  // start to load src
+    fldx.d         f8,     a2,      a3
+    alsl.d         a2,     a3,      a2,    1
+    fld.d          f9,     a2,      0
+    vshuf.b        vr7,    vr7,     vr7,   vr0 // 0123 1234 2345 3456
+    vshuf.b        vr8,    vr8,     vr8,   vr0
+    vshuf.b        vr9,    vr9,     vr9,   vr0
+    vdp2.h.bu.b    vr10,   vr7,     vr5  // EPEL_FILTER(src, 1)
+    vdp2.h.bu.b    vr11,   vr8,     vr5
+    vdp2.h.bu.b    vr12,   vr9,     vr5
+    vhaddw.w.h     vr10,   vr10,    vr10 // tmp[0/1/2/3]
+    vhaddw.w.h     vr11,   vr11,    vr11 // vr10,vr11,vr12 corresponding to EPEL_EXTRA
+    vhaddw.w.h     vr12,   vr12,    vr12
+.LOOP_HV4_\w:
+    add.d          a2,     a2,      a3
+    fld.d          f14,    a2,      0    // height loop begin
+    vshuf.b        vr14,   vr14,    vr14,  vr0
+    vdp2.h.bu.b    vr13,   vr14,    vr5
+    vhaddw.w.h     vr13,   vr13,    vr13
+    vmul.w         vr14,   vr10,    vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE)
+    vmadd.w        vr14,   vr11,    vr17
+    vmadd.w        vr14,   vr12,    vr18
+    vmadd.w        vr14,   vr13,    vr19
+    vaddi.wu       vr10,   vr11,    0    //back up previous value
+    vaddi.wu       vr11,   vr12,    0
+    vaddi.wu       vr12,   vr13,    0
+    vsrai.w        vr14,   vr14,    6    // >> 6
+    vmul.w         vr14,   vr14,    vr1  // * wx
+    vadd.w         vr14,   vr14,    vr2  // + offset
+    vsra.w         vr14,   vr14,    vr3  // >> shift
+    vadd.w         vr14,   vr14,    vr4  // + ox
+    vssrani.h.w    vr14,   vr14,    0
+    vssrani.bu.h   vr14,   vr14,    0    // clip
+    fst.s          f14,    a0,      0
+    add.d          a0,     a0,      a1
+    addi.d         a4,     a4,      -1
+    bnez           a4,     .LOOP_HV4_\w
+.endm
+
+/*
+ * void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
+ *                                   const uint8_t *_src, ptrdiff_t _srcstride,
+ *                                   int height, int denom, int wx, int ox,
+ *                                   intptr_t mx, intptr_t my, int width)
+ */
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV4_LSX 4
+endfunc
+
+.macro PUT_HEVC_EPEL_UNI_W_HV8_LSX w
+    vld            vr7,    a2,      0  // start to load src
+    vldx           vr8,    a2,      a3
+    alsl.d         a2,     a3,      a2,    1
+    vld            vr9,    a2,      0
+    vshuf.b        vr10,   vr7,     vr7,   vr0 // 0123 1234 2345 3456
+    vshuf.b        vr11,   vr8,     vr8,   vr0
+    vshuf.b        vr12,   vr9,     vr9,   vr0
+    vshuf.b        vr7,    vr7,     vr7,   vr22// 4567 5678 6789 78910
+    vshuf.b        vr8,    vr8,     vr8,   vr22
+    vshuf.b        vr9,    vr9,     vr9,   vr22
+    vdp2.h.bu.b    vr13,   vr10,    vr5  // EPEL_FILTER(src, 1)
+    vdp2.h.bu.b    vr14,   vr11,    vr5
+    vdp2.h.bu.b    vr15,   vr12,    vr5
+    vdp2.h.bu.b    vr23,   vr7,     vr5
+    vdp2.h.bu.b    vr20,   vr8,     vr5
+    vdp2.h.bu.b    vr21,   vr9,     vr5
+    vhaddw.w.h     vr7,    vr13,    vr13
+    vhaddw.w.h     vr8,    vr14,    vr14
+    vhaddw.w.h     vr9,    vr15,    vr15
+    vhaddw.w.h     vr10,   vr23,    vr23
+    vhaddw.w.h     vr11,   vr20,    vr20
+    vhaddw.w.h     vr12,   vr21,    vr21
+.LOOP_HV8_HORI_\w:
+    add.d          a2,     a2,      a3
+    vld            vr15,   a2,      0
+    vshuf.b        vr23,   vr15,    vr15,   vr0
+    vshuf.b        vr15,   vr15,    vr15,   vr22
+    vdp2.h.bu.b    vr13,   vr23,    vr5
+    vdp2.h.bu.b    vr14,   vr15,    vr5
+    vhaddw.w.h     vr13,   vr13,    vr13 //789--13
+    vhaddw.w.h     vr14,   vr14,    vr14 //101112--14
+    vmul.w         vr15,   vr7,     vr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
+    vmadd.w        vr15,   vr8,     vr17
+    vmadd.w        vr15,   vr9,     vr18
+    vmadd.w        vr15,   vr13,    vr19
+    vmul.w         vr20,   vr10,    vr16
+    vmadd.w        vr20,   vr11,    vr17
+    vmadd.w        vr20,   vr12,    vr18
+    vmadd.w        vr20,   vr14,    vr19
+    vaddi.wu       vr7,    vr8,     0    //back up previous value
+    vaddi.wu       vr8,    vr9,     0
+    vaddi.wu       vr9,    vr13,    0
+    vaddi.wu       vr10,   vr11,    0
+    vaddi.wu       vr11,   vr12,    0
+    vaddi.wu       vr12,   vr14,    0
+    vsrai.w        vr15,   vr15,    6    // >> 6
+    vsrai.w        vr20,   vr20,    6
+    vmul.w         vr15,   vr15,    vr1  // * wx
+    vmul.w         vr20,   vr20,    vr1
+    vadd.w         vr15,   vr15,    vr2  // + offset
+    vadd.w         vr20,   vr20,    vr2
+    vsra.w         vr15,   vr15,    vr3  // >> shift
+    vsra.w         vr20,   vr20,    vr3
+    vadd.w         vr15,   vr15,    vr4  // + ox
+    vadd.w         vr20,   vr20,    vr4
+    vssrani.h.w    vr20,   vr15,    0
+    vssrani.bu.h   vr20,   vr20,    0
+.if \w > 6
+    fst.d          f20,    a0,      0
+.else
+    fst.s          f20,    a0,      0
+    vstelm.h       vr20,   a0,      4,   2
+.endif
+    add.d          a0,     a0,      a1
+    addi.d         a4,     a4,      -1
+    bnez           a4,     .LOOP_HV8_HORI_\w
+.endm
+
+.macro PUT_HEVC_EPEL_UNI_W_HV8_LASX w
+    vld            vr7,    a2,      0  // start to load src
+    vldx           vr8,    a2,      a3
+    alsl.d         a2,     a3,      a2,    1
+    vld            vr9,    a2,      0
+    xvreplve0.q    xr7,    xr7
+    xvreplve0.q    xr8,    xr8
+    xvreplve0.q    xr9,    xr9
+    xvshuf.b       xr10,   xr7,     xr7,   xr0 // 0123 1234 2345 3456
+    xvshuf.b       xr11,   xr8,     xr8,   xr0
+    xvshuf.b       xr12,   xr9,     xr9,   xr0
+    xvdp2.h.bu.b   xr13,   xr10,    xr5  // EPEL_FILTER(src, 1)
+    xvdp2.h.bu.b   xr14,   xr11,    xr5
+    xvdp2.h.bu.b   xr15,   xr12,    xr5
+    xvhaddw.w.h    xr7,    xr13,    xr13
+    xvhaddw.w.h    xr8,    xr14,    xr14
+    xvhaddw.w.h    xr9,    xr15,    xr15
+.LOOP_HV8_HORI_LASX_\w:
+    add.d          a2,     a2,      a3
+    vld            vr15,   a2,      0
+    xvreplve0.q    xr15,   xr15
+    xvshuf.b       xr23,   xr15,    xr15,   xr0
+    xvdp2.h.bu.b   xr10,   xr23,    xr5
+    xvhaddw.w.h    xr10,   xr10,    xr10
+    xvmul.w        xr15,   xr7,     xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
+    xvmadd.w       xr15,   xr8,     xr17
+    xvmadd.w       xr15,   xr9,     xr18
+    xvmadd.w       xr15,   xr10,    xr19
+    xvaddi.wu      xr7,    xr8,     0    //back up previous value
+    xvaddi.wu      xr8,    xr9,     0
+    xvaddi.wu      xr9,    xr10,    0
+    xvsrai.w       xr15,   xr15,    6    // >> 6
+    xvmul.w        xr15,   xr15,    xr1  // * wx
+    xvadd.w        xr15,   xr15,    xr2  // + offset
+    xvsra.w        xr15,   xr15,    xr3  // >> shift
+    xvadd.w        xr15,   xr15,    xr4  // + ox
+    xvpermi.q      xr20,   xr15,    0x01
+    vssrani.h.w    vr20,   vr15,    0
+    vssrani.bu.h   vr20,   vr20,    0
+.if \w > 6
+    fst.d          f20,    a0,      0
+.else
+    fst.s          f20,    a0,      0
+    vstelm.h       vr20,   a0,      4,   2
+.endif
+    add.d          a0,     a0,      a1
+    addi.d         a4,     a4,      -1
+    bnez           a4,     .LOOP_HV8_HORI_LASX_\w
+.endm
+
+.macro PUT_HEVC_EPEL_UNI_W_HV16_LASX w
+    xvld           xr7,    a2,      0  // start to load src
+    xvldx          xr8,    a2,      a3
+    alsl.d         a2,     a3,      a2,    1
+    xvld           xr9,    a2,      0
+    xvpermi.d      xr10,   xr7,     0x09 //8..18
+    xvpermi.d      xr11,   xr8,     0x09
+    xvpermi.d      xr12,   xr9,     0x09
+    xvreplve0.q    xr7,    xr7
+    xvreplve0.q    xr8,    xr8
+    xvreplve0.q    xr9,    xr9
+    xvshuf.b       xr13,   xr7,     xr7,   xr0 // 0123 1234 2345 3456
+    xvshuf.b       xr14,   xr8,     xr8,   xr0
+    xvshuf.b       xr15,   xr9,     xr9,   xr0
+    xvdp2.h.bu.b   xr20,   xr13,    xr5  // EPEL_FILTER(src, 1)
+    xvdp2.h.bu.b   xr21,   xr14,    xr5
+    xvdp2.h.bu.b   xr22,   xr15,    xr5
+    xvhaddw.w.h    xr7,    xr20,    xr20
+    xvhaddw.w.h    xr8,    xr21,    xr21
+    xvhaddw.w.h    xr9,    xr22,    xr22
+    xvreplve0.q    xr10,   xr10
+    xvreplve0.q    xr11,   xr11
+    xvreplve0.q    xr12,   xr12
+    xvshuf.b       xr13,   xr10,    xr10,  xr0
+    xvshuf.b       xr14,   xr11,    xr11,  xr0
+    xvshuf.b       xr15,   xr12,    xr12,  xr0
+    xvdp2.h.bu.b   xr20,   xr13,    xr5
+    xvdp2.h.bu.b   xr21,   xr14,    xr5
+    xvdp2.h.bu.b   xr22,   xr15,    xr5
+    xvhaddw.w.h    xr10,   xr20,    xr20
+    xvhaddw.w.h    xr11,   xr21,    xr21
+    xvhaddw.w.h    xr12,   xr22,    xr22
+.LOOP_HV16_HORI_LASX_\w:
+    add.d          a2,     a2,      a3
+    xvld           xr15,   a2,      0
+    xvpermi.d      xr20,   xr15,    0x09 //8...18
+    xvreplve0.q    xr15,   xr15
+    xvreplve0.q    xr20,   xr20
+    xvshuf.b       xr21,   xr15,    xr15,   xr0
+    xvshuf.b       xr22,   xr20,    xr20,   xr0
+    xvdp2.h.bu.b   xr13,   xr21,    xr5
+    xvdp2.h.bu.b   xr14,   xr22,    xr5
+    xvhaddw.w.h    xr13,   xr13,    xr13
+    xvhaddw.w.h    xr14,   xr14,    xr14
+    xvmul.w        xr15,   xr7,     xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
+    xvmadd.w       xr15,   xr8,     xr17
+    xvmadd.w       xr15,   xr9,     xr18
+    xvmadd.w       xr15,   xr13,    xr19
+    xvmul.w        xr20,   xr10,    xr16
+    xvmadd.w       xr20,   xr11,    xr17
+    xvmadd.w       xr20,   xr12,    xr18
+    xvmadd.w       xr20,   xr14,    xr19
+    xvaddi.wu      xr7,    xr8,     0    //back up previous value
+    xvaddi.wu      xr8,    xr9,     0
+    xvaddi.wu      xr9,    xr13,    0
+    xvaddi.wu      xr10,   xr11,    0
+    xvaddi.wu      xr11,   xr12,    0
+    xvaddi.wu      xr12,   xr14,    0
+    xvsrai.w       xr15,   xr15,    6    // >> 6
+    xvsrai.w       xr20,   xr20,    6    // >> 6
+    xvmul.w        xr15,   xr15,    xr1  // * wx
+    xvmul.w        xr20,   xr20,    xr1  // * wx
+    xvadd.w        xr15,   xr15,    xr2  // + offset
+    xvadd.w        xr20,   xr20,    xr2  // + offset
+    xvsra.w        xr15,   xr15,    xr3  // >> shift
+    xvsra.w        xr20,   xr20,    xr3  // >> shift
+    xvadd.w        xr15,   xr15,    xr4  // + ox
+    xvadd.w        xr20,   xr20,    xr4  // + ox
+    xvssrani.h.w   xr20,   xr15,    0
+    xvpermi.q      xr21,   xr20,    0x01
+    vssrani.bu.h   vr21,   vr20,    0
+    vpermi.w       vr21,   vr21,    0xd8
+.if \w < 16
+    fst.d          f21,    a0,      0
+    vstelm.w       vr21,   a0,      8,   2
+.else
+    vst            vr21,   a0,      0
+.endif
+    add.d          a0,     a0,      a1
+    addi.d         a4,     a4,      -1
+    bnez           a4,     .LOOP_HV16_HORI_LASX_\w
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV8_LASX 6
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 8
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV8_LASX 8
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 12
+    addi.d         a0,     t2,      8
+    addi.d         a2,     t3,      8
+    addi.d         a4,     t4,      0
+    PUT_HEVC_EPEL_UNI_W_HV4_LSX 12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV16_LASX 12
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    2
+.LOOP_HV16:
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 16
+    addi.d         a0,     t2,      8
+    addi.d         a2,     t3,      8
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    PUT_HEVC_EPEL_UNI_W_HV16_LASX 16
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    3
+.LOOP_HV24:
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 24
+    addi.d         a0,     t2,      8
+    addi.d         t2,     t2,      8
+    addi.d         a2,     t3,      8
+    addi.d         t3,     t3,      8
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    PUT_HEVC_EPEL_UNI_W_HV16_LASX 24
+    addi.d         a0,     t2,      16
+    addi.d         a2,     t3,      16
+    addi.d         a4,     t4,      0
+    PUT_HEVC_EPEL_UNI_W_HV8_LASX 24
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    4
+.LOOP_HV32:
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 32
+    addi.d         a0,     t2,      8
+    addi.d         t2,     t2,      8
+    addi.d         a2,     t3,      8
+    addi.d         t3,     t3,      8
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV32
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    2
+.LOOP_HV32_LASX:
+    PUT_HEVC_EPEL_UNI_W_HV16_LASX 32
+    addi.d         a0,     t2,      16
+    addi.d         t2,     t2,      16
+    addi.d         a2,     t3,      16
+    addi.d         t3,     t3,      16
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV32_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    6
+.LOOP_HV48:
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 48
+    addi.d         a0,     t2,      8
+    addi.d         t2,     t2,      8
+    addi.d         a2,     t3,      8
+    addi.d         t3,     t3,      8
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV48
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    3
+.LOOP_HV48_LASX:
+    PUT_HEVC_EPEL_UNI_W_HV16_LASX 48
+    addi.d         a0,     t2,      16
+    addi.d         t2,     t2,      16
+    addi.d         a2,     t3,      16
+    addi.d         t3,     t3,      16
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV48_LASX
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx
+    LOAD_VAR 128
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    vreplvei.w     vr5,    vr5,     0
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    vreplvei.w     vr16,   vr6,     0
+    vreplvei.w     vr17,   vr6,     1
+    vreplvei.w     vr18,   vr6,     2
+    vreplvei.w     vr19,   vr6,     3
+    la.local       t1,     shufb
+    vld            vr0,    t1,      0
+    vaddi.bu       vr22,   vr0,     4  // update shufb to get high part
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    8
+.LOOP_HV64:
+    PUT_HEVC_EPEL_UNI_W_HV8_LSX 64
+    addi.d         a0,     t2,      8
+    addi.d         t2,     t2,      8
+    addi.d         a2,     t3,      8
+    addi.d         t3,     t3,      8
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV64
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx
+    LOAD_VAR 256
+    ld.d           t0,     sp,      0  // mx
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    la.local       t1,     ff_hevc_epel_filters
+    vldx           vr5,    t1,      t0 // ff_hevc_epel_filters[mx - 1];
+    xvreplve0.w    xr5,    xr5
+    ld.d           t0,     sp,      8  // my
+    addi.d         t0,     t0,      -1
+    slli.w         t0,     t0,      2
+    vldx           vr6,    t1,      t0 // ff_hevc_epel_filters[my - 1];
+    vsllwil.h.b    vr6,    vr6,     0
+    vsllwil.w.h    vr6,    vr6,     0
+    xvreplve0.q    xr6,    xr6
+    xvrepl128vei.w xr16,   xr6,     0
+    xvrepl128vei.w xr17,   xr6,     1
+    xvrepl128vei.w xr18,   xr6,     2
+    xvrepl128vei.w xr19,   xr6,     3
+    la.local       t1,     shufb
+    xvld           xr0,    t1,      0
+    sub.d          a2,     a2,      a3 // src -= srcstride
+    addi.d         a2,     a2,      -1
+    addi.d         t2,     a0,      0
+    addi.d         t3,     a2,      0
+    addi.d         t4,     a4,      0
+    addi.d         t5,     zero,    4
+.LOOP_HV64_LASX:
+    PUT_HEVC_EPEL_UNI_W_HV16_LASX 64
+    addi.d         a0,     t2,      16
+    addi.d         t2,     t2,      16
+    addi.d         a2,     t3,      16
+    addi.d         t3,     t3,      16
+    addi.d         a4,     t4,      0
+    addi.d         t5,     t5,      -1
+    bnez           t5,     .LOOP_HV64_LASX
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 3cdb3fb2d7..245a833947 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -171,6 +171,16 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
             c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
 
+            c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx;
+            c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx;
+            c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx;
+            c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx;
+            c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx;
+            c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx;
+            c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx;
+            c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx;
+            c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx;
+
             c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
             c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
             c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
@@ -258,6 +268,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
             c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
 
+            c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx;
+            c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx;
+            c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx;
+            c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx;
+            c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx;
+            c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx;
+            c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx;
+            c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx;
+
             c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
             c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
             c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
index 8a9266d375..7f09d0943a 100644
--- a/libavcodec/loongarch/hevcdsp_lasx.h
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -66,6 +66,15 @@ PEL_UNI_W(qpel, h, 32);
 PEL_UNI_W(qpel, h, 48);
 PEL_UNI_W(qpel, h, 64);
 
+PEL_UNI_W(epel, hv, 6);
+PEL_UNI_W(epel, hv, 8);
+PEL_UNI_W(epel, hv, 12);
+PEL_UNI_W(epel, hv, 16);
+PEL_UNI_W(epel, hv, 24);
+PEL_UNI_W(epel, hv, 32);
+PEL_UNI_W(epel, hv, 48);
+PEL_UNI_W(epel, hv, 64);
+
 #undef PEL_UNI_W
 
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 3291294ed9..7769cf25ae 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -277,6 +277,16 @@ PEL_UNI_W(qpel, h, 32);
 PEL_UNI_W(qpel, h, 48);
 PEL_UNI_W(qpel, h, 64);
 
+PEL_UNI_W(epel, hv, 4);
+PEL_UNI_W(epel, hv, 6);
+PEL_UNI_W(epel, hv, 8);
+PEL_UNI_W(epel, hv, 12);
+PEL_UNI_W(epel, hv, 16);
+PEL_UNI_W(epel, hv, 24);
+PEL_UNI_W(epel, hv, 32);
+PEL_UNI_W(epel, hv, 48);
+PEL_UNI_W(epel, hv, 64);
+
 #undef PEL_UNI_W
 
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
-- 
2.20.1