[FFmpeg-devel] [PATCH] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv and qpel_h
Jean-Baptiste Kempf
jb at videolan.org
Tue May 2 15:32:08 EEST 2023
Hello,
Just 2 questions:
- could you split this patch into several (3,4 or 5)
- are all those functions checked by chekasm?
Thanks,
jb
On Sun, 30 Apr 2023, at 10:57, myais wrote:
> Hi,
> This is a patch for the aarch64, which completes the neon versions of
> the hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv
> interfaces.
>
> put_hevc_pel_uni_w_pixels4_8_c: 54.3
> put_hevc_pel_uni_w_pixels4_8_neon: 24.1
> put_hevc_pel_uni_w_pixels6_8_c: 105.3
> put_hevc_pel_uni_w_pixels6_8_neon: 53.1
> put_hevc_pel_uni_w_pixels8_8_c: 176.6
> put_hevc_pel_uni_w_pixels8_8_neon: 63.8
> put_hevc_pel_uni_w_pixels12_8_c: 391.1
> put_hevc_pel_uni_w_pixels12_8_neon: 193.3
> put_hevc_pel_uni_w_pixels16_8_c: 688.1
> put_hevc_pel_uni_w_pixels16_8_neon: 226.1
> put_hevc_pel_uni_w_pixels24_8_c: 1542.3
> put_hevc_pel_uni_w_pixels24_8_neon: 536.8
> put_hevc_pel_uni_w_pixels32_8_c: 2753.1
> put_hevc_pel_uni_w_pixels32_8_neon: 875.8
> put_hevc_pel_uni_w_pixels48_8_c: 6251.1
> put_hevc_pel_uni_w_pixels48_8_neon: 1966.1
> put_hevc_pel_uni_w_pixels64_8_c: 11047.1
> put_hevc_pel_uni_w_pixels64_8_neon: 3449.8
>
> put_hevc_qpel_uni_w_h4_8_c: 156.6
> put_hevc_qpel_uni_w_h4_8_neon: 44.6
> put_hevc_qpel_uni_w_h6_8_c: 324.6
> put_hevc_qpel_uni_w_h6_8_neon: 103.1
> put_hevc_qpel_uni_w_h8_8_c: 549.3
> put_hevc_qpel_uni_w_h8_8_neon: 138.6
> put_hevc_qpel_uni_w_h12_8_c: 1240.3
> put_hevc_qpel_uni_w_h12_8_neon: 277.3
> put_hevc_qpel_uni_w_h16_8_c: 2161.8
> put_hevc_qpel_uni_w_h16_8_neon: 394.1
> put_hevc_qpel_uni_w_h24_8_c: 4874.8
> put_hevc_qpel_uni_w_h24_8_neon: 972.6
> put_hevc_qpel_uni_w_h32_8_c: 8517.8
> put_hevc_qpel_uni_w_h32_8_neon: 1517.3
> put_hevc_qpel_uni_w_h48_8_c: 19856.1
> put_hevc_qpel_uni_w_h48_8_neon: 3429.8
> put_hevc_qpel_uni_w_h64_8_c: 35159.3
> put_hevc_qpel_uni_w_h64_8_neon: 6018.1
>
> put_hevc_qpel_uni_w_v4_8_c: 180.6
> put_hevc_qpel_uni_w_v4_8_neon: 63.8
> put_hevc_qpel_uni_w_v6_8_c: 318.6
> put_hevc_qpel_uni_w_v6_8_neon: 117.8
> put_hevc_qpel_uni_w_v8_8_c: 547.6
> put_hevc_qpel_uni_w_v8_8_neon: 132.1
> put_hevc_qpel_uni_w_v12_8_c: 1202.8
> put_hevc_qpel_uni_w_v12_8_neon: 350.1
> put_hevc_qpel_uni_w_v16_8_c: 2109.6
> put_hevc_qpel_uni_w_v16_8_neon: 442.1
> put_hevc_qpel_uni_w_v24_8_c: 4748.8
> put_hevc_qpel_uni_w_v24_8_neon: 1287.1
> put_hevc_qpel_uni_w_v32_8_c: 8487.3
> put_hevc_qpel_uni_w_v32_8_neon: 1704.3
> put_hevc_qpel_uni_w_v48_8_c: 18798.8
> put_hevc_qpel_uni_w_v48_8_neon: 3790.8
> put_hevc_qpel_uni_w_v64_8_c: 35614.6
> put_hevc_qpel_uni_w_v64_8_neon: 6725.6
>
>
> put_hevc_qpel_uni_w_hv4_8_c: 498.8
> put_hevc_qpel_uni_w_hv4_8_neon: 139.3
> put_hevc_qpel_uni_w_hv6_8_c: 874.6
> put_hevc_qpel_uni_w_hv6_8_neon: 295.3
> put_hevc_qpel_uni_w_hv8_8_c: 1372.1
> put_hevc_qpel_uni_w_hv8_8_neon: 387.1
> put_hevc_qpel_uni_w_hv12_8_c: 2721.8
> put_hevc_qpel_uni_w_hv12_8_neon: 804.8
> put_hevc_qpel_uni_w_hv16_8_c: 4503.1
> put_hevc_qpel_uni_w_hv16_8_neon: 1038.1
> put_hevc_qpel_uni_w_hv24_8_c: 9321.8
> put_hevc_qpel_uni_w_hv24_8_neon: 2962.1
> put_hevc_qpel_uni_w_hv32_8_c: 15926.8
> put_hevc_qpel_uni_w_hv32_8_neon: 3858.6
> put_hevc_qpel_uni_w_hv48_8_c: 35051.1
> put_hevc_qpel_uni_w_hv48_8_neon: 9301.1
> put_hevc_qpel_uni_w_hv64_8_c: 61215.3
> put_hevc_qpel_uni_w_hv64_8_neon: 14920.1
>
> put_hevc_qpel_uni_h4_8_c: 143.3
> put_hevc_qpel_uni_h4_8_neon: 55.3
> put_hevc_qpel_uni_h6_8_c: 304.6
> put_hevc_qpel_uni_h6_8_neon: 82.3
> put_hevc_qpel_uni_h8_8_c: 557.8
> put_hevc_qpel_uni_h8_8_neon: 99.3
> put_hevc_qpel_uni_h12_8_c: 1228.3
> put_hevc_qpel_uni_h12_8_neon: 251.6
> put_hevc_qpel_uni_h16_8_c: 2210.3
> put_hevc_qpel_uni_h16_8_neon: 324.6
> put_hevc_qpel_uni_h24_8_c: 4859.1
> put_hevc_qpel_uni_h24_8_neon: 962.3
> put_hevc_qpel_uni_h32_8_c: 8728.6
> put_hevc_qpel_uni_h32_8_neon: 1249.6
> put_hevc_qpel_uni_h48_8_c: 20346.3
> put_hevc_qpel_uni_h48_8_neon: 2824.1
> put_hevc_qpel_uni_h64_8_c: 36702.6
> put_hevc_qpel_uni_h64_8_neon: 5012.1
>
>
>
>
> Signed-off-by: myais <Logan.Lyu at myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 96 +
> libavcodec/aarch64/hevcdsp_qpel_neon.S | 2223 +++++++++++++++++++++
> 2 files changed, 2319 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index be1049a2ec..42b8e9169d 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -128,6 +128,91 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t
> *_dst, ptrdiff_t _dststride, co
> ptrdiff_t _srcstride, const
> int16_t *src2, int height, intptr_t
> mx, intptr_t my, int width);
> +#define NEON8_FNPROTO(fn, args, ext) \
> + void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
> +
> +#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
> + void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
> +
> +#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
> + void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
> + void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
> +
> +
> +NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, int denom, int wx, int ox, + intptr_t mx,
> intptr_t my, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t
> _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, int denom, int wx, int ox,
> + intptr_t mx, intptr_t my, int width),);
> +
> +#if defined(__ARM_FEATURE_DOTPROD)
> +NEON8_FNPROTO(qpel_h, (int16_t *dst,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, intptr_t mx, intptr_t my, int width), _dotprod);
> +
> +NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, int denom, int wx, int ox,
> + intptr_t mx, intptr_t my, int width), _dotprod);
> +
> +NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t
> _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, int denom, int wx, int ox,
> + intptr_t mx, intptr_t my, int width), _dotprod);
> +
> +#endif
> +
> +#define NEON8_FNASSIGN(member, v, h, fn, ext) \
> + member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
> + member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \
> + member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
> + member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
> + member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> + member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
> + member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
> + member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
> + member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
> +
> +#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
> + member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
> + member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
> + member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
> + member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> + member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> + member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> + member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> + member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> + member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
> +
> +#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
> + member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
> + member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
> + member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
> + member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> + member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
> + member[6][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
> + member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
> + member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
> + member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
> +
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int
> bit_depth)
> {
> if (!have_neon(av_get_cpu_flags())) return;
> @@ -185,6 +270,17 @@ av_cold void
> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> c->put_hevc_qpel_bi[7][0][1] =
> c->put_hevc_qpel_bi[8][0][1] =
> c->put_hevc_qpel_bi[9][0][1] =
> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
> +
> + NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
> + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0,
> pel_uni_w_pixels,); +
> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
> +
> + #if defined(__ARM_FEATURE_DOTPROD)
> + NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _dotprod);
> + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,
> _dotprod);
> + NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1,
> qpel_uni_w_hv, _dotprod);
> +
> + #endif
> }
> if (bit_depth == 10) {
> c->hevc_h_loop_filter_chroma =
> ff_hevc_h_loop_filter_chroma_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
> b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> index 0e7b912678..e30ac1b465 100644
> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -30,6 +30,13 @@ const qpel_filters, align=4
> .byte 0, 1, -5, 17, 58,-10, 4, -1
> endconst
> +const qpel_filters_abs, align=4
> + .byte 0, 0, 0, 0, 0, 0, 0, 0
> + .byte 1, 4, 10, 58, 17, 5, 1, 0
> + .byte 1, 4, 11, 40, 40, 11, 4, 1
> + .byte 0, 1, 5, 17, 58, 10, 4, 1
> +endconst
> +
> .macro load_filter m
> movrel x15, qpel_filters
> add x15, x15, \m, lsl #3
> @@ -482,3 +489,2219 @@ endfunc
> put_hevc qpel
> put_hevc qpel_uni
> put_hevc qpel_bi
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.8h, w7
> +1:
> + ldr s0, [x2]
> + ldr s1, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ushll v0.8h, v0.8b, #6
> + ushll v1.8h, v1.8b, #6
> + smull v0.4s, v0.4h, v30.4h
> + smull v1.4s, v1.4h, v30.4h
> + sqrshl v0.4s, v0.4s, v31.4s
> + sqrshl v1.4s, v1.4s, v31.4s
> + sqadd v0.4s, v0.4s, v29.4s
> + sqadd v1.4s, v1.4s, v29.4s
> + sqxtn v0.4h, v0.4s
> + sqxtn v1.4h, v1.4s
> + sqxtun v0.8b, v0.8h
> + sqxtun v1.8b, v1.8h
> + str s0, [x0]
> + str s1, [x0, x1]
> + add x0, x0, x1, lsl 1
> + subs w4, w4, #2
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> + sub x1, x1, #4
> +1:
> + ldr d0, [x2]
> + ldr d1, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ushll v0.8h, v0.8b, #6
> + ushll v1.8h, v1.8b, #6
> + smull v4.4s, v0.4h, v30.4h
> + smull2 v5.4s, v0.8h, v30.8h
> + smull v6.4s, v1.4h, v30.4h
> + smull2 v7.4s, v1.8h, v30.8h
> + sqrshl v4.4s, v4.4s, v31.4s
> + sqrshl v5.4s, v5.4s, v31.4s
> + sqrshl v6.4s, v6.4s, v31.4s
> + sqrshl v7.4s, v7.4s, v31.4s
> + sqadd v4.4s, v4.4s, v29.4s
> + sqadd v5.4s, v5.4s, v29.4s
> + sqadd v6.4s, v6.4s, v29.4s
> + sqadd v7.4s, v7.4s, v29.4s
> + sqxtn v0.4h, v4.4s
> + sqxtn2 v0.8h, v5.4s
> + sqxtn v1.4h, v6.4s
> + sqxtn2 v1.8h, v7.4s
> + sqxtun v0.8b, v0.8h
> + sqxtun v1.8b, v1.8h
> + str s0, [x0], #4
> + st1 {v0.h}[2], [x0], x1
> + str s1, [x0], #4
> + st1 {v1.h}[2], [x0], x1
> + subs w4, w4, #2
> + b.ne 1b
> + ret +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> +1:
> + ldr d0, [x2]
> + ldr d1, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ushll v0.8h, v0.8b, #6
> + ushll v1.8h, v1.8b, #6
> + smull v4.4s, v0.4h, v30.4h
> + smull2 v5.4s, v0.8h, v30.8h
> + smull v6.4s, v1.4h, v30.4h
> + smull2 v7.4s, v1.8h, v30.8h
> + sqrshl v4.4s, v4.4s, v31.4s
> + sqrshl v5.4s, v5.4s, v31.4s
> + sqrshl v6.4s, v6.4s, v31.4s
> + sqrshl v7.4s, v7.4s, v31.4s
> + sqadd v4.4s, v4.4s, v29.4s
> + sqadd v5.4s, v5.4s, v29.4s
> + sqadd v6.4s, v6.4s, v29.4s
> + sqadd v7.4s, v7.4s, v29.4s
> + sqxtn v0.4h, v4.4s
> + sqxtn2 v0.8h, v5.4s
> + sqxtn v1.4h, v6.4s
> + sqxtn2 v1.8h, v7.4s
> + sqxtun v0.8b, v0.8h
> + sqxtun v1.8b, v1.8h
> + str d0, [x0]
> + str d1, [x0, x1]
> + add x0, x0, x1, lsl 1
> + subs w4, w4, #2
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> + sub x1, x1, #8
> +1:
> + ldr q0, [x2]
> + ldr q1, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ushll v4.8h, v0.8b, #6
> + ushll2 v5.8h, v0.16b, #6
> + ushll v6.8h, v1.8b, #6
> + ushll2 v7.8h, v1.16b, #6
> + smull v16.4s, v4.4h, v30.4h
> + smull2 v17.4s, v4.8h, v30.8h
> + smull v18.4s, v5.4h, v30.4h
> + smull2 v19.4s, v5.8h, v30.8h
> + smull v20.4s, v6.4h, v30.4h
> + smull2 v21.4s, v6.8h, v30.8h
> + smull v22.4s, v7.4h, v30.4h
> + smull2 v23.4s, v7.8h, v30.8h
> + + sqrshl v16.4s, v16.4s, v31.4s
> + sqrshl v17.4s, v17.4s, v31.4s
> + sqrshl v18.4s, v18.4s, v31.4s
> + sqrshl v19.4s, v19.4s, v31.4s
> + sqrshl v20.4s, v20.4s, v31.4s
> + sqrshl v21.4s, v21.4s, v31.4s
> + sqrshl v22.4s, v22.4s, v31.4s
> + sqrshl v23.4s, v23.4s, v31.4s
> + sqadd v16.4s, v16.4s, v29.4s
> + sqadd v17.4s, v17.4s, v29.4s
> + sqadd v18.4s, v18.4s, v29.4s
> + sqadd v19.4s, v19.4s, v29.4s
> + sqadd v20.4s, v20.4s, v29.4s
> + sqadd v21.4s, v21.4s, v29.4s
> + sqadd v22.4s, v22.4s, v29.4s
> + sqadd v23.4s, v23.4s, v29.4s
> + sqxtn v0.4h, v16.4s
> + sqxtn2 v0.8h, v17.4s
> + sqxtn v1.4h, v18.4s
> + sqxtn2 v1.8h, v19.4s
> + sqxtn v2.4h, v20.4s
> + sqxtn2 v2.8h, v21.4s
> + sqxtn v3.4h, v22.4s
> + sqxtn2 v3.8h, v23.4s
> + sqxtun v0.8b, v0.8h
> + sqxtun2 v0.16b, v1.8h
> + sqxtun v2.8b, v2.8h
> + sqxtun2 v2.16b, v3.8h
> + str d0, [x0], #8
> + st1 {v0.s}[2], [x0], x1
> + str d2, [x0], #8
> + st1 {v2.s}[2], [x0], x1
> + subs w4, w4, #2
> + b.ne 1b
> + ret
> +endfunc
> +
> +.macro PEL_UNI_W_PIXEL_CALC s0, t0, t1, d0, d1, d2, d3
> + ushll \t0\().8h, \s0\().8b, #6
> + ushll2 \t1\().8h, \s0\().16b, #6
> + smull \d0\().4s, \t0\().4h, v30.4h
> + smull2 \d1\().4s, \t0\().8h, v30.8h
> + smull \d2\().4s, \t1\().4h, v30.4h
> + smull2 \d3\().4s, \t1\().8h, v30.8h
> + sqrshl \d0\().4s, \d0\().4s, v31.4s
> + sqrshl \d1\().4s, \d1\().4s, v31.4s
> + sqrshl \d2\().4s, \d2\().4s, v31.4s
> + sqrshl \d3\().4s, \d3\().4s, v31.4s
> + sqadd \d0\().4s, \d0\().4s, v29.4s
> + sqadd \d1\().4s, \d1\().4s, v29.4s
> + sqadd \d2\().4s, \d2\().4s, v29.4s
> + sqadd \d3\().4s, \d3\().4s, v29.4s
> + sqxtn \t0\().4h, \d0\().4s
> + sqxtn2 \t0\().8h, \d1\().4s
> + sqxtn \t1\().4h, \d2\().4s
> + sqxtn2 \t1\().8h, \d3\().4s
> + sqxtun \s0\().8b, \t0\().8h
> + sqxtun2 \s0\().16b, \t1\().8h
> +.endm
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> +1:
> + ldr q0, [x2]
> + ldr q1, [x2, x3]
> + add x2, x2, x3, lsl 1
> + PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> + PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> + str q0, [x0]
> + str q1, [x0, x1]
> + add x0, x0, x1, lsl 1
> + subs w4, w4, #2
> + b.ne 1b
> + ret
> +endfunc
> +
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> +1:
> + ld1 {v0.16b, v1.16b}, [x2], x3
> + ushll v4.8h, v0.8b, #6
> + ushll2 v5.8h, v0.16b, #6
> + ushll v6.8h, v1.8b, #6
> + smull v16.4s, v4.4h, v30.4h
> + smull2 v17.4s, v4.8h, v30.8h
> + smull v18.4s, v5.4h, v30.4h
> + smull2 v19.4s, v5.8h, v30.8h
> + smull v20.4s, v6.4h, v30.4h
> + smull2 v21.4s, v6.8h, v30.8h
> + sqrshl v16.4s, v16.4s, v31.4s
> + sqrshl v17.4s, v17.4s, v31.4s
> + sqrshl v18.4s, v18.4s, v31.4s
> + sqrshl v19.4s, v19.4s, v31.4s
> + sqrshl v20.4s, v20.4s, v31.4s
> + sqrshl v21.4s, v21.4s, v31.4s
> + sqadd v16.4s, v16.4s, v29.4s
> + sqadd v17.4s, v17.4s, v29.4s
> + sqadd v18.4s, v18.4s, v29.4s
> + sqadd v19.4s, v19.4s, v29.4s
> + sqadd v20.4s, v20.4s, v29.4s
> + sqadd v21.4s, v21.4s, v29.4s
> + sqxtn v0.4h, v16.4s
> + sqxtn2 v0.8h, v17.4s
> + sqxtn v1.4h, v18.4s
> + sqxtn2 v1.8h, v19.4s
> + sqxtn v2.4h, v20.4s
> + sqxtn2 v2.8h, v21.4s
> + sqxtun v0.8b, v0.8h
> + sqxtun v1.8b, v1.8h
> + sqxtun v2.8b, v2.8h
> + st1 {v0.8b, v1.8b, v2.8b}, [x0], x1
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> +1:
> + ld1 {v0.16b, v1.16b}, [x2], x3
> + PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> + PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> + st1 {v0.16b, v1.16b}, [x0], x1
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> +1:
> + ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3
> + PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> + PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> + PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
> + st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6
> + dup v31.4s, w10
> + dup v29.4s, w7
> +1:
> + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
> + PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
> + PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
> + PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
> + PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
> + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +.macro QPEL_UNI_W_V_HEADER
> + ldur x12, [sp, #8] // my
> + sub x2, x2, x3, lsl #1
> + sub x2, x2, x3
> + movrel x9, qpel_filters_abs
> + add x9, x9, x12, lsl 3
> + ldr d28, [x9]
> + dup v0.16b, v28.b[0]
> + dup v1.16b, v28.b[1]
> + dup v2.16b, v28.b[2]
> + dup v3.16b, v28.b[3]
> + dup v4.16b, v28.b[4]
> + dup v5.16b, v28.b[5]
> + dup v6.16b, v28.b[6]
> + dup v7.16b, v28.b[7]
> +
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.8h, w6 // wx
> + dup v31.4s, w10 // shift
> + dup v29.4s, w7 // ox
> +.endm
> +
> +.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
> + umull \dst\().8h, \src1\().8b, v1.8b
> + umlsl \dst\().8h, \src0\().8b, v0.8b
> + umlsl \dst\().8h, \src2\().8b, v2.8b
> + umlal \dst\().8h, \src3\().8b, v3.8b
> + umlal \dst\().8h, \src4\().8b, v4.8b
> + umlsl \dst\().8h, \src5\().8b, v5.8b
> + umlal \dst\().8h, \src6\().8b, v6.8b
> + umlsl \dst\().8h, \src7\().8b, v7.8b
> +.endm
> +
> +.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
> + umull2 \dst\().8h, \src1\().16b, v1.16b
> + umlsl2 \dst\().8h, \src0\().16b, v0.16b
> + umlsl2 \dst\().8h, \src2\().16b, v2.16b
> + umlal2 \dst\().8h, \src3\().16b, v3.16b
> + umlal2 \dst\().8h, \src4\().16b, v4.16b
> + umlsl2 \dst\().8h, \src5\().16b, v5.16b
> + umlal2 \dst\().8h, \src6\().16b, v6.16b
> + umlsl2 \dst\().8h, \src7\().16b, v7.16b
> +.endm
> +
> +.macro QPEL_UNI_W_V_4
> + smull v24.4s, v24.4h, v30.4h
> + sqrshl v24.4s, v24.4s, v31.4s
> + sqadd v24.4s, v24.4s, v29.4s
> + sqxtn v24.4h, v24.4s
> + sqxtun v24.8b, v24.8h
> + st1 {v24.s}[0], [x0], x1
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
> + QPEL_UNI_W_V_HEADER
> + ldr s16, [x2]
> + ldr s17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr s18, [x2]
> + ldr s19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr s20, [x2]
> + ldr s21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr s22, [x2]
> +
> +1: ldr s23, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v24, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s16, [x2]
> + QPEL_FILTER_B v24, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v24, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s18, [x2]
> + QPEL_FILTER_B v24, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v24, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s20, [x2]
> + QPEL_FILTER_B v24, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v24, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr s22, [x2]
> + QPEL_FILTER_B v24, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_UNI_W_V_4
> + subs w4, w4, #1
> + b.ne 1b
> +2:
> + ret
> +endfunc
> +
> +.macro QPEL_UNI_W_V_8
> + smull v24.4s, v26.4h, v30.4h
> + smull2 v25.4s, v26.8h, v30.8h
> + sqrshl v24.4s, v24.4s, v31.4s
> + sqrshl v25.4s, v25.4s, v31.4s
> + sqadd v24.4s, v24.4s, v29.4s
> + sqadd v25.4s, v25.4s, v29.4s
> + sqxtn v24.4h, v24.4s
> + sqxtn2 v24.8h, v25.4s
> + sqxtun v24.8b, v24.8h
> + st1 {v24.d}[0], [x0], x1
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
> + QPEL_UNI_W_V_HEADER
> + ldr d16, [x2]
> + ldr d17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr d18, [x2]
> + ldr d19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr d20, [x2]
> + ldr d21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr d22, [x2]
> +
> +1: ldr d23, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d16, [x2]
> + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d18, [x2]
> + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d20, [x2]
> + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr d22, [x2]
> + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_UNI_W_V_8
> + subs w4, w4, #1
> + b.ne 1b
> +2:
> + ret
> +endfunc
> +
> +.macro QPEL_UNI_W_V_16
> + smull v24.4s, v26.4h, v30.4h
> + smull2 v25.4s, v26.8h, v30.8h
> + smull v26.4s, v27.4h, v30.4h
> + smull2 v27.4s, v27.8h, v30.8h
> + sqrshl v24.4s, v24.4s, v31.4s
> + sqrshl v25.4s, v25.4s, v31.4s
> + sqrshl v26.4s, v26.4s, v31.4s
> + sqrshl v27.4s, v27.4s, v31.4s
> + sqadd v24.4s, v24.4s, v29.4s
> + sqadd v25.4s, v25.4s, v29.4s
> + sqadd v26.4s, v26.4s, v29.4s
> + sqadd v27.4s, v27.4s, v29.4s
> + sqxtn v24.4h, v24.4s
> + sqxtn2 v24.8h, v25.4s
> + sqxtn v26.4h, v26.4s
> + sqxtn2 v26.8h, v27.4s
> + sqxtun v24.8b, v24.8h
> + sqxtun2 v24.16b, v26.8h
> + st1 {v24.16b}, [x0], x1
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
> + QPEL_UNI_W_V_HEADER
> + ldr q16, [x2]
> + ldr q17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr q18, [x2]
> + ldr q19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr q20, [x2]
> + ldr q21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr q22, [x2]
> +
> +1: ldr q23, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q16, [x2]
> + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q18, [x2]
> + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q20, [x2]
> + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q22, [x2]
> + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.ne 1b
> +2:
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
> + QPEL_UNI_W_V_HEADER
> + ldur w13, [sp, #16]
> + mov x14, x0
> + mov x15, x2
> + mov w11, w4
> +
> +3:
> + ldr q16, [x2]
> + ldr q17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr q18, [x2]
> + ldr q19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr q20, [x2]
> + ldr q21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + ldr q22, [x2]
> +
> +
> +1: ldr q23, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q16, [x2]
> + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q17, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q18, [x2]
> + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q19, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q20, [x2]
> + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q21, [x2, x3]
> + add x2, x2, x3, lsl 1
> + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.eq 2f
> +
> + ldr q22, [x2]
> + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_UNI_W_V_16
> + subs w4, w4, #1
> + b.ne 1b
> +2:
> + subs w13, w13, #16
> + add x14, x14, #16
> + add x15, x15, #16
> + mov x0, x14
> + mov x2, x15
> + mov w4, w11
> + b.hi 3b
> + ret
> +endfunc
> +
> +#if __ARM_FEATURE_DOTPROD
> +.macro QPEL_UNI_W_H_HEADER
> + ldr x12, [sp]
> + sub x2, x2, #3
> + movrel x9, qpel_filters
> + add x9, x9, x12, lsl 3
> + ldr x11, [x9]
> + dup v28.2d, x11
> + mov w10, #-6
> + sub w10, w10, w5
> + dup v30.4s, w6 // wx
> + dup v31.4s, w10 // shift
> + dup v29.4s, w7 // ox
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> +1:
> + ld1 {v0.16b}, [x2], x3
> + ext v1.16b, v0.16b, v0.16b, #1
> + ext v2.16b, v0.16b, v0.16b, #2
> + ext v3.16b, v0.16b, v0.16b, #3
> + zip1 v0.2d, v0.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + movi v16.2d, #0
> + movi v17.2d, #0
> + usdot v16.4s, v0.16b, v28.16b
> + usdot v17.4s, v2.16b, v28.16b
> + addp v16.4s, v16.4s, v17.4s
> + mul v16.4s, v16.4s, v30.4s
> + sqrshl v16.4s, v16.4s, v31.4s
> + sqadd v16.4s, v16.4s, v29.4s
> + sqxtn v16.4h, v16.4s
> + sqxtun v16.8b, v16.8h
> + str s16, [x0]
> + add x0, x0, x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> + sub x1, x1, #4
> +1:
> + ld1 {v0.16b}, [x2], x3
> + ext v1.16b, v0.16b, v0.16b, #1
> + ext v2.16b, v0.16b, v0.16b, #2
> + ext v3.16b, v0.16b, v0.16b, #3
> + ext v4.16b, v0.16b, v0.16b, #4
> + ext v5.16b, v0.16b, v0.16b, #5
> + zip1 v0.2d, v0.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + zip1 v4.2d, v4.2d, v5.2d
> + movi v16.2d, #0
> + movi v17.2d, #0
> + movi v18.2d, #0
> + usdot v16.4s, v0.16b, v28.16b
> + usdot v17.4s, v2.16b, v28.16b
> + usdot v18.4s, v4.16b, v28.16b
> + addp v16.4s, v16.4s, v17.4s
> + addp v18.4s, v18.4s, v18.4s
> + mul v16.4s, v16.4s, v30.4s
> + mul v18.2s, v18.2s, v30.2s
> + sqrshl v16.4s, v16.4s, v31.4s
> + sqrshl v18.2s, v18.2s, v31.2s
> + sqadd v16.4s, v16.4s, v29.4s
> + sqadd v18.2s, v18.2s, v29.2s
> + sqxtn v16.4h, v16.4s
> + sqxtn2 v16.8h, v18.4s
> + sqxtun v16.8b, v16.8h
> + str s16, [x0], #4
> + st1 {v16.h}[2], [x0], x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +
> +.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
> + movi \d0\().2d, #0
> + movi \d1\().2d, #0
> + movi \d2\().2d, #0
> + movi \d3\().2d, #0
> + usdot \d0\().4s, \s0\().16b, v28.16b
> + usdot \d1\().4s, \s1\().16b, v28.16b
> + usdot \d2\().4s, \s2\().16b, v28.16b
> + usdot \d3\().4s, \s3\().16b, v28.16b
> + addp \d0\().4s, \d0\().4s, \d1\().4s
> + addp \d2\().4s, \d2\().4s, \d3\().4s
> + mul \d0\().4s, \d0\().4s, v30.4s
> + mul \d2\().4s, \d2\().4s, v30.4s
> + sqrshl \d0\().4s, \d0\().4s, v31.4s
> + sqrshl \d2\().4s, \d2\().4s, v31.4s
> + sqadd \d0\().4s, \d0\().4s, v29.4s
> + sqadd \d2\().4s, \d2\().4s, v29.4s
> +.endm
> +
> +.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
> + movi \d0\().2d, #0
> + movi \d1\().2d, #0
> + usdot \d0\().4s, \s0\().16b, v28.16b
> + usdot \d1\().4s, \s1\().16b, v28.16b
> + addp \d0\().4s, \d0\().4s, \d1\().4s
> + mul \d0\().4s, \d0\().4s, v30.4s
> + sqrshl \d0\().4s, \d0\().4s, v31.4s
> + sqadd \d0\().4s, \d0\().4s, v29.4s
> +.endm
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> +1:
> + ld1 {v16.16b, v17.16b}, [x2], x3
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + zip1 v0.2d, v16.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + zip1 v4.2d, v4.2d, v5.2d
> + zip1 v6.2d, v6.2d, v7.2d
> + QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
> + sqxtn v18.4h, v18.4s
> + sqxtn2 v18.8h, v20.4s
> + sqxtun v18.8b, v18.8h
> + str d18, [x0]
> + add x0, x0, x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> + add x13, x0, #8
> +1:
> + ld1 {v16.16b, v17.16b}, [x2], x3
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + zip1 v18.2d, v16.2d, v1.2d
> + zip1 v19.2d, v2.2d, v3.2d
> + zip1 v20.2d, v4.2d, v5.2d
> + zip1 v21.2d, v6.2d, v7.2d
> + zip2 v22.2d, v16.2d, v1.2d
> + zip2 v23.2d, v2.2d, v3.2d
> + QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6
> + QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
> + sqxtn v0.4h, v0.4s
> + sqxtn2 v0.8h, v4.4s
> + sqxtn v1.4h, v24.4s
> + sqxtun v0.8b, v0.8h
> + sqxtun v1.8b, v1.8h
> +
> + str d0, [x0]
> + str s1, [x13]
> + add x0, x0, x1
> + add x13, x13, x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> +1:
> + ld1 {v16.16b, v17.16b}, [x2], x3
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 //
> v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 //
> v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
> + sqxtn v0.4h, v18.4s
> + sqxtn2 v0.8h, v22.4s
> + sqxtn v1.4h, v20.4s
> + sqxtn2 v1.8h, v24.4s
> + trn1 v2.8h, v0.8h, v1.8h
> + trn2 v3.8h, v0.8h, v1.8h
> + sqxtun v0.8b, v2.8h
> + sqxtun2 v0.16b, v3.8h
> + st1 {v0.16b}, [x0], x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> + sub x1, x1, #16
> +1:
> + ld1 {v16.16b, v17.16b}, [x2], x3
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
> + sqxtn v18.4h, v18.4s
> + sqxtn2 v18.8h, v22.4s
> + sqxtn v19.4h, v20.4s
> + sqxtn2 v19.8h, v24.4s
> + trn1 v20.8h, v18.8h, v19.8h
> + trn2 v21.8h, v18.8h, v19.8h
> + sqxtun v26.8b, v20.8h
> + sqxtun2 v26.16b, v21.8h // 0-15
> + ext v1.16b, v17.16b, v17.16b, #1
> + ext v2.16b, v17.16b, v17.16b, #2
> + ext v3.16b, v17.16b, v17.16b, #3
> + ext v4.16b, v17.16b, v17.16b, #4
> + ext v5.16b, v17.16b, v17.16b, #5
> + ext v6.16b, v17.16b, v17.16b, #6
> + ext v7.16b, v17.16b, v17.16b, #7
> + zip1 v0.2d, v17.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + zip1 v4.2d, v4.2d, v5.2d
> + zip1 v6.2d, v6.2d, v7.2d
> + QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
> + sqxtn v18.4h, v18.4s
> + sqxtn2 v18.8h, v20.4s
> + sqxtun v27.8b, v18.8h
> +
> + st1 {v26.16b}, [x0], #16
> + st1 {v27.8b}, [x0], x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> +1:
> + ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
> + sqxtn v0.4h, v0.4s
> + sqxtn2 v0.8h, v22.4s
> + sqxtn v19.4h, v20.4s
> + sqxtn2 v19.8h, v24.4s
> + trn1 v20.8h, v0.8h, v19.8h
> + trn2 v21.8h, v0.8h, v19.8h
> + sqxtun v26.8b, v20.8h
> + sqxtun2 v26.16b, v21.8h // 0-15
> + ext v1.16b, v17.16b, v18.16b, #1
> + ext v2.16b, v17.16b, v18.16b, #2
> + ext v3.16b, v17.16b, v18.16b, #3
> + ext v4.16b, v17.16b, v18.16b, #4
> + ext v5.16b, v17.16b, v18.16b, #5
> + ext v6.16b, v17.16b, v18.16b, #6
> + ext v7.16b, v17.16b, v18.16b, #7
> + QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
> + sqxtn v0.4h, v0.4s
> + sqxtn2 v0.8h, v22.4s
> + sqxtn v19.4h, v20.4s
> + sqxtn2 v19.8h, v24.4s
> + trn1 v20.8h, v0.8h, v19.8h
> + trn2 v21.8h, v0.8h, v19.8h
> + sqxtun v27.8b, v20.8h
> + sqxtun2 v27.16b, v21.8h // 16-31
> + st1 {v26.16b, v27.16b}, [x0], x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> +1:
> + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v25.8b, v22.8h
> + sqxtun2 v25.16b, v23.8h // 0-15
> + ext v1.16b, v17.16b, v18.16b, #1
> + ext v2.16b, v17.16b, v18.16b, #2
> + ext v3.16b, v17.16b, v18.16b, #3
> + ext v4.16b, v17.16b, v18.16b, #4
> + ext v5.16b, v17.16b, v18.16b, #5
> + ext v6.16b, v17.16b, v18.16b, #6
> + ext v7.16b, v17.16b, v18.16b, #7
> + QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v26.8b, v22.8h
> + sqxtun2 v26.16b, v23.8h // 16-31
> + ext v1.16b, v18.16b, v19.16b, #1
> + ext v2.16b, v18.16b, v19.16b, #2
> + ext v3.16b, v18.16b, v19.16b, #3
> + ext v4.16b, v18.16b, v19.16b, #4
> + ext v5.16b, v18.16b, v19.16b, #5
> + ext v6.16b, v18.16b, v19.16b, #6
> + ext v7.16b, v18.16b, v19.16b, #7
> + QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v27.8b, v22.8h
> + sqxtun2 v27.16b, v23.8h // 32-47
> + st1 {v25.16b, v26.16b, v27.16b}, [x0], x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
> + QPEL_UNI_W_H_HEADER
> + sub x3, x3, #64
> +1:
> + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v16.8b, v22.8h
> + sqxtun2 v16.16b, v23.8h // 0-15
> + ext v1.16b, v17.16b, v18.16b, #1
> + ext v2.16b, v17.16b, v18.16b, #2
> + ext v3.16b, v17.16b, v18.16b, #3
> + ext v4.16b, v17.16b, v18.16b, #4
> + ext v5.16b, v17.16b, v18.16b, #5
> + ext v6.16b, v17.16b, v18.16b, #6
> + ext v7.16b, v17.16b, v18.16b, #7
> + QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v17.8b, v22.8h
> + sqxtun2 v17.16b, v23.8h // 16-31
> + ext v1.16b, v18.16b, v19.16b, #1
> + ext v2.16b, v18.16b, v19.16b, #2
> + ext v3.16b, v18.16b, v19.16b, #3
> + ext v4.16b, v18.16b, v19.16b, #4
> + ext v5.16b, v18.16b, v19.16b, #5
> + ext v6.16b, v18.16b, v19.16b, #6
> + ext v7.16b, v18.16b, v19.16b, #7
> + QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + ld1 {v0.16b}, [x2], x3
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v18.8b, v22.8h
> + sqxtun2 v18.16b, v23.8h // 32-47
> + ext v1.16b, v19.16b, v0.16b, #1
> + ext v2.16b, v19.16b, v0.16b, #2
> + ext v3.16b, v19.16b, v0.16b, #3
> + ext v4.16b, v19.16b, v0.16b, #4
> + ext v5.16b, v19.16b, v0.16b, #5
> + ext v6.16b, v19.16b, v0.16b, #6
> + ext v7.16b, v19.16b, v0.16b, #7
> + QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0
> + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + sqxtn v21.4h, v21.4s
> + sqxtn2 v21.8h, v23.4s
> + trn1 v22.8h, v20.8h, v21.8h
> + trn2 v23.8h, v20.8h, v21.8h
> + sqxtun v19.8b, v22.8h
> + sqxtun2 v19.16b, v23.8h // 48-63
> +
> + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
> + subs w4, w4, #1
> + b.hi 1b
> + ret
> +endfunc
> +
> +
> +.macro QPEL_H_HEADER
> + movrel x9, qpel_filters
> + add x9, x9, x4, lsl 3
> + ldr x11, [x9]
> + dup v31.2d, x11
> + sub x1, x1, #3
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_h4_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> +1:
> + ld1 {v0.16b}, [x1], x2
> + ext v1.16b, v0.16b, v0.16b, #1
> + ext v2.16b, v0.16b, v0.16b, #2
> + ext v3.16b, v0.16b, v0.16b, #3
> + zip1 v0.2d, v0.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + movi v16.2d, #0
> + movi v17.2d, #0
> + usdot v16.4s, v0.16b, v31.16b
> + usdot v17.4s, v2.16b, v31.16b
> + addp v16.4s, v16.4s, v17.4s
> + sqxtn v16.4h, v16.4s
> + str d16, [x0]
> + add x0, x0, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h6_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> + add x15, x0, #8
> +1:
> + ld1 {v0.16b}, [x1], x2
> + ext v1.16b, v0.16b, v0.16b, #1
> + ext v2.16b, v0.16b, v0.16b, #2
> + ext v3.16b, v0.16b, v0.16b, #3
> + ext v4.16b, v0.16b, v0.16b, #4
> + ext v5.16b, v0.16b, v0.16b, #5
> + zip1 v0.2d, v0.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + zip1 v4.2d, v4.2d, v5.2d
> + movi v16.2d, #0
> + movi v17.2d, #0
> + movi v18.2d, #0
> + usdot v16.4s, v0.16b, v31.16b
> + usdot v17.4s, v2.16b, v31.16b
> + usdot v18.4s, v4.16b, v31.16b
> + addp v16.4s, v16.4s, v17.4s
> + addp v18.4s, v18.4s, v18.4s
> + sqxtn v16.4h, v16.4s
> + sqxtn v18.4h, v18.4s
> + str d16, [x0]
> + str s18, [x15]
> + add x0, x0, x10
> + add x15, x15, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h8_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> +1:
> + ld1 {v0.16b}, [x1], x2
> + ext v1.16b, v0.16b, v0.16b, #1
> + ext v2.16b, v0.16b, v0.16b, #2
> + ext v3.16b, v0.16b, v0.16b, #3
> + ext v4.16b, v0.16b, v0.16b, #4
> + ext v5.16b, v0.16b, v0.16b, #5
> + ext v6.16b, v0.16b, v0.16b, #6
> + ext v7.16b, v0.16b, v0.16b, #7
> + zip1 v0.2d, v0.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + zip1 v4.2d, v4.2d, v5.2d
> + zip1 v6.2d, v6.2d, v7.2d
> + movi v16.2d, #0
> + movi v17.2d, #0
> + movi v18.2d, #0
> + movi v19.2d, #0
> + usdot v16.4s, v0.16b, v31.16b
> + usdot v17.4s, v2.16b, v31.16b
> + usdot v18.4s, v4.16b, v31.16b
> + usdot v19.4s, v6.16b, v31.16b
> + addp v16.4s, v16.4s, v17.4s
> + addp v18.4s, v18.4s, v19.4s
> + sqxtn v16.4h, v16.4s
> + sqxtn2 v16.8h, v18.4s
> + str q16, [x0]
> + add x0, x0, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 + movi
> \d0\().2d, #0
> + movi \d1\().2d, #0
> + movi \d2\().2d, #0
> + movi \d3\().2d, #0
> + usdot \d0\().4s, \s0\().16b, v31.16b
> + usdot \d1\().4s, \s1\().16b, v31.16b
> + usdot \d2\().4s, \s2\().16b, v31.16b
> + usdot \d3\().4s, \s3\().16b, v31.16b
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_h12_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> + add x15, x0, #16
> +1:
> + ld1 {v16.16b, v17.16b}, [x1], x2
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + zip1 v18.2d, v4.2d, v5.2d
> + zip1 v19.2d, v6.2d, v7.2d
> + QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + movi v24.2d, #0
> + movi v25.2d, #0
> + usdot v24.4s, v18.16b, v31.16b
> + usdot v25.4s, v19.16b, v31.16b
> + addp v24.4s, v24.4s, v25.4s
> + trn1 v26.4s, v20.4s, v21.4s
> + trn2 v27.4s, v20.4s, v21.4s
> + sqxtn v26.4h, v26.4s
> + sqxtn v27.4h, v27.4s
> + sqxtn2 v26.8h, v24.4s
> +
> + str q26, [x0]
> + str d27, [x15]
> + add x0, x0, x10
> + add x15, x15, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h16_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> +1:
> + ld1 {v16.16b, v17.16b}, [x1], x2
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> +
> + QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> +
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> +
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> +
> + sqxtn v18.4h, v22.4s
> + sqxtn2 v18.8h, v26.4s
> + sqxtn v19.4h, v23.4s
> + sqxtn2 v19.8h, v27.4s
> + + stp q18, q19, [x0]
> + add x0, x0, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h24_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> + add x15, x0, #32
> +1:
> + ld1 {v16.16b, v17.16b}, [x1], x2
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v18.4h, v22.4s
> + sqxtn2 v18.8h, v26.4s
> + sqxtn v19.4h, v23.4s
> + sqxtn2 v19.8h, v27.4s
> + stp q18, q19, [x0]
> + add x0, x0, x10
> + ext v1.16b, v17.16b, v17.16b, #1
> + ext v2.16b, v17.16b, v17.16b, #2
> + ext v3.16b, v17.16b, v17.16b, #3
> + ext v4.16b, v17.16b, v17.16b, #4
> + ext v5.16b, v17.16b, v17.16b, #5
> + ext v6.16b, v17.16b, v17.16b, #6
> + ext v7.16b, v17.16b, v17.16b, #7
> + zip1 v0.2d, v17.2d, v1.2d
> + zip1 v2.2d, v2.2d, v3.2d
> + zip1 v4.2d, v4.2d, v5.2d
> + zip1 v6.2d, v6.2d, v7.2d
> + QPEL_H_CALC v0, v2, v4, v5, v20, v21, v22, v23
> + addp v20.4s, v20.4s, v21.4s
> + addp v22.4s, v22.4s, v23.4s
> + sqxtn v20.4h, v20.4s
> + sqxtn2 v20.8h, v22.4s
> + str q20, [x15]
> + add x15, x15, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h32_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2
> + add x15, x0, #32
> +1:
> + ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0]
> + add x0, x0, x10
> + ext v1.16b, v17.16b, v18.16b, #1
> + ext v2.16b, v17.16b, v18.16b, #2
> + ext v3.16b, v17.16b, v18.16b, #3
> + ext v4.16b, v17.16b, v18.16b, #4
> + ext v5.16b, v17.16b, v18.16b, #5
> + ext v6.16b, v17.16b, v18.16b, #6
> + ext v7.16b, v17.16b, v18.16b, #7
> + QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x15]
> + add x15, x15, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h48_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + mov x10, #MAX_PB_SIZE * 2 - 64
> +1:
> + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0], #32
> +
> + ext v1.16b, v17.16b, v18.16b, #1
> + ext v2.16b, v17.16b, v18.16b, #2
> + ext v3.16b, v17.16b, v18.16b, #3
> + ext v4.16b, v17.16b, v18.16b, #4
> + ext v5.16b, v17.16b, v18.16b, #5
> + ext v6.16b, v17.16b, v18.16b, #6
> + ext v7.16b, v17.16b, v18.16b, #7
> + QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0], #32
> + ext v1.16b, v18.16b, v19.16b, #1
> + ext v2.16b, v18.16b, v19.16b, #2
> + ext v3.16b, v18.16b, v19.16b, #3
> + ext v4.16b, v18.16b, v19.16b, #4
> + ext v5.16b, v18.16b, v19.16b, #5
> + ext v6.16b, v18.16b, v19.16b, #6
> + ext v7.16b, v18.16b, v19.16b, #7
> + QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0]
> + add x0, x0, x10
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h64_8_neon_dotprod, export=1
> + QPEL_H_HEADER
> + sub x2, x2, #64
> +1:
> + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
> + ext v1.16b, v16.16b, v17.16b, #1
> + ext v2.16b, v16.16b, v17.16b, #2
> + ext v3.16b, v16.16b, v17.16b, #3
> + ext v4.16b, v16.16b, v17.16b, #4
> + ext v5.16b, v16.16b, v17.16b, #5
> + ext v6.16b, v16.16b, v17.16b, #6
> + ext v7.16b, v16.16b, v17.16b, #7
> + QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0], #32
> +
> + ext v1.16b, v17.16b, v18.16b, #1
> + ext v2.16b, v17.16b, v18.16b, #2
> + ext v3.16b, v17.16b, v18.16b, #3
> + ext v4.16b, v17.16b, v18.16b, #4
> + ext v5.16b, v17.16b, v18.16b, #5
> + ext v6.16b, v17.16b, v18.16b, #6
> + ext v7.16b, v17.16b, v18.16b, #7
> + QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0], #32
> + ext v1.16b, v18.16b, v19.16b, #1
> + ext v2.16b, v18.16b, v19.16b, #2
> + ext v3.16b, v18.16b, v19.16b, #3
> + ext v4.16b, v18.16b, v19.16b, #4
> + ext v5.16b, v18.16b, v19.16b, #5
> + ext v6.16b, v18.16b, v19.16b, #6
> + ext v7.16b, v18.16b, v19.16b, #7
> + QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0], #32
> + ld1 {v28.8b}, [x1], x2
> + ext v1.16b, v19.16b, v28.16b, #1
> + ext v2.16b, v19.16b, v28.16b, #2
> + ext v3.16b, v19.16b, v28.16b, #3
> + ext v4.16b, v19.16b, v28.16b, #4
> + ext v5.16b, v19.16b, v28.16b, #5
> + ext v6.16b, v19.16b, v28.16b, #6
> + ext v7.16b, v19.16b, v28.16b, #7
> + QPEL_H_CALC v19, v1, v2, v3, v20, v21, v22, v23
> + QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
> + addp v20.4s, v20.4s, v22.4s
> + addp v21.4s, v21.4s, v23.4s
> + addp v24.4s, v24.4s, v26.4s
> + addp v25.4s, v25.4s, v27.4s
> + trn1 v22.4s, v20.4s, v21.4s
> + trn2 v23.4s, v20.4s, v21.4s
> + trn1 v26.4s, v24.4s, v25.4s
> + trn2 v27.4s, v24.4s, v25.4s
> + sqxtn v20.4h, v22.4s
> + sqxtn2 v20.8h, v26.4s
> + sqxtn v21.4h, v23.4s
> + sqxtn2 v21.8h, v27.4s
> + stp q20, q21, [x0], #32
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +.macro QPEL_UNI_W_HV_HEADER width
> + ldp x14, x15, [sp] // mx, my
> + ldr w13, [sp, #16] // width
> + stp x20, x21, [sp, #-16]!
> + stp x22, x23, [sp, #-16]!
> + stp x24, x25, [sp, #-16]!
> + stp x26, x27, [sp, #-16]!
> + stp x28, x30, [sp, #-16]!
> + mov x28, sp
> + mov x11, #9088
> + sub sp, sp, x11
> + mov x20, x0
> + mov x21, x1
> + mov x0, sp
> + sub x1, x2, x3, lsl 1
> + sub x1, x1, x3
> + mov x2, x3
> + add w3, w4, #7
> + mov w22, w4 // height
> + mov x4, x14 // mx
> + mov x23, x15 // my
> + mov w24, w6 // wx
> + mov w25, w7 // ox
> + mov w26, #-6
> + sub w26, w26, w5 // -shift
> + mov w27, w13 // width
> + bl X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_dotprod)
> + movrel x9, qpel_filters
> + add x9, x9, x23, lsl 3
> + ld1 {v0.8b}, [x9]
> + sxtl v0.8h, v0.8b
> + mov x10, #(MAX_PB_SIZE * 2)
> + dup v28.4s, w24
> + dup v29.4s, w25
> + dup v30.4s, w26
> +.endm
> +
> +.macro QPEL_UNI_W_HV_END
> + mov sp, x28
> + ldp x28, x30, [sp], #16
> + ldp x26, x27, [sp], #16
> + ldp x24, x25, [sp], #16
> + ldp x22, x23, [sp], #16
> + ldp x20, x21, [sp], #16
> +.endm
> +
> +.macro QPEL_UNI_W_HV_4
> + sshr v26.4s, v26.4s, #6
> + mul v24.4s, v26.4s, v28.4s
> + sqrshl v24.4s, v24.4s, v30.4s
> + sqadd v24.4s, v24.4s, v29.4s
> + sqxtn v24.4h, v24.4s
> + sqxtun v24.8b, v24.8h
> + st1 {v24.s}[0], [x20], x21
> +.endm
> +
> +.macro QPEL_FILTER_H dst, src0, src1, src2, src3, src4, src5, src6, src7
> + smull \dst\().4s, \src0\().4h, v0.h[0]
> + smlal \dst\().4s, \src1\().4h, v0.h[1]
> + smlal \dst\().4s, \src2\().4h, v0.h[2]
> + smlal \dst\().4s, \src3\().4h, v0.h[3]
> + smlal \dst\().4s, \src4\().4h, v0.h[4]
> + smlal \dst\().4s, \src5\().4h, v0.h[5]
> + smlal \dst\().4s, \src6\().4h, v0.h[6]
> + smlal \dst\().4s, \src7\().4h, v0.h[7]
> +.endm
> +
> +.macro QPEL_FILTER_H2 dst, src0, src1, src2, src3, src4, src5, src6,
> src7
> + smull2 \dst\().4s, \src0\().8h, v0.h[0]
> + smlal2 \dst\().4s, \src1\().8h, v0.h[1]
> + smlal2 \dst\().4s, \src2\().8h, v0.h[2]
> + smlal2 \dst\().4s, \src3\().8h, v0.h[3]
> + smlal2 \dst\().4s, \src4\().8h, v0.h[4]
> + smlal2 \dst\().4s, \src5\().8h, v0.h[5]
> + smlal2 \dst\().4s, \src6\().8h, v0.h[6]
> + smlal2 \dst\().4s, \src7\().8h, v0.h[7]
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_dotprod, export=1
> + QPEL_UNI_W_HV_HEADER 4
> + ldr d16, [sp]
> + ldr d17, [sp, x10]
> + add sp, sp, x10, lsl 1
> + ldr d18, [sp]
> + ldr d19, [sp, x10]
> + add sp, sp, x10, lsl 1
> + ldr d20, [sp]
> + ldr d21, [sp, x10]
> + add sp, sp, x10, lsl 1
> + ldr d22, [sp]
> + add sp, sp, x10
> +1:
> + ldr d23, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d16, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d17, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d18, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d19, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d20, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d21, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr d22, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_UNI_W_HV_4
> + subs w22, w22, #1
> + b.hi 1b
> +
> +2:
> + QPEL_UNI_W_HV_END
> + ret
> +endfunc
> +
> +.macro QPEL_UNI_W_HV_8
> + sshr v26.4s, v26.4s, #6
> + sshr v27.4s, v27.4s, #6
> + mul v24.4s, v26.4s, v28.4s
> + mul v25.4s, v27.4s, v28.4s
> + sqrshl v24.4s, v24.4s, v30.4s
> + sqrshl v25.4s, v25.4s, v30.4s
> + sqadd v24.4s, v24.4s, v29.4s
> + sqadd v25.4s, v25.4s, v29.4s
> + sqxtn v24.4h, v24.4s
> + sqxtn2 v24.8h, v25.4s
> + sqxtun v24.8b, v24.8h
> + st1 {v24.d}[0], [x20], x21
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_dotprod, export=1
> + QPEL_UNI_W_HV_HEADER 8
> + ldr q16, [sp]
> + ldr q17, [sp, x10]
> + add sp, sp, x10, lsl 1
> + ldr q18, [sp]
> + ldr q19, [sp, x10]
> + add sp, sp, x10, lsl 1
> + ldr q20, [sp]
> + ldr q21, [sp, x10]
> + add sp, sp, x10, lsl 1
> + ldr q22, [sp]
> + add sp, sp, x10
> +1:
> + ldr q23, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H2 v27, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q16, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H2 v27, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q17, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H2 v27, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q18, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H2 v27, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q19, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H2 v27, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q20, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H2 v27, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q21, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H2 v27, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldr q22, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H2 v27, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_UNI_W_HV_8
> + subs w22, w22, #1
> + b.hi 1b
> +
> +2:
> + QPEL_UNI_W_HV_END
> + ret
> +endfunc
> +
> +.macro QPEL_UNI_W_HV_16
> + sshr v24.4s, v24.4s, #6
> + sshr v25.4s, v25.4s, #6
> + sshr v26.4s, v26.4s, #6
> + sshr v27.4s, v27.4s, #6
> + mul v24.4s, v24.4s, v28.4s
> + mul v25.4s, v25.4s, v28.4s
> + mul v26.4s, v26.4s, v28.4s
> + mul v27.4s, v27.4s, v28.4s
> + sqrshl v24.4s, v24.4s, v30.4s
> + sqrshl v25.4s, v25.4s, v30.4s
> + sqrshl v26.4s, v26.4s, v30.4s
> + sqrshl v27.4s, v27.4s, v30.4s
> + sqadd v24.4s, v24.4s, v29.4s
> + sqadd v25.4s, v25.4s, v29.4s
> + sqadd v26.4s, v26.4s, v29.4s
> + sqadd v27.4s, v27.4s, v29.4s
> + sqxtn v24.4h, v24.4s
> + sqxtn2 v24.8h, v25.4s
> + sqxtn v26.4h, v26.4s
> + sqxtn2 v26.8h, v27.4s
> + sqxtun v24.8b, v24.8h
> + sqxtun2 v24.16b, v26.8h
> +
> + st1 {v24.16b}, [x20], x21
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_dotprod, export=1
> + QPEL_UNI_W_HV_HEADER 16
> + ldp q16, q1, [sp]
> + add sp, sp, x10
> + ldp q17, q2, [sp]
> + add sp, sp, x10
> + ldp q18, q3, [sp]
> + add sp, sp, x10
> + ldp q19, q4, [sp]
> + add sp, sp, x10
> + ldp q20, q5, [sp]
> + add sp, sp, x10
> + ldp q21, q6, [sp]
> + add sp, sp, x10
> + ldp q22, q7, [sp]
> + add sp, sp, x10
> +1:
> + ldp q23, q31, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
> + QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q16, q1, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
> + QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q17, q2, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
> + QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q18, q3, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
> + QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q19, q4, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
> + QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q20, q5, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
> + QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q21, q6, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
> + QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q22, q7, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
> + QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.hi 1b
> +
> +2:
> + QPEL_UNI_W_HV_END
> + ret
> +endfunc
> +
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_dotprod, export=1
> + QPEL_UNI_W_HV_HEADER 32
> + mov x11, sp
> + mov w12, w22
> + mov x13, x20
> +3:
> + ldp q16, q1, [sp]
> + add sp, sp, x10
> + ldp q17, q2, [sp]
> + add sp, sp, x10
> + ldp q18, q3, [sp]
> + add sp, sp, x10
> + ldp q19, q4, [sp]
> + add sp, sp, x10
> + ldp q20, q5, [sp]
> + add sp, sp, x10
> + ldp q21, q6, [sp]
> + add sp, sp, x10
> + ldp q22, q7, [sp]
> + add sp, sp, x10
> +1:
> + ldp q23, q31, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
> + QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q16, q1, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
> + QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q17, q2, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
> + QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q18, q3, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
> + QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q19, q4, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
> + QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q20, q5, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
> + QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q21, q6, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
> + QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q22, q7, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
> + QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.hi 1b
> +2:
> + subs w27, w27, #16
> + add sp, x11, #32
> + add x20, x13, #16
> + mov w22, w12
> + mov x11, sp
> + mov x13, x20
> + b.hi 3b
> + QPEL_UNI_W_HV_END
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_dotprod, export=1
> + QPEL_UNI_W_HV_HEADER 64
> + mov x11, sp
> + mov w12, w22
> + mov x13, x20
> +3:
> + ldp q16, q1, [sp]
> + add sp, sp, x10
> + ldp q17, q2, [sp]
> + add sp, sp, x10
> + ldp q18, q3, [sp]
> + add sp, sp, x10
> + ldp q19, q4, [sp]
> + add sp, sp, x10
> + ldp q20, q5, [sp]
> + add sp, sp, x10
> + ldp q21, q6, [sp]
> + add sp, sp, x10
> + ldp q22, q7, [sp]
> + add sp, sp, x10
> +1:
> + ldp q23, q31, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
> + QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
> + QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q16, q1, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
> + QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
> + QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q17, q2, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
> + QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
> + QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q18, q3, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
> + QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
> + QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q19, q4, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
> + QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
> + QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q20, q5, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
> + QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
> + QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q21, q6, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
> + QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
> + QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.eq 2f
> +
> + ldp q22, q7, [sp]
> + add sp, sp, x10
> + QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
> + QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
> + QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
> + QPEL_UNI_W_HV_16
> + subs w22, w22, #1
> + b.hi 1b
> +2:
> + subs w27, w27, #16
> + add sp, x11, #32
> + add x20, x13, #16
> + mov w22, w12
> + mov x11, sp
> + mov x13, x20
> + b.hi 3b
> + QPEL_UNI_W_HV_END
> + ret
> +endfunc
> +
> +#endif // __ARM_FEATURE_DOTPROD
> \ No newline at end of file
> --
> 2.38.0.windows.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
--
Jean-Baptiste Kempf - President
+33 672 704 734
More information about the ffmpeg-devel
mailing list