[FFmpeg-devel] [PATCH 1/3] avcodec: [loongarch] Optimize hpeldsp with LASX.
殷时友
yinshiyou-hf at loongson.cn
Wed Dec 29 11:38:37 EET 2021
> 2021年12月24日 下午5:49,Hao Chen <chenhao at loongson.cn> 写道:
>
> From: Shiyou Yin <yinshiyou-hf at loongson.cn>
>
> ./ffmpeg -i 8_mpeg4_1080p_24fps_12Mbps.avi -f rawvideo -y /dev/null -an
> before:376fps
> after :433fps
>
> Change-Id: Ic8018562093154887323b508b81d0f489c0d265d
> Signed-off-by: Hao Chen <chenhao at loongson.cn>
> ---
> libavcodec/hpeldsp.c | 2 +
> libavcodec/hpeldsp.h | 1 +
> libavcodec/loongarch/Makefile | 2 +
> libavcodec/loongarch/hpeldsp_init_loongarch.c | 50 +
> libavcodec/loongarch/hpeldsp_lasx.c | 1289 +++++++++++++++++
> libavcodec/loongarch/hpeldsp_lasx.h | 58 +
> 6 files changed, 1402 insertions(+)
> create mode 100644 libavcodec/loongarch/hpeldsp_init_loongarch.c
> create mode 100644 libavcodec/loongarch/hpeldsp_lasx.c
> create mode 100644 libavcodec/loongarch/hpeldsp_lasx.h
>
>
> diff --git a/libavcodec/loongarch/hpeldsp_lasx.c b/libavcodec/loongarch/hpeldsp_lasx.c
> new file mode 100644
> index 0000000000..07f0c4b517
> --- /dev/null
> +++ b/libavcodec/loongarch/hpeldsp_lasx.c
> @@ -0,0 +1,1289 @@
>
> +void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h)
> +{
> + if (h == 16) {
> + common_vt_bil_no_rnd_16x16_lasx(pixels, line_size, block, line_size);
> + } else if (h == 8) {
> + common_vt_bil_no_rnd_8x16_lasx(pixels, line_size, block, line_size);
> + }
> +}
> +
> +static void common_hv_bil_no_rnd_16x16_lasx(const uint8_t *src,
> + int32_t src_stride,
> + uint8_t *dst, int32_t dst_stride)
> +{
> + __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
> + __m256i src10, src11, src12, src13, src14, src15, src16, src17;
> + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
> + int32_t src_stride_2x = src_stride << 1;
> + int32_t src_stride_4x = src_stride << 2;
> + int32_t src_stride_3x = src_stride_2x + src_stride;
> + uint8_t* _src = (uint8_t*)src;
> +
> + src0 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2);
> + src3 = __lasx_xvldx(_src, src_stride_3x);
> + _src += src_stride_4x;
> + src4 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6);
> + src7 = __lasx_xvldx(_src, src_stride_3x);
> + _src += (1 - src_stride_4x);
> + src9 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x,
> + src10, src11);
> + src12 = __lasx_xvldx(_src, src_stride_3x);
> + _src += src_stride_4x;
> + src13 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x,
> + src14, src15);
> + src16 = __lasx_xvldx(_src, src_stride_3x);
> + _src += (src_stride_4x - 1);
> + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17);
> +
> + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2,
> + src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3);
> + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10,
> + src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7);
> + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02,
> + src8, src9);
> + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3,
> + sum0, sum2, sum4, sum6);
> + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3,
> + sum1, sum3, sum5, sum7);
> + src8 = __lasx_xvilvl_h(src9, src4);
> + src9 = __lasx_xvilvh_h(src9, src4);
> +
> + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2,
> + sum3, sum3, src0, src1, src2, src3);
> + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6,
> + sum7, sum7, src4, src5, src6, src7);
> + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9);
> +
> + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5,
> + sum0, sum1, sum2, sum3);
> + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9,
> + sum4, sum5, sum6, sum7);
> + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1,
> + sum0, sum1, sum2, sum3);
> + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1,
> + sum4, sum5, sum6, sum7);
> + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2,
> + sum7, sum6, 2, sum0, sum1, sum2, sum3);
> + __lasx_xvstelm_d(sum0, dst, 0, 0);
> + __lasx_xvstelm_d(sum0, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum1, dst, 0, 0);
> + __lasx_xvstelm_d(sum1, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum2, dst, 0, 0);
> + __lasx_xvstelm_d(sum2, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum3, dst, 0, 0);
> + __lasx_xvstelm_d(sum3, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum0, dst, 0, 2);
> + __lasx_xvstelm_d(sum0, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum1, dst, 0, 2);
> + __lasx_xvstelm_d(sum1, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum2, dst, 0, 2);
> + __lasx_xvstelm_d(sum2, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum3, dst, 0, 2);
> + __lasx_xvstelm_d(sum3, dst, 8, 3);
> + dst += dst_stride;
> +
> + src0 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2);
> + src3 = __lasx_xvldx(_src, src_stride_3x);
> + _src += src_stride_4x;
> + src4 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6);
> + src7 = __lasx_xvldx(_src, src_stride_3x);
> + _src += (1 - src_stride_4x);
> + src9 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x,
> + src10, src11);
> + src12 = __lasx_xvldx(_src, src_stride_3x);
> + _src += src_stride_4x;
> + src13 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x,
> + src14, src15);
> + src16 = __lasx_xvldx(_src, src_stride_3x);
> + _src += (src_stride_4x - 1);
> + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17);
> +
> + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, src6, 0x02,
> + src3, src7, 0x02, src0, src1, src2, src3);
> + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, src14, 0x02,
> + src11, src15, 0x02, src4, src5, src6, src7);
> + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, src8, src9);
> +
> + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3,
> + sum0, sum2, sum4, sum6);
> + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3,
> + sum1, sum3, sum5, sum7);
> + src8 = __lasx_xvilvl_h(src9, src4);
> + src9 = __lasx_xvilvh_h(src9, src4);
> +
> + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2,
> + sum3, sum3, src0, src1, src2, src3);
> + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6,
> + sum7, sum7, src4, src5, src6, src7);
> + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9);
> +
> + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5,
> + sum0, sum1, sum2, sum3);
> + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9,
> + sum4, sum5, sum6, sum7);
> + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1,
> + sum0, sum1, sum2, sum3);
> + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1,
> + sum4, sum5, sum6, sum7);
> + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2,
> + sum7, sum6, 2, sum0, sum1, sum2, sum3);
> + __lasx_xvstelm_d(sum0, dst, 0, 0);
> + __lasx_xvstelm_d(sum0, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum1, dst, 0, 0);
> + __lasx_xvstelm_d(sum1, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum2, dst, 0, 0);
> + __lasx_xvstelm_d(sum2, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum3, dst, 0, 0);
> + __lasx_xvstelm_d(sum3, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum0, dst, 0, 2);
> + __lasx_xvstelm_d(sum0, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum1, dst, 0, 2);
> + __lasx_xvstelm_d(sum1, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum2, dst, 0, 2);
> + __lasx_xvstelm_d(sum2, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum3, dst, 0, 2);
> + __lasx_xvstelm_d(sum3, dst, 8, 3);
> + dst += dst_stride;
The last line is not needed.
> +}
> +
> +static void common_hv_bil_no_rnd_8x16_lasx(const uint8_t *src,
> + int32_t src_stride,
> + uint8_t *dst, int32_t dst_stride)
> +{
> + __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
> + __m256i src10, src11, src12, src13, src14, src15, src16, src17;
> + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
> + int32_t src_stride_2x = src_stride << 1;
> + int32_t src_stride_4x = src_stride << 2;
> + int32_t src_stride_3x = src_stride_2x + src_stride;
> + uint8_t* _src = (uint8_t*)src;
> +
> + src0 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2);
> + src3 = __lasx_xvldx(_src, src_stride_3x);
> + _src += src_stride_4x;
> + src4 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6);
> + src7 = __lasx_xvldx(_src, src_stride_3x);
> + _src += (1 - src_stride_4x);
> + src9 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x,
> + src10, src11);
> + src12 = __lasx_xvldx(_src, src_stride_3x);
> + _src += src_stride_4x;
> + src13 = __lasx_xvld(_src, 0);
> + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x,
> + src14, src15);
> + src16 = __lasx_xvldx(_src, src_stride_3x);
> + _src += (src_stride_4x - 1);
> + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17);
> +
> + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2,
> + src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3);
> + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10,
> + src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7);
> + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, src8, src9);
> +
> + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3,
> + sum0, sum2, sum4, sum6);
> + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3,
> + sum1, sum3, sum5, sum7);
> + src8 = __lasx_xvilvl_h(src9, src4);
> + src9 = __lasx_xvilvh_h(src9, src4);
> +
> + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2,
> + sum3, sum3, src0, src1, src2, src3);
> + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6,
> + sum7, sum7, src4, src5, src6, src7);
> + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9);
> +
> + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5,
> + sum0, sum1, sum2, sum3);
> + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9,
> + sum4, sum5, sum6, sum7);
> + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1,
> + sum0, sum1, sum2, sum3);
> + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1,
> + sum4, sum5, sum6, sum7);
> + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2,
> + sum7, sum6, 2, sum0, sum1, sum2, sum3);
> + __lasx_xvstelm_d(sum0, dst, 0, 0);
> + __lasx_xvstelm_d(sum0, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum1, dst, 0, 0);
> + __lasx_xvstelm_d(sum1, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum2, dst, 0, 0);
> + __lasx_xvstelm_d(sum2, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum3, dst, 0, 0);
> + __lasx_xvstelm_d(sum3, dst, 8, 1);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum0, dst, 0, 2);
> + __lasx_xvstelm_d(sum0, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum1, dst, 0, 2);
> + __lasx_xvstelm_d(sum1, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum2, dst, 0, 2);
> + __lasx_xvstelm_d(sum2, dst, 8, 3);
> + dst += dst_stride;
> + __lasx_xvstelm_d(sum3, dst, 0, 2);
> + __lasx_xvstelm_d(sum3, dst, 8, 3);
> + dst += dst_stride;
This line is not needed too.
More information about the ffmpeg-devel
mailing list