[FFmpeg-devel] [PATCH] libavcodec/mips: Optimize avc idct 4x4 for msa
Manojkumar Bhosale
Manojkumar.Bhosale at imgtec.com
Mon Jul 24 20:20:26 EEST 2017
LGTM
________________________________________
From: ffmpeg-devel [ffmpeg-devel-bounces at ffmpeg.org] on behalf of kaustubh.raste at imgtec.com [kaustubh.raste at imgtec.com]
Sent: Monday, July 24, 2017 6:11 PM
To: ffmpeg-devel at ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] libavcodec/mips: Optimize avc idct 4x4 for msa
From: Kaustubh Raste <kaustubh.raste at imgtec.com>
Removed memset call and improved performance.
Signed-off-by: Kaustubh Raste <kaustubh.raste at imgtec.com>
---
libavcodec/mips/h264idct_msa.c | 104 +++++++++++++++++++----------------
libavutil/mips/generic_macros_msa.h | 18 ++++++
2 files changed, 74 insertions(+), 48 deletions(-)
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index fac1e7a..81e09e9 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
*
* This file is part of FFmpeg.
*
@@ -36,48 +36,6 @@
BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \
}
-static void avc_idct4x4_addblk_msa(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- v8i16 src0, src1, src2, src3;
- v8i16 hres0, hres1, hres2, hres3;
- v8i16 vres0, vres1, vres2, vres3;
- v8i16 zeros = { 0 };
-
- LD4x4_SH(src, src0, src1, src2, src3);
- AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
- TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
- AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
- SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
- ADDBLK_ST4x4_UB(vres0, vres1, vres2, vres3, dst, dst_stride);
- ST_SH2(zeros, zeros, src, 8);
-}
-
-static void avc_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- int16_t dc;
- uint32_t src0, src1, src2, src3;
- v16u8 pred = { 0 };
- v16i8 out;
- v8i16 input_dc, pred_r, pred_l;
-
- dc = (src[0] + 32) >> 6;
- input_dc = __msa_fill_h(dc);
- src[0] = 0;
-
- LW4(dst, dst_stride, src0, src1, src2, src3);
- INSERT_W4_UB(src0, src1, src2, src3, pred);
- UNPCK_UB_SH(pred, pred_r, pred_l);
-
- pred_r += input_dc;
- pred_l += input_dc;
-
- CLIP_SH2_0_255(pred_r, pred_l);
- out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
int32_t de_q_val)
{
@@ -317,11 +275,45 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
ST8x4_UB(dst2, dst3, dst, dst_stride);
}
-void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
{
- avc_idct4x4_addblk_msa(dst, src, dst_stride);
- memset(src, 0, 16 * sizeof(dctcoef));
+ uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
+ v16i8 dst0_m = { 0 };
+ v16i8 dst1_m = { 0 };
+ v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
+ v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
+ const v8i16 src0 = LD_SH(src);
+ const v8i16 src2 = LD_SH(src + 8);
+ const v8i16 zero = { 0 };
+ const uint8_t *dst1 = dst + dst_stride;
+ const uint8_t *dst2 = dst + 2 * dst_stride;
+ const uint8_t *dst3 = dst + 3 * dst_stride;
+
+ ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
+ ST_SH2(zero, zero, src, 8);
+ AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
+ TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+ AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
+ src0_m = LW(dst);
+ src1_m = LW(dst1);
+ SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
+ src2_m = LW(dst2);
+ src3_m = LW(dst3);
+ ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
+ INSERT_W2_SB(src0_m, src1_m, dst0_m);
+ INSERT_W2_SB(src2_m, src3_m, dst1_m);
+ ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
+ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
+ CLIP_SH2_0_255(res0_m, res1_m);
+ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
+ out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
+ out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
+ out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
+ out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
+ SW(out0_m, dst);
+ SW(out1_m, dst1);
+ SW(out2_m, dst2);
+ SW(out3_m, dst3);
}
void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
@@ -334,7 +326,23 @@ void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
int32_t dst_stride)
{
- avc_idct4x4_addblk_dc_msa(dst, src, dst_stride);
+ v16u8 pred = { 0 };
+ v16i8 out;
+ v8i16 pred_r, pred_l;
+ const uint32_t src0 = LW(dst);
+ const uint32_t src1 = LW(dst + dst_stride);
+ const uint32_t src2 = LW(dst + 2 * dst_stride);
+ const uint32_t src3 = LW(dst + 3 * dst_stride);
+ const int16_t dc = (src[0] + 32) >> 6;
+ const v8i16 input_dc = __msa_fill_h(dc);
+
+ src[0] = 0;
+ INSERT_W4_UB(src0, src1, src2, src3, pred);
+ UNPCK_UB_SH(pred, pred_r, pred_l);
+ ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
+ CLIP_SH2_0_255(pred_r, pred_l);
+ out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index 61a8ee0..407d46e 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -1531,6 +1531,24 @@
#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+/* Description : Interleave left half of double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of double word elements of in0 and left half of
+ double word elements of in1 are interleaved and copied to out0.
+ Left half of double word elements of in2 and left half of
+ double word elements of in3 are interleaved and copied to out1.
+*/
+#define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
+}
+#define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
+#define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
+#define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
+
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
--
1.7.9.5
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel at ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
More information about the ffmpeg-devel
mailing list