[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc weighted mc msa functions
kaustubh.raste at imgtec.com
kaustubh.raste at imgtec.com
Tue Sep 26 08:21:12 EEST 2017
From: Kaustubh Raste <kaustubh.raste at imgtec.com>
Replace generic with block size specific function.
Signed-off-by: Kaustubh Raste <kaustubh.raste at imgtec.com>
---
libavcodec/mips/h264dsp_msa.c | 423 ++++++++++++++++++++++-------------
libavutil/mips/generic_macros_msa.h | 36 +++
2 files changed, 306 insertions(+), 153 deletions(-)
diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
index 422703d..5b06bd9 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -25,187 +25,201 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
int32_t log2_denom, int32_t src_weight,
int32_t offset_in)
{
- uint32_t data0, data1;
+ uint32_t tp0, tp1, offset_val;
v16u8 zero = { 0 };
- v16u8 src0, src1;
- v4i32 res0, res1;
- v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset;
- v8u16 out0, out1;
+ v16u8 src0 = { 0 };
+ v8i16 src0_r, tmp0, wgt, denom, offset;
- offset_in <<= (log2_denom);
-
- if (log2_denom) {
- offset_in += (1 << (log2_denom - 1));
- }
+ offset_val = (unsigned) offset_in << log2_denom;
wgt = __msa_fill_h(src_weight);
- offset = __msa_fill_h(offset_in);
+ offset = __msa_fill_h(offset_val);
denom = __msa_fill_h(log2_denom);
- data0 = LW(data);
- data1 = LW(data + stride);
-
- src0 = (v16u8) __msa_fill_w(data0);
- src1 = (v16u8) __msa_fill_w(data1);
+ LW2(data, stride, tp0, tp1);
+ INSERT_W2_UB(tp0, tp1, src0);
+ src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
+ tmp0 = wgt * src0_r;
+ tmp0 = __msa_adds_s_h(tmp0, offset);
+ tmp0 = __msa_maxi_s_h(tmp0, 0);
+ tmp0 = __msa_srlr_h(tmp0, denom);
+ tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
+ src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+ ST4x2_UB(src0, data, stride);
+}
- ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
- MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
- ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
- MAXI_SH2_SH(temp0, temp1, 0);
+static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+ int32_t src_weight, int32_t offset_in)
+{
+ uint32_t tp0, tp1, tp2, tp3, offset_val;
+ v16u8 src0 = { 0 };
+ v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
- out0 = (v8u16) __msa_srl_h(temp0, denom);
- out1 = (v8u16) __msa_srl_h(temp1, denom);
+ offset_val = (unsigned) offset_in << log2_denom;
- SAT_UH2_UH(out0, out1, 7);
- PCKEV_B2_SW(out0, out0, out1, out1, res0, res1);
+ wgt = __msa_fill_h(src_weight);
+ offset = __msa_fill_h(offset_val);
+ denom = __msa_fill_h(log2_denom);
- data0 = __msa_copy_u_w(res0, 0);
- data1 = __msa_copy_u_w(res1, 0);
- SW(data0, data);
- data += stride;
- SW(data1, data);
+ LW4(data, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+ UNPCK_UB_SH(src0, src0_r, src1_r);
+ MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
+ ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
+ MAXI_SH2_SH(tmp0, tmp1, 0);
+ tmp0 = __msa_srlr_h(tmp0, denom);
+ tmp1 = __msa_srlr_h(tmp1, denom);
+ SAT_UH2_SH(tmp0, tmp1, 7);
+ src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+ ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
}
-static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t offset_in)
+static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+ int32_t src_weight, int32_t offset_in)
{
- uint8_t cnt;
- uint32_t data0, data1, data2, data3;
- v16u8 zero = { 0 };
- v16u8 src0, src1, src2, src3;
- v8u16 temp0, temp1, temp2, temp3, wgt;
- v8i16 denom, offset;
+ uint32_t tp0, tp1, tp2, tp3, offset_val;
+ v16u8 src0 = { 0 }, src1 = { 0 };
+ v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+ v8i16 wgt, denom, offset;
- offset_in <<= (log2_denom);
+ offset_val = (unsigned) offset_in << log2_denom;
- if (log2_denom) {
- offset_in += (1 << (log2_denom - 1));
- }
-
- wgt = (v8u16) __msa_fill_h(src_weight);
- offset = __msa_fill_h(offset_in);
+ wgt = __msa_fill_h(src_weight);
+ offset = __msa_fill_h(offset_val);
denom = __msa_fill_h(log2_denom);
- for (cnt = height / 4; cnt--;) {
- LW4(data, stride, data0, data1, data2, data3);
-
- src0 = (v16u8) __msa_fill_w(data0);
- src1 = (v16u8) __msa_fill_w(data1);
- src2 = (v16u8) __msa_fill_w(data2);
- src3 = (v16u8) __msa_fill_w(data3);
-
- ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
- temp0, temp1, temp2, temp3);
- MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
- temp0, temp1, temp2, temp3);
- ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
- temp0, temp1, temp2, temp3);
- MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
- SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
- SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
- PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride);
- data += (4 * stride);
- }
+ LW4(data, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+ LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
+ UNPCK_UB_SH(src0, src0_r, src1_r);
+ UNPCK_UB_SH(src1, src2_r, src3_r);
+ MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
+ tmp3);
+ ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+ tmp1, tmp2, tmp3);
+ MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
+ SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
+ SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ ST4x8_UB(src0, src1, data, stride);
}
-static void avc_wgt_4width_msa(uint8_t *data, int32_t stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t offset_in)
+static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+ int32_t src_weight, int32_t offset_in)
{
- if (2 == height) {
- avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in);
- } else {
- avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight,
- offset_in);
- }
+ uint32_t offset_val;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16u8 src0 = { 0 }, src1 = { 0 };
+ v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+ v8i16 wgt, denom, offset;
+
+ offset_val = (unsigned) offset_in << log2_denom;
+
+ wgt = __msa_fill_h(src_weight);
+ offset = __msa_fill_h(offset_val);
+ denom = __msa_fill_h(log2_denom);
+
+ LD4(data, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src0);
+ INSERT_D2_UB(tp2, tp3, src1);
+ UNPCK_UB_SH(src0, src0_r, src1_r);
+ UNPCK_UB_SH(src1, src2_r, src3_r);
+ MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
+ tmp3);
+ ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+ tmp1, tmp2, tmp3);
+ MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
+ SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
+ SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ ST8x4_UB(src0, src1, data, stride);
}
-static void avc_wgt_8width_msa(uint8_t *data, int32_t stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t offset_in)
+static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+ int32_t src_weight, int32_t offset_in)
{
- uint8_t cnt;
- v16u8 zero = { 0 };
- v16u8 src0, src1, src2, src3;
- v8u16 src0_r, src1_r, src2_r, src3_r;
- v8u16 temp0, temp1, temp2, temp3;
- v8u16 wgt, denom, offset;
- v16i8 out0, out1;
+ uint32_t offset_val;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 wgt, denom, offset;
- offset_in <<= (log2_denom);
+ offset_val = (unsigned) offset_in << log2_denom;
- if (log2_denom) {
- offset_in += (1 << (log2_denom - 1));
- }
-
- wgt = (v8u16) __msa_fill_h(src_weight);
- offset = (v8u16) __msa_fill_h(offset_in);
- denom = (v8u16) __msa_fill_h(log2_denom);
+ wgt = __msa_fill_h(src_weight);
+ offset = __msa_fill_h(offset_val);
+ denom = __msa_fill_h(log2_denom);
- for (cnt = height / 4; cnt--;) {
- LD_UB4(data, stride, src0, src1, src2, src3);
- ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
- src0_r, src1_r, src2_r, src3_r);
- MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r,
- temp0, temp1, temp2, temp3);
- ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
- temp0, temp1, temp2, temp3);
- MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
- SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
- SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
- PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
- ST8x4_UB(out0, out1, data, stride);
- data += (4 * stride);
- }
+ LD4(data, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src0);
+ INSERT_D2_UB(tp2, tp3, src1);
+ LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src2);
+ INSERT_D2_UB(tp2, tp3, src3);
+ UNPCK_UB_SH(src0, src0_r, src1_r);
+ UNPCK_UB_SH(src1, src2_r, src3_r);
+ UNPCK_UB_SH(src2, src4_r, src5_r);
+ UNPCK_UB_SH(src3, src6_r, src7_r);
+ MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
+ tmp3);
+ MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
+ tmp7);
+ ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+ tmp1, tmp2, tmp3);
+ ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
+ tmp5, tmp6, tmp7);
+ MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+ SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+ SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+ PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
+ src2, src3);
+ ST8x8_UB(src0, src1, src2, src3, data, stride);
}
-static void avc_wgt_16width_msa(uint8_t *data, int32_t stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t offset_in)
+static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+ int32_t src_weight, int32_t offset_in)
{
- uint8_t cnt;
- v16i8 zero = { 0 };
- v16u8 src0, src1, src2, src3;
- v16u8 dst0, dst1, dst2, dst3;
- v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
- v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- v8u16 wgt, denom, offset;
-
- offset_in <<= (log2_denom);
+ uint32_t offset_val, cnt;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 wgt, denom, offset;
- if (log2_denom) {
- offset_in += (1 << (log2_denom - 1));
- }
+ offset_val = (unsigned) offset_in << log2_denom;
- wgt = (v8u16) __msa_fill_h(src_weight);
- offset = (v8u16) __msa_fill_h(offset_in);
- denom = (v8u16) __msa_fill_h(log2_denom);
+ wgt = __msa_fill_h(src_weight);
+ offset = __msa_fill_h(offset_val);
+ denom = __msa_fill_h(log2_denom);
- for (cnt = height / 4; cnt--;) {
- LD_UB4(data, stride, src0, src1, src2, src3);
- ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
- src0_r, src1_r, src2_r, src3_r);
- ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
- src0_l, src1_l, src2_l, src3_l);
- MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l,
- temp0, temp1, temp2, temp3);
- MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l,
- temp4, temp5, temp6, temp7);
- ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
- temp0, temp1, temp2, temp3);
- ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset,
- temp4, temp5, temp6, temp7);
- MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
- MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0);
- SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
- SRL_H4_UH(temp4, temp5, temp6, temp7, denom);
- SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
- SAT_UH4_UH(temp4, temp5, temp6, temp7, 7);
- PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
- dst0, dst1, dst2, dst3);
- ST_UB4(dst0, dst1, dst2, dst3, data, stride);
- data += 4 * stride;
+ for (cnt = 2; cnt--;) {
+ LD4(data, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src0);
+ INSERT_D2_UB(tp2, tp3, src1);
+ LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src2);
+ INSERT_D2_UB(tp2, tp3, src3);
+ UNPCK_UB_SH(src0, src0_r, src1_r);
+ UNPCK_UB_SH(src1, src2_r, src3_r);
+ UNPCK_UB_SH(src2, src4_r, src5_r);
+ UNPCK_UB_SH(src3, src6_r, src7_r);
+ MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
+ tmp2, tmp3);
+ MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
+ tmp6, tmp7);
+ ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
+ tmp0, tmp1, tmp2, tmp3);
+ ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
+ tmp4, tmp5, tmp6, tmp7);
+ MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+ SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+ SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+ PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
+ src2, src3);
+ ST8x8_UB(src0, src1, src2, src3, data, stride);
+ data += 8 * stride;
}
}
@@ -2291,23 +2305,126 @@ void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
int height, int log2_denom,
- int weight_src, int offset)
+ int weight_src, int offset_in)
{
- avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset);
+ uint32_t offset_val;
+ v16i8 zero = { 0 };
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
+ v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ v8i16 wgt, denom, offset;
+
+ offset_val = (unsigned) offset_in << log2_denom;
+
+ wgt = __msa_fill_h(weight_src);
+ offset = __msa_fill_h(offset_val);
+ denom = __msa_fill_h(log2_denom);
+
+ LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
+ src2_r, src3_r);
+ ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
+ src2_l, src3_l);
+ ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
+ src6_r, src7_r);
+ ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
+ src6_l, src7_l);
+ MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
+ tmp3);
+ MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
+ tmp7);
+ MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
+ tmp11);
+ MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
+ tmp14, tmp15);
+ ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+ tmp1, tmp2, tmp3);
+ ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
+ tmp5, tmp6, tmp7);
+ ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
+ tmp9, tmp10, tmp11);
+ ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
+ tmp12, tmp13, tmp14, tmp15);
+ MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+ MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
+ SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+ SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
+ SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+ SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
+ PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+ dst2, dst3);
+ PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+ dst5, dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
+ src += 8 * stride;
+
+ if (16 == height) {
+ LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
+ src1_r, src2_r, src3_r);
+ ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
+ src1_l, src2_l, src3_l);
+ ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
+ src5_r, src6_r, src7_r);
+ ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
+ src5_l, src6_l, src7_l);
+ MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
+ tmp2, tmp3);
+ MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
+ tmp6, tmp7);
+ MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
+ tmp10, tmp11);
+ MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
+ tmp14, tmp15);
+ ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
+ tmp0, tmp1, tmp2, tmp3);
+ ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
+ tmp4, tmp5, tmp6, tmp7);
+ ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
+ tmp8, tmp9, tmp10, tmp11);
+ ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
+ tmp12, tmp13, tmp14, tmp15);
+ MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+ MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
+ SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+ SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
+ SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+ SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
+ PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+ dst2, dst3);
+ PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+ dst5, dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
+ }
}
void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
int height, int log2_denom,
int weight_src, int offset)
{
- avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset);
+ if (4 == height) {
+ avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
+ } else if (8 == height) {
+ avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
+ } else {
+ avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
+ }
}
void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
int height, int log2_denom,
int weight_src, int offset)
{
- avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset);
+ if (2 == height) {
+ avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
+ } else if (4 == height) {
+ avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
+ } else {
+ avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
+ }
}
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index bda3ed2..7de97dd 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -1635,6 +1635,15 @@
MAXI_SH2(RTYPE, in2, in3, max_val); \
}
#define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
+#define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
+
+#define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
+{ \
+ MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
+ MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
+}
+#define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
+#define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
/* Description : Saturate the halfword element values to the max
unsigned value of (sat_val+1 bits)
@@ -1660,6 +1669,15 @@
SAT_UH2(RTYPE, in2, in3, sat_val); \
}
#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+#define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
+
+#define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
+{ \
+ SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
+ SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
+}
+#define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
+#define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
/* Description : Saturate the halfword element values to the max
unsigned value of (sat_val+1 bits)
@@ -2040,6 +2058,24 @@
}
#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
+#define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
+{ \
+ in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
+ in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
+ in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
+ in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
+}
+#define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
+#define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
+
+#define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
+{ \
+ SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
+ SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
+}
+#define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
+#define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
+
/* Description : Shift right arithmetic rounded halfwords
Arguments : Inputs - in0, in1, shift
Outputs - in0, in1, (in place)
--
1.7.9.5
More information about the ffmpeg-devel
mailing list