27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
34 v4i32 out0_r, out1_r, out0_l, out1_l; \
36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46 CLIP_SH2_0_255(out0, out1); \
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50 wgt, rnd, offset, out0, out1, out2, out3) \
52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
59 v4i32 out0_r, out1_r, out0_l, out1_l; \
61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69 CLIP_SH2_0_255(out0, out1); \
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73 vec3, wgt, rnd, offset, out0, out1, \
76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
84 const int16_t *src1_ptr,
95 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96 uint64_t tpd0, tpd1, tpd2, tpd3;
101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102 v8i16 dst0, dst1, dst2, dst3, weight_vec;
103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
105 offset = (offset0 + offset1) << rnd_val;
106 weight0 = weight0 & 0x0000FFFF;
107 weight = weight0 | (weight1 << 16);
109 offset_vec = __msa_fill_w(
offset);
110 weight_vec = (v8i16) __msa_fill_w(
weight);
111 rnd_vec = __msa_fill_w(rnd_val + 1);
114 LW2(src0_ptr, src_stride, tp0, tp1);
116 LD2(src1_ptr, src2_stride, tpd0, tpd1);
119 dst0 = (v8i16) __msa_ilvr_b(
zero,
src0);
123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
126 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129 ST_W2(out0, 0, 1, dst, dst_stride);
131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
139 offset_vec, dst0, dst1);
140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142 }
else if (0 ==
height % 8) {
143 for (loop_cnt = (
height >> 3); loop_cnt--;) {
144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += 4 * src_stride;
147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += 4 * src_stride;
150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151 src1_ptr += (4 * src2_stride);
154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155 src1_ptr += (4 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 in3, weight_vec, rnd_vec, offset_vec,
163 dst0, dst1, dst2, dst3);
165 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166 dst += (8 * dst_stride);
173 const int16_t *src1_ptr,
186 uint64_t tp0, tp1, tp2, tp3;
190 v8i16 in0, in1, in2, in3;
191 v8i16 dst0, dst1, dst2, dst3;
192 v4i32 offset_vec, weight_vec, rnd_vec;
194 offset = (offset0 + offset1) << rnd_val;
195 weight0 = weight0 & 0x0000FFFF;
196 weight = weight0 | (weight1 << 16);
198 weight_vec = __msa_fill_w(
weight);
199 offset_vec = __msa_fill_w(
offset);
200 rnd_vec = __msa_fill_w(rnd_val + 1);
202 for (loop_cnt = (
height >> 2); loop_cnt--;) {
203 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204 src0_ptr += (4 * src_stride);
207 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208 src1_ptr += (4 * src2_stride);
211 SLLI_4V(dst0, dst1, dst2, dst3, 6);
214 weight_vec, rnd_vec, offset_vec,
215 dst0, dst1, dst2, dst3);
217 ST_W2(out0, 0, 2, dst, dst_stride);
218 ST_H2(out0, 2, 6, dst + 4, dst_stride);
219 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221 dst += (4 * dst_stride);
227 const int16_t *src1_ptr,
238 uint64_t tp0, tp1, tp2, tp3;
240 v16u8 out0, out1, out2;
243 v8i16 in0, in1, in2, in3, in4, in5;
244 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245 v4i32 offset_vec, weight_vec, rnd_vec;
247 offset = (offset0 + offset1) << rnd_val;
248 weight0 = weight0 & 0x0000FFFF;
249 weight = weight0 | (weight1 << 16);
251 offset_vec = __msa_fill_w(
offset);
252 weight_vec = __msa_fill_w(
weight);
253 rnd_vec = __msa_fill_w(rnd_val + 1);
256 LD2(src0_ptr, src_stride, tp0, tp1);
258 LD_SH2(src1_ptr, src2_stride, in0, in1);
263 weight_vec, rnd_vec, offset_vec,
266 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267 ST_D2(out0, 0, 1, dst, dst_stride);
269 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270 src0_ptr += 4 * src_stride;
273 LD2(src0_ptr, src_stride, tp0, tp1);
278 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279 SLLI_4V(dst0, dst1, dst2, dst3, 6);
282 weight_vec, rnd_vec, offset_vec, dst0, dst1,
285 offset_vec, dst4, dst5);
286 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289 }
else if (0 ==
height % 4) {
292 for (loop_cnt = (
height >> 2); loop_cnt--;) {
293 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294 src0_ptr += (4 * src_stride);
299 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300 src1_ptr += (4 * src2_stride);
302 SLLI_4V(dst0, dst1, dst2, dst3, 6);
304 in3, weight_vec, rnd_vec, offset_vec,
305 dst0, dst1, dst2, dst3);
307 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308 dst += (4 * dst_stride);
315 const int16_t *src1_ptr,
329 v16u8 out0, out1, out2;
331 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333 v4i32 offset_vec, weight_vec, rnd_vec;
335 offset = (offset0 + offset1) << rnd_val;
336 weight0 = weight0 & 0x0000FFFF;
337 weight = weight0 | (weight1 << 16);
339 offset_vec = __msa_fill_w(
offset);
340 weight_vec = __msa_fill_w(
weight);
341 rnd_vec = __msa_fill_w(rnd_val + 1);
343 for (loop_cnt = (16 >> 2); loop_cnt--;) {
345 src0_ptr += (4 * src_stride);
346 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348 src1_ptr += (4 * src2_stride);
352 dst0, dst1, dst2, dst3);
354 SLLI_4V(dst0, dst1, dst2, dst3, 6);
361 weight_vec, rnd_vec, offset_vec, dst0, dst1,
364 offset_vec, dst4, dst5);
365 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368 dst += (4 * dst_stride);
374 const int16_t *src1_ptr,
387 v16u8 out0, out1, out2, out3;
390 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392 v4i32 offset_vec, weight_vec, rnd_vec;
394 offset = (offset0 + offset1) << rnd_val;
395 weight0 = weight0 & 0x0000FFFF;
396 weight = weight0 | (weight1 << 16);
398 offset_vec = __msa_fill_w(
offset);
399 weight_vec = __msa_fill_w(
weight);
400 rnd_vec = __msa_fill_w(rnd_val + 1);
402 for (loop_cnt = (
height >> 2); loop_cnt--;) {
404 src0_ptr += (4 * src_stride);
405 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407 src1_ptr += (4 * src2_stride);
408 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp0, tmp1,
410 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp4, tmp5,
412 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
415 weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
418 weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
422 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423 dst += (4 * dst_stride);
429 const int16_t *src1_ptr,
442 v16u8 out0, out1, out2, out3, out4, out5;
444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446 v4i32 offset_vec, weight_vec, rnd_vec;
448 offset = (offset0 + offset1) << rnd_val;
449 weight0 = weight0 & 0x0000FFFF;
450 weight = weight0 | (weight1 << 16);
452 offset_vec = __msa_fill_w(
offset);
453 weight_vec = __msa_fill_w(
weight);
454 rnd_vec = __msa_fill_w(rnd_val + 1);
456 for (loop_cnt = 8; loop_cnt--;) {
458 LD_SB4(src0_ptr + 16, src_stride,
src2, src3, src6, src7);
459 src0_ptr += (4 * src_stride);
460 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463 src1_ptr += (4 * src2_stride);
471 SLLI_4V(dst0, dst1, dst2, dst3, 6);
472 SLLI_4V(dst4, dst5, dst6, dst7, 6);
473 SLLI_4V(dst8, dst9, dst10, dst11, 6);
475 weight_vec, rnd_vec, offset_vec, dst0, dst1,
478 weight_vec, rnd_vec, offset_vec, dst4, dst5,
481 in11, weight_vec, rnd_vec, offset_vec,
482 dst8, dst9, dst10, dst11);
483 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487 dst += (4 * dst_stride);
493 const int16_t *src1_ptr,
506 v16u8 out0, out1, out2, out3;
509 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511 v4i32 offset_vec, weight_vec, rnd_vec;
513 offset = (offset0 + offset1) << rnd_val;
514 weight0 = weight0 & 0x0000FFFF;
515 weight = weight0 | (weight1 << 16);
517 offset_vec = __msa_fill_w(
offset);
518 weight_vec = __msa_fill_w(
weight);
519 rnd_vec = __msa_fill_w(rnd_val + 1);
521 for (loop_cnt = (
height >> 1); loop_cnt--;) {
523 src0_ptr += src_stride;
525 src0_ptr += src_stride;
526 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527 src1_ptr += src2_stride;
528 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529 src1_ptr += src2_stride;
535 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
538 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
541 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
545 ST_UB2(out0, out1, dst, 16);
547 ST_UB2(out2, out3, dst, 16);
554 const int16_t *src1_ptr,
567 v16u8 out0, out1, out2;
570 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571 v4i32 offset_vec, weight_vec, rnd_vec;
573 offset = (offset0 + offset1) << rnd_val;
574 weight0 = weight0 & 0x0000FFFF;
575 weight = weight0 | (weight1 << 16);
577 offset_vec = __msa_fill_w(
offset);
578 weight_vec = __msa_fill_w(
weight);
579 rnd_vec = __msa_fill_w(rnd_val + 1);
581 for (loop_cnt = 64; loop_cnt--;) {
583 src0_ptr += src_stride;
584 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585 src1_ptr += src2_stride;
590 SLLI_4V(dst0, dst1, dst2, dst3, 6);
593 weight_vec, rnd_vec, offset_vec, dst0, dst1,
596 offset_vec, dst4, dst5);
597 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598 ST_UB2(out0, out1, dst, 16);
599 ST_UB(out2, dst + 32);
606 const int16_t *src1_ptr,
619 v16u8 out0, out1, out2, out3;
622 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624 v4i32 offset_vec, weight_vec, rnd_vec;
626 offset = (offset0 + offset1) << rnd_val;
627 weight0 = weight0 & 0x0000FFFF;
628 weight = weight0 | (weight1 << 16);
630 offset_vec = __msa_fill_w(
offset);
631 weight_vec = __msa_fill_w(
weight);
632 rnd_vec = __msa_fill_w(rnd_val + 1);
634 for (loop_cnt =
height; loop_cnt--;) {
636 src0_ptr += src_stride;
637 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638 src1_ptr += src2_stride;
640 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp0, tmp1,
642 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp4, tmp5,
644 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
647 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
650 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
654 ST_UB4(out0, out1, out2, out3, dst, 16);
661 const int16_t *src1_ptr,
675 v8i16 filt0, filt1, filt2, filt3;
677 v16i8 mask1, mask2, mask3;
678 v16i8 vec0, vec1, vec2, vec3;
680 v8i16 in0, in1, in2, in3;
681 v8i16 filter_vec, out0, out1;
682 v4i32 weight_vec, offset_vec, rnd_vec;
687 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
693 offset = (offset0 + offset1) << rnd_val;
694 weight0 = weight0 & 0x0000FFFF;
695 weight = weight0 | (weight1 << 16);
696 constant = 128 * weight1;
700 offset_vec = __msa_fill_w(
offset);
701 weight_vec = __msa_fill_w(
weight);
702 rnd_vec = __msa_fill_w(rnd_val + 1);
704 for (loop_cnt = (
height >> 2); loop_cnt--;) {
706 src0_ptr += (4 * src_stride);
707 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708 src1_ptr += (4 * src2_stride);
713 vec0, vec1, vec2, vec3);
717 vec0, vec1, vec2, vec3);
722 weight_vec, rnd_vec, offset_vec,
725 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727 dst += (4 * dst_stride);
733 const int16_t *src1_ptr,
747 v8i16 filt0, filt1, filt2, filt3;
749 v16i8 mask1, mask2, mask3;
750 v16i8 vec0, vec1, vec2, vec3;
751 v8i16 dst0, dst1, dst2, dst3;
752 v8i16 in0, in1, in2, in3;
753 v8i16 filter_vec, out0, out1, out2, out3;
754 v4i32 weight_vec, offset_vec, rnd_vec;
758 offset = (offset0 + offset1) << rnd_val;
759 weight0 = weight0 & 0x0000FFFF;
760 weight = weight0 | (weight1 << 16);
761 constant = 128 * weight1;
765 offset_vec = __msa_fill_w(
offset);
766 weight_vec = __msa_fill_w(
weight);
767 rnd_vec = __msa_fill_w(rnd_val + 1);
770 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
776 for (loop_cnt = (
height >> 2); loop_cnt--;) {
778 src0_ptr += (4 * src_stride);
779 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780 src1_ptr += (4 * src2_stride);
784 vec0, vec1, vec2, vec3);
788 vec0, vec1, vec2, vec3);
792 vec0, vec1, vec2, vec3);
795 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796 vec0, vec1, vec2, vec3);
802 weight_vec, rnd_vec, offset_vec,
803 out0, out1, out2, out3);
806 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807 dst += (4 * dst_stride);
813 const int16_t *src1_ptr,
828 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831 v4i32 weight_vec, offset_vec, rnd_vec;
835 weight0 = weight0 & 0x0000FFFF;
836 weight = weight0 | (weight1 << 16);
837 constant = 128 * weight1;
839 offset = (offset0 + offset1) << rnd_val;
842 offset_vec = __msa_fill_w(
offset);
843 weight_vec = __msa_fill_w(
weight);
844 rnd_vec = __msa_fill_w(rnd_val + 1);
847 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
858 for (loop_cnt = 4; loop_cnt--;) {
860 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
874 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
879 weight_vec, rnd_vec, offset_vec, out0, out1, out2,
882 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
885 src0_ptr += (4 * src_stride);
886 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887 src1_ptr += (4 * src2_stride);
894 VSHF_B4_SB(
src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
899 offset_vec, out0, out1);
900 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902 dst += (4 * dst_stride);
908 const int16_t *src1_ptr,
923 v8i16 in0, in1, in2, in3;
924 v8i16 filt0, filt1, filt2, filt3;
925 v16i8 mask1, mask2, mask3;
926 v8i16 filter_vec, out0, out1, out2, out3;
927 v16i8 vec0, vec1, vec2, vec3;
928 v8i16 dst0, dst1, dst2, dst3;
929 v4i32 weight_vec, offset_vec, rnd_vec;
930 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
933 offset = (offset0 + offset1) << rnd_val;
934 weight0 = weight0 & 0x0000FFFF;
935 weight = weight0 | (weight1 << 16);
936 constant = 128 * weight1;
940 offset_vec = __msa_fill_w(
offset);
941 weight_vec = __msa_fill_w(
weight);
942 rnd_vec = __msa_fill_w(rnd_val + 1);
945 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
951 for (loop_cnt = (
height >> 1); loop_cnt--;) {
953 src0_ptr += src_stride;
955 src0_ptr += src_stride;
956 LD_SH2(src1_ptr, 8, in0, in1);
957 src1_ptr += src2_stride;
958 LD_SH2(src1_ptr, 8, in2, in3);
959 src1_ptr += src2_stride;
963 vec0, vec1, vec2, vec3);
967 vec0, vec1, vec2, vec3);
971 vec0, vec1, vec2, vec3);
974 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975 vec0, vec1, vec2, vec3);
981 weight_vec, rnd_vec, offset_vec,
982 out0, out1, out2, out3);
985 ST_SH2(out0, out1, dst, dst_stride);
986 dst += (2 * dst_stride);
992 const int16_t *src1_ptr,
1008 v8i16 in0, in1, in2;
1009 v8i16 filt0, filt1, filt2, filt3;
1010 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011 v16i8 vec0, vec1, vec2, vec3;
1012 v8i16 dst0, dst1, dst2;
1013 v4i32 dst2_r, dst2_l;
1014 v8i16 filter_vec, out0, out1, out2;
1015 v4i32 weight_vec, offset_vec, rnd_vec;
1018 src0_ptr = src0_ptr - 3;
1019 offset = (offset0 + offset1) << rnd_val;
1020 weight0 = weight0 & 0x0000FFFF;
1021 weight = weight0 | (weight1 << 16);
1022 constant = 128 * weight1;
1026 offset_vec = __msa_fill_w(
offset);
1027 weight_vec = __msa_fill_w(
weight);
1028 rnd_vec = __msa_fill_w(rnd_val + 1);
1031 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1042 src0_ptr += src_stride;
1043 LD_SH2(src1_ptr, 8, in0, in1);
1044 in2 =
LD_SH(src1_ptr + 16);
1045 src1_ptr += src2_stride;
1048 for (loop_cnt = 31; loop_cnt--;) {
1050 vec0, vec1, vec2, vec3);
1054 vec0, vec1, vec2, vec3);
1058 vec0, vec1, vec2, vec3);
1063 weight_vec, rnd_vec, offset_vec,
1067 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068 (v8i16) weight_vec);
1069 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070 (v8i16) weight_vec);
1072 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1076 src0_ptr += src_stride;
1077 LD_SH2(src1_ptr, 8, in0, in1);
1078 in2 =
LD_SH(src1_ptr + 16);
1079 src1_ptr += src2_stride;
1082 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1084 SD(dst_val0, dst + 16);
1100 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1103 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1106 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1108 SD(dst_val0, dst + 16);
1114 const int16_t *src1_ptr,
1129 v8i16 in0, in1, in2, in3;
1130 v8i16 filt0, filt1, filt2, filt3;
1132 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133 v16i8 vec0, vec1, vec2, vec3;
1134 v8i16 dst0, dst1, dst2, dst3;
1135 v8i16 filter_vec, out0, out1, out2, out3;
1136 v4i32 weight_vec, offset_vec, rnd_vec;
1139 offset = (offset0 + offset1) << rnd_val;
1140 weight0 = weight0 & 0x0000FFFF;
1141 weight = weight0 | (weight1 << 16);
1142 constant = 128 * weight1;
1146 offset_vec = __msa_fill_w(
offset);
1147 weight_vec = __msa_fill_w(
weight);
1148 rnd_vec = __msa_fill_w(rnd_val + 1);
1151 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1161 for (loop_cnt =
height; loop_cnt--;) {
1164 src0_ptr += src_stride;
1165 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166 src1_ptr += src2_stride;
1171 vec0, vec1, vec2, vec3);
1175 vec0, vec1, vec2, vec3);
1179 vec0, vec1, vec2, vec3);
1183 vec0, vec1, vec2, vec3);
1189 weight_vec, rnd_vec, offset_vec,
1190 out0, out1, out2, out3);
1193 ST_SH2(out0, out1, dst, 16);
1200 const int16_t *src1_ptr,
1215 v8i16 in0, in1, in2, in3;
1216 v8i16 filt0, filt1, filt2, filt3;
1218 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219 v16i8 vec0, vec1, vec2, vec3;
1220 v8i16 dst0, dst1, dst2, dst3;
1221 v8i16 filter_vec, out0, out1, out2, out3;
1222 v4i32 weight_vec, offset_vec, rnd_vec;
1225 offset = (offset0 + offset1) << rnd_val;
1226 weight0 = weight0 & 0x0000FFFF;
1227 weight = weight0 | (weight1 << 16);
1228 constant = 128 * weight1;
1232 offset_vec = __msa_fill_w(
offset);
1233 weight_vec = __msa_fill_w(
weight);
1234 rnd_vec = __msa_fill_w(rnd_val + 1);
1237 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1247 for (loop_cnt = 64; loop_cnt--;) {
1250 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1252 LD_SB2(src0_ptr + 32, 8, src3, src4);
1253 src0_ptr += src_stride;
1257 vec0, vec1, vec2, vec3);
1261 vec0, vec1, vec2, vec3);
1265 vec0, vec1, vec2, vec3);
1269 vec0, vec1, vec2, vec3);
1274 weight_vec, rnd_vec, offset_vec,
1275 out0, out1, out2, out3);
1278 ST_SH2(out0, out1, dst, 16);
1280 LD_SH2(src1_ptr + 32, 8, in2, in3);
1281 src1_ptr += src2_stride;
1283 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284 vec0, vec1, vec2, vec3);
1287 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288 vec0, vec1, vec2, vec3);
1293 weight_vec, rnd_vec, offset_vec,
1296 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297 ST_SH(out0, dst + 32);
1304 const int16_t *src1_ptr,
1316 const uint8_t *src0_ptr_tmp;
1318 const int16_t *src1_ptr_tmp;
1319 uint32_t loop_cnt, cnt;
1322 v8i16 in0, in1, in2, in3;
1323 v8i16 filt0, filt1, filt2, filt3;
1325 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326 v16i8 vec0, vec1, vec2, vec3;
1327 v8i16 dst0, dst1, dst2, dst3;
1328 v8i16 filter_vec, out0, out1, out2, out3;
1329 v4i32 weight_vec, offset_vec, rnd_vec;
1332 offset = (offset0 + offset1) << rnd_val;
1333 weight0 = weight0 & 0x0000FFFF;
1334 weight = weight0 | (weight1 << 16);
1335 constant = 128 * weight1;
1339 offset_vec = __msa_fill_w(
offset);
1340 weight_vec = __msa_fill_w(
weight);
1341 rnd_vec = __msa_fill_w(rnd_val + 1);
1344 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1354 for (loop_cnt =
height; loop_cnt--;) {
1355 src0_ptr_tmp = src0_ptr;
1357 src1_ptr_tmp = src1_ptr;
1359 for (cnt = 2; cnt--;) {
1363 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1368 vec0, vec1, vec2, vec3);
1372 vec0, vec1, vec2, vec3);
1376 vec0, vec1, vec2, vec3);
1380 vec0, vec1, vec2, vec3);
1386 weight_vec, rnd_vec, offset_vec,
1387 out0, out1, out2, out3);
1390 ST_SH2(out0, out1, dst_tmp, 16);
1394 src0_ptr += src_stride;
1395 src1_ptr += src2_stride;
1403 const int16_t *src1_ptr,
1417 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418 v16i8 src11, src12, src13, src14;
1419 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423 v16i8 src2110, src4332, src6554, src8776, src10998;
1424 v16i8 src12111110, src14131312;
1425 v8i16 dst10, dst32, dst54, dst76;
1426 v8i16 filt0, filt1, filt2, filt3;
1427 v8i16 filter_vec, out0, out1, out2, out3;
1428 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1430 src0_ptr -= (3 * src_stride);
1431 offset = (offset0 + offset1) << rnd_val;
1432 weight0 = weight0 & 0x0000FFFF;
1433 weight = weight0 | (weight1 << 16);
1435 const_vec = __msa_ldi_w(128);
1437 offset_vec = __msa_fill_w(
offset);
1438 weight_vec = __msa_fill_w(
weight);
1439 rnd_vec = __msa_fill_w(rnd_val + 1);
1440 weight1_vec = __msa_fill_w(weight1);
1441 offset_vec += const_vec * weight1_vec;
1444 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1447 src0_ptr += (7 * src_stride);
1450 src10_r, src32_r, src54_r, src21_r);
1451 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453 src2110, src4332, src6554);
1456 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1457 LD_SB8(src0_ptr, src_stride,
1458 src7, src8, src9, src10, src11, src12, src13, src14);
1459 src0_ptr += (8 * src_stride);
1460 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461 src1_ptr += (8 * src2_stride);
1465 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466 src76_r, src87_r, src98_r, src109_r);
1467 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468 src1110_r, src1211_r, src1312_r, src1413_r);
1469 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470 src1413_r, src1312_r,
1471 src8776, src10998, src12111110, src14131312);
1474 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475 filt0, dst10, dst32, dst54, dst76);
1476 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477 filt1, dst10, dst32, dst54, dst76);
1478 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479 filt2, filt2, dst10, dst32, dst54, dst76);
1480 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481 filt3, filt3, dst10, dst32, dst54, dst76);
1485 weight_vec, rnd_vec, offset_vec,
1486 out0, out1, out2, out3);
1489 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490 dst += (8 * dst_stride);
1493 src4332 = src12111110;
1494 src6554 = src14131312;
1501 const int16_t *src1_ptr,
1516 v16i8 src6, src7, src8, src9, src10;
1517 v8i16 in0, in1, in2, in3;
1518 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520 v8i16 tmp0, tmp1, tmp2, tmp3;
1521 v8i16 filt0, filt1, filt2, filt3;
1522 v8i16 filter_vec, out0, out1, out2, out3;
1523 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1525 src0_ptr -= (3 * src_stride);
1526 offset = (offset0 + offset1) << rnd_val;
1527 weight0 = weight0 & 0x0000FFFF;
1528 weight = weight0 | (weight1 << 16);
1530 const_vec = __msa_ldi_w(128);
1532 offset_vec = __msa_fill_w(
offset);
1533 weight_vec = __msa_fill_w(
weight);
1534 rnd_vec = __msa_fill_w(rnd_val + 1);
1535 weight1_vec = __msa_fill_w(weight1);
1536 offset_vec += const_vec * weight1_vec;
1539 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1542 src0_ptr += (7 * src_stride);
1546 src10_r, src32_r, src54_r, src21_r);
1547 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1549 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1550 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551 src0_ptr += (4 * src_stride);
1552 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553 src1_ptr += (4 * src2_stride);
1556 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557 src76_r, src87_r, src98_r, src109_r);
1559 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560 filt0, tmp0, tmp1, tmp2, tmp3);
1561 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562 filt1, tmp0, tmp1, tmp2, tmp3);
1563 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564 filt2, tmp0, tmp1, tmp2, tmp3);
1565 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566 filt3, tmp0, tmp1, tmp2, tmp3);
1570 weight_vec, rnd_vec, offset_vec,
1571 out0, out1, out2, out3);
1574 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575 dst += (4 * dst_stride);
1589 const int16_t *src1_ptr,
1603 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1604 v8i16 in0, in1, in2, in3;
1605 v16i8 src10_r, src32_r, src54_r, src76_r;
1606 v16i8 src21_r, src43_r, src65_r, src87_r;
1607 v8i16 tmp0, tmp1, tmp2;
1608 v16i8 src10_l, src32_l, src54_l, src76_l;
1609 v16i8 src21_l, src43_l, src65_l, src87_l;
1610 v16i8 src2110, src4332, src6554, src8776;
1611 v8i16 filt0, filt1, filt2, filt3;
1612 v8i16 out0, out1, out2, filter_vec;
1613 v4i32 dst2_r, dst2_l;
1614 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1616 src0_ptr -= (3 * src_stride);
1617 offset = (offset0 + offset1) << rnd_val;
1618 weight0 = weight0 & 0x0000FFFF;
1619 weight = weight0 | (weight1 << 16);
1621 const_vec = __msa_ldi_w(128);
1623 offset_vec = __msa_fill_w(
offset);
1624 weight_vec = __msa_fill_w(
weight);
1625 rnd_vec = __msa_fill_w(rnd_val + 1);
1626 weight1_vec = __msa_fill_w(weight1);
1627 offset_vec += const_vec * weight1_vec;
1630 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1633 src0_ptr += (7 * src_stride);
1637 src10_r, src32_r, src54_r, src21_r);
1638 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1640 src10_l, src32_l, src54_l, src21_l);
1641 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643 src2110, src4332, src6554);
1645 for (loop_cnt = 8; loop_cnt--;) {
1646 LD_SB2(src0_ptr, src_stride, src7, src8);
1647 src0_ptr += (2 * src_stride);
1648 LD_SH2(src1_ptr, src2_stride, in0, in1);
1649 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650 src1_ptr += (2 * src2_stride);
1651 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1654 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1658 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1660 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1668 weight_vec, rnd_vec, offset_vec,
1672 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673 (v8i16) weight_vec);
1674 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675 (v8i16) weight_vec);
1677 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1680 ST_D2(out0, 0, 1, dst, dst_stride);
1681 ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682 dst += (2 * dst_stride);
1699 const int16_t *src1_ptr,
1712 const uint8_t *src0_ptr_tmp;
1713 const int16_t *src1_ptr_tmp;
1715 uint32_t loop_cnt, cnt;
1717 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1718 v8i16 in0, in1, in2, in3;
1719 v16i8 src10_r, src32_r, src54_r, src76_r;
1720 v16i8 src21_r, src43_r, src65_r, src87_r;
1721 v16i8 src10_l, src32_l, src54_l, src76_l;
1722 v16i8 src21_l, src43_l, src65_l, src87_l;
1723 v8i16 tmp0, tmp1, tmp2, tmp3;
1724 v8i16 filt0, filt1, filt2, filt3;
1726 v8i16 out0, out1, out2, out3;
1727 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1729 src0_ptr -= (3 * src_stride);
1731 offset = (offset0 + offset1) << rnd_val;
1732 weight0 = weight0 & 0x0000FFFF;
1733 weight = weight0 | (weight1 << 16);
1735 const_vec = __msa_ldi_w(128);
1737 offset_vec = __msa_fill_w(
offset);
1738 weight_vec = __msa_fill_w(
weight);
1739 rnd_vec = __msa_fill_w(rnd_val + 1);
1740 weight1_vec = __msa_fill_w(weight1);
1741 offset_vec += const_vec * weight1_vec;
1744 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1746 for (cnt = (
width >> 4); cnt--;) {
1747 src0_ptr_tmp = src0_ptr;
1748 src1_ptr_tmp = src1_ptr;
1751 LD_SB7(src0_ptr_tmp, src_stride,
1753 src0_ptr_tmp += (7 * src_stride);
1757 src10_r, src32_r, src54_r, src21_r);
1758 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1760 src10_l, src32_l, src54_l, src21_l);
1761 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1763 for (loop_cnt = (
height >> 1); loop_cnt--;) {
1764 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765 src0_ptr_tmp += (2 * src_stride);
1766 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768 src1_ptr_tmp += (2 * src2_stride);
1771 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1774 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775 filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777 filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779 filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781 filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1785 weight_vec, rnd_vec, offset_vec,
1786 out0, out1, out2, out3);
1789 ST_SH2(out0, out1, dst_tmp, dst_stride);
1790 dst_tmp += (2 * dst_stride);
1815 const int16_t *src1_ptr,
1828 src1_ptr, src2_stride,
1830 weight0, weight1, offset0, offset1,
1836 const int16_t *src1_ptr,
1849 src1_ptr, src2_stride,
1851 weight0, weight1, offset0, offset1,
1854 src1_ptr + 16, src2_stride,
1856 weight0, weight1, offset0, offset1, rnd_val);
1861 const int16_t *src1_ptr,
1874 src1_ptr, src2_stride,
1876 weight0, weight1, offset0, offset1,
1882 const int16_t *src1_ptr,
1895 src1_ptr, src2_stride,
1897 weight0, weight1, offset0, offset1,
1903 const int16_t *src1_ptr,
1916 src1_ptr, src2_stride,
1918 weight0, weight1, offset0, offset1,
1924 const int16_t *src1_ptr,
1928 const int8_t *filter_x,
1929 const int8_t *filter_y,
1941 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942 v8i16 in0 = { 0 }, in1 = { 0 };
1943 v8i16 filt0, filt1, filt2, filt3;
1944 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945 v16i8 mask1, mask2, mask3;
1946 v8i16 filter_vec, weight_vec;
1947 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950 v8i16 tmp0, tmp1, tmp2, tmp3;
1951 v8i16 dst10, dst32, dst54, dst76;
1952 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1956 src0_ptr -= ((3 * src_stride) + 3);
1958 filter_vec =
LD_SH(filter_x);
1959 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1961 filter_vec =
LD_SH(filter_y);
1964 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1970 offset = (offset0 + offset1) << rnd_val;
1971 weight0 = weight0 & 0x0000FFFF;
1972 weight = weight0 | (weight1 << 16);
1974 const_vec = __msa_fill_w((128 * weight1));
1976 offset_vec = __msa_fill_w(
offset);
1977 rnd_vec = __msa_fill_w(rnd_val + 1);
1978 offset_vec += const_vec;
1979 weight_vec = (v8i16) __msa_fill_w(
weight);
1982 src0_ptr += (7 * src_stride);
1986 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1989 vec8, vec9, vec10, vec11);
1990 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991 vec12, vec13, vec14, vec15);
2006 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2008 for (loop_cnt =
height >> 2; loop_cnt--;) {
2009 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010 src0_ptr += (4 * src_stride);
2013 LD2(src1_ptr, src2_stride, tp0, tp1);
2015 src1_ptr += (2 * src2_stride);
2016 LD2(src1_ptr, src2_stride, tp0, tp1);
2018 src1_ptr += (2 * src2_stride);
2020 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021 vec0, vec1, vec2, vec3);
2022 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023 vec4, vec5, vec6, vec7);
2029 dst76 = __msa_ilvr_h(dst97, dst66);
2031 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032 dst98 = __msa_ilvr_h(dst66, dst108);
2034 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2036 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2038 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2040 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2042 SRA_4V(dst0, dst1, dst2, dst3, 6);
2046 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2053 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2055 dst += (4 * dst_stride);
2063 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2069 const int16_t *src1_ptr,
2073 const int8_t *filter_x,
2074 const int8_t *filter_y,
2083 uint32_t loop_cnt, cnt;
2085 const uint8_t *src0_ptr_tmp;
2086 const int16_t *src1_ptr_tmp;
2089 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
2091 v8i16 filt0, filt1, filt2, filt3;
2092 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2094 v16i8 mask1, mask2, mask3;
2095 v8i16 filter_vec, weight_vec;
2096 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100 v8i16 tmp0, tmp1, tmp2, tmp3;
2101 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105 v4i32 offset_vec, rnd_vec, const_vec;
2107 src0_ptr -= ((3 * src_stride) + 3);
2109 offset = (offset0 + offset1) << rnd_val;
2110 weight0 = weight0 & 0x0000FFFF;
2111 weight = weight0 | (weight1 << 16);
2113 const_vec = __msa_fill_w((128 * weight1));
2115 offset_vec = __msa_fill_w(
offset);
2116 rnd_vec = __msa_fill_w(rnd_val + 1);
2117 offset_vec += const_vec;
2118 weight_vec = (v8i16) __msa_fill_w(
weight);
2120 filter_vec =
LD_SH(filter_x);
2121 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2123 filter_vec =
LD_SH(filter_y);
2126 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2132 for (cnt = width8mult; cnt--;) {
2133 src0_ptr_tmp = src0_ptr;
2134 src1_ptr_tmp = src1_ptr;
2137 LD_SB7(src0_ptr_tmp, src_stride,
2139 src0_ptr_tmp += (7 * src_stride);
2145 vec0, vec1, vec2, vec3);
2147 vec4, vec5, vec6, vec7);
2149 vec8, vec9, vec10, vec11);
2150 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151 vec12, vec13, vec14, vec15);
2163 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164 vec0, vec1, vec2, vec3);
2165 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166 vec4, vec5, vec6, vec7);
2167 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168 vec8, vec9, vec10, vec11);
2177 for (loop_cnt =
height >> 1; loop_cnt--;) {
2178 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2180 src0_ptr_tmp += 2 * src_stride;
2182 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183 src1_ptr_tmp += (2 * src2_stride);
2185 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186 dst32_r, dst54_r, dst21_r);
2187 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188 dst32_l, dst54_l, dst21_l);
2189 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2192 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193 vec0, vec1, vec2, vec3);
2199 filt_h0, filt_h1, filt_h2, filt_h3);
2201 filt_h0, filt_h1, filt_h2, filt_h3);
2207 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208 vec0, vec1, vec2, vec3);
2214 filt_h0, filt_h1, filt_h2, filt_h3);
2216 filt_h0, filt_h1, filt_h2, filt_h3);
2221 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2224 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2230 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
2233 dst_tmp += (2 * dst_stride);
2252 const int16_t *src1_ptr,
2256 const int8_t *filter_x,
2257 const int8_t *filter_y,
2266 src1_ptr, src2_stride,
2267 dst, dst_stride, filter_x, filter_y,
2268 height, weight0, weight1, offset0,
2269 offset1, rnd_val, 1);
2274 const int16_t *src1_ptr,
2278 const int8_t *filter_x,
2279 const int8_t *filter_y,
2288 const uint8_t *src0_ptr_tmp;
2290 const int16_t *src1_ptr_tmp;
2294 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2295 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2296 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2297 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2298 v8i16 in0 = { 0 }, in1 = { 0 };
2299 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2300 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2301 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2302 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2303 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2304 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2305 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2306 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2308 src0_ptr -= ((3 * src_stride) + 3);
2310 offset = (offset0 + offset1) << rnd_val;
2311 weight0 = weight0 & 0x0000FFFF;
2312 weight = weight0 | (weight1 << 16);
2314 const_vec = __msa_fill_w((128 * weight1));
2316 offset_vec = __msa_fill_w(
offset);
2317 rnd_vec = __msa_fill_w(rnd_val + 1);
2318 offset_vec += const_vec;
2319 weight_vec = (v8i16) __msa_fill_w(
weight);
2321 filter_vec =
LD_SH(filter_x);
2322 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2324 filter_vec =
LD_SH(filter_y);
2327 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2334 src0_ptr_tmp = src0_ptr;
2335 src1_ptr_tmp = src1_ptr;
2339 src0_ptr_tmp += (7 * src_stride);
2346 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2356 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2357 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2358 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2367 for (loop_cnt = 8; loop_cnt--;) {
2368 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2369 src0_ptr_tmp += (2 * src_stride);
2372 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2373 src1_ptr_tmp += (2 * src2_stride);
2375 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2376 dst10_r, dst32_r, dst54_r, dst21_r);
2377 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2378 dst10_l, dst32_l, dst54_l, dst21_l);
2379 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2380 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2382 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2388 dst0 =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2389 filt_h1, filt_h2, filt_h3);
2390 dst1 =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2391 filt_h1, filt_h2, filt_h3);
2395 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2401 dst2 =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2402 filt_h1, filt_h2, filt_h3);
2403 dst3 =
HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2404 filt_h1, filt_h2, filt_h3);
2411 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2412 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2413 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2414 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2418 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2419 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
2420 dst_tmp += (2 * dst_stride);
2441 src0_ptr += (7 * src_stride);
2444 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2445 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2446 VSHF_B4_SB(
src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2448 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2462 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2464 for (loop_cnt = 4; loop_cnt--;) {
2465 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2466 src0_ptr += (4 * src_stride);
2469 LD2(src1_ptr, src2_stride, tp0, tp1);
2471 src1_ptr += (2 * src2_stride);
2472 LD2(src1_ptr, src2_stride, tp0, tp1);
2474 src1_ptr += (2 * src2_stride);
2476 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2478 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2485 dst76 = __msa_ilvr_h(dst97, dst66);
2487 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2488 dst98 = __msa_ilvr_h(dst66, dst108);
2490 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2492 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2494 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2496 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2498 SRA_4V(dst0, dst1, dst2, dst3, 6);
2502 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2503 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2504 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2505 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2509 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2510 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2511 dst += (4 * dst_stride);
2519 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2525 const int16_t *src1_ptr,
2529 const int8_t *filter_x,
2530 const int8_t *filter_y,
2539 src1_ptr, src2_stride,
2540 dst, dst_stride, filter_x, filter_y,
2541 height, weight0, weight1, offset0,
2542 offset1, rnd_val, 2);
2547 const int16_t *src1_ptr,
2551 const int8_t *filter_x,
2552 const int8_t *filter_y,
2561 src1_ptr, src2_stride,
2562 dst, dst_stride, filter_x, filter_y,
2563 height, weight0, weight1, offset0,
2564 offset1, rnd_val, 3);
2569 const int16_t *src1_ptr,
2573 const int8_t *filter_x,
2574 const int8_t *filter_y,
2583 src1_ptr, src2_stride,
2584 dst, dst_stride, filter_x, filter_y,
2585 height, weight0, weight1, offset0,
2586 offset1, rnd_val, 4);
2591 const int16_t *src1_ptr,
2595 const int8_t *filter_x,
2596 const int8_t *filter_y,
2605 src1_ptr, src2_stride,
2606 dst, dst_stride, filter_x, filter_y,
2607 height, weight0, weight1, offset0,
2608 offset1, rnd_val, 6);
2613 const int16_t *src1_ptr,
2617 const int8_t *filter_x,
2618 const int8_t *filter_y,
2627 src1_ptr, src2_stride,
2628 dst, dst_stride, filter_x, filter_y,
2629 height, weight0, weight1, offset0,
2630 offset1, rnd_val, 8);
2635 const int16_t *src1_ptr,
2651 v16i8 mask1, vec0, vec1;
2653 v4i32 dst0_r, dst0_l;
2654 v8i16 out0, filter_vec;
2655 v4i32 weight_vec, offset_vec, rnd_vec;
2664 offset = (offset0 + offset1) << rnd_val;
2665 weight0 = weight0 & 0x0000FFFF;
2666 weight = weight0 | (weight1 << 16);
2667 constant = 128 * weight1;
2671 offset_vec = __msa_fill_w(
offset);
2672 weight_vec = __msa_fill_w(
weight);
2673 rnd_vec = __msa_fill_w(rnd_val + 1);
2676 LD_SH2(src1_ptr, src2_stride, in0, in1);
2677 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2684 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2685 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2687 out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2689 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2690 ST_W2(out0, 0, 1, dst, dst_stride);
2695 const int16_t *src1_ptr,
2713 v8i16 in0, in1, in2, in3;
2715 v4i32 weight_vec, offset_vec, rnd_vec;
2725 offset = (offset0 + offset1) << rnd_val;
2726 weight0 = weight0 & 0x0000FFFF;
2727 weight = weight0 | (weight1 << 16);
2728 constant = 128 * weight1;
2732 offset_vec = __msa_fill_w(
offset);
2733 weight_vec = __msa_fill_w(
weight);
2734 rnd_vec = __msa_fill_w(rnd_val + 1);
2738 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2746 weight_vec, rnd_vec, offset_vec,
2749 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2750 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2755 const int16_t *src1_ptr,
2774 v8i16 dst0, dst1, dst2, dst3;
2775 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2777 v4i32 weight_vec, offset_vec, rnd_vec;
2784 offset = (offset0 + offset1) << rnd_val;
2785 weight0 = weight0 & 0x0000FFFF;
2786 weight = weight0 | (weight1 << 16);
2787 constant = 128 * weight1;
2791 offset_vec = __msa_fill_w(
offset);
2792 weight_vec = __msa_fill_w(
weight);
2793 rnd_vec = __msa_fill_w(rnd_val + 1);
2797 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2798 LD_SB8(src0_ptr, src_stride,
2800 src0_ptr += (8 * src_stride);
2801 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2802 src1_ptr += (4 * src2_stride);
2803 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2804 src1_ptr += (4 * src2_stride);
2813 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2815 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2819 weight_vec, rnd_vec, offset_vec,
2820 dst0, dst1, dst2, dst3);
2823 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2824 dst += (8 * dst_stride);
2830 const int16_t *src1_ptr,
2845 weight0, weight1, offset0, offset1, rnd_val);
2846 }
else if (4 ==
height) {
2849 weight0, weight1, offset0, offset1, rnd_val);
2850 }
else if (0 == (
height % 8)) {
2852 src1_ptr, src2_stride,
2854 weight0, weight1, offset0, offset1,
2861 const int16_t *src1_ptr,
2880 v8i16 in0, in1, in2, in3;
2881 v8i16 dst0, dst1, dst2, dst3;
2883 v4i32 weight_vec, offset_vec, rnd_vec;
2890 offset = (offset0 + offset1) << rnd_val;
2891 weight0 = weight0 & 0x0000FFFF;
2892 weight = weight0 | (weight1 << 16);
2893 constant = 128 * weight1;
2897 offset_vec = __msa_fill_w(
offset);
2898 weight_vec = __msa_fill_w(
weight);
2899 rnd_vec = __msa_fill_w(rnd_val + 1);
2903 for (loop_cnt = 2; loop_cnt--;) {
2905 src0_ptr += (4 * src_stride);
2906 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2907 src1_ptr += (4 * src2_stride);
2916 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2921 weight_vec, rnd_vec, offset_vec,
2922 dst0, dst1, dst2, dst3);
2925 ST_W2(dst0, 0, 2, dst, dst_stride);
2926 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2927 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2928 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2929 dst += (4 * dst_stride);
2935 const int16_t *src1_ptr,
2951 v16i8 mask1, vec0, vec1;
2954 v4i32 weight_vec, offset_vec, rnd_vec;
2961 offset = (offset0 + offset1) << rnd_val;
2962 weight0 = weight0 & 0x0000FFFF;
2963 weight = weight0 | (weight1 << 16);
2964 constant = 128 * weight1;
2968 offset_vec = __msa_fill_w(
offset);
2969 weight_vec = __msa_fill_w(
weight);
2970 rnd_vec = __msa_fill_w(rnd_val + 1);
2975 LD_SH2(src1_ptr, src2_stride, in0, in1);
2982 weight_vec, rnd_vec, offset_vec,
2985 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2986 ST_D2(dst0, 0, 1, dst, dst_stride);
2991 const int16_t *src1_ptr,
3005 v8i16 in0, in1, in2, in3, in4, in5;
3009 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3011 v4i32 weight_vec, offset_vec, rnd_vec;
3018 offset = (offset0 + offset1) << rnd_val;
3019 weight0 = weight0 & 0x0000FFFF;
3020 weight = weight0 | (weight1 << 16);
3021 constant = 128 * weight1;
3025 offset_vec = __msa_fill_w(
offset);
3026 weight_vec = __msa_fill_w(
weight);
3027 rnd_vec = __msa_fill_w(rnd_val + 1);
3033 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3034 src1_ptr += (4 * src2_stride);
3035 LD_SH2(src1_ptr, src2_stride, in4, in5);
3043 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3045 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3047 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3051 weight_vec, rnd_vec, offset_vec,
3052 dst0, dst1, dst2, dst3);
3054 weight_vec, rnd_vec, offset_vec,
3058 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3059 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3060 ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3065 const int16_t *src1_ptr,
3084 v8i16 in0, in1, in2, in3;
3085 v8i16 dst0, dst1, dst2, dst3;
3087 v4i32 weight_vec, offset_vec, rnd_vec;
3094 offset = (offset0 + offset1) << rnd_val;
3095 weight0 = weight0 & 0x0000FFFF;
3096 weight = weight0 | (weight1 << 16);
3097 constant = 128 * weight1;
3101 offset_vec = __msa_fill_w(
offset);
3102 weight_vec = __msa_fill_w(
weight);
3103 rnd_vec = __msa_fill_w(rnd_val + 1);
3107 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3109 src0_ptr += (4 * src_stride);
3110 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3111 src1_ptr += (4 * src2_stride);
3120 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3124 weight_vec, rnd_vec, offset_vec,
3125 dst0, dst1, dst2, dst3);
3128 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3129 dst += (4 * dst_stride);
3135 const int16_t *src1_ptr,
3150 weight0, weight1, offset0, offset1, rnd_val);
3151 }
else if (6 ==
height) {
3154 weight0, weight1, offset0, offset1, rnd_val);
3155 }
else if (0 == (
height % 4)) {
3157 src1_ptr, src2_stride,
3159 weight0, weight1, offset0, offset1,
3166 const int16_t *src1_ptr,
3182 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3185 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3189 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3191 v4i32 weight_vec, offset_vec, rnd_vec;
3198 offset = (offset0 + offset1) << rnd_val;
3199 weight0 = weight0 & 0x0000FFFF;
3200 weight = weight0 | (weight1 << 16);
3201 constant = 128 * weight1;
3205 offset_vec = __msa_fill_w(
offset);
3206 weight_vec = __msa_fill_w(
weight);
3207 rnd_vec = __msa_fill_w(rnd_val + 1);
3212 for (loop_cnt = 4; loop_cnt--;) {
3214 src0_ptr += (4 * src_stride);
3215 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3216 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3217 src1_ptr += (4 * src2_stride);
3227 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3236 weight_vec, rnd_vec, offset_vec,
3237 dst0, dst1, dst2, dst3);
3239 weight_vec, rnd_vec, offset_vec,
3243 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3244 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3245 ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3246 dst += (4 * dst_stride);
3252 const int16_t *src1_ptr,
3267 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3271 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3274 v4i32 weight_vec, offset_vec, rnd_vec;
3281 offset = (offset0 + offset1) << rnd_val;
3282 weight0 = weight0 & 0x0000FFFF;
3283 weight = weight0 | (weight1 << 16);
3284 constant = 128 * weight1;
3288 offset_vec = __msa_fill_w(
offset);
3289 weight_vec = __msa_fill_w(
weight);
3290 rnd_vec = __msa_fill_w(rnd_val + 1);
3294 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3296 LD_SB4(src0_ptr + 8, src_stride,
src1, src3, src5, src7);
3297 src0_ptr += (4 * src_stride);
3298 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3299 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3300 src1_ptr += (4 * src2_stride);
3309 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3311 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3313 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3315 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3317 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3321 weight_vec, rnd_vec, offset_vec,
3322 dst0, dst1, dst2, dst3);
3325 ST_SH2(dst0, dst1, dst, dst_stride);
3326 dst += (2 * dst_stride);
3330 weight_vec, rnd_vec, offset_vec,
3331 dst0, dst1, dst2, dst3);
3334 ST_SH2(dst0, dst1, dst, dst_stride);
3335 dst += (2 * dst_stride);
3341 const int16_t *src1_ptr,
3358 v16i8 mask1, mask2, mask3;
3360 v8i16 dst0, dst1, dst2, dst3;
3361 v8i16 in0, in1, in2, in3, in4, in5;
3363 v4i32 weight_vec, offset_vec, rnd_vec;
3370 offset = (offset0 + offset1) << rnd_val;
3371 weight0 = weight0 & 0x0000FFFF;
3372 weight = weight0 | (weight1 << 16);
3373 constant = 128 * weight1;
3377 offset_vec = __msa_fill_w(
offset);
3378 weight_vec = __msa_fill_w(
weight);
3379 rnd_vec = __msa_fill_w(rnd_val + 1);
3385 for (loop_cnt = 16; loop_cnt--;) {
3387 LD_SB2(src0_ptr + 16, src_stride,
src1, src3);
3388 src0_ptr += (2 * src_stride);
3389 LD_SH2(src1_ptr, src2_stride, in0, in2);
3390 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3391 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3392 src1_ptr += (2 * src2_stride);
3405 weight_vec, rnd_vec, offset_vec,
3406 dst0, dst1, dst2, dst3);
3409 ST_SH2(dst0, dst1, dst, dst_stride);
3414 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3417 weight_vec, rnd_vec, offset_vec,
3420 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3421 ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3422 dst += (2 * dst_stride);
3428 const int16_t *src1_ptr,
3445 v16i8 mask1, mask2, mask3;
3446 v8i16 dst0, dst1, dst2, dst3;
3448 v8i16 in0, in1, in2, in3;
3450 v4i32 weight_vec, offset_vec, rnd_vec;
3457 offset = (offset0 + offset1) << rnd_val;
3458 weight0 = weight0 & 0x0000FFFF;
3459 weight = weight0 | (weight1 << 16);
3460 constant = 128 * weight1;
3464 offset_vec = __msa_fill_w(
offset);
3465 weight_vec = __msa_fill_w(
weight);
3466 rnd_vec = __msa_fill_w(rnd_val + 1);
3472 for (loop_cnt =
height; loop_cnt--;) {
3475 src0_ptr += src_stride;
3476 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3477 src1_ptr += src2_stride;
3490 weight_vec, rnd_vec, offset_vec,
3491 dst0, dst1, dst2, dst3);
3494 ST_SH2(dst0, dst1, dst, 16);
3501 const int16_t *src1_ptr,
3514 v8i16 in0, in1, dst10;
3515 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3516 v4i32 dst10_r, dst10_l;
3518 v8i16 filter_vec,
out;
3519 v4i32 weight_vec, offset_vec, rnd_vec;
3521 src0_ptr -= src_stride;
3523 offset = (offset0 + offset1) << rnd_val;
3524 weight0 = weight0 & 0x0000FFFF;
3525 weight = weight0 | (weight1 << 16);
3526 constant = 128 * weight1;
3530 offset_vec = __msa_fill_w(
offset);
3531 weight_vec = __msa_fill_w(
weight);
3532 rnd_vec = __msa_fill_w(rnd_val + 1);
3538 src0_ptr += (3 * src_stride);
3540 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3541 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3542 LD_SB2(src0_ptr, src_stride, src3, src4);
3543 src0_ptr += (2 * src_stride);
3544 LD_SH2(src1_ptr, src2_stride, in0, in1);
3545 src1_ptr += (2 * src2_stride);
3547 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3549 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3550 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3555 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3556 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3558 out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3560 out = (v8i16) __msa_pckev_b((v16i8)
out, (v16i8)
out);
3566 const int16_t *src1_ptr,
3579 v8i16 in0, in1, in2, in3;
3580 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3581 v16i8 src2110, src4332, src6554;
3585 v4i32 weight_vec, offset_vec, rnd_vec;
3587 src0_ptr -= src_stride;
3589 offset = (offset0 + offset1) << rnd_val;
3590 weight0 = weight0 & 0x0000FFFF;
3591 weight = weight0 | (weight1 << 16);
3592 constant = 128 * weight1;
3596 offset_vec = __msa_fill_w(
offset);
3597 weight_vec = __msa_fill_w(
weight);
3598 rnd_vec = __msa_fill_w(rnd_val + 1);
3604 src0_ptr += (3 * src_stride);
3606 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3607 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3609 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3610 src0_ptr += (4 * src_stride);
3611 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3612 src1_ptr += (4 * src2_stride);
3615 src32_r, src43_r, src54_r, src65_r);
3616 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3623 weight_vec, rnd_vec, offset_vec,
3626 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3627 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3628 dst += (4 * dst_stride);
3633 const int16_t *src1_ptr,
3647 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
3648 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3649 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3650 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3651 v16i8 src2110, src4332, src6554, src8776;
3652 v8i16 dst10, dst32, dst54, dst76;
3655 v4i32 weight_vec, offset_vec, rnd_vec;
3657 src0_ptr -= src_stride;
3659 offset = (offset0 + offset1) << rnd_val;
3660 weight0 = weight0 & 0x0000FFFF;
3661 weight = weight0 | (weight1 << 16);
3662 constant = 128 * weight1;
3666 offset_vec = __msa_fill_w(
offset);
3667 weight_vec = __msa_fill_w(
weight);
3668 rnd_vec = __msa_fill_w(rnd_val + 1);
3674 src0_ptr += (3 * src_stride);
3676 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3677 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3679 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3680 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3681 src0_ptr += (6 * src_stride);
3682 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3683 src1_ptr += (8 * src2_stride);
3689 src32_r, src43_r, src54_r, src65_r);
3690 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3691 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3692 src4332, src6554, src8776);
3700 src0_ptr += (2 * src_stride);
3702 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3703 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3708 weight_vec, rnd_vec, offset_vec,
3709 dst10, dst32, dst54, dst76);
3711 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3712 ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3713 dst += (8 * dst_stride);
3719 const int16_t *src1_ptr,
3734 weight0, weight1, offset0, offset1, rnd_val);
3735 }
else if (4 ==
height) {
3738 weight0, weight1, offset0, offset1, rnd_val);
3739 }
else if (0 == (
height % 8)) {
3741 src1_ptr, src2_stride,
3743 weight0, weight1, offset0, offset1,
3750 const int16_t *src1_ptr,
3765 v8i16 in0, in1, in2, in3;
3766 v16i8 src10_r, src32_r, src21_r, src43_r;
3767 v8i16 tmp0, tmp1, tmp2, tmp3;
3770 v4i32 weight_vec, offset_vec, rnd_vec;
3772 src0_ptr -= src_stride;
3774 offset = (offset0 + offset1) << rnd_val;
3775 weight0 = weight0 & 0x0000FFFF;
3776 weight = weight0 | (weight1 << 16);
3777 constant = 128 * weight1;
3781 offset_vec = __msa_fill_w(
offset);
3782 weight_vec = __msa_fill_w(
weight);
3783 rnd_vec = __msa_fill_w(rnd_val + 1);
3789 src0_ptr += (3 * src_stride);
3793 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3794 LD_SB2(src0_ptr, src_stride, src3, src4);
3795 src0_ptr += (2 * src_stride);
3796 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3797 src1_ptr += (4 * src2_stride);
3805 src0_ptr += (2 * src_stride);
3813 weight_vec, rnd_vec, offset_vec,
3814 tmp0, tmp1, tmp2, tmp3);
3817 ST_W2(tmp0, 0, 2, dst, dst_stride);
3818 ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3819 ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3820 ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3821 dst += (4 * dst_stride);
3827 const int16_t *src1_ptr,
3840 v8i16 in0, in1, tmp0, tmp1;
3841 v16i8 src10_r, src32_r, src21_r, src43_r;
3844 v4i32 weight_vec, offset_vec, rnd_vec;
3846 src0_ptr -= src_stride;
3848 offset = (offset0 + offset1) << rnd_val;
3849 weight0 = weight0 & 0x0000FFFF;
3850 weight = weight0 | (weight1 << 16);
3851 constant = 128 * weight1;
3855 offset_vec = __msa_fill_w(
offset);
3856 weight_vec = __msa_fill_w(
weight);
3857 rnd_vec = __msa_fill_w(rnd_val + 1);
3863 src0_ptr += (3 * src_stride);
3867 LD_SB2(src0_ptr, src_stride, src3, src4);
3868 LD_SH2(src1_ptr, src2_stride, in0, in1);
3875 weight_vec, rnd_vec, offset_vec,
3878 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3879 ST_D2(tmp0, 0, 1, dst, dst_stride);
3884 const int16_t *src1_ptr,
3896 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
3897 v8i16 in0, in1, in2, in3, in4, in5;
3898 v16i8 src10_r, src32_r, src54_r, src76_r;
3899 v16i8 src21_r, src43_r, src65_r, src87_r;
3900 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3903 v4i32 weight_vec, offset_vec, rnd_vec;
3905 src0_ptr -= src_stride;
3907 offset = (offset0 + offset1) << rnd_val;
3908 weight0 = weight0 & 0x0000FFFF;
3909 weight = weight0 | (weight1 << 16);
3910 constant = 128 * weight1;
3914 offset_vec = __msa_fill_w(
offset);
3915 weight_vec = __msa_fill_w(
weight);
3916 rnd_vec = __msa_fill_w(rnd_val + 1);
3922 src0_ptr += (3 * src_stride);
3926 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3927 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3930 src32_r, src43_r, src54_r, src65_r);
3931 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3941 weight_vec, rnd_vec, offset_vec,
3942 tmp0, tmp1, tmp2, tmp3);
3944 weight_vec, rnd_vec, offset_vec,
3948 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3949 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3950 ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3955 const int16_t *src1_ptr,
3970 v8i16 in0, in1, in2, in3;
3971 v16i8 src10_r, src32_r, src21_r, src43_r;
3972 v8i16 tmp0, tmp1, tmp2, tmp3;
3975 v4i32 weight_vec, offset_vec, rnd_vec;
3977 src0_ptr -= src_stride;
3979 offset = (offset0 + offset1) << rnd_val;
3980 weight0 = weight0 & 0x0000FFFF;
3981 weight = weight0 | (weight1 << 16);
3982 constant = 128 * weight1;
3986 offset_vec = __msa_fill_w(
offset);
3987 weight_vec = __msa_fill_w(
weight);
3988 rnd_vec = __msa_fill_w(rnd_val + 1);
3994 src0_ptr += (3 * src_stride);
3998 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3999 LD_SB2(src0_ptr, src_stride, src3, src4);
4000 src0_ptr += (2 * src_stride);
4001 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4002 src1_ptr += (4 * src2_stride);
4010 src0_ptr += (2 * src_stride);
4018 weight_vec, rnd_vec, offset_vec,
4019 tmp0, tmp1, tmp2, tmp3);
4022 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4023 dst += (4 * dst_stride);
4029 const int16_t *src1_ptr,
4044 weight0, weight1, offset0, offset1, rnd_val);
4045 }
else if (6 ==
height) {
4048 weight0, weight1, offset0, offset1, rnd_val);
4051 src1_ptr, src2_stride,
4053 weight0, weight1, offset0, offset1,
4060 const int16_t *src1_ptr,
4075 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4076 v16i8 src10_r, src32_r, src21_r, src43_r;
4077 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4078 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4079 v16i8 src2110, src4332;
4082 v4i32 weight_vec, offset_vec, rnd_vec;
4084 src0_ptr -= (1 * src_stride);
4086 offset = (offset0 + offset1) << rnd_val;
4087 weight0 = weight0 & 0x0000FFFF;
4088 weight = weight0 | (weight1 << 16);
4089 constant = 128 * weight1;
4093 offset_vec = __msa_fill_w(
offset);
4094 weight_vec = __msa_fill_w(
weight);
4095 rnd_vec = __msa_fill_w(rnd_val + 1);
4101 src0_ptr += (3 * src_stride);
4105 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4107 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4108 LD_SB2(src0_ptr, src_stride, src3, src4);
4109 src0_ptr += (2 * src_stride);
4110 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4111 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4112 src1_ptr += (4 * src2_stride);
4118 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4125 src0_ptr += (2 * src_stride);
4129 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4136 weight_vec, rnd_vec, offset_vec,
4137 tmp0, tmp1, tmp2, tmp3);
4139 weight_vec, rnd_vec, offset_vec,
4143 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4144 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4145 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4146 dst += (4 * dst_stride);
4152 const int16_t *src1_ptr,
4167 v8i16 in0, in1, in2, in3;
4168 v16i8 src10_r, src32_r, src21_r, src43_r;
4169 v16i8 src10_l, src32_l, src21_l, src43_l;
4170 v8i16 tmp0, tmp1, tmp2, tmp3;
4173 v4i32 weight_vec, offset_vec, rnd_vec;
4175 src0_ptr -= src_stride;
4177 offset = (offset0 + offset1) << rnd_val;
4178 weight0 = weight0 & 0x0000FFFF;
4179 weight = weight0 | (weight1 << 16);
4180 constant = 128 * weight1;
4184 offset_vec = __msa_fill_w(
offset);
4185 weight_vec = __msa_fill_w(
weight);
4186 rnd_vec = __msa_fill_w(rnd_val + 1);
4192 src0_ptr += (3 * src_stride);
4197 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4198 LD_SB2(src0_ptr, src_stride, src3, src4);
4199 src0_ptr += (2 * src_stride);
4200 LD_SH2(src1_ptr, src2_stride, in0, in1);
4201 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4202 src1_ptr += (2 * src2_stride);
4214 weight_vec, rnd_vec, offset_vec,
4215 tmp0, tmp1, tmp2, tmp3);
4217 ST_SH2(tmp0, tmp1, dst, dst_stride);
4218 dst += (2 * dst_stride);
4220 src0_ptr += (2 * src_stride);
4222 LD_SH2(src1_ptr, src2_stride, in0, in1);
4223 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4224 src1_ptr += (2 * src2_stride);
4235 weight_vec, rnd_vec, offset_vec,
4236 tmp0, tmp1, tmp2, tmp3);
4239 ST_SH2(tmp0, tmp1, dst, dst_stride);
4240 dst += (2 * dst_stride);
4246 const int16_t *src1_ptr,
4261 v16i8 src6, src7, src8, src9, src10, src11;
4262 v8i16 in0, in1, in2, in3, in4, in5;
4263 v16i8 src10_r, src32_r, src76_r, src98_r;
4264 v16i8 src10_l, src32_l, src21_l, src43_l;
4265 v16i8 src21_r, src43_r, src87_r, src109_r;
4266 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4269 v4i32 weight_vec, offset_vec, rnd_vec;
4271 src0_ptr -= src_stride;
4273 offset = (offset0 + offset1) << rnd_val;
4274 weight0 = weight0 & 0x0000FFFF;
4275 weight = weight0 | (weight1 << 16);
4276 constant = 128 * weight1;
4280 offset_vec = __msa_fill_w(
offset);
4281 weight_vec = __msa_fill_w(
weight);
4282 rnd_vec = __msa_fill_w(rnd_val + 1);
4293 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4294 src0_ptr += (3 * src_stride);
4296 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4298 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4300 LD_SB2(src0_ptr, src_stride, src3, src4);
4301 LD_SH2(src1_ptr, src2_stride, in0, in1);
4302 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4308 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4309 src0_ptr += (2 * src_stride);
4310 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4311 src1_ptr += (2 * src2_stride);
4313 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4325 weight_vec, rnd_vec, offset_vec,
4326 tmp0, tmp1, tmp4, tmp5);
4329 weight_vec, rnd_vec, offset_vec,
4334 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4335 ST_SH2(tmp0, tmp1, dst, dst_stride);
4336 ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4337 dst += (2 * dst_stride);
4341 LD_SH2(src1_ptr, src2_stride, in0, in1);
4342 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4347 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4348 src0_ptr += (2 * src_stride);
4349 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4350 src1_ptr += (2 * src2_stride);
4352 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4364 weight_vec, rnd_vec, offset_vec,
4365 tmp0, tmp1, tmp4, tmp5);
4368 weight_vec, rnd_vec, offset_vec,
4374 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4375 ST_SH2(tmp0, tmp1, dst, dst_stride);
4376 ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4377 dst += (2 * dst_stride);
4383 const int16_t *src1_ptr,
4396 uint8_t *dst_tmp = dst + 16;
4398 v16i8
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
4399 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4400 v16i8 src10_r, src32_r, src76_r, src98_r;
4401 v16i8 src21_r, src43_r, src87_r, src109_r;
4402 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4403 v16i8 src10_l, src32_l, src76_l, src98_l;
4404 v16i8 src21_l, src43_l, src87_l, src109_l;
4407 v4i32 weight_vec, offset_vec, rnd_vec;
4409 src0_ptr -= src_stride;
4411 offset = (offset0 + offset1) << rnd_val;
4412 weight0 = weight0 & 0x0000FFFF;
4413 weight = weight0 | (weight1 << 16);
4414 constant = 128 * weight1;
4418 offset_vec = __msa_fill_w(
offset);
4419 weight_vec = __msa_fill_w(
weight);
4420 rnd_vec = __msa_fill_w(rnd_val + 1);
4431 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4432 src0_ptr += (3 * src_stride);
4434 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4435 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4437 for (loop_cnt = (
height >> 1); loop_cnt--;) {
4439 LD_SB2(src0_ptr, src_stride, src3, src4);
4440 LD_SH2(src1_ptr, src2_stride, in0, in1);
4441 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4454 weight_vec, rnd_vec, offset_vec,
4455 tmp0, tmp1, tmp4, tmp5);
4458 ST_SH2(tmp0, tmp1, dst, dst_stride);
4459 dst += (2 * dst_stride);
4468 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4469 src0_ptr += (2 * src_stride);
4470 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4471 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4472 src1_ptr += (2 * src2_stride);
4474 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4475 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4484 weight_vec, rnd_vec, offset_vec,
4485 tmp2, tmp3, tmp6, tmp7);
4489 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4490 dst_tmp += (2 * dst_stride);
4502 const int16_t *src1_ptr,
4506 const int8_t *filter_x,
4507 const int8_t *filter_y,
4520 v8i16 filt_h0, filt_h1;
4523 v8i16 filter_vec,
tmp, weight_vec;
4524 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4525 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4526 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4528 src0_ptr -= (src_stride + 1);
4530 filter_vec =
LD_SH(filter_x);
4533 filter_vec =
LD_SH(filter_y);
4540 offset = (offset0 + offset1) << rnd_val;
4541 weight0 = weight0 & 0x0000FFFF;
4542 weight = weight0 | (weight1 << 16);
4544 const_vec = __msa_fill_w((128 * weight1));
4546 offset_vec = __msa_fill_w(
offset);
4547 weight_vec = (v8i16) __msa_fill_w(
weight);
4548 rnd_vec = __msa_fill_w(rnd_val + 1);
4549 offset_vec += const_vec;
4569 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4571 LD2(src1_ptr, src2_stride, tp0, tp1);
4575 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4576 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4578 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4580 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
4586 const int16_t *src1_ptr,
4590 const int8_t *filter_x,
4591 const int8_t *filter_y,
4601 v8i16 in0 = { 0 }, in1 = { 0 };
4604 v8i16 filt_h0, filt_h1;
4607 v8i16 filter_vec, weight_vec;
4608 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4609 v8i16 tmp0, tmp1, tmp2, tmp3;
4610 v8i16 dst30, dst41, dst52, dst63;
4611 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4612 v4i32 offset_vec, rnd_vec, const_vec;
4613 v4i32 dst0, dst1, dst2, dst3;
4615 src0_ptr -= (src_stride + 1);
4617 filter_vec =
LD_SH(filter_x);
4620 filter_vec =
LD_SH(filter_y);
4627 offset = (offset0 + offset1) << rnd_val;
4628 weight0 = weight0 & 0x0000FFFF;
4629 weight = weight0 | (weight1 << 16);
4631 const_vec = __msa_fill_w((128 * weight1));
4633 offset_vec = __msa_fill_w(
offset);
4634 weight_vec = (v8i16) __msa_fill_w(
weight);
4635 rnd_vec = __msa_fill_w(rnd_val + 1);
4636 offset_vec += const_vec;
4644 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4658 SRA_4V(dst0, dst1, dst2, dst3, 6);
4661 LD2(src1_ptr, src2_stride, tp0, tp1);
4663 src1_ptr += (2 * src2_stride);
4664 LD2(src1_ptr, src2_stride, tp0, tp1);
4670 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4671 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4672 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4673 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4677 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4678 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
4683 const int16_t *src1_ptr,
4687 const int8_t *filter_x,
4688 const int8_t *filter_y,
4700 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4701 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4703 v8i16 filt_h0, filt_h1;
4704 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4707 v8i16 filter_vec, weight_vec;
4708 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4709 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4710 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4711 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4712 v8i16 dst98_r, dst109_r;
4713 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4714 v4i32 offset_vec, rnd_vec, const_vec;
4716 src0_ptr -= (src_stride + 1);
4718 filter_vec =
LD_SH(filter_x);
4721 filter_vec =
LD_SH(filter_y);
4728 offset = (offset0 + offset1) << rnd_val;
4729 weight0 = weight0 & 0x0000FFFF;
4730 weight = weight0 | (weight1 << 16);
4732 const_vec = __msa_fill_w((128 * weight1));
4734 offset_vec = __msa_fill_w(
offset);
4735 weight_vec = (v8i16) __msa_fill_w(
weight);
4736 rnd_vec = __msa_fill_w(rnd_val + 1);
4737 offset_vec += const_vec;
4740 src0_ptr += (3 * src_stride);
4748 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4750 for (loop_cnt =
height >> 3; loop_cnt--;) {
4751 LD_SB8(src0_ptr, src_stride,
4752 src3, src4, src5, src6, src7, src8, src9, src10);
4753 src0_ptr += (8 * src_stride);
4755 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4756 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4757 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4758 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4765 dst32_r = __msa_ilvr_h(dst73, dst22);
4769 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4770 dst76_r = __msa_ilvr_h(dst22, dst106);
4772 LD2(src1_ptr, src2_stride, tp0, tp1);
4773 src1_ptr += 2 * src2_stride;
4775 LD2(src1_ptr, src2_stride, tp0, tp1);
4776 src1_ptr += 2 * src2_stride;
4779 LD2(src1_ptr, src2_stride, tp0, tp1);
4780 src1_ptr += 2 * src2_stride;
4782 LD2(src1_ptr, src2_stride, tp0, tp1);
4783 src1_ptr += 2 * src2_stride;
4794 SRA_4V(dst0, dst1, dst2, dst3, 6);
4795 SRA_4V(dst4, dst5, dst6, dst7, 6);
4796 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4802 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4803 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4804 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4805 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4806 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4807 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4808 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4809 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4812 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4816 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4817 dst += (8 * dst_stride);
4821 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4827 const int16_t *src1_ptr,
4831 const int8_t *filter_x,
4832 const int8_t *filter_y,
4842 dst, dst_stride, filter_x, filter_y,
4843 weight0, weight1, offset0, offset1, rnd_val);
4844 }
else if (4 ==
height) {
4846 dst, dst_stride, filter_x, filter_y,
4847 weight0, weight1, offset0, offset1, rnd_val);
4848 }
else if (0 == (
height % 8)) {
4850 src1_ptr, src2_stride,
4851 dst, dst_stride, filter_x, filter_y,
4852 height, weight0, weight1,
4853 offset0, offset1, rnd_val);
4859 const int16_t *src1_ptr,
4863 const int8_t *filter_x,
4864 const int8_t *filter_y,
4872 uint32_t tpw0, tpw1, tpw2, tpw3;
4875 v16u8 out0, out1, out2;
4876 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4877 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4878 v8i16 in4 = { 0 }, in5 = { 0 };
4880 v8i16 filt_h0, filt_h1, filter_vec;
4881 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4884 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4885 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4886 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4887 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4888 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4889 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4890 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4891 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4892 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4893 v4i32 offset_vec, rnd_vec, const_vec;
4895 src0_ptr -= (src_stride + 1);
4897 filter_vec =
LD_SH(filter_x);
4900 filter_vec =
LD_SH(filter_y);
4907 offset = (offset0 + offset1) << rnd_val;
4908 weight0 = weight0 & 0x0000FFFF;
4909 weight = weight0 | (weight1 << 16);
4911 const_vec = __msa_fill_w((128 * weight1));
4913 offset_vec = __msa_fill_w(
offset);
4914 weight_vec = (v8i16) __msa_fill_w(
weight);
4915 rnd_vec = __msa_fill_w(rnd_val + 1);
4916 offset_vec += const_vec;
4919 src0_ptr += (3 * src_stride);
4932 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4936 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4937 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4938 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4939 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4946 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4947 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4948 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4949 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4964 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4965 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4966 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4979 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4980 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4981 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4982 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4983 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4984 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4986 LD2(src1_ptr, src2_stride, tp0, tp1);
4988 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4991 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4993 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
5000 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5001 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5002 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5003 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5004 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5005 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5006 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5007 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5010 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5014 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5016 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5018 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5019 src1_ptr += (4 * src2_stride);
5021 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5027 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5028 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5029 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5030 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5035 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5036 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5041 const int16_t *src1_ptr,
5045 const int8_t *filter_x,
5046 const int8_t *filter_y,
5057 v8i16 filt_h0, filt_h1;
5060 v8i16 filter_vec, weight_vec;
5061 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5062 v8i16 dst0, dst1, dst2, dst3, dst4;
5064 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5065 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5066 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5067 v8i16 tmp0, tmp1, tmp2, tmp3;
5068 v4i32 offset_vec, rnd_vec, const_vec;
5070 src0_ptr -= (src_stride + 1);
5072 filter_vec =
LD_SH(filter_x);
5075 filter_vec =
LD_SH(filter_y);
5082 offset = (offset0 + offset1) << rnd_val;
5083 weight0 = weight0 & 0x0000FFFF;
5084 weight = weight0 | (weight1 << 16);
5086 const_vec = __msa_fill_w((128 * weight1));
5088 offset_vec = __msa_fill_w(
offset);
5089 weight_vec = (v8i16) __msa_fill_w(
weight);
5090 rnd_vec = __msa_fill_w(rnd_val + 1);
5091 offset_vec += const_vec;
5096 LD_SH2(src1_ptr, src2_stride, in0, in1);
5101 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5102 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5118 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5119 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5124 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5125 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5126 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5127 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5128 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5129 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5131 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5137 const int16_t *src1_ptr,
5141 const int8_t *filter_x,
5142 const int8_t *filter_y,
5153 v16i8
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
5154 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5155 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5156 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5157 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5158 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5159 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5160 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5161 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5162 v4i32 offset_vec, rnd_vec, const_vec;
5164 src0_ptr -= (src_stride + 1);
5166 filter_vec =
LD_SH(filter_x);
5169 filter_vec =
LD_SH(filter_y);
5177 offset = (offset0 + offset1) << rnd_val;
5178 weight0 = weight0 & 0x0000FFFF;
5179 weight = weight0 | (weight1 << 16);
5181 const_vec = __msa_fill_w((128 * weight1));
5183 offset_vec = __msa_fill_w(
offset);
5184 rnd_vec = __msa_fill_w(rnd_val + 1);
5185 offset_vec += const_vec;
5186 weight_vec = (v8i16) __msa_fill_w(
weight);
5188 for (cnt = width8mult; cnt--;) {
5193 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5207 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5208 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5209 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5210 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5231 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5232 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5233 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5234 dst3_r, dst0, dst1, dst2, dst3);
5240 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5241 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5242 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5243 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5244 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5245 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5246 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5247 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5250 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5251 tmp0, tmp1, tmp2, tmp3);
5254 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5261 const int16_t *src1_ptr,
5265 const int8_t *filter_x,
5266 const int8_t *filter_y,
5274 v16u8 out0, out1, out2;
5275 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
5277 v8i16 filt_h0, filt_h1;
5280 v8i16 filter_vec, weight_vec;
5281 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5282 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5283 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5284 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5285 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5286 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5287 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5288 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5289 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5290 v8i16 in0, in1, in2, in3, in4, in5;
5291 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5292 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5293 v4i32 offset_vec, rnd_vec, const_vec;
5295 src0_ptr -= (src_stride + 1);
5297 filter_vec =
LD_SH(filter_x);
5300 filter_vec =
LD_SH(filter_y);
5307 offset = (offset0 + offset1) << rnd_val;
5308 weight0 = weight0 & 0x0000FFFF;
5309 weight = weight0 | (weight1 << 16);
5311 const_vec = __msa_fill_w((128 * weight1));
5313 offset_vec = __msa_fill_w(
offset);
5314 weight_vec = (v8i16) __msa_fill_w(
weight);
5315 rnd_vec = __msa_fill_w(rnd_val + 1);
5316 offset_vec += const_vec;
5319 src0_ptr += (5 * src_stride);
5320 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5325 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5330 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5331 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5332 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5333 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5334 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5335 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5369 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5370 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5371 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5372 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5373 dst0, dst1, dst2, dst3);
5379 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5380 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5381 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5382 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5383 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5384 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5385 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5386 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5389 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5390 tmp0, tmp1, tmp2, tmp3);
5394 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5397 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5398 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5399 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5400 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5404 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5405 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5406 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5411 const int16_t *src1_ptr,
5415 const int8_t *filter_x,
5416 const int8_t *filter_y,
5428 const uint8_t *src0_ptr_tmp;
5429 const int16_t *src1_ptr_tmp;
5433 v8i16 in0, in1, in2, in3;
5435 v8i16 filt_h0, filt_h1;
5439 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5440 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5441 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5442 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5443 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5444 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5445 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5446 v4i32 offset_vec, rnd_vec, const_vec;
5448 src0_ptr -= (src_stride + 1);
5450 filter_vec =
LD_SH(filter_x);
5453 filter_vec =
LD_SH(filter_y);
5460 offset = (offset0 + offset1) << rnd_val;
5461 weight0 = weight0 & 0x0000FFFF;
5462 weight = weight0 | (weight1 << 16);